avxintrin.h revision 321369
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38/* Unsigned types */
39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
44/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
46typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
48typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
54
55/* Arithmetic */
56/// \brief Adds two 256-bit vectors of [4 x double].
57///
58/// \headerfile <x86intrin.h>
59///
60/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
61///
62/// \param __a
63///    A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65///    A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67///    operands.
68static __inline __m256d __DEFAULT_FN_ATTRS
69_mm256_add_pd(__m256d __a, __m256d __b)
70{
71  return (__m256d)((__v4df)__a+(__v4df)__b);
72}
73
74/// \brief Adds two 256-bit vectors of [8 x float].
75///
76/// \headerfile <x86intrin.h>
77///
78/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
79///
80/// \param __a
81///    A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83///    A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85///    operands.
86static __inline __m256 __DEFAULT_FN_ATTRS
87_mm256_add_ps(__m256 __a, __m256 __b)
88{
89  return (__m256)((__v8sf)__a+(__v8sf)__b);
90}
91
92/// \brief Subtracts two 256-bit vectors of [4 x double].
93///
94/// \headerfile <x86intrin.h>
95///
96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
97///
98/// \param __a
99///    A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101///    A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103///    both operands.
104static __inline __m256d __DEFAULT_FN_ATTRS
105_mm256_sub_pd(__m256d __a, __m256d __b)
106{
107  return (__m256d)((__v4df)__a-(__v4df)__b);
108}
109
110/// \brief Subtracts two 256-bit vectors of [8 x float].
111///
112/// \headerfile <x86intrin.h>
113///
114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
115///
116/// \param __a
117///    A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119///    A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121///    both operands.
122static __inline __m256 __DEFAULT_FN_ATTRS
123_mm256_sub_ps(__m256 __a, __m256 __b)
124{
125  return (__m256)((__v8sf)__a-(__v8sf)__b);
126}
127
128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129///    two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
134///
135/// \param __a
136///    A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138///    A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140///    and differences between both operands.
141static __inline __m256d __DEFAULT_FN_ATTRS
142_mm256_addsub_pd(__m256d __a, __m256d __b)
143{
144  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145}
146
147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148///    two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
153///
154/// \param __a
155///    A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157///    A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159///    differences between both operands.
160static __inline __m256 __DEFAULT_FN_ATTRS
161_mm256_addsub_ps(__m256 __a, __m256 __b)
162{
163  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
164}
165
166/// \brief Divides two 256-bit vectors of [4 x double].
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
171///
172/// \param __a
173///    A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175///    A 256-bit vector of [4 x double] containing the divisor.
176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177///    operands.
178static __inline __m256d __DEFAULT_FN_ATTRS
179_mm256_div_pd(__m256d __a, __m256d __b)
180{
181  return (__m256d)((__v4df)__a/(__v4df)__b);
182}
183
184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
189///
190/// \param __a
191///    A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193///    A 256-bit vector of [8 x float] containing the divisor.
194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195///    operands.
196static __inline __m256 __DEFAULT_FN_ATTRS
197_mm256_div_ps(__m256 __a, __m256 __b)
198{
199  return (__m256)((__v8sf)__a/(__v8sf)__b);
200}
201
202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203///    of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
208///
209/// \param __a
210///    A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212///    A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214///    between both operands.
215static __inline __m256d __DEFAULT_FN_ATTRS
216_mm256_max_pd(__m256d __a, __m256d __b)
217{
218  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219}
220
221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222///    of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
227///
228/// \param __a
229///    A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231///    A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233///    between both operands.
234static __inline __m256 __DEFAULT_FN_ATTRS
235_mm256_max_ps(__m256 __a, __m256 __b)
236{
237  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238}
239
240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241///    of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
246///
247/// \param __a
248///    A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250///    A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252///    between both operands.
253static __inline __m256d __DEFAULT_FN_ATTRS
254_mm256_min_pd(__m256d __a, __m256d __b)
255{
256  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257}
258
259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260///    of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
265///
266/// \param __a
267///    A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269///    A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271///    between both operands.
272static __inline __m256 __DEFAULT_FN_ATTRS
273_mm256_min_ps(__m256 __a, __m256 __b)
274{
275  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276}
277
278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
283///
284/// \param __a
285///    A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287///    A 256-bit vector of [4 x double] containing one of the operands.
288/// \returns A 256-bit vector of [4 x double] containing the products of both
289///    operands.
290static __inline __m256d __DEFAULT_FN_ATTRS
291_mm256_mul_pd(__m256d __a, __m256d __b)
292{
293  return (__m256d)((__v4df)__a * (__v4df)__b);
294}
295
296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
301///
302/// \param __a
303///    A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305///    A 256-bit vector of [8 x float] containing one of the operands.
306/// \returns A 256-bit vector of [8 x float] containing the products of both
307///    operands.
308static __inline __m256 __DEFAULT_FN_ATTRS
309_mm256_mul_ps(__m256 __a, __m256 __b)
310{
311  return (__m256)((__v8sf)__a * (__v8sf)__b);
312}
313
314/// \brief Calculates the square roots of the values in a 256-bit vector of
315///    [4 x double].
316///
317/// \headerfile <x86intrin.h>
318///
319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
320///
321/// \param __a
322///    A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324///    values in the operand.
325static __inline __m256d __DEFAULT_FN_ATTRS
326_mm256_sqrt_pd(__m256d __a)
327{
328  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329}
330
331/// \brief Calculates the square roots of the values in a 256-bit vector of
332///    [8 x float].
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
337///
338/// \param __a
339///    A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341///    values in the operand.
342static __inline __m256 __DEFAULT_FN_ATTRS
343_mm256_sqrt_ps(__m256 __a)
344{
345  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346}
347
348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349///    vector of [8 x float].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
354///
355/// \param __a
356///    A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358///    roots of the values in the operand.
359static __inline __m256 __DEFAULT_FN_ATTRS
360_mm256_rsqrt_ps(__m256 __a)
361{
362  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363}
364
365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366///    [8 x float].
367///
368/// \headerfile <x86intrin.h>
369///
370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
371///
372/// \param __a
373///    A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375///    values in the operand.
376static __inline __m256 __DEFAULT_FN_ATTRS
377_mm256_rcp_ps(__m256 __a)
378{
379  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380}
381
382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383///    by the byte operand. The source values are rounded to integer values and
384///    returned as 64-bit double-precision floating-point values.
385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
393///
394/// \param V
395///    A 256-bit vector of [4 x double].
396/// \param M
397///    An integer value that specifies the rounding operation. \n
398///    Bits [7:4] are reserved. \n
399///    Bit [3] is a precision exception value: \n
400///      0: A normal PE exception is used. \n
401///      1: The PE field is not updated. \n
402///    Bit [2] is the rounding control source: \n
403///      0: Use bits [1:0] of \a M. \n
404///      1: Use the current MXCSR setting. \n
405///    Bits [1:0] contain the rounding control definition: \n
406///      00: Nearest. \n
407///      01: Downward (toward negative infinity). \n
408///      10: Upward (toward positive infinity). \n
409///      11: Truncated.
410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
411#define _mm256_round_pd(V, M) __extension__ ({ \
412    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
413
414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415///    specified by the byte operand. The source values are rounded to integer
416///    values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
425///
426/// \param V
427///    A 256-bit vector of [8 x float].
428/// \param M
429///    An integer value that specifies the rounding operation. \n
430///    Bits [7:4] are reserved. \n
431///    Bit [3] is a precision exception value: \n
432///      0: A normal PE exception is used. \n
433///      1: The PE field is not updated. \n
434///    Bit [2] is the rounding control source: \n
435///      0: Use bits [1:0] of \a M. \n
436///      1: Use the current MXCSR setting. \n
437///    Bits [1:0] contain the rounding control definition: \n
438///      00: Nearest. \n
439///      01: Downward (toward negative infinity). \n
440///      10: Upward (toward positive infinity). \n
441///      11: Truncated.
442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
443#define _mm256_round_ps(V, M) __extension__ ({ \
444  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
445
446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
447///    source values are rounded up to integer values and returned as 64-bit
448///    double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
457///
458/// \param V
459///    A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
462
463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
464///    The source values are rounded down to integer values and returned as
465///    64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
474///
475/// \param V
476///    A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478///    values.
479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480
481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
482///    source values are rounded up to integer values and returned as
483///    floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
492///
493/// \param V
494///    A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
497
498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
499///    source values are rounded down to integer values and returned as
500///    floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
509///
510/// \param V
511///    A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
521///
522/// \param __a
523///    A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525///    A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527///    values between both operands.
528static __inline __m256d __DEFAULT_FN_ATTRS
529_mm256_and_pd(__m256d __a, __m256d __b)
530{
531  return (__m256d)((__v4du)__a & (__v4du)__b);
532}
533
534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
539///
540/// \param __a
541///    A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543///    A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545///    values between both operands.
546static __inline __m256 __DEFAULT_FN_ATTRS
547_mm256_and_ps(__m256 __a, __m256 __b)
548{
549  return (__m256)((__v8su)__a & (__v8su)__b);
550}
551
552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553///    the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
558///
559/// \param __a
560///    A 256-bit vector of [4 x double] containing the left source operand. The
561///    one's complement of this value is used in the bitwise AND.
562/// \param __b
563///    A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565///    values of the second operand and the one's complement of the first
566///    operand.
567static __inline __m256d __DEFAULT_FN_ATTRS
568_mm256_andnot_pd(__m256d __a, __m256d __b)
569{
570  return (__m256d)(~(__v4du)__a & (__v4du)__b);
571}
572
573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574///    the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
579///
580/// \param __a
581///    A 256-bit vector of [8 x float] containing the left source operand. The
582///    one's complement of this value is used in the bitwise AND.
583/// \param __b
584///    A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586///    values of the second operand and the one's complement of the first
587///    operand.
588static __inline __m256 __DEFAULT_FN_ATTRS
589_mm256_andnot_ps(__m256 __a, __m256 __b)
590{
591  return (__m256)(~(__v8su)__a & (__v8su)__b);
592}
593
594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
599///
600/// \param __a
601///    A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603///    A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605///    values between both operands.
606static __inline __m256d __DEFAULT_FN_ATTRS
607_mm256_or_pd(__m256d __a, __m256d __b)
608{
609  return (__m256d)((__v4du)__a | (__v4du)__b);
610}
611
612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
617///
618/// \param __a
619///    A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621///    A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623///    values between both operands.
624static __inline __m256 __DEFAULT_FN_ATTRS
625_mm256_or_ps(__m256 __a, __m256 __b)
626{
627  return (__m256)((__v8su)__a | (__v8su)__b);
628}
629
630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
635///
636/// \param __a
637///    A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639///    A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641///    values between both operands.
642static __inline __m256d __DEFAULT_FN_ATTRS
643_mm256_xor_pd(__m256d __a, __m256d __b)
644{
645  return (__m256d)((__v4du)__a ^ (__v4du)__b);
646}
647
648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
653///
654/// \param __a
655///    A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657///    A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659///    values between both operands.
660static __inline __m256 __DEFAULT_FN_ATTRS
661_mm256_xor_ps(__m256 __a, __m256 __b)
662{
663  return (__m256)((__v8su)__a ^ (__v8su)__b);
664}
665
666/* Horizontal arithmetic */
667/// \brief Horizontally adds the adjacent pairs of values contained in two
668///    256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
673///
674/// \param __a
675///    A 256-bit vector of [4 x double] containing one of the source operands.
676///    The horizontal sums of the values are returned in the even-indexed
677///    elements of a vector of [4 x double].
678/// \param __b
679///    A 256-bit vector of [4 x double] containing one of the source operands.
680///    The horizontal sums of the values are returned in the odd-indexed
681///    elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683///    both operands.
684static __inline __m256d __DEFAULT_FN_ATTRS
685_mm256_hadd_pd(__m256d __a, __m256d __b)
686{
687  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688}
689
690/// \brief Horizontally adds the adjacent pairs of values contained in two
691///    256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
696///
697/// \param __a
698///    A 256-bit vector of [8 x float] containing one of the source operands.
699///    The horizontal sums of the values are returned in the elements with
700///    index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702///    A 256-bit vector of [8 x float] containing one of the source operands.
703///    The horizontal sums of the values are returned in the elements with
704///    index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706///    both operands.
707static __inline __m256 __DEFAULT_FN_ATTRS
708_mm256_hadd_ps(__m256 __a, __m256 __b)
709{
710  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711}
712
713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714///    256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
719///
720/// \param __a
721///    A 256-bit vector of [4 x double] containing one of the source operands.
722///    The horizontal differences between the values are returned in the
723///    even-indexed elements of a vector of [4 x double].
724/// \param __b
725///    A 256-bit vector of [4 x double] containing one of the source operands.
726///    The horizontal differences between the values are returned in the
727///    odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729///    differences of both operands.
730static __inline __m256d __DEFAULT_FN_ATTRS
731_mm256_hsub_pd(__m256d __a, __m256d __b)
732{
733  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734}
735
736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737///    256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742///
743/// \param __a
744///    A 256-bit vector of [8 x float] containing one of the source operands.
745///    The horizontal differences between the values are returned in the
746///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748///    A 256-bit vector of [8 x float] containing one of the source operands.
749///    The horizontal differences between the values are returned in the
750///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752///    differences of both operands.
753static __inline __m256 __DEFAULT_FN_ATTRS
754_mm256_hsub_ps(__m256 __a, __m256 __b)
755{
756  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757}
758
759/* Vector permutations */
760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761///    by the 128-bit integer vector operand.
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
766///
767/// \param __a
768///    A 128-bit vector of [2 x double].
769/// \param __c
770///    A 128-bit integer vector operand specifying how the values are to be
771///    copied. \n
772///    Bit [1]: \n
773///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774///         vector. \n
775///      1: Bits [127:64] of the source are copied to bits [63:0] of the
776///         returned vector. \n
777///    Bit [65]: \n
778///      0: Bits [63:0] of the source are copied to bits [127:64] of the
779///         returned vector. \n
780///      1: Bits [127:64] of the source are copied to bits [127:64] of the
781///         returned vector.
782/// \returns A 128-bit vector of [2 x double] containing the copied values.
783static __inline __m128d __DEFAULT_FN_ATTRS
784_mm_permutevar_pd(__m128d __a, __m128i __c)
785{
786  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787}
788
789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
790///    by the 256-bit integer vector operand.
791///
792/// \headerfile <x86intrin.h>
793///
794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
795///
796/// \param __a
797///    A 256-bit vector of [4 x double].
798/// \param __c
799///    A 256-bit integer vector operand specifying how the values are to be
800///    copied. \n
801///    Bit [1]: \n
802///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803///         vector. \n
804///      1: Bits [127:64] of the source are copied to bits [63:0] of the
805///         returned vector. \n
806///    Bit [65]: \n
807///      0: Bits [63:0] of the source are copied to bits [127:64] of the
808///         returned vector. \n
809///      1: Bits [127:64] of the source are copied to bits [127:64] of the
810///         returned vector. \n
811///    Bit [129]: \n
812///      0: Bits [191:128] of the source are copied to bits [191:128] of the
813///         returned vector. \n
814///      1: Bits [255:192] of the source are copied to bits [191:128] of the
815///         returned vector. \n
816///    Bit [193]: \n
817///      0: Bits [191:128] of the source are copied to bits [255:192] of the
818///         returned vector. \n
819///      1: Bits [255:192] of the source are copied to bits [255:192] of the
820///    returned vector.
821/// \returns A 256-bit vector of [4 x double] containing the copied values.
822static __inline __m256d __DEFAULT_FN_ATTRS
823_mm256_permutevar_pd(__m256d __a, __m256i __c)
824{
825  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826}
827
828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829///    specified by the 128-bit integer vector operand.
830/// \headerfile <x86intrin.h>
831///
832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
833///
834/// \param __a
835///    A 128-bit vector of [4 x float].
836/// \param __c
837///    A 128-bit integer vector operand specifying how the values are to be
838///    copied. \n
839///    Bits [1:0]: \n
840///      00: Bits [31:0] of the source are copied to bits [31:0] of the
841///          returned vector. \n
842///      01: Bits [63:32] of the source are copied to bits [31:0] of the
843///          returned vector. \n
844///      10: Bits [95:64] of the source are copied to bits [31:0] of the
845///          returned vector. \n
846///      11: Bits [127:96] of the source are copied to bits [31:0] of the
847///          returned vector. \n
848///    Bits [33:32]: \n
849///      00: Bits [31:0] of the source are copied to bits [63:32] of the
850///          returned vector. \n
851///      01: Bits [63:32] of the source are copied to bits [63:32] of the
852///          returned vector. \n
853///      10: Bits [95:64] of the source are copied to bits [63:32] of the
854///          returned vector. \n
855///      11: Bits [127:96] of the source are copied to bits [63:32] of the
856///          returned vector. \n
857///    Bits [65:64]: \n
858///      00: Bits [31:0] of the source are copied to bits [95:64] of the
859///          returned vector. \n
860///      01: Bits [63:32] of the source are copied to bits [95:64] of the
861///          returned vector. \n
862///      10: Bits [95:64] of the source are copied to bits [95:64] of the
863///          returned vector. \n
864///      11: Bits [127:96] of the source are copied to bits [95:64] of the
865///          returned vector. \n
866///    Bits [97:96]: \n
867///      00: Bits [31:0] of the source are copied to bits [127:96] of the
868///          returned vector. \n
869///      01: Bits [63:32] of the source are copied to bits [127:96] of the
870///          returned vector. \n
871///      10: Bits [95:64] of the source are copied to bits [127:96] of the
872///          returned vector. \n
873///      11: Bits [127:96] of the source are copied to bits [127:96] of the
874///          returned vector.
875/// \returns A 128-bit vector of [4 x float] containing the copied values.
876static __inline __m128 __DEFAULT_FN_ATTRS
877_mm_permutevar_ps(__m128 __a, __m128i __c)
878{
879  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
880}
881
882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
883///    specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
888///
889/// \param __a
890///    A 256-bit vector of [8 x float].
891/// \param __c
892///    A 256-bit integer vector operand specifying how the values are to be
893///    copied. \n
894///    Bits [1:0]: \n
895///      00: Bits [31:0] of the source are copied to bits [31:0] of the
896///          returned vector. \n
897///      01: Bits [63:32] of the source are copied to bits [31:0] of the
898///          returned vector. \n
899///      10: Bits [95:64] of the source are copied to bits [31:0] of the
900///          returned vector. \n
901///      11: Bits [127:96] of the source are copied to bits [31:0] of the
902///          returned vector. \n
903///    Bits [33:32]: \n
904///      00: Bits [31:0] of the source are copied to bits [63:32] of the
905///          returned vector. \n
906///      01: Bits [63:32] of the source are copied to bits [63:32] of the
907///          returned vector. \n
908///      10: Bits [95:64] of the source are copied to bits [63:32] of the
909///          returned vector. \n
910///      11: Bits [127:96] of the source are copied to bits [63:32] of the
911///          returned vector. \n
912///    Bits [65:64]: \n
913///      00: Bits [31:0] of the source are copied to bits [95:64] of the
914///          returned vector. \n
915///      01: Bits [63:32] of the source are copied to bits [95:64] of the
916///          returned vector. \n
917///      10: Bits [95:64] of the source are copied to bits [95:64] of the
918///          returned vector. \n
919///      11: Bits [127:96] of the source are copied to bits [95:64] of the
920///          returned vector. \n
921///    Bits [97:96]: \n
922///      00: Bits [31:0] of the source are copied to bits [127:96] of the
923///          returned vector. \n
924///      01: Bits [63:32] of the source are copied to bits [127:96] of the
925///          returned vector. \n
926///      10: Bits [95:64] of the source are copied to bits [127:96] of the
927///          returned vector. \n
928///      11: Bits [127:96] of the source are copied to bits [127:96] of the
929///          returned vector. \n
930///    Bits [129:128]: \n
931///      00: Bits [159:128] of the source are copied to bits [159:128] of the
932///          returned vector. \n
933///      01: Bits [191:160] of the source are copied to bits [159:128] of the
934///          returned vector. \n
935///      10: Bits [223:192] of the source are copied to bits [159:128] of the
936///          returned vector. \n
937///      11: Bits [255:224] of the source are copied to bits [159:128] of the
938///          returned vector. \n
939///    Bits [161:160]: \n
940///      00: Bits [159:128] of the source are copied to bits [191:160] of the
941///          returned vector. \n
942///      01: Bits [191:160] of the source are copied to bits [191:160] of the
943///          returned vector. \n
944///      10: Bits [223:192] of the source are copied to bits [191:160] of the
945///          returned vector. \n
946///      11: Bits [255:224] of the source are copied to bits [191:160] of the
947///          returned vector. \n
948///    Bits [193:192]: \n
949///      00: Bits [159:128] of the source are copied to bits [223:192] of the
950///          returned vector. \n
951///      01: Bits [191:160] of the source are copied to bits [223:192] of the
952///          returned vector. \n
953///      10: Bits [223:192] of the source are copied to bits [223:192] of the
954///          returned vector. \n
955///      11: Bits [255:224] of the source are copied to bits [223:192] of the
956///          returned vector. \n
957///    Bits [225:224]: \n
958///      00: Bits [159:128] of the source are copied to bits [255:224] of the
959///          returned vector. \n
960///      01: Bits [191:160] of the source are copied to bits [255:224] of the
961///          returned vector. \n
962///      10: Bits [223:192] of the source are copied to bits [255:224] of the
963///          returned vector. \n
964///      11: Bits [255:224] of the source are copied to bits [255:224] of the
965///          returned vector.
966/// \returns A 256-bit vector of [8 x float] containing the copied values.
967static __inline __m256 __DEFAULT_FN_ATTRS
968_mm256_permutevar_ps(__m256 __a, __m256i __c)
969{
970  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971}
972
973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
974///    by the immediate integer operand.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983///
984/// \param A
985///    A 128-bit vector of [2 x double].
986/// \param C
987///    An immediate integer operand specifying how the values are to be
988///    copied. \n
989///    Bit [0]: \n
990///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991///         vector. \n
992///      1: Bits [127:64] of the source are copied to bits [63:0] of the
993///         returned vector. \n
994///    Bit [1]: \n
995///      0: Bits [63:0] of the source are copied to bits [127:64] of the
996///         returned vector. \n
997///      1: Bits [127:64] of the source are copied to bits [127:64] of the
998///         returned vector.
999/// \returns A 128-bit vector of [2 x double] containing the copied values.
1000#define _mm_permute_pd(A, C) __extension__ ({ \
1001  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
1002                                   (__v2df)_mm_undefined_pd(), \
1003                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
1004
1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
1006///    the immediate integer operand.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// __m256d _mm256_permute_pd(__m256d A, const int C);
1012/// \endcode
1013///
1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1015///
1016/// \param A
1017///    A 256-bit vector of [4 x double].
1018/// \param C
1019///    An immediate integer operand specifying how the values are to be
1020///    copied. \n
1021///    Bit [0]: \n
1022///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1023///         vector. \n
1024///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1025///         returned vector. \n
1026///    Bit [1]: \n
1027///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1028///         returned vector. \n
1029///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1030///         returned vector. \n
1031///    Bit [2]: \n
1032///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1033///         returned vector. \n
1034///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1035///         returned vector. \n
1036///    Bit [3]: \n
1037///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1038///         returned vector. \n
1039///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1040///         returned vector.
1041/// \returns A 256-bit vector of [4 x double] containing the copied values.
1042#define _mm256_permute_pd(A, C) __extension__ ({ \
1043  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1044                                   (__v4df)_mm256_undefined_pd(), \
1045                                   0 + (((C) >> 0) & 0x1), \
1046                                   0 + (((C) >> 1) & 0x1), \
1047                                   2 + (((C) >> 2) & 0x1), \
1048                                   2 + (((C) >> 3) & 0x1)); })
1049
1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
1051///    the immediate integer operand.
1052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// \code
1056/// __m128 _mm_permute_ps(__m128 A, const int C);
1057/// \endcode
1058///
1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1060///
1061/// \param A
1062///    A 128-bit vector of [4 x float].
1063/// \param C
1064///    An immediate integer operand specifying how the values are to be
1065///    copied. \n
1066///    Bits [1:0]: \n
1067///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1068///          returned vector. \n
1069///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1070///          returned vector. \n
1071///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1072///          returned vector. \n
1073///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1074///          returned vector. \n
1075///    Bits [3:2]: \n
1076///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1077///          returned vector. \n
1078///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1079///          returned vector. \n
1080///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1081///          returned vector. \n
1082///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1083///          returned vector. \n
1084///    Bits [5:4]: \n
1085///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1086///          returned vector. \n
1087///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1088///          returned vector. \n
1089///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1090///          returned vector. \n
1091///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1092///          returned vector. \n
1093///    Bits [7:6]: \n
1094///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1095///          returned vector. \n
1096///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1097///          returned vector. \n
1098///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1099///          returned vector. \n
1100///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1101///          returned vector.
1102/// \returns A 128-bit vector of [4 x float] containing the copied values.
1103#define _mm_permute_ps(A, C) __extension__ ({ \
1104  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1105                                  (__v4sf)_mm_undefined_ps(), \
1106                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1107                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
1108
1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
1110///    the immediate integer operand.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1119///
1120/// \param A
1121///    A 256-bit vector of [8 x float].
1122/// \param C
1123///    An immediate integer operand specifying how the values are to be \n
1124///    copied. \n
1125///    Bits [1:0]: \n
1126///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1127///          returned vector. \n
1128///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1129///          returned vector. \n
1130///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1131///          returned vector. \n
1132///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1133///          returned vector. \n
1134///    Bits [3:2]: \n
1135///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1136///          returned vector. \n
1137///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1138///          returned vector. \n
1139///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1140///          returned vector. \n
1141///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1142///          returned vector. \n
1143///    Bits [5:4]: \n
1144///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1145///          returned vector. \n
1146///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1147///          returned vector. \n
1148///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1149///          returned vector. \n
1150///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1151///          returned vector. \n
1152///    Bits [7:6]: \n
1153///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
1154///          returned vector. \n
1155///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1156///          returned vector. \n
1157///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1158///          returned vector. \n
1159///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1160///          returned vector. \n
1161///    Bits [1:0]: \n
1162///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1163///          returned vector. \n
1164///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1165///          returned vector. \n
1166///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1167///          returned vector. \n
1168///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1169///          returned vector. \n
1170///    Bits [3:2]: \n
1171///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1172///          returned vector. \n
1173///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1174///          returned vector. \n
1175///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1176///          returned vector. \n
1177///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1178///          returned vector. \n
1179///    Bits [5:4]: \n
1180///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1181///          returned vector. \n
1182///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1183///          returned vector. \n
1184///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1185///          returned vector. \n
1186///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1187///          returned vector. \n
1188///    Bits [7:6]: \n
1189///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1190///          returned vector. \n
1191///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1192///          returned vector. \n
1193///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1194///          returned vector. \n
1195///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1196///          returned vector.
1197/// \returns A 256-bit vector of [8 x float] containing the copied values.
1198#define _mm256_permute_ps(A, C) __extension__ ({ \
1199  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1200                                  (__v8sf)_mm256_undefined_ps(), \
1201                                  0 + (((C) >> 0) & 0x3), \
1202                                  0 + (((C) >> 2) & 0x3), \
1203                                  0 + (((C) >> 4) & 0x3), \
1204                                  0 + (((C) >> 6) & 0x3), \
1205                                  4 + (((C) >> 0) & 0x3), \
1206                                  4 + (((C) >> 2) & 0x3), \
1207                                  4 + (((C) >> 4) & 0x3), \
1208                                  4 + (((C) >> 6) & 0x3)); })
1209
1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1211///    [4 x double], as specified by the immediate integer operand.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// \code
1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1217/// \endcode
1218///
1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1220///
1221/// \param V1
1222///    A 256-bit vector of [4 x double].
1223/// \param V2
1224///    A 256-bit vector of [4 x double.
1225/// \param M
1226///    An immediate integer operand specifying how the values are to be
1227///    permuted. \n
1228///    Bits [1:0]: \n
1229///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1230///          destination. \n
1231///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1232///          destination. \n
1233///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1234///          destination. \n
1235///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1236///          destination. \n
1237///    Bits [5:4]: \n
1238///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1239///          destination. \n
1240///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1241///          destination. \n
1242///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1243///          destination. \n
1244///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1245///          destination.
1246/// \returns A 256-bit vector of [4 x double] containing the copied values.
1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1248  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1249                                           (__v4df)(__m256d)(V2), (M)); })
1250
1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1252///    [8 x float], as specified by the immediate integer operand.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// \code
1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1258/// \endcode
1259///
1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1261///
1262/// \param V1
1263///    A 256-bit vector of [8 x float].
1264/// \param V2
1265///    A 256-bit vector of [8 x float].
1266/// \param M
1267///    An immediate integer operand specifying how the values are to be
1268///    permuted. \n
1269///    Bits [1:0]: \n
1270///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1271///    destination. \n
1272///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1273///    destination. \n
1274///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1275///    destination. \n
1276///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1277///    destination. \n
1278///    Bits [5:4]: \n
1279///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1280///    destination. \n
1281///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1282///    destination. \n
1283///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1284///    destination. \n
1285///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1286///    destination.
1287/// \returns A 256-bit vector of [8 x float] containing the copied values.
1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1289  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1290                                          (__v8sf)(__m256)(V2), (M)); })
1291
1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1293///    as specified by the immediate integer operand.
1294///
1295/// \headerfile <x86intrin.h>
1296///
1297/// \code
1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1299/// \endcode
1300///
1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1302///
1303/// \param V1
1304///    A 256-bit integer vector.
1305/// \param V2
1306///    A 256-bit integer vector.
1307/// \param M
1308///    An immediate integer operand specifying how the values are to be copied.
1309///    Bits [1:0]: \n
1310///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1311///    destination. \n
1312///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1313///    destination. \n
1314///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1315///    destination. \n
1316///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1317///    destination. \n
1318///    Bits [5:4]: \n
1319///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1320///    destination. \n
1321///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1322///    destination. \n
1323///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1324///    destination. \n
1325///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1326///    destination.
1327/// \returns A 256-bit integer vector containing the copied values.
1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1329  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1330                                           (__v8si)(__m256i)(V2), (M)); })
1331
1332/* Vector Blend */
1333/// \brief Merges 64-bit double-precision data values stored in either of the
1334///    two 256-bit vectors of [4 x double], as specified by the immediate
1335///    integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1341/// \endcode
1342///
1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1344///
1345/// \param V1
1346///    A 256-bit vector of [4 x double].
1347/// \param V2
1348///    A 256-bit vector of [4 x double].
1349/// \param M
1350///    An immediate integer operand, with mask bits [3:0] specifying how the
1351///    values are to be copied. The position of the mask bit corresponds to the
1352///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1353///    element in operand \a V1 is copied to the same position in the
1354///    destination. When a mask bit is 1, the corresponding 64-bit element in
1355///    operand \a V2 is copied to the same position in the destination.
1356/// \returns A 256-bit vector of [4 x double] containing the copied values.
1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1358  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1359                                   (__v4df)(__m256d)(V2), \
1360                                   (((M) & 0x01) ? 4 : 0), \
1361                                   (((M) & 0x02) ? 5 : 1), \
1362                                   (((M) & 0x04) ? 6 : 2), \
1363                                   (((M) & 0x08) ? 7 : 3)); })
1364
1365/// \brief Merges 32-bit single-precision data values stored in either of the
1366///    two 256-bit vectors of [8 x float], as specified by the immediate
1367///    integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1376///
1377/// \param V1
1378///    A 256-bit vector of [8 x float].
1379/// \param V2
1380///    A 256-bit vector of [8 x float].
1381/// \param M
1382///    An immediate integer operand, with mask bits [7:0] specifying how the
1383///    values are to be copied. The position of the mask bit corresponds to the
1384///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1385///    element in operand \a V1 is copied to the same position in the
1386///    destination. When a mask bit is 1, the corresponding 32-bit element in
1387///    operand \a V2 is copied to the same position in the destination.
1388/// \returns A 256-bit vector of [8 x float] containing the copied values.
1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1390  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1391                                  (__v8sf)(__m256)(V2), \
1392                                  (((M) & 0x01) ?  8 : 0), \
1393                                  (((M) & 0x02) ?  9 : 1), \
1394                                  (((M) & 0x04) ? 10 : 2), \
1395                                  (((M) & 0x08) ? 11 : 3), \
1396                                  (((M) & 0x10) ? 12 : 4), \
1397                                  (((M) & 0x20) ? 13 : 5), \
1398                                  (((M) & 0x40) ? 14 : 6), \
1399                                  (((M) & 0x80) ? 15 : 7)); })
1400
1401/// \brief Merges 64-bit double-precision data values stored in either of the
1402///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403///    operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410///    A 256-bit vector of [4 x double].
1411/// \param __b
1412///    A 256-bit vector of [4 x double].
1413/// \param __c
1414///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415///    how the values are to be copied. The position of the mask bit corresponds
1416///    to the most significant bit of a copied value. When a mask bit is 0, the
1417///    corresponding 64-bit element in operand \a __a is copied to the same
1418///    position in the destination. When a mask bit is 1, the corresponding
1419///    64-bit element in operand \a __b is copied to the same position in the
1420///    destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425  return (__m256d)__builtin_ia32_blendvpd256(
1426    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// \brief Merges 32-bit single-precision data values stored in either of the
1430///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431///    operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438///    A 256-bit vector of [8 x float].
1439/// \param __b
1440///    A 256-bit vector of [8 x float].
1441/// \param __c
1442///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443///    and 31 specifying how the values are to be copied. The position of the
1444///    mask bit corresponds to the most significant bit of a copied value. When
1445///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446///    copied to the same position in the destination. When a mask bit is 1, the
1447///    corresponding 32-bit element in operand \a __b is copied to the same
1448///    position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453  return (__m256)__builtin_ia32_blendvps256(
1454    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// \brief Computes two dot products in parallel, using the lower and upper
1459///    halves of two [8 x float] vectors as input to the two computations, and
1460///    returning the two dot products in the lower and upper halves of the
1461///    [8 x float] result.
1462///
1463///    The immediate integer operand controls which input elements will
1464///    contribute to the dot product, and where the final results are returned.
1465///    In general, for each dot product, the four corresponding elements of the
1466///    input vectors are multiplied; the first two and second two products are
1467///    summed, then the two sums are added to form the final result.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// \code
1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1473/// \endcode
1474///
1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1476///
1477/// \param V1
1478///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1479/// \param V2
1480///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1481/// \param M
1482///    An immediate integer argument. Bits [7:4] determine which elements of
1483///    the input vectors are used, with bit [4] corresponding to the lowest
1484///    element and bit [7] corresponding to the highest element of each [4 x
1485///    float] subvector. If a bit is set, the corresponding elements from the
1486///    two input vectors are used as an input for dot product; otherwise that
1487///    input is treated as zero. Bits [3:0] determine which elements of the
1488///    result will receive a copy of the final dot product, with bit [0]
1489///    corresponding to the lowest element and bit [3] corresponding to the
1490///    highest element of each [4 x float] subvector. If a bit is set, the dot
1491///    product is returned in the corresponding element; otherwise that element
1492///    is set to zero. The bitmask is applied in the same way to each of the
1493///    two parallel dot product computations.
1494/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1495#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1496  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1497                                 (__v8sf)(__m256)(V2), (M)); })
1498
1499/* Vector shuffle */
1500/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1501///    specified by the immediate value operand.
1502///
1503///    The four selected elements in each operand are copied to the destination
1504///    according to the bits specified in the immediate operand. The selected
1505///    elements from the first 256-bit operand are copied to bits [63:0] and
1506///    bits [191:128] of the destination, and the selected elements from the
1507///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1508///    the destination. For example, if bits [7:0] of the immediate operand
1509///    contain a value of 0xFF, the 256-bit destination vector would contain the
1510///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// \code
1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1516/// \endcode
1517///
1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1519///
1520/// \param a
1521///    A 256-bit vector of [8 x float]. The four selected elements in this
1522///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1523///    according to the bits specified in the immediate operand.
1524/// \param b
1525///    A 256-bit vector of [8 x float]. The four selected elements in this
1526///    operand are copied to bits [127:64] and bits [255:192] in the
1527///    destination, according to the bits specified in the immediate operand.
1528/// \param mask
1529///    An immediate value containing an 8-bit value specifying which elements to
1530///    copy from \a a and \a b \n.
1531///    Bits [3:0] specify the values copied from operand \a a. \n
1532///    Bits [7:4] specify the values copied from operand \a b. \n
1533///    The destinations within the 256-bit destination are assigned values as
1534///    follows, according to the bit value assignments described below: \n
1535///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1536///    destination. \n
1537///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1538///    destination. \n
1539///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1540///    destination. \n
1541///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1542///    the destination. \n
1543///    Bit value assignments: \n
1544///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1545///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1546///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1547///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1548/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1549#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1550  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1551                                  (__v8sf)(__m256)(b), \
1552                                  0  + (((mask) >> 0) & 0x3), \
1553                                  0  + (((mask) >> 2) & 0x3), \
1554                                  8  + (((mask) >> 4) & 0x3), \
1555                                  8  + (((mask) >> 6) & 0x3), \
1556                                  4  + (((mask) >> 0) & 0x3), \
1557                                  4  + (((mask) >> 2) & 0x3), \
1558                                  12 + (((mask) >> 4) & 0x3), \
1559                                  12 + (((mask) >> 6) & 0x3)); })
1560
1561/// \brief Selects four double-precision values from the 256-bit operands of
1562///    [4 x double], as specified by the immediate value operand.
1563///
1564///    The selected elements from the first 256-bit operand are copied to bits
1565///    [63:0] and bits [191:128] in the destination, and the selected elements
1566///    from the second 256-bit operand are copied to bits [127:64] and bits
1567///    [255:192] in the destination. For example, if bits [3:0] of the immediate
1568///    operand contain a value of 0xF, the 256-bit destination vector would
1569///    contain the following values: b[3], a[3], b[1], a[1].
1570///
1571/// \headerfile <x86intrin.h>
1572///
1573/// \code
1574/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1575/// \endcode
1576///
1577/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1578///
1579/// \param a
1580///    A 256-bit vector of [4 x double].
1581/// \param b
1582///    A 256-bit vector of [4 x double].
1583/// \param mask
1584///    An immediate value containing 8-bit values specifying which elements to
1585///    copy from \a a and \a b: \n
1586///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1587///    destination. \n
1588///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1589///    destination. \n
1590///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1591///    destination. \n
1592///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1593///    destination. \n
1594///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1595///    destination. \n
1596///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1597///    destination. \n
1598///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1599///    destination. \n
1600///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1601///    destination.
1602/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1603#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1604  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1605                                   (__v4df)(__m256d)(b), \
1606                                   0 + (((mask) >> 0) & 0x1), \
1607                                   4 + (((mask) >> 1) & 0x1), \
1608                                   2 + (((mask) >> 2) & 0x1), \
1609                                   6 + (((mask) >> 3) & 0x1)); })
1610
1611/* Compare */
1612#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1613#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1614#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1615#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1616#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1617#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1618#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1619#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
1620#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1621#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
1622#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1623#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1624#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1625#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1626#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1627#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1628#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1629#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1630#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1631#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1632#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1633#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1634#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
1635#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1636#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1637#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
1638#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1639#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1640#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1641#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1642#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1643#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1644
1645/// \brief Compares each of the corresponding double-precision values of two
1646///    128-bit vectors of [2 x double], using the operation specified by the
1647///    immediate integer operand.
1648///
1649///    Returns a [2 x double] vector consisting of two doubles corresponding to
1650///    the two comparison results: zero if the comparison is false, and all 1's
1651///    if the comparison is true.
1652///
1653/// \headerfile <x86intrin.h>
1654///
1655/// \code
1656/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1657/// \endcode
1658///
1659/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1660///
1661/// \param a
1662///    A 128-bit vector of [2 x double].
1663/// \param b
1664///    A 128-bit vector of [2 x double].
1665/// \param c
1666///    An immediate integer operand, with bits [4:0] specifying which comparison
1667///    operation to use: \n
1668///    0x00 : Equal (ordered, non-signaling)
1669///    0x01 : Less-than (ordered, signaling)
1670///    0x02 : Less-than-or-equal (ordered, signaling)
1671///    0x03 : Unordered (non-signaling)
1672///    0x04 : Not-equal (unordered, non-signaling)
1673///    0x05 : Not-less-than (unordered, signaling)
1674///    0x06 : Not-less-than-or-equal (unordered, signaling)
1675///    0x07 : Ordered (non-signaling)
1676///    0x08 : Equal (unordered, non-signaling)
1677///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1678///    0x0a : Not-greater-than (unordered, signaling)
1679///    0x0b : False (ordered, non-signaling)
1680///    0x0c : Not-equal (ordered, non-signaling)
1681///    0x0d : Greater-than-or-equal (ordered, signaling)
1682///    0x0e : Greater-than (ordered, signaling)
1683///    0x0f : True (unordered, non-signaling)
1684///    0x10 : Equal (ordered, signaling)
1685///    0x11 : Less-than (ordered, non-signaling)
1686///    0x12 : Less-than-or-equal (ordered, non-signaling)
1687///    0x13 : Unordered (signaling)
1688///    0x14 : Not-equal (unordered, signaling)
1689///    0x15 : Not-less-than (unordered, non-signaling)
1690///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1691///    0x17 : Ordered (signaling)
1692///    0x18 : Equal (unordered, signaling)
1693///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1694///    0x1a : Not-greater-than (unordered, non-signaling)
1695///    0x1b : False (ordered, signaling)
1696///    0x1c : Not-equal (ordered, signaling)
1697///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1698///    0x1e : Greater-than (ordered, non-signaling)
1699///    0x1f : True (unordered, signaling)
1700/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1701#define _mm_cmp_pd(a, b, c) __extension__ ({ \
1702  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1703                                (__v2df)(__m128d)(b), (c)); })
1704
1705/// \brief Compares each of the corresponding values of two 128-bit vectors of
1706///    [4 x float], using the operation specified by the immediate integer
1707///    operand.
1708///
1709///    Returns a [4 x float] vector consisting of four floats corresponding to
1710///    the four comparison results: zero if the comparison is false, and all 1's
1711///    if the comparison is true.
1712///
1713/// \headerfile <x86intrin.h>
1714///
1715/// \code
1716/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1717/// \endcode
1718///
1719/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1720///
1721/// \param a
1722///    A 128-bit vector of [4 x float].
1723/// \param b
1724///    A 128-bit vector of [4 x float].
1725/// \param c
1726///    An immediate integer operand, with bits [4:0] specifying which comparison
1727///    operation to use: \n
1728///    0x00 : Equal (ordered, non-signaling)
1729///    0x01 : Less-than (ordered, signaling)
1730///    0x02 : Less-than-or-equal (ordered, signaling)
1731///    0x03 : Unordered (non-signaling)
1732///    0x04 : Not-equal (unordered, non-signaling)
1733///    0x05 : Not-less-than (unordered, signaling)
1734///    0x06 : Not-less-than-or-equal (unordered, signaling)
1735///    0x07 : Ordered (non-signaling)
1736///    0x08 : Equal (unordered, non-signaling)
1737///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1738///    0x0a : Not-greater-than (unordered, signaling)
1739///    0x0b : False (ordered, non-signaling)
1740///    0x0c : Not-equal (ordered, non-signaling)
1741///    0x0d : Greater-than-or-equal (ordered, signaling)
1742///    0x0e : Greater-than (ordered, signaling)
1743///    0x0f : True (unordered, non-signaling)
1744///    0x10 : Equal (ordered, signaling)
1745///    0x11 : Less-than (ordered, non-signaling)
1746///    0x12 : Less-than-or-equal (ordered, non-signaling)
1747///    0x13 : Unordered (signaling)
1748///    0x14 : Not-equal (unordered, signaling)
1749///    0x15 : Not-less-than (unordered, non-signaling)
1750///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1751///    0x17 : Ordered (signaling)
1752///    0x18 : Equal (unordered, signaling)
1753///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1754///    0x1a : Not-greater-than (unordered, non-signaling)
1755///    0x1b : False (ordered, signaling)
1756///    0x1c : Not-equal (ordered, signaling)
1757///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1758///    0x1e : Greater-than (ordered, non-signaling)
1759///    0x1f : True (unordered, signaling)
1760/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1761#define _mm_cmp_ps(a, b, c) __extension__ ({ \
1762  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1763                               (__v4sf)(__m128)(b), (c)); })
1764
1765/// \brief Compares each of the corresponding double-precision values of two
1766///    256-bit vectors of [4 x double], using the operation specified by the
1767///    immediate integer operand.
1768///
1769///    Returns a [4 x double] vector consisting of four doubles corresponding to
1770///    the four comparison results: zero if the comparison is false, and all 1's
1771///    if the comparison is true.
1772///
1773/// \headerfile <x86intrin.h>
1774///
1775/// \code
1776/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1777/// \endcode
1778///
1779/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1780///
1781/// \param a
1782///    A 256-bit vector of [4 x double].
1783/// \param b
1784///    A 256-bit vector of [4 x double].
1785/// \param c
1786///    An immediate integer operand, with bits [4:0] specifying which comparison
1787///    operation to use: \n
1788///    0x00 : Equal (ordered, non-signaling)
1789///    0x01 : Less-than (ordered, signaling)
1790///    0x02 : Less-than-or-equal (ordered, signaling)
1791///    0x03 : Unordered (non-signaling)
1792///    0x04 : Not-equal (unordered, non-signaling)
1793///    0x05 : Not-less-than (unordered, signaling)
1794///    0x06 : Not-less-than-or-equal (unordered, signaling)
1795///    0x07 : Ordered (non-signaling)
1796///    0x08 : Equal (unordered, non-signaling)
1797///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1798///    0x0a : Not-greater-than (unordered, signaling)
1799///    0x0b : False (ordered, non-signaling)
1800///    0x0c : Not-equal (ordered, non-signaling)
1801///    0x0d : Greater-than-or-equal (ordered, signaling)
1802///    0x0e : Greater-than (ordered, signaling)
1803///    0x0f : True (unordered, non-signaling)
1804///    0x10 : Equal (ordered, signaling)
1805///    0x11 : Less-than (ordered, non-signaling)
1806///    0x12 : Less-than-or-equal (ordered, non-signaling)
1807///    0x13 : Unordered (signaling)
1808///    0x14 : Not-equal (unordered, signaling)
1809///    0x15 : Not-less-than (unordered, non-signaling)
1810///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1811///    0x17 : Ordered (signaling)
1812///    0x18 : Equal (unordered, signaling)
1813///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1814///    0x1a : Not-greater-than (unordered, non-signaling)
1815///    0x1b : False (ordered, signaling)
1816///    0x1c : Not-equal (ordered, signaling)
1817///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1818///    0x1e : Greater-than (ordered, non-signaling)
1819///    0x1f : True (unordered, signaling)
1820/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1821#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1822  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1823                                   (__v4df)(__m256d)(b), (c)); })
1824
1825/// \brief Compares each of the corresponding values of two 256-bit vectors of
1826///    [8 x float], using the operation specified by the immediate integer
1827///    operand.
1828///
1829///    Returns a [8 x float] vector consisting of eight floats corresponding to
1830///    the eight comparison results: zero if the comparison is false, and all
1831///    1's if the comparison is true.
1832///
1833/// \headerfile <x86intrin.h>
1834///
1835/// \code
1836/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1837/// \endcode
1838///
1839/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1840///
1841/// \param a
1842///    A 256-bit vector of [8 x float].
1843/// \param b
1844///    A 256-bit vector of [8 x float].
1845/// \param c
1846///    An immediate integer operand, with bits [4:0] specifying which comparison
1847///    operation to use: \n
1848///    0x00 : Equal (ordered, non-signaling)
1849///    0x01 : Less-than (ordered, signaling)
1850///    0x02 : Less-than-or-equal (ordered, signaling)
1851///    0x03 : Unordered (non-signaling)
1852///    0x04 : Not-equal (unordered, non-signaling)
1853///    0x05 : Not-less-than (unordered, signaling)
1854///    0x06 : Not-less-than-or-equal (unordered, signaling)
1855///    0x07 : Ordered (non-signaling)
1856///    0x08 : Equal (unordered, non-signaling)
1857///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1858///    0x0a : Not-greater-than (unordered, signaling)
1859///    0x0b : False (ordered, non-signaling)
1860///    0x0c : Not-equal (ordered, non-signaling)
1861///    0x0d : Greater-than-or-equal (ordered, signaling)
1862///    0x0e : Greater-than (ordered, signaling)
1863///    0x0f : True (unordered, non-signaling)
1864///    0x10 : Equal (ordered, signaling)
1865///    0x11 : Less-than (ordered, non-signaling)
1866///    0x12 : Less-than-or-equal (ordered, non-signaling)
1867///    0x13 : Unordered (signaling)
1868///    0x14 : Not-equal (unordered, signaling)
1869///    0x15 : Not-less-than (unordered, non-signaling)
1870///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1871///    0x17 : Ordered (signaling)
1872///    0x18 : Equal (unordered, signaling)
1873///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1874///    0x1a : Not-greater-than (unordered, non-signaling)
1875///    0x1b : False (ordered, signaling)
1876///    0x1c : Not-equal (ordered, signaling)
1877///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1878///    0x1e : Greater-than (ordered, non-signaling)
1879///    0x1f : True (unordered, signaling)
1880/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1881#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1882  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1883                                  (__v8sf)(__m256)(b), (c)); })
1884
1885/// \brief Compares each of the corresponding scalar double-precision values of
1886///    two 128-bit vectors of [2 x double], using the operation specified by the
1887///    immediate integer operand.
1888///
1889///    If the result is true, all 64 bits of the destination vector are set;
1890///    otherwise they are cleared.
1891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// \code
1895/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1896/// \endcode
1897///
1898/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1899///
1900/// \param a
1901///    A 128-bit vector of [2 x double].
1902/// \param b
1903///    A 128-bit vector of [2 x double].
1904/// \param c
1905///    An immediate integer operand, with bits [4:0] specifying which comparison
1906///    operation to use: \n
1907///    0x00 : Equal (ordered, non-signaling)
1908///    0x01 : Less-than (ordered, signaling)
1909///    0x02 : Less-than-or-equal (ordered, signaling)
1910///    0x03 : Unordered (non-signaling)
1911///    0x04 : Not-equal (unordered, non-signaling)
1912///    0x05 : Not-less-than (unordered, signaling)
1913///    0x06 : Not-less-than-or-equal (unordered, signaling)
1914///    0x07 : Ordered (non-signaling)
1915///    0x08 : Equal (unordered, non-signaling)
1916///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1917///    0x0a : Not-greater-than (unordered, signaling)
1918///    0x0b : False (ordered, non-signaling)
1919///    0x0c : Not-equal (ordered, non-signaling)
1920///    0x0d : Greater-than-or-equal (ordered, signaling)
1921///    0x0e : Greater-than (ordered, signaling)
1922///    0x0f : True (unordered, non-signaling)
1923///    0x10 : Equal (ordered, signaling)
1924///    0x11 : Less-than (ordered, non-signaling)
1925///    0x12 : Less-than-or-equal (ordered, non-signaling)
1926///    0x13 : Unordered (signaling)
1927///    0x14 : Not-equal (unordered, signaling)
1928///    0x15 : Not-less-than (unordered, non-signaling)
1929///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1930///    0x17 : Ordered (signaling)
1931///    0x18 : Equal (unordered, signaling)
1932///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1933///    0x1a : Not-greater-than (unordered, non-signaling)
1934///    0x1b : False (ordered, signaling)
1935///    0x1c : Not-equal (ordered, signaling)
1936///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1937///    0x1e : Greater-than (ordered, non-signaling)
1938///    0x1f : True (unordered, signaling)
1939/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1940#define _mm_cmp_sd(a, b, c) __extension__ ({ \
1941  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1942                                (__v2df)(__m128d)(b), (c)); })
1943
1944/// \brief Compares each of the corresponding scalar values of two 128-bit
1945///    vectors of [4 x float], using the operation specified by the immediate
1946///    integer operand.
1947///
1948///    If the result is true, all 32 bits of the destination vector are set;
1949///    otherwise they are cleared.
1950///
1951/// \headerfile <x86intrin.h>
1952///
1953/// \code
1954/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1955/// \endcode
1956///
1957/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1958///
1959/// \param a
1960///    A 128-bit vector of [4 x float].
1961/// \param b
1962///    A 128-bit vector of [4 x float].
1963/// \param c
1964///    An immediate integer operand, with bits [4:0] specifying which comparison
1965///    operation to use: \n
1966///    0x00 : Equal (ordered, non-signaling)
1967///    0x01 : Less-than (ordered, signaling)
1968///    0x02 : Less-than-or-equal (ordered, signaling)
1969///    0x03 : Unordered (non-signaling)
1970///    0x04 : Not-equal (unordered, non-signaling)
1971///    0x05 : Not-less-than (unordered, signaling)
1972///    0x06 : Not-less-than-or-equal (unordered, signaling)
1973///    0x07 : Ordered (non-signaling)
1974///    0x08 : Equal (unordered, non-signaling)
1975///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1976///    0x0a : Not-greater-than (unordered, signaling)
1977///    0x0b : False (ordered, non-signaling)
1978///    0x0c : Not-equal (ordered, non-signaling)
1979///    0x0d : Greater-than-or-equal (ordered, signaling)
1980///    0x0e : Greater-than (ordered, signaling)
1981///    0x0f : True (unordered, non-signaling)
1982///    0x10 : Equal (ordered, signaling)
1983///    0x11 : Less-than (ordered, non-signaling)
1984///    0x12 : Less-than-or-equal (ordered, non-signaling)
1985///    0x13 : Unordered (signaling)
1986///    0x14 : Not-equal (unordered, signaling)
1987///    0x15 : Not-less-than (unordered, non-signaling)
1988///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1989///    0x17 : Ordered (signaling)
1990///    0x18 : Equal (unordered, signaling)
1991///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1992///    0x1a : Not-greater-than (unordered, non-signaling)
1993///    0x1b : False (ordered, signaling)
1994///    0x1c : Not-equal (ordered, signaling)
1995///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1996///    0x1e : Greater-than (ordered, non-signaling)
1997///    0x1f : True (unordered, signaling)
1998/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1999#define _mm_cmp_ss(a, b, c) __extension__ ({ \
2000  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
2001                               (__v4sf)(__m128)(b), (c)); })
2002
2003/// \brief Takes a [8 x i32] vector and returns the vector element value
2004///    indexed by the immediate constant operand.
2005///
2006/// \headerfile <x86intrin.h>
2007///
2008/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2009///   instruction.
2010///
2011/// \param __a
2012///    A 256-bit vector of [8 x i32].
2013/// \param __imm
2014///    An immediate integer operand with bits [2:0] determining which vector
2015///    element is extracted and returned.
2016/// \returns A 32-bit integer containing the extracted 32 bits of extended
2017///    packed data.
2018static __inline int __DEFAULT_FN_ATTRS
2019_mm256_extract_epi32(__m256i __a, const int __imm)
2020{
2021  __v8si __b = (__v8si)__a;
2022  return __b[__imm & 7];
2023}
2024
2025/// \brief Takes a [16 x i16] vector and returns the vector element value
2026///    indexed by the immediate constant operand.
2027///
2028/// \headerfile <x86intrin.h>
2029///
2030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2031///   instruction.
2032///
2033/// \param __a
2034///    A 256-bit integer vector of [16 x i16].
2035/// \param __imm
2036///    An immediate integer operand with bits [3:0] determining which vector
2037///    element is extracted and returned.
2038/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2039///    packed data.
2040static __inline int __DEFAULT_FN_ATTRS
2041_mm256_extract_epi16(__m256i __a, const int __imm)
2042{
2043  __v16hi __b = (__v16hi)__a;
2044  return (unsigned short)__b[__imm & 15];
2045}
2046
2047/// \brief Takes a [32 x i8] vector and returns the vector element value
2048///    indexed by the immediate constant operand.
2049///
2050/// \headerfile <x86intrin.h>
2051///
2052/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2053///   instruction.
2054///
2055/// \param __a
2056///    A 256-bit integer vector of [32 x i8].
2057/// \param __imm
2058///    An immediate integer operand with bits [4:0] determining which vector
2059///    element is extracted and returned.
2060/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2061///    packed data.
2062static __inline int __DEFAULT_FN_ATTRS
2063_mm256_extract_epi8(__m256i __a, const int __imm)
2064{
2065  __v32qi __b = (__v32qi)__a;
2066  return (unsigned char)__b[__imm & 31];
2067}
2068
2069#ifdef __x86_64__
2070/// \brief Takes a [4 x i64] vector and returns the vector element value
2071///    indexed by the immediate constant operand.
2072///
2073/// \headerfile <x86intrin.h>
2074///
2075/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2076///   instruction.
2077///
2078/// \param __a
2079///    A 256-bit integer vector of [4 x i64].
2080/// \param __imm
2081///    An immediate integer operand with bits [1:0] determining which vector
2082///    element is extracted and returned.
2083/// \returns A 64-bit integer containing the extracted 64 bits of extended
2084///    packed data.
2085static __inline long long  __DEFAULT_FN_ATTRS
2086_mm256_extract_epi64(__m256i __a, const int __imm)
2087{
2088  __v4di __b = (__v4di)__a;
2089  return __b[__imm & 3];
2090}
2091#endif
2092
2093/// \brief Takes a [8 x i32] vector and replaces the vector element value
2094///    indexed by the immediate constant operand by a new value. Returns the
2095///    modified vector.
2096///
2097/// \headerfile <x86intrin.h>
2098///
2099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2100///   instruction.
2101///
2102/// \param __a
2103///    A vector of [8 x i32] to be used by the insert operation.
2104/// \param __b
2105///    An integer value. The replacement value for the insert operation.
2106/// \param __imm
2107///    An immediate integer specifying the index of the vector element to be
2108///    replaced.
2109/// \returns A copy of vector \a __a, after replacing its element indexed by
2110///    \a __imm with \a __b.
2111static __inline __m256i __DEFAULT_FN_ATTRS
2112_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
2113{
2114  __v8si __c = (__v8si)__a;
2115  __c[__imm & 7] = __b;
2116  return (__m256i)__c;
2117}
2118
2119
2120/// \brief Takes a [16 x i16] vector and replaces the vector element value
2121///    indexed by the immediate constant operand with a new value. Returns the
2122///    modified vector.
2123///
2124/// \headerfile <x86intrin.h>
2125///
2126/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2127///   instruction.
2128///
2129/// \param __a
2130///    A vector of [16 x i16] to be used by the insert operation.
2131/// \param __b
2132///    An i16 integer value. The replacement value for the insert operation.
2133/// \param __imm
2134///    An immediate integer specifying the index of the vector element to be
2135///    replaced.
2136/// \returns A copy of vector \a __a, after replacing its element indexed by
2137///    \a __imm with \a __b.
2138static __inline __m256i __DEFAULT_FN_ATTRS
2139_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
2140{
2141  __v16hi __c = (__v16hi)__a;
2142  __c[__imm & 15] = __b;
2143  return (__m256i)__c;
2144}
2145
2146/// \brief Takes a [32 x i8] vector and replaces the vector element value
2147///    indexed by the immediate constant operand with a new value. Returns the
2148///    modified vector.
2149///
2150/// \headerfile <x86intrin.h>
2151///
2152/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2153///   instruction.
2154///
2155/// \param __a
2156///    A vector of [32 x i8] to be used by the insert operation.
2157/// \param __b
2158///    An i8 integer value. The replacement value for the insert operation.
2159/// \param __imm
2160///    An immediate integer specifying the index of the vector element to be
2161///    replaced.
2162/// \returns A copy of vector \a __a, after replacing its element indexed by
2163///    \a __imm with \a __b.
2164static __inline __m256i __DEFAULT_FN_ATTRS
2165_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
2166{
2167  __v32qi __c = (__v32qi)__a;
2168  __c[__imm & 31] = __b;
2169  return (__m256i)__c;
2170}
2171
2172#ifdef __x86_64__
2173/// \brief Takes a [4 x i64] vector and replaces the vector element value
2174///    indexed by the immediate constant operand with a new value. Returns the
2175///    modified vector.
2176///
2177/// \headerfile <x86intrin.h>
2178///
2179/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2180///   instruction.
2181///
2182/// \param __a
2183///    A vector of [4 x i64] to be used by the insert operation.
2184/// \param __b
2185///    A 64-bit integer value. The replacement value for the insert operation.
2186/// \param __imm
2187///    An immediate integer specifying the index of the vector element to be
2188///    replaced.
2189/// \returns A copy of vector \a __a, after replacing its element indexed by
2190///     \a __imm with \a __b.
2191static __inline __m256i __DEFAULT_FN_ATTRS
2192_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
2193{
2194  __v4di __c = (__v4di)__a;
2195  __c[__imm & 3] = __b;
2196  return (__m256i)__c;
2197}
2198#endif
2199
2200/* Conversion */
2201/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2202///
2203/// \headerfile <x86intrin.h>
2204///
2205/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2206///
2207/// \param __a
2208///    A 128-bit integer vector of [4 x i32].
2209/// \returns A 256-bit vector of [4 x double] containing the converted values.
2210static __inline __m256d __DEFAULT_FN_ATTRS
2211_mm256_cvtepi32_pd(__m128i __a)
2212{
2213  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2214}
2215
2216/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2217///
2218/// \headerfile <x86intrin.h>
2219///
2220/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2221///
2222/// \param __a
2223///    A 256-bit integer vector.
2224/// \returns A 256-bit vector of [8 x float] containing the converted values.
2225static __inline __m256 __DEFAULT_FN_ATTRS
2226_mm256_cvtepi32_ps(__m256i __a)
2227{
2228  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
2229}
2230
2231/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2232///    [4 x float].
2233///
2234/// \headerfile <x86intrin.h>
2235///
2236/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2237///
2238/// \param __a
2239///    A 256-bit vector of [4 x double].
2240/// \returns A 128-bit vector of [4 x float] containing the converted values.
2241static __inline __m128 __DEFAULT_FN_ATTRS
2242_mm256_cvtpd_ps(__m256d __a)
2243{
2244  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2245}
2246
2247/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2248///
2249/// \headerfile <x86intrin.h>
2250///
2251/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2252///
2253/// \param __a
2254///    A 256-bit vector of [8 x float].
2255/// \returns A 256-bit integer vector containing the converted values.
2256static __inline __m256i __DEFAULT_FN_ATTRS
2257_mm256_cvtps_epi32(__m256 __a)
2258{
2259  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2260}
2261
2262/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2263///    x double].
2264///
2265/// \headerfile <x86intrin.h>
2266///
2267/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2268///
2269/// \param __a
2270///    A 128-bit vector of [4 x float].
2271/// \returns A 256-bit vector of [4 x double] containing the converted values.
2272static __inline __m256d __DEFAULT_FN_ATTRS
2273_mm256_cvtps_pd(__m128 __a)
2274{
2275  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2276}
2277
2278/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2279///    x i32], truncating the result by rounding towards zero when it is
2280///    inexact.
2281///
2282/// \headerfile <x86intrin.h>
2283///
2284/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2285///
2286/// \param __a
2287///    A 256-bit vector of [4 x double].
2288/// \returns A 128-bit integer vector containing the converted values.
2289static __inline __m128i __DEFAULT_FN_ATTRS
2290_mm256_cvttpd_epi32(__m256d __a)
2291{
2292  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2293}
2294
2295/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2296///    x i32]. When a conversion is inexact, the value returned is rounded
2297///    according to the rounding control bits in the MXCSR register.
2298///
2299/// \headerfile <x86intrin.h>
2300///
2301/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2302///
2303/// \param __a
2304///    A 256-bit vector of [4 x double].
2305/// \returns A 128-bit integer vector containing the converted values.
2306static __inline __m128i __DEFAULT_FN_ATTRS
2307_mm256_cvtpd_epi32(__m256d __a)
2308{
2309  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2310}
2311
2312/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2313///    truncating the result by rounding towards zero when it is inexact.
2314///
2315/// \headerfile <x86intrin.h>
2316///
2317/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2318///
2319/// \param __a
2320///    A 256-bit vector of [8 x float].
2321/// \returns A 256-bit integer vector containing the converted values.
2322static __inline __m256i __DEFAULT_FN_ATTRS
2323_mm256_cvttps_epi32(__m256 __a)
2324{
2325  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2326}
2327
2328/// \brief Returns the first element of the input vector of [4 x double].
2329///
2330/// \headerfile <avxintrin.h>
2331///
2332/// This intrinsic is a utility function and does not correspond to a specific
2333///    instruction.
2334///
2335/// \param __a
2336///    A 256-bit vector of [4 x double].
2337/// \returns A 64 bit double containing the first element of the input vector.
2338static __inline double __DEFAULT_FN_ATTRS
2339_mm256_cvtsd_f64(__m256d __a)
2340{
2341 return __a[0];
2342}
2343
2344/// \brief Returns the first element of the input vector of [8 x i32].
2345///
2346/// \headerfile <avxintrin.h>
2347///
2348/// This intrinsic is a utility function and does not correspond to a specific
2349///    instruction.
2350///
2351/// \param __a
2352///    A 256-bit vector of [8 x i32].
2353/// \returns A 32 bit integer containing the first element of the input vector.
2354static __inline int __DEFAULT_FN_ATTRS
2355_mm256_cvtsi256_si32(__m256i __a)
2356{
2357 __v8si __b = (__v8si)__a;
2358 return __b[0];
2359}
2360
2361/// \brief Returns the first element of the input vector of [8 x float].
2362///
2363/// \headerfile <avxintrin.h>
2364///
2365/// This intrinsic is a utility function and does not correspond to a specific
2366///    instruction.
2367///
2368/// \param __a
2369///    A 256-bit vector of [8 x float].
2370/// \returns A 32 bit float containing the first element of the input vector.
2371static __inline float __DEFAULT_FN_ATTRS
2372_mm256_cvtss_f32(__m256 __a)
2373{
2374 return __a[0];
2375}
2376
2377/* Vector replicate */
2378/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
2379///    vector of [8 x float] to float values in a 256-bit vector of
2380///    [8 x float].
2381///
2382/// \headerfile <x86intrin.h>
2383///
2384/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2385///
2386/// \param __a
2387///    A 256-bit vector of [8 x float]. \n
2388///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2389///    the return value. \n
2390///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2391///    the return value. \n
2392///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2393///    return value. \n
2394///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2395///    return value.
2396/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2397///    values.
2398static __inline __m256 __DEFAULT_FN_ATTRS
2399_mm256_movehdup_ps(__m256 __a)
2400{
2401  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2402}
2403
2404/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
2405///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2406///
2407/// \headerfile <x86intrin.h>
2408///
2409/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2410///
2411/// \param __a
2412///    A 256-bit vector of [8 x float]. \n
2413///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2414///    the return value. \n
2415///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2416///    the return value. \n
2417///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2418///    return value. \n
2419///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2420///    return value.
2421/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2422///    values.
2423static __inline __m256 __DEFAULT_FN_ATTRS
2424_mm256_moveldup_ps(__m256 __a)
2425{
2426  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2427}
2428
2429/// \brief Moves and duplicates double-precision floating point values from a
2430///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2431///    vector of [4 x double].
2432///
2433/// \headerfile <x86intrin.h>
2434///
2435/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2436///
2437/// \param __a
2438///    A 256-bit vector of [4 x double]. \n
2439///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2440///    return value. \n
2441///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2442///    the return value.
2443/// \returns A 256-bit vector of [4 x double] containing the moved and
2444///    duplicated values.
2445static __inline __m256d __DEFAULT_FN_ATTRS
2446_mm256_movedup_pd(__m256d __a)
2447{
2448  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2449}
2450
2451/* Unpack and Interleave */
2452/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2453///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2454///
2455/// \headerfile <x86intrin.h>
2456///
2457/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2458///
2459/// \param __a
2460///    A 256-bit floating-point vector of [4 x double]. \n
2461///    Bits [127:64] are written to bits [63:0] of the return value. \n
2462///    Bits [255:192] are written to bits [191:128] of the return value. \n
2463/// \param __b
2464///    A 256-bit floating-point vector of [4 x double]. \n
2465///    Bits [127:64] are written to bits [127:64] of the return value. \n
2466///    Bits [255:192] are written to bits [255:192] of the return value. \n
2467/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2468static __inline __m256d __DEFAULT_FN_ATTRS
2469_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2470{
2471  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2472}
2473
2474/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2475///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2476///
2477/// \headerfile <x86intrin.h>
2478///
2479/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2480///
2481/// \param __a
2482///    A 256-bit floating-point vector of [4 x double]. \n
2483///    Bits [63:0] are written to bits [63:0] of the return value. \n
2484///    Bits [191:128] are written to bits [191:128] of the return value.
2485/// \param __b
2486///    A 256-bit floating-point vector of [4 x double]. \n
2487///    Bits [63:0] are written to bits [127:64] of the return value. \n
2488///    Bits [191:128] are written to bits [255:192] of the return value. \n
2489/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2490static __inline __m256d __DEFAULT_FN_ATTRS
2491_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2492{
2493  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2494}
2495
2496/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2497///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2498///    vector of [8 x float].
2499///
2500/// \headerfile <x86intrin.h>
2501///
2502/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2503///
2504/// \param __a
2505///    A 256-bit vector of [8 x float]. \n
2506///    Bits [95:64] are written to bits [31:0] of the return value. \n
2507///    Bits [127:96] are written to bits [95:64] of the return value. \n
2508///    Bits [223:192] are written to bits [159:128] of the return value. \n
2509///    Bits [255:224] are written to bits [223:192] of the return value.
2510/// \param __b
2511///    A 256-bit vector of [8 x float]. \n
2512///    Bits [95:64] are written to bits [63:32] of the return value. \n
2513///    Bits [127:96] are written to bits [127:96] of the return value. \n
2514///    Bits [223:192] are written to bits [191:160] of the return value. \n
2515///    Bits [255:224] are written to bits [255:224] of the return value.
2516/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2517static __inline __m256 __DEFAULT_FN_ATTRS
2518_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2519{
2520  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2521}
2522
2523/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2524///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2525///    vector of [8 x float].
2526///
2527/// \headerfile <x86intrin.h>
2528///
2529/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2530///
2531/// \param __a
2532///    A 256-bit vector of [8 x float]. \n
2533///    Bits [31:0] are written to bits [31:0] of the return value. \n
2534///    Bits [63:32] are written to bits [95:64] of the return value. \n
2535///    Bits [159:128] are written to bits [159:128] of the return value. \n
2536///    Bits [191:160] are written to bits [223:192] of the return value.
2537/// \param __b
2538///    A 256-bit vector of [8 x float]. \n
2539///    Bits [31:0] are written to bits [63:32] of the return value. \n
2540///    Bits [63:32] are written to bits [127:96] of the return value. \n
2541///    Bits [159:128] are written to bits [191:160] of the return value. \n
2542///    Bits [191:160] are written to bits [255:224] of the return value.
2543/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2544static __inline __m256 __DEFAULT_FN_ATTRS
2545_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2546{
2547  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2548}
2549
2550/* Bit Test */
2551/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2552///    element-by-element comparison of the double-precision element in the
2553///    first source vector and the corresponding element in the second source
2554///    vector.
2555///
2556///    The EFLAGS register is updated as follows: \n
2557///    If there is at least one pair of double-precision elements where the
2558///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2559///    ZF flag is set to 1. \n
2560///    If there is at least one pair of double-precision elements where the
2561///    sign-bit of the first element is 0 and the sign-bit of the second element
2562///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2563///    This intrinsic returns the value of the ZF flag.
2564///
2565/// \headerfile <x86intrin.h>
2566///
2567/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2568///
2569/// \param __a
2570///    A 128-bit vector of [2 x double].
2571/// \param __b
2572///    A 128-bit vector of [2 x double].
2573/// \returns the ZF flag in the EFLAGS register.
2574static __inline int __DEFAULT_FN_ATTRS
2575_mm_testz_pd(__m128d __a, __m128d __b)
2576{
2577  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2578}
2579
2580/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2581///    element-by-element comparison of the double-precision element in the
2582///    first source vector and the corresponding element in the second source
2583///    vector.
2584///
2585///    The EFLAGS register is updated as follows: \n
2586///    If there is at least one pair of double-precision elements where the
2587///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2588///    ZF flag is set to 1. \n
2589///    If there is at least one pair of double-precision elements where the
2590///    sign-bit of the first element is 0 and the sign-bit of the second element
2591///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2592///    This intrinsic returns the value of the CF flag.
2593///
2594/// \headerfile <x86intrin.h>
2595///
2596/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2597///
2598/// \param __a
2599///    A 128-bit vector of [2 x double].
2600/// \param __b
2601///    A 128-bit vector of [2 x double].
2602/// \returns the CF flag in the EFLAGS register.
2603static __inline int __DEFAULT_FN_ATTRS
2604_mm_testc_pd(__m128d __a, __m128d __b)
2605{
2606  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2607}
2608
2609/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2610///    element-by-element comparison of the double-precision element in the
2611///    first source vector and the corresponding element in the second source
2612///    vector.
2613///
2614///    The EFLAGS register is updated as follows: \n
2615///    If there is at least one pair of double-precision elements where the
2616///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2617///    ZF flag is set to 1. \n
2618///    If there is at least one pair of double-precision elements where the
2619///    sign-bit of the first element is 0 and the sign-bit of the second element
2620///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2621///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2622///    otherwise it returns 0.
2623///
2624/// \headerfile <x86intrin.h>
2625///
2626/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2627///
2628/// \param __a
2629///    A 128-bit vector of [2 x double].
2630/// \param __b
2631///    A 128-bit vector of [2 x double].
2632/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2633static __inline int __DEFAULT_FN_ATTRS
2634_mm_testnzc_pd(__m128d __a, __m128d __b)
2635{
2636  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2637}
2638
2639/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2640///    element-by-element comparison of the single-precision element in the
2641///    first source vector and the corresponding element in the second source
2642///    vector.
2643///
2644///    The EFLAGS register is updated as follows: \n
2645///    If there is at least one pair of single-precision elements where the
2646///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2647///    ZF flag is set to 1. \n
2648///    If there is at least one pair of single-precision elements where the
2649///    sign-bit of the first element is 0 and the sign-bit of the second element
2650///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2651///    This intrinsic returns the value of the ZF flag.
2652///
2653/// \headerfile <x86intrin.h>
2654///
2655/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2656///
2657/// \param __a
2658///    A 128-bit vector of [4 x float].
2659/// \param __b
2660///    A 128-bit vector of [4 x float].
2661/// \returns the ZF flag.
2662static __inline int __DEFAULT_FN_ATTRS
2663_mm_testz_ps(__m128 __a, __m128 __b)
2664{
2665  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2666}
2667
2668/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2669///    element-by-element comparison of the single-precision element in the
2670///    first source vector and the corresponding element in the second source
2671///    vector.
2672///
2673///    The EFLAGS register is updated as follows: \n
2674///    If there is at least one pair of single-precision elements where the
2675///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2676///    ZF flag is set to 1. \n
2677///    If there is at least one pair of single-precision elements where the
2678///    sign-bit of the first element is 0 and the sign-bit of the second element
2679///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2680///    This intrinsic returns the value of the CF flag.
2681///
2682/// \headerfile <x86intrin.h>
2683///
2684/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2685///
2686/// \param __a
2687///    A 128-bit vector of [4 x float].
2688/// \param __b
2689///    A 128-bit vector of [4 x float].
2690/// \returns the CF flag.
2691static __inline int __DEFAULT_FN_ATTRS
2692_mm_testc_ps(__m128 __a, __m128 __b)
2693{
2694  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2695}
2696
2697/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2698///    element-by-element comparison of the single-precision element in the
2699///    first source vector and the corresponding element in the second source
2700///    vector.
2701///
2702///    The EFLAGS register is updated as follows: \n
2703///    If there is at least one pair of single-precision elements where the
2704///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2705///    ZF flag is set to 1. \n
2706///    If there is at least one pair of single-precision elements where the
2707///    sign-bit of the first element is 0 and the sign-bit of the second element
2708///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2709///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2710///    otherwise it returns 0.
2711///
2712/// \headerfile <x86intrin.h>
2713///
2714/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2715///
2716/// \param __a
2717///    A 128-bit vector of [4 x float].
2718/// \param __b
2719///    A 128-bit vector of [4 x float].
2720/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2721static __inline int __DEFAULT_FN_ATTRS
2722_mm_testnzc_ps(__m128 __a, __m128 __b)
2723{
2724  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2725}
2726
2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2728///    element-by-element comparison of the double-precision elements in the
2729///    first source vector and the corresponding elements in the second source
2730///    vector.
2731///
2732///    The EFLAGS register is updated as follows: \n
2733///    If there is at least one pair of double-precision elements where the
2734///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2735///    ZF flag is set to 1. \n
2736///    If there is at least one pair of double-precision elements where the
2737///    sign-bit of the first element is 0 and the sign-bit of the second element
2738///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2739///    This intrinsic returns the value of the ZF flag.
2740///
2741/// \headerfile <x86intrin.h>
2742///
2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2744///
2745/// \param __a
2746///    A 256-bit vector of [4 x double].
2747/// \param __b
2748///    A 256-bit vector of [4 x double].
2749/// \returns the ZF flag.
2750static __inline int __DEFAULT_FN_ATTRS
2751_mm256_testz_pd(__m256d __a, __m256d __b)
2752{
2753  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2754}
2755
2756/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2757///    element-by-element comparison of the double-precision elements in the
2758///    first source vector and the corresponding elements in the second source
2759///    vector.
2760///
2761///    The EFLAGS register is updated as follows: \n
2762///    If there is at least one pair of double-precision elements where the
2763///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2764///    ZF flag is set to 1. \n
2765///    If there is at least one pair of double-precision elements where the
2766///    sign-bit of the first element is 0 and the sign-bit of the second element
2767///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2768///    This intrinsic returns the value of the CF flag.
2769///
2770/// \headerfile <x86intrin.h>
2771///
2772/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2773///
2774/// \param __a
2775///    A 256-bit vector of [4 x double].
2776/// \param __b
2777///    A 256-bit vector of [4 x double].
2778/// \returns the CF flag.
2779static __inline int __DEFAULT_FN_ATTRS
2780_mm256_testc_pd(__m256d __a, __m256d __b)
2781{
2782  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2783}
2784
2785/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2786///    element-by-element comparison of the double-precision elements in the
2787///    first source vector and the corresponding elements in the second source
2788///    vector.
2789///
2790///    The EFLAGS register is updated as follows: \n
2791///    If there is at least one pair of double-precision elements where the
2792///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2793///    ZF flag is set to 1. \n
2794///    If there is at least one pair of double-precision elements where the
2795///    sign-bit of the first element is 0 and the sign-bit of the second element
2796///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2797///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2798///    otherwise it returns 0.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2803///
2804/// \param __a
2805///    A 256-bit vector of [4 x double].
2806/// \param __b
2807///    A 256-bit vector of [4 x double].
2808/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2809static __inline int __DEFAULT_FN_ATTRS
2810_mm256_testnzc_pd(__m256d __a, __m256d __b)
2811{
2812  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2813}
2814
2815/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2816///    element-by-element comparison of the single-precision element in the
2817///    first source vector and the corresponding element in the second source
2818///    vector.
2819///
2820///    The EFLAGS register is updated as follows: \n
2821///    If there is at least one pair of single-precision elements where the
2822///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2823///    ZF flag is set to 1. \n
2824///    If there is at least one pair of single-precision elements where the
2825///    sign-bit of the first element is 0 and the sign-bit of the second element
2826///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2827///    This intrinsic returns the value of the ZF flag.
2828///
2829/// \headerfile <x86intrin.h>
2830///
2831/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2832///
2833/// \param __a
2834///    A 256-bit vector of [8 x float].
2835/// \param __b
2836///    A 256-bit vector of [8 x float].
2837/// \returns the ZF flag.
2838static __inline int __DEFAULT_FN_ATTRS
2839_mm256_testz_ps(__m256 __a, __m256 __b)
2840{
2841  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2842}
2843
2844/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2845///    element-by-element comparison of the single-precision element in the
2846///    first source vector and the corresponding element in the second source
2847///    vector.
2848///
2849///    The EFLAGS register is updated as follows: \n
2850///    If there is at least one pair of single-precision elements where the
2851///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2852///    ZF flag is set to 1. \n
2853///    If there is at least one pair of single-precision elements where the
2854///    sign-bit of the first element is 0 and the sign-bit of the second element
2855///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2856///    This intrinsic returns the value of the CF flag.
2857///
2858/// \headerfile <x86intrin.h>
2859///
2860/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2861///
2862/// \param __a
2863///    A 256-bit vector of [8 x float].
2864/// \param __b
2865///    A 256-bit vector of [8 x float].
2866/// \returns the CF flag.
2867static __inline int __DEFAULT_FN_ATTRS
2868_mm256_testc_ps(__m256 __a, __m256 __b)
2869{
2870  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2871}
2872
2873/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2874///    element-by-element comparison of the single-precision elements in the
2875///    first source vector and the corresponding elements in the second source
2876///    vector.
2877///
2878///    The EFLAGS register is updated as follows: \n
2879///    If there is at least one pair of single-precision elements where the
2880///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2881///    ZF flag is set to 1. \n
2882///    If there is at least one pair of single-precision elements where the
2883///    sign-bit of the first element is 0 and the sign-bit of the second element
2884///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2885///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2886///    otherwise it returns 0.
2887///
2888/// \headerfile <x86intrin.h>
2889///
2890/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2891///
2892/// \param __a
2893///    A 256-bit vector of [8 x float].
2894/// \param __b
2895///    A 256-bit vector of [8 x float].
2896/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2897static __inline int __DEFAULT_FN_ATTRS
2898_mm256_testnzc_ps(__m256 __a, __m256 __b)
2899{
2900  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2901}
2902
2903/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2904///    of the two source vectors.
2905///
2906///    The EFLAGS register is updated as follows: \n
2907///    If there is at least one pair of bits where both bits are 1, the ZF flag
2908///    is set to 0. Otherwise the ZF flag is set to 1. \n
2909///    If there is at least one pair of bits where the bit from the first source
2910///    vector is 0 and the bit from the second source vector is 1, the CF flag
2911///    is set to 0. Otherwise the CF flag is set to 1. \n
2912///    This intrinsic returns the value of the ZF flag.
2913///
2914/// \headerfile <x86intrin.h>
2915///
2916/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2917///
2918/// \param __a
2919///    A 256-bit integer vector.
2920/// \param __b
2921///    A 256-bit integer vector.
2922/// \returns the ZF flag.
2923static __inline int __DEFAULT_FN_ATTRS
2924_mm256_testz_si256(__m256i __a, __m256i __b)
2925{
2926  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2927}
2928
2929/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2930///    of the two source vectors.
2931///
2932///    The EFLAGS register is updated as follows: \n
2933///    If there is at least one pair of bits where both bits are 1, the ZF flag
2934///    is set to 0. Otherwise the ZF flag is set to 1. \n
2935///    If there is at least one pair of bits where the bit from the first source
2936///    vector is 0 and the bit from the second source vector is 1, the CF flag
2937///    is set to 0. Otherwise the CF flag is set to 1. \n
2938///    This intrinsic returns the value of the CF flag.
2939///
2940/// \headerfile <x86intrin.h>
2941///
2942/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2943///
2944/// \param __a
2945///    A 256-bit integer vector.
2946/// \param __b
2947///    A 256-bit integer vector.
2948/// \returns the CF flag.
2949static __inline int __DEFAULT_FN_ATTRS
2950_mm256_testc_si256(__m256i __a, __m256i __b)
2951{
2952  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2953}
2954
2955/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2956///    of the two source vectors.
2957///
2958///    The EFLAGS register is updated as follows: \n
2959///    If there is at least one pair of bits where both bits are 1, the ZF flag
2960///    is set to 0. Otherwise the ZF flag is set to 1. \n
2961///    If there is at least one pair of bits where the bit from the first source
2962///    vector is 0 and the bit from the second source vector is 1, the CF flag
2963///    is set to 0. Otherwise the CF flag is set to 1. \n
2964///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2965///    otherwise it returns 0.
2966///
2967/// \headerfile <x86intrin.h>
2968///
2969/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2970///
2971/// \param __a
2972///    A 256-bit integer vector.
2973/// \param __b
2974///    A 256-bit integer vector.
2975/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2976static __inline int __DEFAULT_FN_ATTRS
2977_mm256_testnzc_si256(__m256i __a, __m256i __b)
2978{
2979  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2980}
2981
2982/* Vector extract sign mask */
2983/// \brief Extracts the sign bits of double-precision floating point elements
2984///    in a 256-bit vector of [4 x double] and writes them to the lower order
2985///    bits of the return value.
2986///
2987/// \headerfile <x86intrin.h>
2988///
2989/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2990///
2991/// \param __a
2992///    A 256-bit vector of [4 x double] containing the double-precision
2993///    floating point values with sign bits to be extracted.
2994/// \returns The sign bits from the operand, written to bits [3:0].
2995static __inline int __DEFAULT_FN_ATTRS
2996_mm256_movemask_pd(__m256d __a)
2997{
2998  return __builtin_ia32_movmskpd256((__v4df)__a);
2999}
3000
3001/// \brief Extracts the sign bits of double-precision floating point elements
3002///    in a 256-bit vector of [8 x float] and writes them to the lower order
3003///    bits of the return value.
3004///
3005/// \headerfile <x86intrin.h>
3006///
3007/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
3008///
3009/// \param __a
3010///    A 256-bit vector of [8 x float] containing the double-precision floating
3011///    point values with sign bits to be extracted.
3012/// \returns The sign bits from the operand, written to bits [7:0].
3013static __inline int __DEFAULT_FN_ATTRS
3014_mm256_movemask_ps(__m256 __a)
3015{
3016  return __builtin_ia32_movmskps256((__v8sf)__a);
3017}
3018
3019/* Vector __zero */
3020/// \brief Zeroes the contents of all XMM or YMM registers.
3021///
3022/// \headerfile <x86intrin.h>
3023///
3024/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3025static __inline void __DEFAULT_FN_ATTRS
3026_mm256_zeroall(void)
3027{
3028  __builtin_ia32_vzeroall();
3029}
3030
3031/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3032///
3033/// \headerfile <x86intrin.h>
3034///
3035/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3036static __inline void __DEFAULT_FN_ATTRS
3037_mm256_zeroupper(void)
3038{
3039  __builtin_ia32_vzeroupper();
3040}
3041
3042/* Vector load with broadcast */
3043/// \brief Loads a scalar single-precision floating point value from the
3044///    specified address pointed to by \a __a and broadcasts it to the elements
3045///    of a [4 x float] vector.
3046///
3047/// \headerfile <x86intrin.h>
3048///
3049/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3050///
3051/// \param __a
3052///    The single-precision floating point value to be broadcast.
3053/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3054///    equal to the broadcast value.
3055static __inline __m128 __DEFAULT_FN_ATTRS
3056_mm_broadcast_ss(float const *__a)
3057{
3058  float __f = *__a;
3059  return (__m128)(__v4sf){ __f, __f, __f, __f };
3060}
3061
3062/// \brief Loads a scalar double-precision floating point value from the
3063///    specified address pointed to by \a __a and broadcasts it to the elements
3064///    of a [4 x double] vector.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3069///
3070/// \param __a
3071///    The double-precision floating point value to be broadcast.
3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3073///    equal to the broadcast value.
3074static __inline __m256d __DEFAULT_FN_ATTRS
3075_mm256_broadcast_sd(double const *__a)
3076{
3077  double __d = *__a;
3078  return (__m256d)(__v4df){ __d, __d, __d, __d };
3079}
3080
3081/// \brief Loads a scalar single-precision floating point value from the
3082///    specified address pointed to by \a __a and broadcasts it to the elements
3083///    of a [8 x float] vector.
3084///
3085/// \headerfile <x86intrin.h>
3086///
3087/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3088///
3089/// \param __a
3090///    The single-precision floating point value to be broadcast.
3091/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3092///    equal to the broadcast value.
3093static __inline __m256 __DEFAULT_FN_ATTRS
3094_mm256_broadcast_ss(float const *__a)
3095{
3096  float __f = *__a;
3097  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3098}
3099
3100/// \brief Loads the data from a 128-bit vector of [2 x double] from the
3101///    specified address pointed to by \a __a and broadcasts it to 128-bit
3102///    elements in a 256-bit vector of [4 x double].
3103///
3104/// \headerfile <x86intrin.h>
3105///
3106/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3107///
3108/// \param __a
3109///    The 128-bit vector of [2 x double] to be broadcast.
3110/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3111///    equal to the broadcast value.
3112static __inline __m256d __DEFAULT_FN_ATTRS
3113_mm256_broadcast_pd(__m128d const *__a)
3114{
3115  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
3116}
3117
3118/// \brief Loads the data from a 128-bit vector of [4 x float] from the
3119///    specified address pointed to by \a __a and broadcasts it to 128-bit
3120///    elements in a 256-bit vector of [8 x float].
3121///
3122/// \headerfile <x86intrin.h>
3123///
3124/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3125///
3126/// \param __a
3127///    The 128-bit vector of [4 x float] to be broadcast.
3128/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3129///    equal to the broadcast value.
3130static __inline __m256 __DEFAULT_FN_ATTRS
3131_mm256_broadcast_ps(__m128 const *__a)
3132{
3133  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
3134}
3135
3136/* SIMD load ops */
3137/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
3138///    memory location pointed to by \a __p into a vector of [4 x double].
3139///
3140/// \headerfile <x86intrin.h>
3141///
3142/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3143///
3144/// \param __p
3145///    A 32-byte aligned pointer to a memory location containing
3146///    double-precision floating point values.
3147/// \returns A 256-bit vector of [4 x double] containing the moved values.
3148static __inline __m256d __DEFAULT_FN_ATTRS
3149_mm256_load_pd(double const *__p)
3150{
3151  return *(__m256d *)__p;
3152}
3153
3154/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
3155///    memory location pointed to by \a __p into a vector of [8 x float].
3156///
3157/// \headerfile <x86intrin.h>
3158///
3159/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3160///
3161/// \param __p
3162///    A 32-byte aligned pointer to a memory location containing float values.
3163/// \returns A 256-bit vector of [8 x float] containing the moved values.
3164static __inline __m256 __DEFAULT_FN_ATTRS
3165_mm256_load_ps(float const *__p)
3166{
3167  return *(__m256 *)__p;
3168}
3169
3170/// \brief Loads 4 double-precision floating point values from an unaligned
3171///    memory location pointed to by \a __p into a vector of [4 x double].
3172///
3173/// \headerfile <x86intrin.h>
3174///
3175/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3176///
3177/// \param __p
3178///    A pointer to a memory location containing double-precision floating
3179///    point values.
3180/// \returns A 256-bit vector of [4 x double] containing the moved values.
3181static __inline __m256d __DEFAULT_FN_ATTRS
3182_mm256_loadu_pd(double const *__p)
3183{
3184  struct __loadu_pd {
3185    __m256d __v;
3186  } __attribute__((__packed__, __may_alias__));
3187  return ((struct __loadu_pd*)__p)->__v;
3188}
3189
3190/// \brief Loads 8 single-precision floating point values from an unaligned
3191///    memory location pointed to by \a __p into a vector of [8 x float].
3192///
3193/// \headerfile <x86intrin.h>
3194///
3195/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3196///
3197/// \param __p
3198///    A pointer to a memory location containing single-precision floating
3199///    point values.
3200/// \returns A 256-bit vector of [8 x float] containing the moved values.
3201static __inline __m256 __DEFAULT_FN_ATTRS
3202_mm256_loadu_ps(float const *__p)
3203{
3204  struct __loadu_ps {
3205    __m256 __v;
3206  } __attribute__((__packed__, __may_alias__));
3207  return ((struct __loadu_ps*)__p)->__v;
3208}
3209
3210/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
3211///    location pointed to by \a __p into elements of a 256-bit integer vector.
3212///
3213/// \headerfile <x86intrin.h>
3214///
3215/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3216///
3217/// \param __p
3218///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3219///    values.
3220/// \returns A 256-bit integer vector containing the moved values.
3221static __inline __m256i __DEFAULT_FN_ATTRS
3222_mm256_load_si256(__m256i const *__p)
3223{
3224  return *__p;
3225}
3226
3227/// \brief Loads 256 bits of integer data from an unaligned memory location
3228///    pointed to by \a __p into a 256-bit integer vector.
3229///
3230/// \headerfile <x86intrin.h>
3231///
3232/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3233///
3234/// \param __p
3235///    A pointer to a 256-bit integer vector containing integer values.
3236/// \returns A 256-bit integer vector containing the moved values.
3237static __inline __m256i __DEFAULT_FN_ATTRS
3238_mm256_loadu_si256(__m256i const *__p)
3239{
3240  struct __loadu_si256 {
3241    __m256i __v;
3242  } __attribute__((__packed__, __may_alias__));
3243  return ((struct __loadu_si256*)__p)->__v;
3244}
3245
3246/// \brief Loads 256 bits of integer data from an unaligned memory location
3247///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3248///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3249///    line boundary.
3250///
3251/// \headerfile <x86intrin.h>
3252///
3253/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3254///
3255/// \param __p
3256///    A pointer to a 256-bit integer vector containing integer values.
3257/// \returns A 256-bit integer vector containing the moved values.
3258static __inline __m256i __DEFAULT_FN_ATTRS
3259_mm256_lddqu_si256(__m256i const *__p)
3260{
3261  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3262}
3263
3264/* SIMD store ops */
3265/// \brief Stores double-precision floating point values from a 256-bit vector
3266///    of [4 x double] to a 32-byte aligned memory location pointed to by
3267///    \a __p.
3268///
3269/// \headerfile <x86intrin.h>
3270///
3271/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3272///
3273/// \param __p
3274///    A 32-byte aligned pointer to a memory location that will receive the
3275///    double-precision floaing point values.
3276/// \param __a
3277///    A 256-bit vector of [4 x double] containing the values to be moved.
3278static __inline void __DEFAULT_FN_ATTRS
3279_mm256_store_pd(double *__p, __m256d __a)
3280{
3281  *(__m256d *)__p = __a;
3282}
3283
3284/// \brief Stores single-precision floating point values from a 256-bit vector
3285///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3286///
3287/// \headerfile <x86intrin.h>
3288///
3289/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3290///
3291/// \param __p
3292///    A 32-byte aligned pointer to a memory location that will receive the
3293///    float values.
3294/// \param __a
3295///    A 256-bit vector of [8 x float] containing the values to be moved.
3296static __inline void __DEFAULT_FN_ATTRS
3297_mm256_store_ps(float *__p, __m256 __a)
3298{
3299  *(__m256 *)__p = __a;
3300}
3301
3302/// \brief Stores double-precision floating point values from a 256-bit vector
3303///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3304///
3305/// \headerfile <x86intrin.h>
3306///
3307/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3308///
3309/// \param __p
3310///    A pointer to a memory location that will receive the double-precision
3311///    floating point values.
3312/// \param __a
3313///    A 256-bit vector of [4 x double] containing the values to be moved.
3314static __inline void __DEFAULT_FN_ATTRS
3315_mm256_storeu_pd(double *__p, __m256d __a)
3316{
3317  struct __storeu_pd {
3318    __m256d __v;
3319  } __attribute__((__packed__, __may_alias__));
3320  ((struct __storeu_pd*)__p)->__v = __a;
3321}
3322
3323/// \brief Stores single-precision floating point values from a 256-bit vector
3324///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3325///
3326/// \headerfile <x86intrin.h>
3327///
3328/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3329///
3330/// \param __p
3331///    A pointer to a memory location that will receive the float values.
3332/// \param __a
3333///    A 256-bit vector of [8 x float] containing the values to be moved.
3334static __inline void __DEFAULT_FN_ATTRS
3335_mm256_storeu_ps(float *__p, __m256 __a)
3336{
3337  struct __storeu_ps {
3338    __m256 __v;
3339  } __attribute__((__packed__, __may_alias__));
3340  ((struct __storeu_ps*)__p)->__v = __a;
3341}
3342
3343/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
3344///    aligned memory location pointed to by \a __p.
3345///
3346/// \headerfile <x86intrin.h>
3347///
3348/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3349///
3350/// \param __p
3351///    A 32-byte aligned pointer to a memory location that will receive the
3352///    integer values.
3353/// \param __a
3354///    A 256-bit integer vector containing the values to be moved.
3355static __inline void __DEFAULT_FN_ATTRS
3356_mm256_store_si256(__m256i *__p, __m256i __a)
3357{
3358  *__p = __a;
3359}
3360
3361/// \brief Stores integer values from a 256-bit integer vector to an unaligned
3362///    memory location pointed to by \a __p.
3363///
3364/// \headerfile <x86intrin.h>
3365///
3366/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3367///
3368/// \param __p
3369///    A pointer to a memory location that will receive the integer values.
3370/// \param __a
3371///    A 256-bit integer vector containing the values to be moved.
3372static __inline void __DEFAULT_FN_ATTRS
3373_mm256_storeu_si256(__m256i *__p, __m256i __a)
3374{
3375  struct __storeu_si256 {
3376    __m256i __v;
3377  } __attribute__((__packed__, __may_alias__));
3378  ((struct __storeu_si256*)__p)->__v = __a;
3379}
3380
3381/* Conditional load ops */
3382/// \brief Conditionally loads double-precision floating point elements from a
3383///    memory location pointed to by \a __p into a 128-bit vector of
3384///    [2 x double], depending on the mask bits associated with each data
3385///    element.
3386///
3387/// \headerfile <x86intrin.h>
3388///
3389/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3390///
3391/// \param __p
3392///    A pointer to a memory location that contains the double-precision
3393///    floating point values.
3394/// \param __m
3395///    A 128-bit integer vector containing the mask. The most significant bit of
3396///    each data element represents the mask bits. If a mask bit is zero, the
3397///    corresponding value in the memory location is not loaded and the
3398///    corresponding field in the return value is set to zero.
3399/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3400static __inline __m128d __DEFAULT_FN_ATTRS
3401_mm_maskload_pd(double const *__p, __m128i __m)
3402{
3403  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3404}
3405
3406/// \brief Conditionally loads double-precision floating point elements from a
3407///    memory location pointed to by \a __p into a 256-bit vector of
3408///    [4 x double], depending on the mask bits associated with each data
3409///    element.
3410///
3411/// \headerfile <x86intrin.h>
3412///
3413/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3414///
3415/// \param __p
3416///    A pointer to a memory location that contains the double-precision
3417///    floating point values.
3418/// \param __m
3419///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3420///    significant bit of each quadword element represents the mask bits. If a
3421///    mask bit is zero, the corresponding value in the memory location is not
3422///    loaded and the corresponding field in the return value is set to zero.
3423/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3424static __inline __m256d __DEFAULT_FN_ATTRS
3425_mm256_maskload_pd(double const *__p, __m256i __m)
3426{
3427  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3428                                               (__v4di)__m);
3429}
3430
3431/// \brief Conditionally loads single-precision floating point elements from a
3432///    memory location pointed to by \a __p into a 128-bit vector of
3433///    [4 x float], depending on the mask bits associated with each data
3434///    element.
3435///
3436/// \headerfile <x86intrin.h>
3437///
3438/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3439///
3440/// \param __p
3441///    A pointer to a memory location that contains the single-precision
3442///    floating point values.
3443/// \param __m
3444///    A 128-bit integer vector containing the mask. The most significant bit of
3445///    each data element represents the mask bits. If a mask bit is zero, the
3446///    corresponding value in the memory location is not loaded and the
3447///    corresponding field in the return value is set to zero.
3448/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3449static __inline __m128 __DEFAULT_FN_ATTRS
3450_mm_maskload_ps(float const *__p, __m128i __m)
3451{
3452  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3453}
3454
3455/// \brief Conditionally loads single-precision floating point elements from a
3456///    memory location pointed to by \a __p into a 256-bit vector of
3457///    [8 x float], depending on the mask bits associated with each data
3458///    element.
3459///
3460/// \headerfile <x86intrin.h>
3461///
3462/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3463///
3464/// \param __p
3465///    A pointer to a memory location that contains the single-precision
3466///    floating point values.
3467/// \param __m
3468///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3469///    significant bit of each dword element represents the mask bits. If a mask
3470///    bit is zero, the corresponding value in the memory location is not loaded
3471///    and the corresponding field in the return value is set to zero.
3472/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3473static __inline __m256 __DEFAULT_FN_ATTRS
3474_mm256_maskload_ps(float const *__p, __m256i __m)
3475{
3476  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3477}
3478
3479/* Conditional store ops */
3480/// \brief Moves single-precision floating point values from a 256-bit vector
3481///    of [8 x float] to a memory location pointed to by \a __p, according to
3482///    the specified mask.
3483///
3484/// \headerfile <x86intrin.h>
3485///
3486/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3487///
3488/// \param __p
3489///    A pointer to a memory location that will receive the float values.
3490/// \param __m
3491///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3492///    significant bit of each dword element in the mask vector represents the
3493///    mask bits. If a mask bit is zero, the corresponding value from vector
3494///    \a __a is not stored and the corresponding field in the memory location
3495///    pointed to by \a __p is not changed.
3496/// \param __a
3497///    A 256-bit vector of [8 x float] containing the values to be stored.
3498static __inline void __DEFAULT_FN_ATTRS
3499_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3500{
3501  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3502}
3503
3504/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
3505///    to a memory location pointed to by \a __p, according to the specified
3506///    mask.
3507///
3508/// \headerfile <x86intrin.h>
3509///
3510/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3511///
3512/// \param __p
3513///    A pointer to a memory location that will receive the float values.
3514/// \param __m
3515///    A 128-bit integer vector containing the mask. The most significant bit of
3516///    each field in the mask vector represents the mask bits. If a mask bit is
3517///    zero, the corresponding value from vector \a __a is not stored and the
3518///    corresponding field in the memory location pointed to by \a __p is not
3519///    changed.
3520/// \param __a
3521///    A 128-bit vector of [2 x double] containing the values to be stored.
3522static __inline void __DEFAULT_FN_ATTRS
3523_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3524{
3525  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3526}
3527
3528/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3529///    to a memory location pointed to by \a __p, according to the specified
3530///    mask.
3531///
3532/// \headerfile <x86intrin.h>
3533///
3534/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3535///
3536/// \param __p
3537///    A pointer to a memory location that will receive the float values.
3538/// \param __m
3539///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3540///    significant bit of each quadword element in the mask vector represents
3541///    the mask bits. If a mask bit is zero, the corresponding value from vector
3542///    __a is not stored and the corresponding field in the memory location
3543///    pointed to by \a __p is not changed.
3544/// \param __a
3545///    A 256-bit vector of [4 x double] containing the values to be stored.
3546static __inline void __DEFAULT_FN_ATTRS
3547_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3548{
3549  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3550}
3551
3552/// \brief Moves single-precision floating point values from a 128-bit vector
3553///    of [4 x float] to a memory location pointed to by \a __p, according to
3554///    the specified mask.
3555///
3556/// \headerfile <x86intrin.h>
3557///
3558/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3559///
3560/// \param __p
3561///    A pointer to a memory location that will receive the float values.
3562/// \param __m
3563///    A 128-bit integer vector containing the mask. The most significant bit of
3564///    each field in the mask vector represents the mask bits. If a mask bit is
3565///    zero, the corresponding value from vector __a is not stored and the
3566///    corresponding field in the memory location pointed to by \a __p is not
3567///    changed.
3568/// \param __a
3569///    A 128-bit vector of [4 x float] containing the values to be stored.
3570static __inline void __DEFAULT_FN_ATTRS
3571_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3572{
3573  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3574}
3575
3576/* Cacheability support ops */
3577/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3578///    aligned memory location. To minimize caching, the data is flagged as
3579///    non-temporal (unlikely to be used again soon).
3580///
3581/// \headerfile <x86intrin.h>
3582///
3583/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3584///
3585/// \param __a
3586///    A pointer to a 32-byte aligned memory location that will receive the
3587///    integer values.
3588/// \param __b
3589///    A 256-bit integer vector containing the values to be moved.
3590static __inline void __DEFAULT_FN_ATTRS
3591_mm256_stream_si256(__m256i *__a, __m256i __b)
3592{
3593  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
3594}
3595
3596/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3597///    to a 32-byte aligned memory location. To minimize caching, the data is
3598///    flagged as non-temporal (unlikely to be used again soon).
3599///
3600/// \headerfile <x86intrin.h>
3601///
3602/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3603///
3604/// \param __a
3605///    A pointer to a 32-byte aligned memory location that will receive the
3606///    double-precision floating-point values.
3607/// \param __b
3608///    A 256-bit vector of [4 x double] containing the values to be moved.
3609static __inline void __DEFAULT_FN_ATTRS
3610_mm256_stream_pd(double *__a, __m256d __b)
3611{
3612  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
3613}
3614
3615/// \brief Moves single-precision floating point values from a 256-bit vector
3616///    of [8 x float] to a 32-byte aligned memory location. To minimize
3617///    caching, the data is flagged as non-temporal (unlikely to be used again
3618///    soon).
3619///
3620/// \headerfile <x86intrin.h>
3621///
3622/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3623///
3624/// \param __p
3625///    A pointer to a 32-byte aligned memory location that will receive the
3626///    single-precision floating point values.
3627/// \param __a
3628///    A 256-bit vector of [8 x float] containing the values to be moved.
3629static __inline void __DEFAULT_FN_ATTRS
3630_mm256_stream_ps(float *__p, __m256 __a)
3631{
3632  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
3633}
3634
3635/* Create vectors */
3636/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3637///
3638/// \headerfile <x86intrin.h>
3639///
3640/// This intrinsic has no corresponding instruction.
3641///
3642/// \returns A 256-bit vector of [4 x double] containing undefined values.
3643static __inline__ __m256d __DEFAULT_FN_ATTRS
3644_mm256_undefined_pd(void)
3645{
3646  return (__m256d)__builtin_ia32_undef256();
3647}
3648
3649/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3650///
3651/// \headerfile <x86intrin.h>
3652///
3653/// This intrinsic has no corresponding instruction.
3654///
3655/// \returns A 256-bit vector of [8 x float] containing undefined values.
3656static __inline__ __m256 __DEFAULT_FN_ATTRS
3657_mm256_undefined_ps(void)
3658{
3659  return (__m256)__builtin_ia32_undef256();
3660}
3661
3662/// \brief Create a 256-bit integer vector with undefined values.
3663///
3664/// \headerfile <x86intrin.h>
3665///
3666/// This intrinsic has no corresponding instruction.
3667///
3668/// \returns A 256-bit integer vector containing undefined values.
3669static __inline__ __m256i __DEFAULT_FN_ATTRS
3670_mm256_undefined_si256(void)
3671{
3672  return (__m256i)__builtin_ia32_undef256();
3673}
3674
3675/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3676///    initialized with the specified double-precision floating-point values.
3677///
3678/// \headerfile <x86intrin.h>
3679///
3680/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3681///   instruction.
3682///
3683/// \param __a
3684///    A double-precision floating-point value used to initialize bits [255:192]
3685///    of the result.
3686/// \param __b
3687///    A double-precision floating-point value used to initialize bits [191:128]
3688///    of the result.
3689/// \param __c
3690///    A double-precision floating-point value used to initialize bits [127:64]
3691///    of the result.
3692/// \param __d
3693///    A double-precision floating-point value used to initialize bits [63:0]
3694///    of the result.
3695/// \returns An initialized 256-bit floating-point vector of [4 x double].
3696static __inline __m256d __DEFAULT_FN_ATTRS
3697_mm256_set_pd(double __a, double __b, double __c, double __d)
3698{
3699  return (__m256d){ __d, __c, __b, __a };
3700}
3701
3702/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3703///    with the specified single-precision floating-point values.
3704///
3705/// \headerfile <x86intrin.h>
3706///
3707/// This intrinsic is a utility function and does not correspond to a specific
3708///   instruction.
3709///
3710/// \param __a
3711///    A single-precision floating-point value used to initialize bits [255:224]
3712///    of the result.
3713/// \param __b
3714///    A single-precision floating-point value used to initialize bits [223:192]
3715///    of the result.
3716/// \param __c
3717///    A single-precision floating-point value used to initialize bits [191:160]
3718///    of the result.
3719/// \param __d
3720///    A single-precision floating-point value used to initialize bits [159:128]
3721///    of the result.
3722/// \param __e
3723///    A single-precision floating-point value used to initialize bits [127:96]
3724///    of the result.
3725/// \param __f
3726///    A single-precision floating-point value used to initialize bits [95:64]
3727///    of the result.
3728/// \param __g
3729///    A single-precision floating-point value used to initialize bits [63:32]
3730///    of the result.
3731/// \param __h
3732///    A single-precision floating-point value used to initialize bits [31:0]
3733///    of the result.
3734/// \returns An initialized 256-bit floating-point vector of [8 x float].
3735static __inline __m256 __DEFAULT_FN_ATTRS
3736_mm256_set_ps(float __a, float __b, float __c, float __d,
3737              float __e, float __f, float __g, float __h)
3738{
3739  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3740}
3741
3742/// \brief Constructs a 256-bit integer vector initialized with the specified
3743///    32-bit integral values.
3744///
3745/// \headerfile <x86intrin.h>
3746///
3747/// This intrinsic is a utility function and does not correspond to a specific
3748///   instruction.
3749///
3750/// \param __i0
3751///    A 32-bit integral value used to initialize bits [255:224] of the result.
3752/// \param __i1
3753///    A 32-bit integral value used to initialize bits [223:192] of the result.
3754/// \param __i2
3755///    A 32-bit integral value used to initialize bits [191:160] of the result.
3756/// \param __i3
3757///    A 32-bit integral value used to initialize bits [159:128] of the result.
3758/// \param __i4
3759///    A 32-bit integral value used to initialize bits [127:96] of the result.
3760/// \param __i5
3761///    A 32-bit integral value used to initialize bits [95:64] of the result.
3762/// \param __i6
3763///    A 32-bit integral value used to initialize bits [63:32] of the result.
3764/// \param __i7
3765///    A 32-bit integral value used to initialize bits [31:0] of the result.
3766/// \returns An initialized 256-bit integer vector.
3767static __inline __m256i __DEFAULT_FN_ATTRS
3768_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3769                 int __i4, int __i5, int __i6, int __i7)
3770{
3771  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3772}
3773
3774/// \brief Constructs a 256-bit integer vector initialized with the specified
3775///    16-bit integral values.
3776///
3777/// \headerfile <x86intrin.h>
3778///
3779/// This intrinsic is a utility function and does not correspond to a specific
3780///   instruction.
3781///
3782/// \param __w15
3783///    A 16-bit integral value used to initialize bits [255:240] of the result.
3784/// \param __w14
3785///    A 16-bit integral value used to initialize bits [239:224] of the result.
3786/// \param __w13
3787///    A 16-bit integral value used to initialize bits [223:208] of the result.
3788/// \param __w12
3789///    A 16-bit integral value used to initialize bits [207:192] of the result.
3790/// \param __w11
3791///    A 16-bit integral value used to initialize bits [191:176] of the result.
3792/// \param __w10
3793///    A 16-bit integral value used to initialize bits [175:160] of the result.
3794/// \param __w09
3795///    A 16-bit integral value used to initialize bits [159:144] of the result.
3796/// \param __w08
3797///    A 16-bit integral value used to initialize bits [143:128] of the result.
3798/// \param __w07
3799///    A 16-bit integral value used to initialize bits [127:112] of the result.
3800/// \param __w06
3801///    A 16-bit integral value used to initialize bits [111:96] of the result.
3802/// \param __w05
3803///    A 16-bit integral value used to initialize bits [95:80] of the result.
3804/// \param __w04
3805///    A 16-bit integral value used to initialize bits [79:64] of the result.
3806/// \param __w03
3807///    A 16-bit integral value used to initialize bits [63:48] of the result.
3808/// \param __w02
3809///    A 16-bit integral value used to initialize bits [47:32] of the result.
3810/// \param __w01
3811///    A 16-bit integral value used to initialize bits [31:16] of the result.
3812/// \param __w00
3813///    A 16-bit integral value used to initialize bits [15:0] of the result.
3814/// \returns An initialized 256-bit integer vector.
3815static __inline __m256i __DEFAULT_FN_ATTRS
3816_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3817                 short __w11, short __w10, short __w09, short __w08,
3818                 short __w07, short __w06, short __w05, short __w04,
3819                 short __w03, short __w02, short __w01, short __w00)
3820{
3821  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3822    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3823}
3824
3825/// \brief Constructs a 256-bit integer vector initialized with the specified
3826///    8-bit integral values.
3827///
3828/// \headerfile <x86intrin.h>
3829///
3830/// This intrinsic is a utility function and does not correspond to a specific
3831///   instruction.
3832///
3833/// \param __b31
3834///    An 8-bit integral value used to initialize bits [255:248] of the result.
3835/// \param __b30
3836///    An 8-bit integral value used to initialize bits [247:240] of the result.
3837/// \param __b29
3838///    An 8-bit integral value used to initialize bits [239:232] of the result.
3839/// \param __b28
3840///    An 8-bit integral value used to initialize bits [231:224] of the result.
3841/// \param __b27
3842///    An 8-bit integral value used to initialize bits [223:216] of the result.
3843/// \param __b26
3844///    An 8-bit integral value used to initialize bits [215:208] of the result.
3845/// \param __b25
3846///    An 8-bit integral value used to initialize bits [207:200] of the result.
3847/// \param __b24
3848///    An 8-bit integral value used to initialize bits [199:192] of the result.
3849/// \param __b23
3850///    An 8-bit integral value used to initialize bits [191:184] of the result.
3851/// \param __b22
3852///    An 8-bit integral value used to initialize bits [183:176] of the result.
3853/// \param __b21
3854///    An 8-bit integral value used to initialize bits [175:168] of the result.
3855/// \param __b20
3856///    An 8-bit integral value used to initialize bits [167:160] of the result.
3857/// \param __b19
3858///    An 8-bit integral value used to initialize bits [159:152] of the result.
3859/// \param __b18
3860///    An 8-bit integral value used to initialize bits [151:144] of the result.
3861/// \param __b17
3862///    An 8-bit integral value used to initialize bits [143:136] of the result.
3863/// \param __b16
3864///    An 8-bit integral value used to initialize bits [135:128] of the result.
3865/// \param __b15
3866///    An 8-bit integral value used to initialize bits [127:120] of the result.
3867/// \param __b14
3868///    An 8-bit integral value used to initialize bits [119:112] of the result.
3869/// \param __b13
3870///    An 8-bit integral value used to initialize bits [111:104] of the result.
3871/// \param __b12
3872///    An 8-bit integral value used to initialize bits [103:96] of the result.
3873/// \param __b11
3874///    An 8-bit integral value used to initialize bits [95:88] of the result.
3875/// \param __b10
3876///    An 8-bit integral value used to initialize bits [87:80] of the result.
3877/// \param __b09
3878///    An 8-bit integral value used to initialize bits [79:72] of the result.
3879/// \param __b08
3880///    An 8-bit integral value used to initialize bits [71:64] of the result.
3881/// \param __b07
3882///    An 8-bit integral value used to initialize bits [63:56] of the result.
3883/// \param __b06
3884///    An 8-bit integral value used to initialize bits [55:48] of the result.
3885/// \param __b05
3886///    An 8-bit integral value used to initialize bits [47:40] of the result.
3887/// \param __b04
3888///    An 8-bit integral value used to initialize bits [39:32] of the result.
3889/// \param __b03
3890///    An 8-bit integral value used to initialize bits [31:24] of the result.
3891/// \param __b02
3892///    An 8-bit integral value used to initialize bits [23:16] of the result.
3893/// \param __b01
3894///    An 8-bit integral value used to initialize bits [15:8] of the result.
3895/// \param __b00
3896///    An 8-bit integral value used to initialize bits [7:0] of the result.
3897/// \returns An initialized 256-bit integer vector.
3898static __inline __m256i __DEFAULT_FN_ATTRS
3899_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3900                char __b27, char __b26, char __b25, char __b24,
3901                char __b23, char __b22, char __b21, char __b20,
3902                char __b19, char __b18, char __b17, char __b16,
3903                char __b15, char __b14, char __b13, char __b12,
3904                char __b11, char __b10, char __b09, char __b08,
3905                char __b07, char __b06, char __b05, char __b04,
3906                char __b03, char __b02, char __b01, char __b00)
3907{
3908  return (__m256i)(__v32qi){
3909    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3910    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3911    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3912    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3913  };
3914}
3915
3916/// \brief Constructs a 256-bit integer vector initialized with the specified
3917///    64-bit integral values.
3918///
3919/// \headerfile <x86intrin.h>
3920///
3921/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3922///   instruction.
3923///
3924/// \param __a
3925///    A 64-bit integral value used to initialize bits [255:192] of the result.
3926/// \param __b
3927///    A 64-bit integral value used to initialize bits [191:128] of the result.
3928/// \param __c
3929///    A 64-bit integral value used to initialize bits [127:64] of the result.
3930/// \param __d
3931///    A 64-bit integral value used to initialize bits [63:0] of the result.
3932/// \returns An initialized 256-bit integer vector.
3933static __inline __m256i __DEFAULT_FN_ATTRS
3934_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3935{
3936  return (__m256i)(__v4di){ __d, __c, __b, __a };
3937}
3938
3939/* Create vectors with elements in reverse order */
3940/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3941///    initialized in reverse order with the specified double-precision
3942///    floating-point values.
3943///
3944/// \headerfile <x86intrin.h>
3945///
3946/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3947///   instruction.
3948///
3949/// \param __a
3950///    A double-precision floating-point value used to initialize bits [63:0]
3951///    of the result.
3952/// \param __b
3953///    A double-precision floating-point value used to initialize bits [127:64]
3954///    of the result.
3955/// \param __c
3956///    A double-precision floating-point value used to initialize bits [191:128]
3957///    of the result.
3958/// \param __d
3959///    A double-precision floating-point value used to initialize bits [255:192]
3960///    of the result.
3961/// \returns An initialized 256-bit floating-point vector of [4 x double].
3962static __inline __m256d __DEFAULT_FN_ATTRS
3963_mm256_setr_pd(double __a, double __b, double __c, double __d)
3964{
3965  return (__m256d){ __a, __b, __c, __d };
3966}
3967
3968/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3969///    initialized in reverse order with the specified single-precision
3970///    float-point values.
3971///
3972/// \headerfile <x86intrin.h>
3973///
3974/// This intrinsic is a utility function and does not correspond to a specific
3975///   instruction.
3976///
3977/// \param __a
3978///    A single-precision floating-point value used to initialize bits [31:0]
3979///    of the result.
3980/// \param __b
3981///    A single-precision floating-point value used to initialize bits [63:32]
3982///    of the result.
3983/// \param __c
3984///    A single-precision floating-point value used to initialize bits [95:64]
3985///    of the result.
3986/// \param __d
3987///    A single-precision floating-point value used to initialize bits [127:96]
3988///    of the result.
3989/// \param __e
3990///    A single-precision floating-point value used to initialize bits [159:128]
3991///    of the result.
3992/// \param __f
3993///    A single-precision floating-point value used to initialize bits [191:160]
3994///    of the result.
3995/// \param __g
3996///    A single-precision floating-point value used to initialize bits [223:192]
3997///    of the result.
3998/// \param __h
3999///    A single-precision floating-point value used to initialize bits [255:224]
4000///    of the result.
4001/// \returns An initialized 256-bit floating-point vector of [8 x float].
4002static __inline __m256 __DEFAULT_FN_ATTRS
4003_mm256_setr_ps(float __a, float __b, float __c, float __d,
4004               float __e, float __f, float __g, float __h)
4005{
4006  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
4007}
4008
4009/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4010///    with the specified 32-bit integral values.
4011///
4012/// \headerfile <x86intrin.h>
4013///
4014/// This intrinsic is a utility function and does not correspond to a specific
4015///   instruction.
4016///
4017/// \param __i0
4018///    A 32-bit integral value used to initialize bits [31:0] of the result.
4019/// \param __i1
4020///    A 32-bit integral value used to initialize bits [63:32] of the result.
4021/// \param __i2
4022///    A 32-bit integral value used to initialize bits [95:64] of the result.
4023/// \param __i3
4024///    A 32-bit integral value used to initialize bits [127:96] of the result.
4025/// \param __i4
4026///    A 32-bit integral value used to initialize bits [159:128] of the result.
4027/// \param __i5
4028///    A 32-bit integral value used to initialize bits [191:160] of the result.
4029/// \param __i6
4030///    A 32-bit integral value used to initialize bits [223:192] of the result.
4031/// \param __i7
4032///    A 32-bit integral value used to initialize bits [255:224] of the result.
4033/// \returns An initialized 256-bit integer vector.
4034static __inline __m256i __DEFAULT_FN_ATTRS
4035_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4036                  int __i4, int __i5, int __i6, int __i7)
4037{
4038  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
4039}
4040
4041/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4042///    with the specified 16-bit integral values.
4043///
4044/// \headerfile <x86intrin.h>
4045///
4046/// This intrinsic is a utility function and does not correspond to a specific
4047///   instruction.
4048///
4049/// \param __w15
4050///    A 16-bit integral value used to initialize bits [15:0] of the result.
4051/// \param __w14
4052///    A 16-bit integral value used to initialize bits [31:16] of the result.
4053/// \param __w13
4054///    A 16-bit integral value used to initialize bits [47:32] of the result.
4055/// \param __w12
4056///    A 16-bit integral value used to initialize bits [63:48] of the result.
4057/// \param __w11
4058///    A 16-bit integral value used to initialize bits [79:64] of the result.
4059/// \param __w10
4060///    A 16-bit integral value used to initialize bits [95:80] of the result.
4061/// \param __w09
4062///    A 16-bit integral value used to initialize bits [111:96] of the result.
4063/// \param __w08
4064///    A 16-bit integral value used to initialize bits [127:112] of the result.
4065/// \param __w07
4066///    A 16-bit integral value used to initialize bits [143:128] of the result.
4067/// \param __w06
4068///    A 16-bit integral value used to initialize bits [159:144] of the result.
4069/// \param __w05
4070///    A 16-bit integral value used to initialize bits [175:160] of the result.
4071/// \param __w04
4072///    A 16-bit integral value used to initialize bits [191:176] of the result.
4073/// \param __w03
4074///    A 16-bit integral value used to initialize bits [207:192] of the result.
4075/// \param __w02
4076///    A 16-bit integral value used to initialize bits [223:208] of the result.
4077/// \param __w01
4078///    A 16-bit integral value used to initialize bits [239:224] of the result.
4079/// \param __w00
4080///    A 16-bit integral value used to initialize bits [255:240] of the result.
4081/// \returns An initialized 256-bit integer vector.
4082static __inline __m256i __DEFAULT_FN_ATTRS
4083_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4084       short __w11, short __w10, short __w09, short __w08,
4085       short __w07, short __w06, short __w05, short __w04,
4086       short __w03, short __w02, short __w01, short __w00)
4087{
4088  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
4089    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
4090}
4091
4092/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4093///    with the specified 8-bit integral values.
4094///
4095/// \headerfile <x86intrin.h>
4096///
4097/// This intrinsic is a utility function and does not correspond to a specific
4098///   instruction.
4099///
4100/// \param __b31
4101///    An 8-bit integral value used to initialize bits [7:0] of the result.
4102/// \param __b30
4103///    An 8-bit integral value used to initialize bits [15:8] of the result.
4104/// \param __b29
4105///    An 8-bit integral value used to initialize bits [23:16] of the result.
4106/// \param __b28
4107///    An 8-bit integral value used to initialize bits [31:24] of the result.
4108/// \param __b27
4109///    An 8-bit integral value used to initialize bits [39:32] of the result.
4110/// \param __b26
4111///    An 8-bit integral value used to initialize bits [47:40] of the result.
4112/// \param __b25
4113///    An 8-bit integral value used to initialize bits [55:48] of the result.
4114/// \param __b24
4115///    An 8-bit integral value used to initialize bits [63:56] of the result.
4116/// \param __b23
4117///    An 8-bit integral value used to initialize bits [71:64] of the result.
4118/// \param __b22
4119///    An 8-bit integral value used to initialize bits [79:72] of the result.
4120/// \param __b21
4121///    An 8-bit integral value used to initialize bits [87:80] of the result.
4122/// \param __b20
4123///    An 8-bit integral value used to initialize bits [95:88] of the result.
4124/// \param __b19
4125///    An 8-bit integral value used to initialize bits [103:96] of the result.
4126/// \param __b18
4127///    An 8-bit integral value used to initialize bits [111:104] of the result.
4128/// \param __b17
4129///    An 8-bit integral value used to initialize bits [119:112] of the result.
4130/// \param __b16
4131///    An 8-bit integral value used to initialize bits [127:120] of the result.
4132/// \param __b15
4133///    An 8-bit integral value used to initialize bits [135:128] of the result.
4134/// \param __b14
4135///    An 8-bit integral value used to initialize bits [143:136] of the result.
4136/// \param __b13
4137///    An 8-bit integral value used to initialize bits [151:144] of the result.
4138/// \param __b12
4139///    An 8-bit integral value used to initialize bits [159:152] of the result.
4140/// \param __b11
4141///    An 8-bit integral value used to initialize bits [167:160] of the result.
4142/// \param __b10
4143///    An 8-bit integral value used to initialize bits [175:168] of the result.
4144/// \param __b09
4145///    An 8-bit integral value used to initialize bits [183:176] of the result.
4146/// \param __b08
4147///    An 8-bit integral value used to initialize bits [191:184] of the result.
4148/// \param __b07
4149///    An 8-bit integral value used to initialize bits [199:192] of the result.
4150/// \param __b06
4151///    An 8-bit integral value used to initialize bits [207:200] of the result.
4152/// \param __b05
4153///    An 8-bit integral value used to initialize bits [215:208] of the result.
4154/// \param __b04
4155///    An 8-bit integral value used to initialize bits [223:216] of the result.
4156/// \param __b03
4157///    An 8-bit integral value used to initialize bits [231:224] of the result.
4158/// \param __b02
4159///    An 8-bit integral value used to initialize bits [239:232] of the result.
4160/// \param __b01
4161///    An 8-bit integral value used to initialize bits [247:240] of the result.
4162/// \param __b00
4163///    An 8-bit integral value used to initialize bits [255:248] of the result.
4164/// \returns An initialized 256-bit integer vector.
4165static __inline __m256i __DEFAULT_FN_ATTRS
4166_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4167                 char __b27, char __b26, char __b25, char __b24,
4168                 char __b23, char __b22, char __b21, char __b20,
4169                 char __b19, char __b18, char __b17, char __b16,
4170                 char __b15, char __b14, char __b13, char __b12,
4171                 char __b11, char __b10, char __b09, char __b08,
4172                 char __b07, char __b06, char __b05, char __b04,
4173                 char __b03, char __b02, char __b01, char __b00)
4174{
4175  return (__m256i)(__v32qi){
4176    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
4177    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
4178    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
4179    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
4180}
4181
4182/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4183///    with the specified 64-bit integral values.
4184///
4185/// \headerfile <x86intrin.h>
4186///
4187/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4188///   instruction.
4189///
4190/// \param __a
4191///    A 64-bit integral value used to initialize bits [63:0] of the result.
4192/// \param __b
4193///    A 64-bit integral value used to initialize bits [127:64] of the result.
4194/// \param __c
4195///    A 64-bit integral value used to initialize bits [191:128] of the result.
4196/// \param __d
4197///    A 64-bit integral value used to initialize bits [255:192] of the result.
4198/// \returns An initialized 256-bit integer vector.
4199static __inline __m256i __DEFAULT_FN_ATTRS
4200_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4201{
4202  return (__m256i)(__v4di){ __a, __b, __c, __d };
4203}
4204
4205/* Create vectors with repeated elements */
4206/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4207///    of the four double-precision floating-point vector elements set to the
4208///    specified double-precision floating-point value.
4209///
4210/// \headerfile <x86intrin.h>
4211///
4212/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4213///
4214/// \param __w
4215///    A double-precision floating-point value used to initialize each vector
4216///    element of the result.
4217/// \returns An initialized 256-bit floating-point vector of [4 x double].
4218static __inline __m256d __DEFAULT_FN_ATTRS
4219_mm256_set1_pd(double __w)
4220{
4221  return (__m256d){ __w, __w, __w, __w };
4222}
4223
4224/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4225///    of the eight single-precision floating-point vector elements set to the
4226///    specified single-precision floating-point value.
4227///
4228/// \headerfile <x86intrin.h>
4229///
4230/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4231///   instruction.
4232///
4233/// \param __w
4234///    A single-precision floating-point value used to initialize each vector
4235///    element of the result.
4236/// \returns An initialized 256-bit floating-point vector of [8 x float].
4237static __inline __m256 __DEFAULT_FN_ATTRS
4238_mm256_set1_ps(float __w)
4239{
4240  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
4241}
4242
4243/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4244///    32-bit integral vector elements set to the specified 32-bit integral
4245///    value.
4246///
4247/// \headerfile <x86intrin.h>
4248///
4249/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4250///   instruction.
4251///
4252/// \param __i
4253///    A 32-bit integral value used to initialize each vector element of the
4254///    result.
4255/// \returns An initialized 256-bit integer vector of [8 x i32].
4256static __inline __m256i __DEFAULT_FN_ATTRS
4257_mm256_set1_epi32(int __i)
4258{
4259  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
4260}
4261
4262/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4263///    16-bit integral vector elements set to the specified 16-bit integral
4264///    value.
4265///
4266/// \headerfile <x86intrin.h>
4267///
4268/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4269///
4270/// \param __w
4271///    A 16-bit integral value used to initialize each vector element of the
4272///    result.
4273/// \returns An initialized 256-bit integer vector of [16 x i16].
4274static __inline __m256i __DEFAULT_FN_ATTRS
4275_mm256_set1_epi16(short __w)
4276{
4277  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4278    __w, __w, __w, __w, __w, __w };
4279}
4280
4281/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4282///    8-bit integral vector elements set to the specified 8-bit integral value.
4283///
4284/// \headerfile <x86intrin.h>
4285///
4286/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4287///
4288/// \param __b
4289///    An 8-bit integral value used to initialize each vector element of the
4290///    result.
4291/// \returns An initialized 256-bit integer vector of [32 x i8].
4292static __inline __m256i __DEFAULT_FN_ATTRS
4293_mm256_set1_epi8(char __b)
4294{
4295  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4296    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4297    __b, __b, __b, __b, __b, __b, __b };
4298}
4299
4300/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4301///    64-bit integral vector elements set to the specified 64-bit integral
4302///    value.
4303///
4304/// \headerfile <x86intrin.h>
4305///
4306/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4307///
4308/// \param __q
4309///    A 64-bit integral value used to initialize each vector element of the
4310///    result.
4311/// \returns An initialized 256-bit integer vector of [4 x i64].
4312static __inline __m256i __DEFAULT_FN_ATTRS
4313_mm256_set1_epi64x(long long __q)
4314{
4315  return (__m256i)(__v4di){ __q, __q, __q, __q };
4316}
4317
4318/* Create __zeroed vectors */
4319/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4320///    vector elements initialized to zero.
4321///
4322/// \headerfile <x86intrin.h>
4323///
4324/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4325///
4326/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4327static __inline __m256d __DEFAULT_FN_ATTRS
4328_mm256_setzero_pd(void)
4329{
4330  return (__m256d){ 0, 0, 0, 0 };
4331}
4332
4333/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4334///    vector elements initialized to zero.
4335///
4336/// \headerfile <x86intrin.h>
4337///
4338/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4339///
4340/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4341static __inline __m256 __DEFAULT_FN_ATTRS
4342_mm256_setzero_ps(void)
4343{
4344  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4345}
4346
4347/// \brief Constructs a 256-bit integer vector initialized to zero.
4348///
4349/// \headerfile <x86intrin.h>
4350///
4351/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4352///
4353/// \returns A 256-bit integer vector initialized to zero.
4354static __inline __m256i __DEFAULT_FN_ATTRS
4355_mm256_setzero_si256(void)
4356{
4357  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4358}
4359
4360/* Cast between vector types */
4361/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4362///    floating-point vector of [8 x float].
4363///
4364/// \headerfile <x86intrin.h>
4365///
4366/// This intrinsic has no corresponding instruction.
4367///
4368/// \param __a
4369///    A 256-bit floating-point vector of [4 x double].
4370/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4371///    bitwise pattern as the parameter.
4372static __inline __m256 __DEFAULT_FN_ATTRS
4373_mm256_castpd_ps(__m256d __a)
4374{
4375  return (__m256)__a;
4376}
4377
4378/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4379///    integer vector.
4380///
4381/// \headerfile <x86intrin.h>
4382///
4383/// This intrinsic has no corresponding instruction.
4384///
4385/// \param __a
4386///    A 256-bit floating-point vector of [4 x double].
4387/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4388///    parameter.
4389static __inline __m256i __DEFAULT_FN_ATTRS
4390_mm256_castpd_si256(__m256d __a)
4391{
4392  return (__m256i)__a;
4393}
4394
4395/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4396///    floating-point vector of [4 x double].
4397///
4398/// \headerfile <x86intrin.h>
4399///
4400/// This intrinsic has no corresponding instruction.
4401///
4402/// \param __a
4403///    A 256-bit floating-point vector of [8 x float].
4404/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4405///    bitwise pattern as the parameter.
4406static __inline __m256d __DEFAULT_FN_ATTRS
4407_mm256_castps_pd(__m256 __a)
4408{
4409  return (__m256d)__a;
4410}
4411
4412/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4413///    integer vector.
4414///
4415/// \headerfile <x86intrin.h>
4416///
4417/// This intrinsic has no corresponding instruction.
4418///
4419/// \param __a
4420///    A 256-bit floating-point vector of [8 x float].
4421/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4422///    parameter.
4423static __inline __m256i __DEFAULT_FN_ATTRS
4424_mm256_castps_si256(__m256 __a)
4425{
4426  return (__m256i)__a;
4427}
4428
4429/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4430///    of [8 x float].
4431///
4432/// \headerfile <x86intrin.h>
4433///
4434/// This intrinsic has no corresponding instruction.
4435///
4436/// \param __a
4437///    A 256-bit integer vector.
4438/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4439///    bitwise pattern as the parameter.
4440static __inline __m256 __DEFAULT_FN_ATTRS
4441_mm256_castsi256_ps(__m256i __a)
4442{
4443  return (__m256)__a;
4444}
4445
4446/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4447///    of [4 x double].
4448///
4449/// \headerfile <x86intrin.h>
4450///
4451/// This intrinsic has no corresponding instruction.
4452///
4453/// \param __a
4454///    A 256-bit integer vector.
4455/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4456///    bitwise pattern as the parameter.
4457static __inline __m256d __DEFAULT_FN_ATTRS
4458_mm256_castsi256_pd(__m256i __a)
4459{
4460  return (__m256d)__a;
4461}
4462
4463/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4464///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4465///
4466/// \headerfile <x86intrin.h>
4467///
4468/// This intrinsic has no corresponding instruction.
4469///
4470/// \param __a
4471///    A 256-bit floating-point vector of [4 x double].
4472/// \returns A 128-bit floating-point vector of [2 x double] containing the
4473///    lower 128 bits of the parameter.
4474static __inline __m128d __DEFAULT_FN_ATTRS
4475_mm256_castpd256_pd128(__m256d __a)
4476{
4477  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4478}
4479
4480/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4481///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4482///
4483/// \headerfile <x86intrin.h>
4484///
4485/// This intrinsic has no corresponding instruction.
4486///
4487/// \param __a
4488///    A 256-bit floating-point vector of [8 x float].
4489/// \returns A 128-bit floating-point vector of [4 x float] containing the
4490///    lower 128 bits of the parameter.
4491static __inline __m128 __DEFAULT_FN_ATTRS
4492_mm256_castps256_ps128(__m256 __a)
4493{
4494  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4495}
4496
4497/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4498///
4499/// \headerfile <x86intrin.h>
4500///
4501/// This intrinsic has no corresponding instruction.
4502///
4503/// \param __a
4504///    A 256-bit integer vector.
4505/// \returns A 128-bit integer vector containing the lower 128 bits of the
4506///    parameter.
4507static __inline __m128i __DEFAULT_FN_ATTRS
4508_mm256_castsi256_si128(__m256i __a)
4509{
4510  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4511}
4512
4513/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4514///    128-bit floating-point vector of [2 x double].
4515///
4516///    The lower 128 bits contain the value of the source vector. The contents
4517///    of the upper 128 bits are undefined.
4518///
4519/// \headerfile <x86intrin.h>
4520///
4521/// This intrinsic has no corresponding instruction.
4522///
4523/// \param __a
4524///    A 128-bit vector of [2 x double].
4525/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4526///    contain the value of the parameter. The contents of the upper 128 bits
4527///    are undefined.
4528static __inline __m256d __DEFAULT_FN_ATTRS
4529_mm256_castpd128_pd256(__m128d __a)
4530{
4531  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4532}
4533
4534/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4535///    128-bit floating-point vector of [4 x float].
4536///
4537///    The lower 128 bits contain the value of the source vector. The contents
4538///    of the upper 128 bits are undefined.
4539///
4540/// \headerfile <x86intrin.h>
4541///
4542/// This intrinsic has no corresponding instruction.
4543///
4544/// \param __a
4545///    A 128-bit vector of [4 x float].
4546/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4547///    contain the value of the parameter. The contents of the upper 128 bits
4548///    are undefined.
4549static __inline __m256 __DEFAULT_FN_ATTRS
4550_mm256_castps128_ps256(__m128 __a)
4551{
4552  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4553}
4554
4555/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4556///
4557///    The lower 128 bits contain the value of the source vector. The contents
4558///    of the upper 128 bits are undefined.
4559///
4560/// \headerfile <x86intrin.h>
4561///
4562/// This intrinsic has no corresponding instruction.
4563///
4564/// \param __a
4565///    A 128-bit integer vector.
4566/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4567///    the parameter. The contents of the upper 128 bits are undefined.
4568static __inline __m256i __DEFAULT_FN_ATTRS
4569_mm256_castsi128_si256(__m128i __a)
4570{
4571  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4572}
4573
4574/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4575///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4576///    contain the value of the source vector. The upper 128 bits are set
4577///    to zero.
4578///
4579/// \headerfile <x86intrin.h>
4580///
4581/// This intrinsic has no corresponding instruction.
4582///
4583/// \param __a
4584///    A 128-bit vector of [2 x double].
4585/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4586///    contain the value of the parameter. The upper 128 bits are set to zero.
4587static __inline __m256d __DEFAULT_FN_ATTRS
4588_mm256_zextpd128_pd256(__m128d __a)
4589{
4590  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4591}
4592
4593/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4594///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4595///    the value of the source vector. The upper 128 bits are set to zero.
4596///
4597/// \headerfile <x86intrin.h>
4598///
4599/// This intrinsic has no corresponding instruction.
4600///
4601/// \param __a
4602///    A 128-bit vector of [4 x float].
4603/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4604///    contain the value of the parameter. The upper 128 bits are set to zero.
4605static __inline __m256 __DEFAULT_FN_ATTRS
4606_mm256_zextps128_ps256(__m128 __a)
4607{
4608  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4609}
4610
4611/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4612///    The lower 128 bits contain the value of the source vector. The upper
4613///    128 bits are set to zero.
4614///
4615/// \headerfile <x86intrin.h>
4616///
4617/// This intrinsic has no corresponding instruction.
4618///
4619/// \param __a
4620///    A 128-bit integer vector.
4621/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4622///    the parameter. The upper 128 bits are set to zero.
4623static __inline __m256i __DEFAULT_FN_ATTRS
4624_mm256_zextsi128_si256(__m128i __a)
4625{
4626  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4627}
4628
4629/*
4630   Vector insert.
4631   We use macros rather than inlines because we only want to accept
4632   invocations where the immediate M is a constant expression.
4633*/
4634/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4635///    a 256-bit vector of [8 x float] given in the first parameter, and then
4636///    replacing either the upper or the lower 128 bits with the contents of a
4637///    128-bit vector of [4 x float] in the second parameter.
4638///
4639///    The immediate integer parameter determines between the upper or the lower
4640///    128 bits.
4641///
4642/// \headerfile <x86intrin.h>
4643///
4644/// \code
4645/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4646/// \endcode
4647///
4648/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4649///
4650/// \param V1
4651///    A 256-bit vector of [8 x float]. This vector is copied to the result
4652///    first, and then either the upper or the lower 128 bits of the result will
4653///    be replaced by the contents of \a V2.
4654/// \param V2
4655///    A 128-bit vector of [4 x float]. The contents of this parameter are
4656///    written to either the upper or the lower 128 bits of the result depending
4657///    on the value of parameter \a M.
4658/// \param M
4659///    An immediate integer. The least significant bit determines how the values
4660///    from the two parameters are interleaved: \n
4661///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4662///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4663///    result. \n
4664///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4665///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4666///    result.
4667/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4668#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4669  (__m256)__builtin_shufflevector( \
4670    (__v8sf)(__m256)(V1), \
4671    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4672    (((M) & 1) ?  0 :  8), \
4673    (((M) & 1) ?  1 :  9), \
4674    (((M) & 1) ?  2 : 10), \
4675    (((M) & 1) ?  3 : 11), \
4676    (((M) & 1) ?  8 :  4), \
4677    (((M) & 1) ?  9 :  5), \
4678    (((M) & 1) ? 10 :  6), \
4679    (((M) & 1) ? 11 :  7) );})
4680
4681/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4682///    a 256-bit vector of [4 x double] given in the first parameter, and then
4683///    replacing either the upper or the lower 128 bits with the contents of a
4684///    128-bit vector of [2 x double] in the second parameter.
4685///
4686///    The immediate integer parameter determines between the upper or the lower
4687///    128 bits.
4688///
4689/// \headerfile <x86intrin.h>
4690///
4691/// \code
4692/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4693/// \endcode
4694///
4695/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4696///
4697/// \param V1
4698///    A 256-bit vector of [4 x double]. This vector is copied to the result
4699///    first, and then either the upper or the lower 128 bits of the result will
4700///    be replaced by the contents of \a V2.
4701/// \param V2
4702///    A 128-bit vector of [2 x double]. The contents of this parameter are
4703///    written to either the upper or the lower 128 bits of the result depending
4704///    on the value of parameter \a M.
4705/// \param M
4706///    An immediate integer. The least significant bit determines how the values
4707///    from the two parameters are interleaved: \n
4708///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4709///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4710///    result. \n
4711///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4712///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4713///    result.
4714/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4715#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4716  (__m256d)__builtin_shufflevector( \
4717    (__v4df)(__m256d)(V1), \
4718    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4719    (((M) & 1) ? 0 : 4), \
4720    (((M) & 1) ? 1 : 5), \
4721    (((M) & 1) ? 4 : 2), \
4722    (((M) & 1) ? 5 : 3) );})
4723
4724/// \brief Constructs a new 256-bit integer vector by first duplicating a
4725///    256-bit integer vector given in the first parameter, and then replacing
4726///    either the upper or the lower 128 bits with the contents of a 128-bit
4727///    integer vector in the second parameter.
4728///
4729///    The immediate integer parameter determines between the upper or the lower
4730///    128 bits.
4731///
4732/// \headerfile <x86intrin.h>
4733///
4734/// \code
4735/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4736/// \endcode
4737///
4738/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4739///
4740/// \param V1
4741///    A 256-bit integer vector. This vector is copied to the result first, and
4742///    then either the upper or the lower 128 bits of the result will be
4743///    replaced by the contents of \a V2.
4744/// \param V2
4745///    A 128-bit integer vector. The contents of this parameter are written to
4746///    either the upper or the lower 128 bits of the result depending on the
4747///     value of parameter \a M.
4748/// \param M
4749///    An immediate integer. The least significant bit determines how the values
4750///    from the two parameters are interleaved: \n
4751///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4752///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4753///    result. \n
4754///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4755///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4756///    result.
4757/// \returns A 256-bit integer vector containing the interleaved values.
4758#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4759  (__m256i)__builtin_shufflevector( \
4760    (__v4di)(__m256i)(V1), \
4761    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4762    (((M) & 1) ? 0 : 4), \
4763    (((M) & 1) ? 1 : 5), \
4764    (((M) & 1) ? 4 : 2), \
4765    (((M) & 1) ? 5 : 3) );})
4766
4767/*
4768   Vector extract.
4769   We use macros rather than inlines because we only want to accept
4770   invocations where the immediate M is a constant expression.
4771*/
4772/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4773///    of [8 x float], as determined by the immediate integer parameter, and
4774///    returns the extracted bits as a 128-bit vector of [4 x float].
4775///
4776/// \headerfile <x86intrin.h>
4777///
4778/// \code
4779/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4780/// \endcode
4781///
4782/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4783///
4784/// \param V
4785///    A 256-bit vector of [8 x float].
4786/// \param M
4787///    An immediate integer. The least significant bit determines which bits are
4788///    extracted from the first parameter: \n
4789///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4790///    result. \n
4791///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4792/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4793#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4794  (__m128)__builtin_shufflevector( \
4795    (__v8sf)(__m256)(V), \
4796    (__v8sf)(_mm256_undefined_ps()), \
4797    (((M) & 1) ? 4 : 0), \
4798    (((M) & 1) ? 5 : 1), \
4799    (((M) & 1) ? 6 : 2), \
4800    (((M) & 1) ? 7 : 3) );})
4801
4802/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4803///    of [4 x double], as determined by the immediate integer parameter, and
4804///    returns the extracted bits as a 128-bit vector of [2 x double].
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// \code
4809/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4810/// \endcode
4811///
4812/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4813///
4814/// \param V
4815///    A 256-bit vector of [4 x double].
4816/// \param M
4817///    An immediate integer. The least significant bit determines which bits are
4818///    extracted from the first parameter: \n
4819///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4820///    result. \n
4821///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4822/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4823#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4824  (__m128d)__builtin_shufflevector( \
4825    (__v4df)(__m256d)(V), \
4826    (__v4df)(_mm256_undefined_pd()), \
4827    (((M) & 1) ? 2 : 0), \
4828    (((M) & 1) ? 3 : 1) );})
4829
4830/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4831///    integer vector, as determined by the immediate integer parameter, and
4832///    returns the extracted bits as a 128-bit integer vector.
4833///
4834/// \headerfile <x86intrin.h>
4835///
4836/// \code
4837/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4838/// \endcode
4839///
4840/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4841///
4842/// \param V
4843///    A 256-bit integer vector.
4844/// \param M
4845///    An immediate integer. The least significant bit determines which bits are
4846///    extracted from the first parameter:  \n
4847///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4848///    result. \n
4849///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4850/// \returns A 128-bit integer vector containing the extracted bits.
4851#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4852  (__m128i)__builtin_shufflevector( \
4853    (__v4di)(__m256i)(V), \
4854    (__v4di)(_mm256_undefined_si256()), \
4855    (((M) & 1) ? 2 : 0), \
4856    (((M) & 1) ? 3 : 1) );})
4857
4858/* SIMD load ops (unaligned) */
4859/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4860///    unaligned memory locations and constructs a 256-bit floating-point vector
4861///    of [8 x float] by concatenating the two 128-bit vectors.
4862///
4863/// \headerfile <x86intrin.h>
4864///
4865/// This intrinsic corresponds to load instructions followed by the
4866///   <c> VINSERTF128 </c> instruction.
4867///
4868/// \param __addr_hi
4869///    A pointer to a 128-bit memory location containing 4 consecutive
4870///    single-precision floating-point values. These values are to be copied to
4871///    bits[255:128] of the result. The address of the memory location does not
4872///    have to be aligned.
4873/// \param __addr_lo
4874///    A pointer to a 128-bit memory location containing 4 consecutive
4875///    single-precision floating-point values. These values are to be copied to
4876///    bits[127:0] of the result. The address of the memory location does not
4877///    have to be aligned.
4878/// \returns A 256-bit floating-point vector of [8 x float] containing the
4879///    concatenated result.
4880static __inline __m256 __DEFAULT_FN_ATTRS
4881_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4882{
4883  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4884  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4885}
4886
4887/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4888///    unaligned memory locations and constructs a 256-bit floating-point vector
4889///    of [4 x double] by concatenating the two 128-bit vectors.
4890///
4891/// \headerfile <x86intrin.h>
4892///
4893/// This intrinsic corresponds to load instructions followed by the
4894///   <c> VINSERTF128 </c> instruction.
4895///
4896/// \param __addr_hi
4897///    A pointer to a 128-bit memory location containing two consecutive
4898///    double-precision floating-point values. These values are to be copied to
4899///    bits[255:128] of the result. The address of the memory location does not
4900///    have to be aligned.
4901/// \param __addr_lo
4902///    A pointer to a 128-bit memory location containing two consecutive
4903///    double-precision floating-point values. These values are to be copied to
4904///    bits[127:0] of the result. The address of the memory location does not
4905///    have to be aligned.
4906/// \returns A 256-bit floating-point vector of [4 x double] containing the
4907///    concatenated result.
4908static __inline __m256d __DEFAULT_FN_ATTRS
4909_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4910{
4911  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4912  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4913}
4914
4915/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4916///    constructs a 256-bit integer vector by concatenating the two 128-bit
4917///    vectors.
4918///
4919/// \headerfile <x86intrin.h>
4920///
4921/// This intrinsic corresponds to load instructions followed by the
4922///   <c> VINSERTF128 </c> instruction.
4923///
4924/// \param __addr_hi
4925///    A pointer to a 128-bit memory location containing a 128-bit integer
4926///    vector. This vector is to be copied to bits[255:128] of the result. The
4927///    address of the memory location does not have to be aligned.
4928/// \param __addr_lo
4929///    A pointer to a 128-bit memory location containing a 128-bit integer
4930///    vector. This vector is to be copied to bits[127:0] of the result. The
4931///    address of the memory location does not have to be aligned.
4932/// \returns A 256-bit integer vector containing the concatenated result.
4933static __inline __m256i __DEFAULT_FN_ATTRS
4934_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
4935{
4936  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4937  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4938}
4939
4940/* SIMD store ops (unaligned) */
4941/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4942///    vector of [8 x float] into two different unaligned memory locations.
4943///
4944/// \headerfile <x86intrin.h>
4945///
4946/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4947///   store instructions.
4948///
4949/// \param __addr_hi
4950///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4951///    copied to this memory location. The address of this memory location does
4952///    not have to be aligned.
4953/// \param __addr_lo
4954///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4955///    copied to this memory location. The address of this memory location does
4956///    not have to be aligned.
4957/// \param __a
4958///    A 256-bit floating-point vector of [8 x float].
4959static __inline void __DEFAULT_FN_ATTRS
4960_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4961{
4962  __m128 __v128;
4963
4964  __v128 = _mm256_castps256_ps128(__a);
4965  _mm_storeu_ps(__addr_lo, __v128);
4966  __v128 = _mm256_extractf128_ps(__a, 1);
4967  _mm_storeu_ps(__addr_hi, __v128);
4968}
4969
4970/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4971///    vector of [4 x double] into two different unaligned memory locations.
4972///
4973/// \headerfile <x86intrin.h>
4974///
4975/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4976///   store instructions.
4977///
4978/// \param __addr_hi
4979///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4980///    copied to this memory location. The address of this memory location does
4981///    not have to be aligned.
4982/// \param __addr_lo
4983///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4984///    copied to this memory location. The address of this memory location does
4985///    not have to be aligned.
4986/// \param __a
4987///    A 256-bit floating-point vector of [4 x double].
4988static __inline void __DEFAULT_FN_ATTRS
4989_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4990{
4991  __m128d __v128;
4992
4993  __v128 = _mm256_castpd256_pd128(__a);
4994  _mm_storeu_pd(__addr_lo, __v128);
4995  __v128 = _mm256_extractf128_pd(__a, 1);
4996  _mm_storeu_pd(__addr_hi, __v128);
4997}
4998
4999/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
5000///    two different unaligned memory locations.
5001///
5002/// \headerfile <x86intrin.h>
5003///
5004/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5005///   store instructions.
5006///
5007/// \param __addr_hi
5008///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5009///    copied to this memory location. The address of this memory location does
5010///    not have to be aligned.
5011/// \param __addr_lo
5012///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5013///    copied to this memory location. The address of this memory location does
5014///    not have to be aligned.
5015/// \param __a
5016///    A 256-bit integer vector.
5017static __inline void __DEFAULT_FN_ATTRS
5018_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
5019{
5020  __m128i __v128;
5021
5022  __v128 = _mm256_castsi256_si128(__a);
5023  _mm_storeu_si128(__addr_lo, __v128);
5024  __v128 = _mm256_extractf128_si256(__a, 1);
5025  _mm_storeu_si128(__addr_hi, __v128);
5026}
5027
5028/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5029///    concatenating two 128-bit floating-point vectors of [4 x float].
5030///
5031/// \headerfile <x86intrin.h>
5032///
5033/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5034///
5035/// \param __hi
5036///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
5037///    128 bits of the result.
5038/// \param __lo
5039///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
5040///    128 bits of the result.
5041/// \returns A 256-bit floating-point vector of [8 x float] containing the
5042///    concatenated result.
5043static __inline __m256 __DEFAULT_FN_ATTRS
5044_mm256_set_m128 (__m128 __hi, __m128 __lo)
5045{
5046  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
5047}
5048
5049/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5050///    concatenating two 128-bit floating-point vectors of [2 x double].
5051///
5052/// \headerfile <x86intrin.h>
5053///
5054/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5055///
5056/// \param __hi
5057///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5058///    128 bits of the result.
5059/// \param __lo
5060///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5061///    128 bits of the result.
5062/// \returns A 256-bit floating-point vector of [4 x double] containing the
5063///    concatenated result.
5064static __inline __m256d __DEFAULT_FN_ATTRS
5065_mm256_set_m128d (__m128d __hi, __m128d __lo)
5066{
5067  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5068}
5069
5070/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5071///    integer vectors.
5072///
5073/// \headerfile <x86intrin.h>
5074///
5075/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5076///
5077/// \param __hi
5078///    A 128-bit integer vector to be copied to the upper 128 bits of the
5079///    result.
5080/// \param __lo
5081///    A 128-bit integer vector to be copied to the lower 128 bits of the
5082///    result.
5083/// \returns A 256-bit integer vector containing the concatenated result.
5084static __inline __m256i __DEFAULT_FN_ATTRS
5085_mm256_set_m128i (__m128i __hi, __m128i __lo)
5086{
5087  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5088}
5089
5090/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5091///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
5092///    similar to _mm256_set_m128, but the order of the input parameters is
5093///    swapped.
5094///
5095/// \headerfile <x86intrin.h>
5096///
5097/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5098///
5099/// \param __lo
5100///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
5101///    128 bits of the result.
5102/// \param __hi
5103///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
5104///    128 bits of the result.
5105/// \returns A 256-bit floating-point vector of [8 x float] containing the
5106///    concatenated result.
5107static __inline __m256 __DEFAULT_FN_ATTRS
5108_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5109{
5110  return _mm256_set_m128(__hi, __lo);
5111}
5112
5113/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5114///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
5115///    similar to _mm256_set_m128d, but the order of the input parameters is
5116///    swapped.
5117///
5118/// \headerfile <x86intrin.h>
5119///
5120/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5121///
5122/// \param __lo
5123///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5124///    128 bits of the result.
5125/// \param __hi
5126///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5127///    128 bits of the result.
5128/// \returns A 256-bit floating-point vector of [4 x double] containing the
5129///    concatenated result.
5130static __inline __m256d __DEFAULT_FN_ATTRS
5131_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5132{
5133  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5134}
5135
5136/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5137///    integer vectors. This is similar to _mm256_set_m128i, but the order of
5138///    the input parameters is swapped.
5139///
5140/// \headerfile <x86intrin.h>
5141///
5142/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5143///
5144/// \param __lo
5145///    A 128-bit integer vector to be copied to the lower 128 bits of the
5146///    result.
5147/// \param __hi
5148///    A 128-bit integer vector to be copied to the upper 128 bits of the
5149///    result.
5150/// \returns A 256-bit integer vector containing the concatenated result.
5151static __inline __m256i __DEFAULT_FN_ATTRS
5152_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5153{
5154  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5155}
5156
5157#undef __DEFAULT_FN_ATTRS
5158
5159#endif /* __AVXINTRIN_H */
5160