avxintrin.h revision 322320
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38/* Unsigned types */
39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
44/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
46typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
48typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
54
55/* Arithmetic */
56/// \brief Adds two 256-bit vectors of [4 x double].
57///
58/// \headerfile <x86intrin.h>
59///
60/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
61///
62/// \param __a
63///    A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65///    A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67///    operands.
68static __inline __m256d __DEFAULT_FN_ATTRS
69_mm256_add_pd(__m256d __a, __m256d __b)
70{
71  return (__m256d)((__v4df)__a+(__v4df)__b);
72}
73
74/// \brief Adds two 256-bit vectors of [8 x float].
75///
76/// \headerfile <x86intrin.h>
77///
78/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
79///
80/// \param __a
81///    A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83///    A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85///    operands.
86static __inline __m256 __DEFAULT_FN_ATTRS
87_mm256_add_ps(__m256 __a, __m256 __b)
88{
89  return (__m256)((__v8sf)__a+(__v8sf)__b);
90}
91
92/// \brief Subtracts two 256-bit vectors of [4 x double].
93///
94/// \headerfile <x86intrin.h>
95///
96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
97///
98/// \param __a
99///    A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101///    A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103///    both operands.
104static __inline __m256d __DEFAULT_FN_ATTRS
105_mm256_sub_pd(__m256d __a, __m256d __b)
106{
107  return (__m256d)((__v4df)__a-(__v4df)__b);
108}
109
110/// \brief Subtracts two 256-bit vectors of [8 x float].
111///
112/// \headerfile <x86intrin.h>
113///
114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
115///
116/// \param __a
117///    A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119///    A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121///    both operands.
122static __inline __m256 __DEFAULT_FN_ATTRS
123_mm256_sub_ps(__m256 __a, __m256 __b)
124{
125  return (__m256)((__v8sf)__a-(__v8sf)__b);
126}
127
128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129///    two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
134///
135/// \param __a
136///    A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138///    A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140///    and differences between both operands.
141static __inline __m256d __DEFAULT_FN_ATTRS
142_mm256_addsub_pd(__m256d __a, __m256d __b)
143{
144  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145}
146
147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148///    two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
153///
154/// \param __a
155///    A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157///    A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159///    differences between both operands.
160static __inline __m256 __DEFAULT_FN_ATTRS
161_mm256_addsub_ps(__m256 __a, __m256 __b)
162{
163  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
164}
165
166/// \brief Divides two 256-bit vectors of [4 x double].
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
171///
172/// \param __a
173///    A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175///    A 256-bit vector of [4 x double] containing the divisor.
176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177///    operands.
178static __inline __m256d __DEFAULT_FN_ATTRS
179_mm256_div_pd(__m256d __a, __m256d __b)
180{
181  return (__m256d)((__v4df)__a/(__v4df)__b);
182}
183
184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
189///
190/// \param __a
191///    A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193///    A 256-bit vector of [8 x float] containing the divisor.
194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195///    operands.
196static __inline __m256 __DEFAULT_FN_ATTRS
197_mm256_div_ps(__m256 __a, __m256 __b)
198{
199  return (__m256)((__v8sf)__a/(__v8sf)__b);
200}
201
202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203///    of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
208///
209/// \param __a
210///    A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212///    A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214///    between both operands.
215static __inline __m256d __DEFAULT_FN_ATTRS
216_mm256_max_pd(__m256d __a, __m256d __b)
217{
218  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219}
220
221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222///    of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
227///
228/// \param __a
229///    A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231///    A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233///    between both operands.
234static __inline __m256 __DEFAULT_FN_ATTRS
235_mm256_max_ps(__m256 __a, __m256 __b)
236{
237  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238}
239
240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241///    of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
246///
247/// \param __a
248///    A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250///    A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252///    between both operands.
253static __inline __m256d __DEFAULT_FN_ATTRS
254_mm256_min_pd(__m256d __a, __m256d __b)
255{
256  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257}
258
259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260///    of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
265///
266/// \param __a
267///    A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269///    A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271///    between both operands.
272static __inline __m256 __DEFAULT_FN_ATTRS
273_mm256_min_ps(__m256 __a, __m256 __b)
274{
275  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276}
277
278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
283///
284/// \param __a
285///    A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287///    A 256-bit vector of [4 x double] containing one of the operands.
288/// \returns A 256-bit vector of [4 x double] containing the products of both
289///    operands.
290static __inline __m256d __DEFAULT_FN_ATTRS
291_mm256_mul_pd(__m256d __a, __m256d __b)
292{
293  return (__m256d)((__v4df)__a * (__v4df)__b);
294}
295
296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
301///
302/// \param __a
303///    A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305///    A 256-bit vector of [8 x float] containing one of the operands.
306/// \returns A 256-bit vector of [8 x float] containing the products of both
307///    operands.
308static __inline __m256 __DEFAULT_FN_ATTRS
309_mm256_mul_ps(__m256 __a, __m256 __b)
310{
311  return (__m256)((__v8sf)__a * (__v8sf)__b);
312}
313
314/// \brief Calculates the square roots of the values in a 256-bit vector of
315///    [4 x double].
316///
317/// \headerfile <x86intrin.h>
318///
319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
320///
321/// \param __a
322///    A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324///    values in the operand.
325static __inline __m256d __DEFAULT_FN_ATTRS
326_mm256_sqrt_pd(__m256d __a)
327{
328  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329}
330
331/// \brief Calculates the square roots of the values in a 256-bit vector of
332///    [8 x float].
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
337///
338/// \param __a
339///    A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341///    values in the operand.
342static __inline __m256 __DEFAULT_FN_ATTRS
343_mm256_sqrt_ps(__m256 __a)
344{
345  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346}
347
348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349///    vector of [8 x float].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
354///
355/// \param __a
356///    A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358///    roots of the values in the operand.
359static __inline __m256 __DEFAULT_FN_ATTRS
360_mm256_rsqrt_ps(__m256 __a)
361{
362  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363}
364
365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366///    [8 x float].
367///
368/// \headerfile <x86intrin.h>
369///
370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
371///
372/// \param __a
373///    A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375///    values in the operand.
376static __inline __m256 __DEFAULT_FN_ATTRS
377_mm256_rcp_ps(__m256 __a)
378{
379  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380}
381
382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383///    by the byte operand. The source values are rounded to integer values and
384///    returned as 64-bit double-precision floating-point values.
385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
393///
394/// \param V
395///    A 256-bit vector of [4 x double].
396/// \param M
397///    An integer value that specifies the rounding operation. \n
398///    Bits [7:4] are reserved. \n
399///    Bit [3] is a precision exception value: \n
400///      0: A normal PE exception is used. \n
401///      1: The PE field is not updated. \n
402///    Bit [2] is the rounding control source: \n
403///      0: Use bits [1:0] of \a M. \n
404///      1: Use the current MXCSR setting. \n
405///    Bits [1:0] contain the rounding control definition: \n
406///      00: Nearest. \n
407///      01: Downward (toward negative infinity). \n
408///      10: Upward (toward positive infinity). \n
409///      11: Truncated.
410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
411#define _mm256_round_pd(V, M) __extension__ ({ \
412    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
413
414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415///    specified by the byte operand. The source values are rounded to integer
416///    values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
425///
426/// \param V
427///    A 256-bit vector of [8 x float].
428/// \param M
429///    An integer value that specifies the rounding operation. \n
430///    Bits [7:4] are reserved. \n
431///    Bit [3] is a precision exception value: \n
432///      0: A normal PE exception is used. \n
433///      1: The PE field is not updated. \n
434///    Bit [2] is the rounding control source: \n
435///      0: Use bits [1:0] of \a M. \n
436///      1: Use the current MXCSR setting. \n
437///    Bits [1:0] contain the rounding control definition: \n
438///      00: Nearest. \n
439///      01: Downward (toward negative infinity). \n
440///      10: Upward (toward positive infinity). \n
441///      11: Truncated.
442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
443#define _mm256_round_ps(V, M) __extension__ ({ \
444  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
445
446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
447///    source values are rounded up to integer values and returned as 64-bit
448///    double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
457///
458/// \param V
459///    A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
462
463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
464///    The source values are rounded down to integer values and returned as
465///    64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
474///
475/// \param V
476///    A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478///    values.
479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480
481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
482///    source values are rounded up to integer values and returned as
483///    floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
492///
493/// \param V
494///    A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
497
498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
499///    source values are rounded down to integer values and returned as
500///    floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
509///
510/// \param V
511///    A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
521///
522/// \param __a
523///    A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525///    A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527///    values between both operands.
528static __inline __m256d __DEFAULT_FN_ATTRS
529_mm256_and_pd(__m256d __a, __m256d __b)
530{
531  return (__m256d)((__v4du)__a & (__v4du)__b);
532}
533
534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
539///
540/// \param __a
541///    A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543///    A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545///    values between both operands.
546static __inline __m256 __DEFAULT_FN_ATTRS
547_mm256_and_ps(__m256 __a, __m256 __b)
548{
549  return (__m256)((__v8su)__a & (__v8su)__b);
550}
551
552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553///    the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
558///
559/// \param __a
560///    A 256-bit vector of [4 x double] containing the left source operand. The
561///    one's complement of this value is used in the bitwise AND.
562/// \param __b
563///    A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565///    values of the second operand and the one's complement of the first
566///    operand.
567static __inline __m256d __DEFAULT_FN_ATTRS
568_mm256_andnot_pd(__m256d __a, __m256d __b)
569{
570  return (__m256d)(~(__v4du)__a & (__v4du)__b);
571}
572
573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574///    the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
579///
580/// \param __a
581///    A 256-bit vector of [8 x float] containing the left source operand. The
582///    one's complement of this value is used in the bitwise AND.
583/// \param __b
584///    A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586///    values of the second operand and the one's complement of the first
587///    operand.
588static __inline __m256 __DEFAULT_FN_ATTRS
589_mm256_andnot_ps(__m256 __a, __m256 __b)
590{
591  return (__m256)(~(__v8su)__a & (__v8su)__b);
592}
593
594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
599///
600/// \param __a
601///    A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603///    A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605///    values between both operands.
606static __inline __m256d __DEFAULT_FN_ATTRS
607_mm256_or_pd(__m256d __a, __m256d __b)
608{
609  return (__m256d)((__v4du)__a | (__v4du)__b);
610}
611
612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
617///
618/// \param __a
619///    A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621///    A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623///    values between both operands.
624static __inline __m256 __DEFAULT_FN_ATTRS
625_mm256_or_ps(__m256 __a, __m256 __b)
626{
627  return (__m256)((__v8su)__a | (__v8su)__b);
628}
629
630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
635///
636/// \param __a
637///    A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639///    A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641///    values between both operands.
642static __inline __m256d __DEFAULT_FN_ATTRS
643_mm256_xor_pd(__m256d __a, __m256d __b)
644{
645  return (__m256d)((__v4du)__a ^ (__v4du)__b);
646}
647
648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
653///
654/// \param __a
655///    A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657///    A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659///    values between both operands.
660static __inline __m256 __DEFAULT_FN_ATTRS
661_mm256_xor_ps(__m256 __a, __m256 __b)
662{
663  return (__m256)((__v8su)__a ^ (__v8su)__b);
664}
665
666/* Horizontal arithmetic */
667/// \brief Horizontally adds the adjacent pairs of values contained in two
668///    256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
673///
674/// \param __a
675///    A 256-bit vector of [4 x double] containing one of the source operands.
676///    The horizontal sums of the values are returned in the even-indexed
677///    elements of a vector of [4 x double].
678/// \param __b
679///    A 256-bit vector of [4 x double] containing one of the source operands.
680///    The horizontal sums of the values are returned in the odd-indexed
681///    elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683///    both operands.
684static __inline __m256d __DEFAULT_FN_ATTRS
685_mm256_hadd_pd(__m256d __a, __m256d __b)
686{
687  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688}
689
690/// \brief Horizontally adds the adjacent pairs of values contained in two
691///    256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
696///
697/// \param __a
698///    A 256-bit vector of [8 x float] containing one of the source operands.
699///    The horizontal sums of the values are returned in the elements with
700///    index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702///    A 256-bit vector of [8 x float] containing one of the source operands.
703///    The horizontal sums of the values are returned in the elements with
704///    index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706///    both operands.
707static __inline __m256 __DEFAULT_FN_ATTRS
708_mm256_hadd_ps(__m256 __a, __m256 __b)
709{
710  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711}
712
713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714///    256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
719///
720/// \param __a
721///    A 256-bit vector of [4 x double] containing one of the source operands.
722///    The horizontal differences between the values are returned in the
723///    even-indexed elements of a vector of [4 x double].
724/// \param __b
725///    A 256-bit vector of [4 x double] containing one of the source operands.
726///    The horizontal differences between the values are returned in the
727///    odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729///    differences of both operands.
730static __inline __m256d __DEFAULT_FN_ATTRS
731_mm256_hsub_pd(__m256d __a, __m256d __b)
732{
733  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734}
735
736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737///    256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742///
743/// \param __a
744///    A 256-bit vector of [8 x float] containing one of the source operands.
745///    The horizontal differences between the values are returned in the
746///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748///    A 256-bit vector of [8 x float] containing one of the source operands.
749///    The horizontal differences between the values are returned in the
750///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752///    differences of both operands.
753static __inline __m256 __DEFAULT_FN_ATTRS
754_mm256_hsub_ps(__m256 __a, __m256 __b)
755{
756  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757}
758
759/* Vector permutations */
760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761///    by the 128-bit integer vector operand.
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
766///
767/// \param __a
768///    A 128-bit vector of [2 x double].
769/// \param __c
770///    A 128-bit integer vector operand specifying how the values are to be
771///    copied. \n
772///    Bit [1]: \n
773///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774///         vector. \n
775///      1: Bits [127:64] of the source are copied to bits [63:0] of the
776///         returned vector. \n
777///    Bit [65]: \n
778///      0: Bits [63:0] of the source are copied to bits [127:64] of the
779///         returned vector. \n
780///      1: Bits [127:64] of the source are copied to bits [127:64] of the
781///         returned vector.
782/// \returns A 128-bit vector of [2 x double] containing the copied values.
783static __inline __m128d __DEFAULT_FN_ATTRS
784_mm_permutevar_pd(__m128d __a, __m128i __c)
785{
786  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787}
788
789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
790///    by the 256-bit integer vector operand.
791///
792/// \headerfile <x86intrin.h>
793///
794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
795///
796/// \param __a
797///    A 256-bit vector of [4 x double].
798/// \param __c
799///    A 256-bit integer vector operand specifying how the values are to be
800///    copied. \n
801///    Bit [1]: \n
802///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803///         vector. \n
804///      1: Bits [127:64] of the source are copied to bits [63:0] of the
805///         returned vector. \n
806///    Bit [65]: \n
807///      0: Bits [63:0] of the source are copied to bits [127:64] of the
808///         returned vector. \n
809///      1: Bits [127:64] of the source are copied to bits [127:64] of the
810///         returned vector. \n
811///    Bit [129]: \n
812///      0: Bits [191:128] of the source are copied to bits [191:128] of the
813///         returned vector. \n
814///      1: Bits [255:192] of the source are copied to bits [191:128] of the
815///         returned vector. \n
816///    Bit [193]: \n
817///      0: Bits [191:128] of the source are copied to bits [255:192] of the
818///         returned vector. \n
819///      1: Bits [255:192] of the source are copied to bits [255:192] of the
820///    returned vector.
821/// \returns A 256-bit vector of [4 x double] containing the copied values.
822static __inline __m256d __DEFAULT_FN_ATTRS
823_mm256_permutevar_pd(__m256d __a, __m256i __c)
824{
825  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826}
827
828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829///    specified by the 128-bit integer vector operand.
830/// \headerfile <x86intrin.h>
831///
832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
833///
834/// \param __a
835///    A 128-bit vector of [4 x float].
836/// \param __c
837///    A 128-bit integer vector operand specifying how the values are to be
838///    copied. \n
839///    Bits [1:0]: \n
840///      00: Bits [31:0] of the source are copied to bits [31:0] of the
841///          returned vector. \n
842///      01: Bits [63:32] of the source are copied to bits [31:0] of the
843///          returned vector. \n
844///      10: Bits [95:64] of the source are copied to bits [31:0] of the
845///          returned vector. \n
846///      11: Bits [127:96] of the source are copied to bits [31:0] of the
847///          returned vector. \n
848///    Bits [33:32]: \n
849///      00: Bits [31:0] of the source are copied to bits [63:32] of the
850///          returned vector. \n
851///      01: Bits [63:32] of the source are copied to bits [63:32] of the
852///          returned vector. \n
853///      10: Bits [95:64] of the source are copied to bits [63:32] of the
854///          returned vector. \n
855///      11: Bits [127:96] of the source are copied to bits [63:32] of the
856///          returned vector. \n
857///    Bits [65:64]: \n
858///      00: Bits [31:0] of the source are copied to bits [95:64] of the
859///          returned vector. \n
860///      01: Bits [63:32] of the source are copied to bits [95:64] of the
861///          returned vector. \n
862///      10: Bits [95:64] of the source are copied to bits [95:64] of the
863///          returned vector. \n
864///      11: Bits [127:96] of the source are copied to bits [95:64] of the
865///          returned vector. \n
866///    Bits [97:96]: \n
867///      00: Bits [31:0] of the source are copied to bits [127:96] of the
868///          returned vector. \n
869///      01: Bits [63:32] of the source are copied to bits [127:96] of the
870///          returned vector. \n
871///      10: Bits [95:64] of the source are copied to bits [127:96] of the
872///          returned vector. \n
873///      11: Bits [127:96] of the source are copied to bits [127:96] of the
874///          returned vector.
875/// \returns A 128-bit vector of [4 x float] containing the copied values.
876static __inline __m128 __DEFAULT_FN_ATTRS
877_mm_permutevar_ps(__m128 __a, __m128i __c)
878{
879  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
880}
881
882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
883///    specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
888///
889/// \param __a
890///    A 256-bit vector of [8 x float].
891/// \param __c
892///    A 256-bit integer vector operand specifying how the values are to be
893///    copied. \n
894///    Bits [1:0]: \n
895///      00: Bits [31:0] of the source are copied to bits [31:0] of the
896///          returned vector. \n
897///      01: Bits [63:32] of the source are copied to bits [31:0] of the
898///          returned vector. \n
899///      10: Bits [95:64] of the source are copied to bits [31:0] of the
900///          returned vector. \n
901///      11: Bits [127:96] of the source are copied to bits [31:0] of the
902///          returned vector. \n
903///    Bits [33:32]: \n
904///      00: Bits [31:0] of the source are copied to bits [63:32] of the
905///          returned vector. \n
906///      01: Bits [63:32] of the source are copied to bits [63:32] of the
907///          returned vector. \n
908///      10: Bits [95:64] of the source are copied to bits [63:32] of the
909///          returned vector. \n
910///      11: Bits [127:96] of the source are copied to bits [63:32] of the
911///          returned vector. \n
912///    Bits [65:64]: \n
913///      00: Bits [31:0] of the source are copied to bits [95:64] of the
914///          returned vector. \n
915///      01: Bits [63:32] of the source are copied to bits [95:64] of the
916///          returned vector. \n
917///      10: Bits [95:64] of the source are copied to bits [95:64] of the
918///          returned vector. \n
919///      11: Bits [127:96] of the source are copied to bits [95:64] of the
920///          returned vector. \n
921///    Bits [97:96]: \n
922///      00: Bits [31:0] of the source are copied to bits [127:96] of the
923///          returned vector. \n
924///      01: Bits [63:32] of the source are copied to bits [127:96] of the
925///          returned vector. \n
926///      10: Bits [95:64] of the source are copied to bits [127:96] of the
927///          returned vector. \n
928///      11: Bits [127:96] of the source are copied to bits [127:96] of the
929///          returned vector. \n
930///    Bits [129:128]: \n
931///      00: Bits [159:128] of the source are copied to bits [159:128] of the
932///          returned vector. \n
933///      01: Bits [191:160] of the source are copied to bits [159:128] of the
934///          returned vector. \n
935///      10: Bits [223:192] of the source are copied to bits [159:128] of the
936///          returned vector. \n
937///      11: Bits [255:224] of the source are copied to bits [159:128] of the
938///          returned vector. \n
939///    Bits [161:160]: \n
940///      00: Bits [159:128] of the source are copied to bits [191:160] of the
941///          returned vector. \n
942///      01: Bits [191:160] of the source are copied to bits [191:160] of the
943///          returned vector. \n
944///      10: Bits [223:192] of the source are copied to bits [191:160] of the
945///          returned vector. \n
946///      11: Bits [255:224] of the source are copied to bits [191:160] of the
947///          returned vector. \n
948///    Bits [193:192]: \n
949///      00: Bits [159:128] of the source are copied to bits [223:192] of the
950///          returned vector. \n
951///      01: Bits [191:160] of the source are copied to bits [223:192] of the
952///          returned vector. \n
953///      10: Bits [223:192] of the source are copied to bits [223:192] of the
954///          returned vector. \n
955///      11: Bits [255:224] of the source are copied to bits [223:192] of the
956///          returned vector. \n
957///    Bits [225:224]: \n
958///      00: Bits [159:128] of the source are copied to bits [255:224] of the
959///          returned vector. \n
960///      01: Bits [191:160] of the source are copied to bits [255:224] of the
961///          returned vector. \n
962///      10: Bits [223:192] of the source are copied to bits [255:224] of the
963///          returned vector. \n
964///      11: Bits [255:224] of the source are copied to bits [255:224] of the
965///          returned vector.
966/// \returns A 256-bit vector of [8 x float] containing the copied values.
967static __inline __m256 __DEFAULT_FN_ATTRS
968_mm256_permutevar_ps(__m256 __a, __m256i __c)
969{
970  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971}
972
973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
974///    by the immediate integer operand.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983///
984/// \param A
985///    A 128-bit vector of [2 x double].
986/// \param C
987///    An immediate integer operand specifying how the values are to be
988///    copied. \n
989///    Bit [0]: \n
990///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991///         vector. \n
992///      1: Bits [127:64] of the source are copied to bits [63:0] of the
993///         returned vector. \n
994///    Bit [1]: \n
995///      0: Bits [63:0] of the source are copied to bits [127:64] of the
996///         returned vector. \n
997///      1: Bits [127:64] of the source are copied to bits [127:64] of the
998///         returned vector.
999/// \returns A 128-bit vector of [2 x double] containing the copied values.
1000#define _mm_permute_pd(A, C) __extension__ ({ \
1001  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
1002                                   (__v2df)_mm_undefined_pd(), \
1003                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
1004
1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
1006///    the immediate integer operand.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// __m256d _mm256_permute_pd(__m256d A, const int C);
1012/// \endcode
1013///
1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1015///
1016/// \param A
1017///    A 256-bit vector of [4 x double].
1018/// \param C
1019///    An immediate integer operand specifying how the values are to be
1020///    copied. \n
1021///    Bit [0]: \n
1022///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1023///         vector. \n
1024///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1025///         returned vector. \n
1026///    Bit [1]: \n
1027///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1028///         returned vector. \n
1029///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1030///         returned vector. \n
1031///    Bit [2]: \n
1032///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1033///         returned vector. \n
1034///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1035///         returned vector. \n
1036///    Bit [3]: \n
1037///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1038///         returned vector. \n
1039///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1040///         returned vector.
1041/// \returns A 256-bit vector of [4 x double] containing the copied values.
1042#define _mm256_permute_pd(A, C) __extension__ ({ \
1043  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1044                                   (__v4df)_mm256_undefined_pd(), \
1045                                   0 + (((C) >> 0) & 0x1), \
1046                                   0 + (((C) >> 1) & 0x1), \
1047                                   2 + (((C) >> 2) & 0x1), \
1048                                   2 + (((C) >> 3) & 0x1)); })
1049
1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
1051///    the immediate integer operand.
1052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// \code
1056/// __m128 _mm_permute_ps(__m128 A, const int C);
1057/// \endcode
1058///
1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1060///
1061/// \param A
1062///    A 128-bit vector of [4 x float].
1063/// \param C
1064///    An immediate integer operand specifying how the values are to be
1065///    copied. \n
1066///    Bits [1:0]: \n
1067///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1068///          returned vector. \n
1069///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1070///          returned vector. \n
1071///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1072///          returned vector. \n
1073///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1074///          returned vector. \n
1075///    Bits [3:2]: \n
1076///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1077///          returned vector. \n
1078///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1079///          returned vector. \n
1080///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1081///          returned vector. \n
1082///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1083///          returned vector. \n
1084///    Bits [5:4]: \n
1085///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1086///          returned vector. \n
1087///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1088///          returned vector. \n
1089///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1090///          returned vector. \n
1091///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1092///          returned vector. \n
1093///    Bits [7:6]: \n
1094///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1095///          returned vector. \n
1096///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1097///          returned vector. \n
1098///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1099///          returned vector. \n
1100///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1101///          returned vector.
1102/// \returns A 128-bit vector of [4 x float] containing the copied values.
1103#define _mm_permute_ps(A, C) __extension__ ({ \
1104  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1105                                  (__v4sf)_mm_undefined_ps(), \
1106                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1107                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
1108
1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
1110///    the immediate integer operand.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1119///
1120/// \param A
1121///    A 256-bit vector of [8 x float].
1122/// \param C
1123///    An immediate integer operand specifying how the values are to be \n
1124///    copied. \n
1125///    Bits [1:0]: \n
1126///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1127///          returned vector. \n
1128///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1129///          returned vector. \n
1130///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1131///          returned vector. \n
1132///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1133///          returned vector. \n
1134///    Bits [3:2]: \n
1135///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1136///          returned vector. \n
1137///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1138///          returned vector. \n
1139///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1140///          returned vector. \n
1141///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1142///          returned vector. \n
1143///    Bits [5:4]: \n
1144///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1145///          returned vector. \n
1146///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1147///          returned vector. \n
1148///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1149///          returned vector. \n
1150///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1151///          returned vector. \n
1152///    Bits [7:6]: \n
1153///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
1154///          returned vector. \n
1155///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1156///          returned vector. \n
1157///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1158///          returned vector. \n
1159///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1160///          returned vector. \n
1161///    Bits [1:0]: \n
1162///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1163///          returned vector. \n
1164///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1165///          returned vector. \n
1166///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1167///          returned vector. \n
1168///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1169///          returned vector. \n
1170///    Bits [3:2]: \n
1171///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1172///          returned vector. \n
1173///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1174///          returned vector. \n
1175///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1176///          returned vector. \n
1177///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1178///          returned vector. \n
1179///    Bits [5:4]: \n
1180///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1181///          returned vector. \n
1182///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1183///          returned vector. \n
1184///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1185///          returned vector. \n
1186///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1187///          returned vector. \n
1188///    Bits [7:6]: \n
1189///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1190///          returned vector. \n
1191///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1192///          returned vector. \n
1193///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1194///          returned vector. \n
1195///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1196///          returned vector.
1197/// \returns A 256-bit vector of [8 x float] containing the copied values.
1198#define _mm256_permute_ps(A, C) __extension__ ({ \
1199  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1200                                  (__v8sf)_mm256_undefined_ps(), \
1201                                  0 + (((C) >> 0) & 0x3), \
1202                                  0 + (((C) >> 2) & 0x3), \
1203                                  0 + (((C) >> 4) & 0x3), \
1204                                  0 + (((C) >> 6) & 0x3), \
1205                                  4 + (((C) >> 0) & 0x3), \
1206                                  4 + (((C) >> 2) & 0x3), \
1207                                  4 + (((C) >> 4) & 0x3), \
1208                                  4 + (((C) >> 6) & 0x3)); })
1209
1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1211///    [4 x double], as specified by the immediate integer operand.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// \code
1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1217/// \endcode
1218///
1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1220///
1221/// \param V1
1222///    A 256-bit vector of [4 x double].
1223/// \param V2
1224///    A 256-bit vector of [4 x double.
1225/// \param M
1226///    An immediate integer operand specifying how the values are to be
1227///    permuted. \n
1228///    Bits [1:0]: \n
1229///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1230///          destination. \n
1231///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1232///          destination. \n
1233///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1234///          destination. \n
1235///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1236///          destination. \n
1237///    Bits [5:4]: \n
1238///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1239///          destination. \n
1240///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1241///          destination. \n
1242///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1243///          destination. \n
1244///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1245///          destination.
1246/// \returns A 256-bit vector of [4 x double] containing the copied values.
1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1248  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1249                                           (__v4df)(__m256d)(V2), (M)); })
1250
1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1252///    [8 x float], as specified by the immediate integer operand.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// \code
1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1258/// \endcode
1259///
1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1261///
1262/// \param V1
1263///    A 256-bit vector of [8 x float].
1264/// \param V2
1265///    A 256-bit vector of [8 x float].
1266/// \param M
1267///    An immediate integer operand specifying how the values are to be
1268///    permuted. \n
1269///    Bits [1:0]: \n
1270///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1271///    destination. \n
1272///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1273///    destination. \n
1274///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1275///    destination. \n
1276///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1277///    destination. \n
1278///    Bits [5:4]: \n
1279///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1280///    destination. \n
1281///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1282///    destination. \n
1283///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1284///    destination. \n
1285///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1286///    destination.
1287/// \returns A 256-bit vector of [8 x float] containing the copied values.
1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1289  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1290                                          (__v8sf)(__m256)(V2), (M)); })
1291
1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1293///    as specified by the immediate integer operand.
1294///
1295/// \headerfile <x86intrin.h>
1296///
1297/// \code
1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1299/// \endcode
1300///
1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1302///
1303/// \param V1
1304///    A 256-bit integer vector.
1305/// \param V2
1306///    A 256-bit integer vector.
1307/// \param M
1308///    An immediate integer operand specifying how the values are to be copied.
1309///    Bits [1:0]: \n
1310///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1311///    destination. \n
1312///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1313///    destination. \n
1314///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1315///    destination. \n
1316///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1317///    destination. \n
1318///    Bits [5:4]: \n
1319///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1320///    destination. \n
1321///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1322///    destination. \n
1323///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1324///    destination. \n
1325///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1326///    destination.
1327/// \returns A 256-bit integer vector containing the copied values.
1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1329  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1330                                           (__v8si)(__m256i)(V2), (M)); })
1331
1332/* Vector Blend */
1333/// \brief Merges 64-bit double-precision data values stored in either of the
1334///    two 256-bit vectors of [4 x double], as specified by the immediate
1335///    integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1341/// \endcode
1342///
1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1344///
1345/// \param V1
1346///    A 256-bit vector of [4 x double].
1347/// \param V2
1348///    A 256-bit vector of [4 x double].
1349/// \param M
1350///    An immediate integer operand, with mask bits [3:0] specifying how the
1351///    values are to be copied. The position of the mask bit corresponds to the
1352///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1353///    element in operand \a V1 is copied to the same position in the
1354///    destination. When a mask bit is 1, the corresponding 64-bit element in
1355///    operand \a V2 is copied to the same position in the destination.
1356/// \returns A 256-bit vector of [4 x double] containing the copied values.
1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1358  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1359                                   (__v4df)(__m256d)(V2), \
1360                                   (((M) & 0x01) ? 4 : 0), \
1361                                   (((M) & 0x02) ? 5 : 1), \
1362                                   (((M) & 0x04) ? 6 : 2), \
1363                                   (((M) & 0x08) ? 7 : 3)); })
1364
1365/// \brief Merges 32-bit single-precision data values stored in either of the
1366///    two 256-bit vectors of [8 x float], as specified by the immediate
1367///    integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1376///
1377/// \param V1
1378///    A 256-bit vector of [8 x float].
1379/// \param V2
1380///    A 256-bit vector of [8 x float].
1381/// \param M
1382///    An immediate integer operand, with mask bits [7:0] specifying how the
1383///    values are to be copied. The position of the mask bit corresponds to the
1384///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1385///    element in operand \a V1 is copied to the same position in the
1386///    destination. When a mask bit is 1, the corresponding 32-bit element in
1387///    operand \a V2 is copied to the same position in the destination.
1388/// \returns A 256-bit vector of [8 x float] containing the copied values.
1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1390  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1391                                  (__v8sf)(__m256)(V2), \
1392                                  (((M) & 0x01) ?  8 : 0), \
1393                                  (((M) & 0x02) ?  9 : 1), \
1394                                  (((M) & 0x04) ? 10 : 2), \
1395                                  (((M) & 0x08) ? 11 : 3), \
1396                                  (((M) & 0x10) ? 12 : 4), \
1397                                  (((M) & 0x20) ? 13 : 5), \
1398                                  (((M) & 0x40) ? 14 : 6), \
1399                                  (((M) & 0x80) ? 15 : 7)); })
1400
1401/// \brief Merges 64-bit double-precision data values stored in either of the
1402///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403///    operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410///    A 256-bit vector of [4 x double].
1411/// \param __b
1412///    A 256-bit vector of [4 x double].
1413/// \param __c
1414///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415///    how the values are to be copied. The position of the mask bit corresponds
1416///    to the most significant bit of a copied value. When a mask bit is 0, the
1417///    corresponding 64-bit element in operand \a __a is copied to the same
1418///    position in the destination. When a mask bit is 1, the corresponding
1419///    64-bit element in operand \a __b is copied to the same position in the
1420///    destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425  return (__m256d)__builtin_ia32_blendvpd256(
1426    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// \brief Merges 32-bit single-precision data values stored in either of the
1430///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431///    operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438///    A 256-bit vector of [8 x float].
1439/// \param __b
1440///    A 256-bit vector of [8 x float].
1441/// \param __c
1442///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443///    and 31 specifying how the values are to be copied. The position of the
1444///    mask bit corresponds to the most significant bit of a copied value. When
1445///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446///    copied to the same position in the destination. When a mask bit is 1, the
1447///    corresponding 32-bit element in operand \a __b is copied to the same
1448///    position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453  return (__m256)__builtin_ia32_blendvps256(
1454    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// \brief Computes two dot products in parallel, using the lower and upper
1459///    halves of two [8 x float] vectors as input to the two computations, and
1460///    returning the two dot products in the lower and upper halves of the
1461///    [8 x float] result.
1462///
1463///    The immediate integer operand controls which input elements will
1464///    contribute to the dot product, and where the final results are returned.
1465///    In general, for each dot product, the four corresponding elements of the
1466///    input vectors are multiplied; the first two and second two products are
1467///    summed, then the two sums are added to form the final result.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// \code
1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1473/// \endcode
1474///
1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1476///
1477/// \param V1
1478///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1479/// \param V2
1480///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1481/// \param M
1482///    An immediate integer argument. Bits [7:4] determine which elements of
1483///    the input vectors are used, with bit [4] corresponding to the lowest
1484///    element and bit [7] corresponding to the highest element of each [4 x
1485///    float] subvector. If a bit is set, the corresponding elements from the
1486///    two input vectors are used as an input for dot product; otherwise that
1487///    input is treated as zero. Bits [3:0] determine which elements of the
1488///    result will receive a copy of the final dot product, with bit [0]
1489///    corresponding to the lowest element and bit [3] corresponding to the
1490///    highest element of each [4 x float] subvector. If a bit is set, the dot
1491///    product is returned in the corresponding element; otherwise that element
1492///    is set to zero. The bitmask is applied in the same way to each of the
1493///    two parallel dot product computations.
1494/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1495#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1496  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1497                                 (__v8sf)(__m256)(V2), (M)); })
1498
1499/* Vector shuffle */
1500/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1501///    specified by the immediate value operand.
1502///
1503///    The four selected elements in each operand are copied to the destination
1504///    according to the bits specified in the immediate operand. The selected
1505///    elements from the first 256-bit operand are copied to bits [63:0] and
1506///    bits [191:128] of the destination, and the selected elements from the
1507///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1508///    the destination. For example, if bits [7:0] of the immediate operand
1509///    contain a value of 0xFF, the 256-bit destination vector would contain the
1510///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// \code
1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1516/// \endcode
1517///
1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1519///
1520/// \param a
1521///    A 256-bit vector of [8 x float]. The four selected elements in this
1522///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1523///    according to the bits specified in the immediate operand.
1524/// \param b
1525///    A 256-bit vector of [8 x float]. The four selected elements in this
1526///    operand are copied to bits [127:64] and bits [255:192] in the
1527///    destination, according to the bits specified in the immediate operand.
1528/// \param mask
1529///    An immediate value containing an 8-bit value specifying which elements to
1530///    copy from \a a and \a b \n.
1531///    Bits [3:0] specify the values copied from operand \a a. \n
1532///    Bits [7:4] specify the values copied from operand \a b. \n
1533///    The destinations within the 256-bit destination are assigned values as
1534///    follows, according to the bit value assignments described below: \n
1535///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1536///    destination. \n
1537///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1538///    destination. \n
1539///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1540///    destination. \n
1541///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1542///    the destination. \n
1543///    Bit value assignments: \n
1544///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1545///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1546///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1547///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1548/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1549#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1550  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1551                                  (__v8sf)(__m256)(b), \
1552                                  0  + (((mask) >> 0) & 0x3), \
1553                                  0  + (((mask) >> 2) & 0x3), \
1554                                  8  + (((mask) >> 4) & 0x3), \
1555                                  8  + (((mask) >> 6) & 0x3), \
1556                                  4  + (((mask) >> 0) & 0x3), \
1557                                  4  + (((mask) >> 2) & 0x3), \
1558                                  12 + (((mask) >> 4) & 0x3), \
1559                                  12 + (((mask) >> 6) & 0x3)); })
1560
1561/// \brief Selects four double-precision values from the 256-bit operands of
1562///    [4 x double], as specified by the immediate value operand.
1563///
1564///    The selected elements from the first 256-bit operand are copied to bits
1565///    [63:0] and bits [191:128] in the destination, and the selected elements
1566///    from the second 256-bit operand are copied to bits [127:64] and bits
1567///    [255:192] in the destination. For example, if bits [3:0] of the immediate
1568///    operand contain a value of 0xF, the 256-bit destination vector would
1569///    contain the following values: b[3], a[3], b[1], a[1].
1570///
1571/// \headerfile <x86intrin.h>
1572///
1573/// \code
1574/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1575/// \endcode
1576///
1577/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1578///
1579/// \param a
1580///    A 256-bit vector of [4 x double].
1581/// \param b
1582///    A 256-bit vector of [4 x double].
1583/// \param mask
1584///    An immediate value containing 8-bit values specifying which elements to
1585///    copy from \a a and \a b: \n
1586///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1587///    destination. \n
1588///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1589///    destination. \n
1590///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1591///    destination. \n
1592///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1593///    destination. \n
1594///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1595///    destination. \n
1596///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1597///    destination. \n
1598///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1599///    destination. \n
1600///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1601///    destination.
1602/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1603#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1604  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1605                                   (__v4df)(__m256d)(b), \
1606                                   0 + (((mask) >> 0) & 0x1), \
1607                                   4 + (((mask) >> 1) & 0x1), \
1608                                   2 + (((mask) >> 2) & 0x1), \
1609                                   6 + (((mask) >> 3) & 0x1)); })
1610
1611/* Compare */
1612#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1613#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1614#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1615#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1616#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1617#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1618#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1619#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
1620#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1621#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
1622#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1623#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1624#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1625#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1626#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1627#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1628#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1629#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1630#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1631#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1632#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1633#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1634#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
1635#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1636#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1637#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
1638#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1639#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1640#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1641#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1642#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1643#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1644
1645/// \brief Compares each of the corresponding double-precision values of two
1646///    128-bit vectors of [2 x double], using the operation specified by the
1647///    immediate integer operand.
1648///
1649///    Returns a [2 x double] vector consisting of two doubles corresponding to
1650///    the two comparison results: zero if the comparison is false, and all 1's
1651///    if the comparison is true.
1652///
1653/// \headerfile <x86intrin.h>
1654///
1655/// \code
1656/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1657/// \endcode
1658///
1659/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1660///
1661/// \param a
1662///    A 128-bit vector of [2 x double].
1663/// \param b
1664///    A 128-bit vector of [2 x double].
1665/// \param c
1666///    An immediate integer operand, with bits [4:0] specifying which comparison
1667///    operation to use: \n
1668///    0x00 : Equal (ordered, non-signaling)
1669///    0x01 : Less-than (ordered, signaling)
1670///    0x02 : Less-than-or-equal (ordered, signaling)
1671///    0x03 : Unordered (non-signaling)
1672///    0x04 : Not-equal (unordered, non-signaling)
1673///    0x05 : Not-less-than (unordered, signaling)
1674///    0x06 : Not-less-than-or-equal (unordered, signaling)
1675///    0x07 : Ordered (non-signaling)
1676///    0x08 : Equal (unordered, non-signaling)
1677///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1678///    0x0a : Not-greater-than (unordered, signaling)
1679///    0x0b : False (ordered, non-signaling)
1680///    0x0c : Not-equal (ordered, non-signaling)
1681///    0x0d : Greater-than-or-equal (ordered, signaling)
1682///    0x0e : Greater-than (ordered, signaling)
1683///    0x0f : True (unordered, non-signaling)
1684///    0x10 : Equal (ordered, signaling)
1685///    0x11 : Less-than (ordered, non-signaling)
1686///    0x12 : Less-than-or-equal (ordered, non-signaling)
1687///    0x13 : Unordered (signaling)
1688///    0x14 : Not-equal (unordered, signaling)
1689///    0x15 : Not-less-than (unordered, non-signaling)
1690///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1691///    0x17 : Ordered (signaling)
1692///    0x18 : Equal (unordered, signaling)
1693///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1694///    0x1a : Not-greater-than (unordered, non-signaling)
1695///    0x1b : False (ordered, signaling)
1696///    0x1c : Not-equal (ordered, signaling)
1697///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1698///    0x1e : Greater-than (ordered, non-signaling)
1699///    0x1f : True (unordered, signaling)
1700/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1701#define _mm_cmp_pd(a, b, c) __extension__ ({ \
1702  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1703                                (__v2df)(__m128d)(b), (c)); })
1704
1705/// \brief Compares each of the corresponding values of two 128-bit vectors of
1706///    [4 x float], using the operation specified by the immediate integer
1707///    operand.
1708///
1709///    Returns a [4 x float] vector consisting of four floats corresponding to
1710///    the four comparison results: zero if the comparison is false, and all 1's
1711///    if the comparison is true.
1712///
1713/// \headerfile <x86intrin.h>
1714///
1715/// \code
1716/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1717/// \endcode
1718///
1719/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1720///
1721/// \param a
1722///    A 128-bit vector of [4 x float].
1723/// \param b
1724///    A 128-bit vector of [4 x float].
1725/// \param c
1726///    An immediate integer operand, with bits [4:0] specifying which comparison
1727///    operation to use: \n
1728///    0x00 : Equal (ordered, non-signaling)
1729///    0x01 : Less-than (ordered, signaling)
1730///    0x02 : Less-than-or-equal (ordered, signaling)
1731///    0x03 : Unordered (non-signaling)
1732///    0x04 : Not-equal (unordered, non-signaling)
1733///    0x05 : Not-less-than (unordered, signaling)
1734///    0x06 : Not-less-than-or-equal (unordered, signaling)
1735///    0x07 : Ordered (non-signaling)
1736///    0x08 : Equal (unordered, non-signaling)
1737///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1738///    0x0a : Not-greater-than (unordered, signaling)
1739///    0x0b : False (ordered, non-signaling)
1740///    0x0c : Not-equal (ordered, non-signaling)
1741///    0x0d : Greater-than-or-equal (ordered, signaling)
1742///    0x0e : Greater-than (ordered, signaling)
1743///    0x0f : True (unordered, non-signaling)
1744///    0x10 : Equal (ordered, signaling)
1745///    0x11 : Less-than (ordered, non-signaling)
1746///    0x12 : Less-than-or-equal (ordered, non-signaling)
1747///    0x13 : Unordered (signaling)
1748///    0x14 : Not-equal (unordered, signaling)
1749///    0x15 : Not-less-than (unordered, non-signaling)
1750///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1751///    0x17 : Ordered (signaling)
1752///    0x18 : Equal (unordered, signaling)
1753///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1754///    0x1a : Not-greater-than (unordered, non-signaling)
1755///    0x1b : False (ordered, signaling)
1756///    0x1c : Not-equal (ordered, signaling)
1757///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1758///    0x1e : Greater-than (ordered, non-signaling)
1759///    0x1f : True (unordered, signaling)
1760/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1761#define _mm_cmp_ps(a, b, c) __extension__ ({ \
1762  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1763                               (__v4sf)(__m128)(b), (c)); })
1764
1765/// \brief Compares each of the corresponding double-precision values of two
1766///    256-bit vectors of [4 x double], using the operation specified by the
1767///    immediate integer operand.
1768///
1769///    Returns a [4 x double] vector consisting of four doubles corresponding to
1770///    the four comparison results: zero if the comparison is false, and all 1's
1771///    if the comparison is true.
1772///
1773/// \headerfile <x86intrin.h>
1774///
1775/// \code
1776/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1777/// \endcode
1778///
1779/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1780///
1781/// \param a
1782///    A 256-bit vector of [4 x double].
1783/// \param b
1784///    A 256-bit vector of [4 x double].
1785/// \param c
1786///    An immediate integer operand, with bits [4:0] specifying which comparison
1787///    operation to use: \n
1788///    0x00 : Equal (ordered, non-signaling)
1789///    0x01 : Less-than (ordered, signaling)
1790///    0x02 : Less-than-or-equal (ordered, signaling)
1791///    0x03 : Unordered (non-signaling)
1792///    0x04 : Not-equal (unordered, non-signaling)
1793///    0x05 : Not-less-than (unordered, signaling)
1794///    0x06 : Not-less-than-or-equal (unordered, signaling)
1795///    0x07 : Ordered (non-signaling)
1796///    0x08 : Equal (unordered, non-signaling)
1797///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1798///    0x0a : Not-greater-than (unordered, signaling)
1799///    0x0b : False (ordered, non-signaling)
1800///    0x0c : Not-equal (ordered, non-signaling)
1801///    0x0d : Greater-than-or-equal (ordered, signaling)
1802///    0x0e : Greater-than (ordered, signaling)
1803///    0x0f : True (unordered, non-signaling)
1804///    0x10 : Equal (ordered, signaling)
1805///    0x11 : Less-than (ordered, non-signaling)
1806///    0x12 : Less-than-or-equal (ordered, non-signaling)
1807///    0x13 : Unordered (signaling)
1808///    0x14 : Not-equal (unordered, signaling)
1809///    0x15 : Not-less-than (unordered, non-signaling)
1810///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1811///    0x17 : Ordered (signaling)
1812///    0x18 : Equal (unordered, signaling)
1813///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1814///    0x1a : Not-greater-than (unordered, non-signaling)
1815///    0x1b : False (ordered, signaling)
1816///    0x1c : Not-equal (ordered, signaling)
1817///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1818///    0x1e : Greater-than (ordered, non-signaling)
1819///    0x1f : True (unordered, signaling)
1820/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1821#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1822  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1823                                   (__v4df)(__m256d)(b), (c)); })
1824
1825/// \brief Compares each of the corresponding values of two 256-bit vectors of
1826///    [8 x float], using the operation specified by the immediate integer
1827///    operand.
1828///
1829///    Returns a [8 x float] vector consisting of eight floats corresponding to
1830///    the eight comparison results: zero if the comparison is false, and all
1831///    1's if the comparison is true.
1832///
1833/// \headerfile <x86intrin.h>
1834///
1835/// \code
1836/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1837/// \endcode
1838///
1839/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1840///
1841/// \param a
1842///    A 256-bit vector of [8 x float].
1843/// \param b
1844///    A 256-bit vector of [8 x float].
1845/// \param c
1846///    An immediate integer operand, with bits [4:0] specifying which comparison
1847///    operation to use: \n
1848///    0x00 : Equal (ordered, non-signaling)
1849///    0x01 : Less-than (ordered, signaling)
1850///    0x02 : Less-than-or-equal (ordered, signaling)
1851///    0x03 : Unordered (non-signaling)
1852///    0x04 : Not-equal (unordered, non-signaling)
1853///    0x05 : Not-less-than (unordered, signaling)
1854///    0x06 : Not-less-than-or-equal (unordered, signaling)
1855///    0x07 : Ordered (non-signaling)
1856///    0x08 : Equal (unordered, non-signaling)
1857///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1858///    0x0a : Not-greater-than (unordered, signaling)
1859///    0x0b : False (ordered, non-signaling)
1860///    0x0c : Not-equal (ordered, non-signaling)
1861///    0x0d : Greater-than-or-equal (ordered, signaling)
1862///    0x0e : Greater-than (ordered, signaling)
1863///    0x0f : True (unordered, non-signaling)
1864///    0x10 : Equal (ordered, signaling)
1865///    0x11 : Less-than (ordered, non-signaling)
1866///    0x12 : Less-than-or-equal (ordered, non-signaling)
1867///    0x13 : Unordered (signaling)
1868///    0x14 : Not-equal (unordered, signaling)
1869///    0x15 : Not-less-than (unordered, non-signaling)
1870///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1871///    0x17 : Ordered (signaling)
1872///    0x18 : Equal (unordered, signaling)
1873///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1874///    0x1a : Not-greater-than (unordered, non-signaling)
1875///    0x1b : False (ordered, signaling)
1876///    0x1c : Not-equal (ordered, signaling)
1877///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1878///    0x1e : Greater-than (ordered, non-signaling)
1879///    0x1f : True (unordered, signaling)
1880/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1881#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1882  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1883                                  (__v8sf)(__m256)(b), (c)); })
1884
1885/// \brief Compares each of the corresponding scalar double-precision values of
1886///    two 128-bit vectors of [2 x double], using the operation specified by the
1887///    immediate integer operand.
1888///
1889///    If the result is true, all 64 bits of the destination vector are set;
1890///    otherwise they are cleared.
1891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// \code
1895/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1896/// \endcode
1897///
1898/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1899///
1900/// \param a
1901///    A 128-bit vector of [2 x double].
1902/// \param b
1903///    A 128-bit vector of [2 x double].
1904/// \param c
1905///    An immediate integer operand, with bits [4:0] specifying which comparison
1906///    operation to use: \n
1907///    0x00 : Equal (ordered, non-signaling)
1908///    0x01 : Less-than (ordered, signaling)
1909///    0x02 : Less-than-or-equal (ordered, signaling)
1910///    0x03 : Unordered (non-signaling)
1911///    0x04 : Not-equal (unordered, non-signaling)
1912///    0x05 : Not-less-than (unordered, signaling)
1913///    0x06 : Not-less-than-or-equal (unordered, signaling)
1914///    0x07 : Ordered (non-signaling)
1915///    0x08 : Equal (unordered, non-signaling)
1916///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1917///    0x0a : Not-greater-than (unordered, signaling)
1918///    0x0b : False (ordered, non-signaling)
1919///    0x0c : Not-equal (ordered, non-signaling)
1920///    0x0d : Greater-than-or-equal (ordered, signaling)
1921///    0x0e : Greater-than (ordered, signaling)
1922///    0x0f : True (unordered, non-signaling)
1923///    0x10 : Equal (ordered, signaling)
1924///    0x11 : Less-than (ordered, non-signaling)
1925///    0x12 : Less-than-or-equal (ordered, non-signaling)
1926///    0x13 : Unordered (signaling)
1927///    0x14 : Not-equal (unordered, signaling)
1928///    0x15 : Not-less-than (unordered, non-signaling)
1929///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1930///    0x17 : Ordered (signaling)
1931///    0x18 : Equal (unordered, signaling)
1932///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1933///    0x1a : Not-greater-than (unordered, non-signaling)
1934///    0x1b : False (ordered, signaling)
1935///    0x1c : Not-equal (ordered, signaling)
1936///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1937///    0x1e : Greater-than (ordered, non-signaling)
1938///    0x1f : True (unordered, signaling)
1939/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1940#define _mm_cmp_sd(a, b, c) __extension__ ({ \
1941  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1942                                (__v2df)(__m128d)(b), (c)); })
1943
1944/// \brief Compares each of the corresponding scalar values of two 128-bit
1945///    vectors of [4 x float], using the operation specified by the immediate
1946///    integer operand.
1947///
1948///    If the result is true, all 32 bits of the destination vector are set;
1949///    otherwise they are cleared.
1950///
1951/// \headerfile <x86intrin.h>
1952///
1953/// \code
1954/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1955/// \endcode
1956///
1957/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1958///
1959/// \param a
1960///    A 128-bit vector of [4 x float].
1961/// \param b
1962///    A 128-bit vector of [4 x float].
1963/// \param c
1964///    An immediate integer operand, with bits [4:0] specifying which comparison
1965///    operation to use: \n
1966///    0x00 : Equal (ordered, non-signaling)
1967///    0x01 : Less-than (ordered, signaling)
1968///    0x02 : Less-than-or-equal (ordered, signaling)
1969///    0x03 : Unordered (non-signaling)
1970///    0x04 : Not-equal (unordered, non-signaling)
1971///    0x05 : Not-less-than (unordered, signaling)
1972///    0x06 : Not-less-than-or-equal (unordered, signaling)
1973///    0x07 : Ordered (non-signaling)
1974///    0x08 : Equal (unordered, non-signaling)
1975///    0x09 : Not-greater-than-or-equal (unordered, signaling)
1976///    0x0a : Not-greater-than (unordered, signaling)
1977///    0x0b : False (ordered, non-signaling)
1978///    0x0c : Not-equal (ordered, non-signaling)
1979///    0x0d : Greater-than-or-equal (ordered, signaling)
1980///    0x0e : Greater-than (ordered, signaling)
1981///    0x0f : True (unordered, non-signaling)
1982///    0x10 : Equal (ordered, signaling)
1983///    0x11 : Less-than (ordered, non-signaling)
1984///    0x12 : Less-than-or-equal (ordered, non-signaling)
1985///    0x13 : Unordered (signaling)
1986///    0x14 : Not-equal (unordered, signaling)
1987///    0x15 : Not-less-than (unordered, non-signaling)
1988///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
1989///    0x17 : Ordered (signaling)
1990///    0x18 : Equal (unordered, signaling)
1991///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
1992///    0x1a : Not-greater-than (unordered, non-signaling)
1993///    0x1b : False (ordered, signaling)
1994///    0x1c : Not-equal (ordered, signaling)
1995///    0x1d : Greater-than-or-equal (ordered, non-signaling)
1996///    0x1e : Greater-than (ordered, non-signaling)
1997///    0x1f : True (unordered, signaling)
1998/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1999#define _mm_cmp_ss(a, b, c) __extension__ ({ \
2000  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
2001                               (__v4sf)(__m128)(b), (c)); })
2002
2003/// \brief Takes a [8 x i32] vector and returns the vector element value
2004///    indexed by the immediate constant operand.
2005///
2006/// \headerfile <x86intrin.h>
2007///
2008/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2009///   instruction.
2010///
2011/// \param __a
2012///    A 256-bit vector of [8 x i32].
2013/// \param __imm
2014///    An immediate integer operand with bits [2:0] determining which vector
2015///    element is extracted and returned.
2016/// \returns A 32-bit integer containing the extracted 32 bits of extended
2017///    packed data.
2018static __inline int __DEFAULT_FN_ATTRS
2019_mm256_extract_epi32(__m256i __a, const int __imm)
2020{
2021  __v8si __b = (__v8si)__a;
2022  return __b[__imm & 7];
2023}
2024
2025/// \brief Takes a [16 x i16] vector and returns the vector element value
2026///    indexed by the immediate constant operand.
2027///
2028/// \headerfile <x86intrin.h>
2029///
2030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2031///   instruction.
2032///
2033/// \param __a
2034///    A 256-bit integer vector of [16 x i16].
2035/// \param __imm
2036///    An immediate integer operand with bits [3:0] determining which vector
2037///    element is extracted and returned.
2038/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2039///    packed data.
2040static __inline int __DEFAULT_FN_ATTRS
2041_mm256_extract_epi16(__m256i __a, const int __imm)
2042{
2043  __v16hi __b = (__v16hi)__a;
2044  return (unsigned short)__b[__imm & 15];
2045}
2046
2047/// \brief Takes a [32 x i8] vector and returns the vector element value
2048///    indexed by the immediate constant operand.
2049///
2050/// \headerfile <x86intrin.h>
2051///
2052/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2053///   instruction.
2054///
2055/// \param __a
2056///    A 256-bit integer vector of [32 x i8].
2057/// \param __imm
2058///    An immediate integer operand with bits [4:0] determining which vector
2059///    element is extracted and returned.
2060/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2061///    packed data.
2062static __inline int __DEFAULT_FN_ATTRS
2063_mm256_extract_epi8(__m256i __a, const int __imm)
2064{
2065  __v32qi __b = (__v32qi)__a;
2066  return (unsigned char)__b[__imm & 31];
2067}
2068
2069#ifdef __x86_64__
2070/// \brief Takes a [4 x i64] vector and returns the vector element value
2071///    indexed by the immediate constant operand.
2072///
2073/// \headerfile <x86intrin.h>
2074///
2075/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2076///   instruction.
2077///
2078/// \param __a
2079///    A 256-bit integer vector of [4 x i64].
2080/// \param __imm
2081///    An immediate integer operand with bits [1:0] determining which vector
2082///    element is extracted and returned.
2083/// \returns A 64-bit integer containing the extracted 64 bits of extended
2084///    packed data.
2085static __inline long long  __DEFAULT_FN_ATTRS
2086_mm256_extract_epi64(__m256i __a, const int __imm)
2087{
2088  __v4di __b = (__v4di)__a;
2089  return __b[__imm & 3];
2090}
2091#endif
2092
2093/// \brief Takes a [8 x i32] vector and replaces the vector element value
2094///    indexed by the immediate constant operand by a new value. Returns the
2095///    modified vector.
2096///
2097/// \headerfile <x86intrin.h>
2098///
2099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2100///   instruction.
2101///
2102/// \param __a
2103///    A vector of [8 x i32] to be used by the insert operation.
2104/// \param __b
2105///    An integer value. The replacement value for the insert operation.
2106/// \param __imm
2107///    An immediate integer specifying the index of the vector element to be
2108///    replaced.
2109/// \returns A copy of vector \a __a, after replacing its element indexed by
2110///    \a __imm with \a __b.
2111static __inline __m256i __DEFAULT_FN_ATTRS
2112_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
2113{
2114  __v8si __c = (__v8si)__a;
2115  __c[__imm & 7] = __b;
2116  return (__m256i)__c;
2117}
2118
2119
2120/// \brief Takes a [16 x i16] vector and replaces the vector element value
2121///    indexed by the immediate constant operand with a new value. Returns the
2122///    modified vector.
2123///
2124/// \headerfile <x86intrin.h>
2125///
2126/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2127///   instruction.
2128///
2129/// \param __a
2130///    A vector of [16 x i16] to be used by the insert operation.
2131/// \param __b
2132///    An i16 integer value. The replacement value for the insert operation.
2133/// \param __imm
2134///    An immediate integer specifying the index of the vector element to be
2135///    replaced.
2136/// \returns A copy of vector \a __a, after replacing its element indexed by
2137///    \a __imm with \a __b.
2138static __inline __m256i __DEFAULT_FN_ATTRS
2139_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
2140{
2141  __v16hi __c = (__v16hi)__a;
2142  __c[__imm & 15] = __b;
2143  return (__m256i)__c;
2144}
2145
2146/// \brief Takes a [32 x i8] vector and replaces the vector element value
2147///    indexed by the immediate constant operand with a new value. Returns the
2148///    modified vector.
2149///
2150/// \headerfile <x86intrin.h>
2151///
2152/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2153///   instruction.
2154///
2155/// \param __a
2156///    A vector of [32 x i8] to be used by the insert operation.
2157/// \param __b
2158///    An i8 integer value. The replacement value for the insert operation.
2159/// \param __imm
2160///    An immediate integer specifying the index of the vector element to be
2161///    replaced.
2162/// \returns A copy of vector \a __a, after replacing its element indexed by
2163///    \a __imm with \a __b.
2164static __inline __m256i __DEFAULT_FN_ATTRS
2165_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
2166{
2167  __v32qi __c = (__v32qi)__a;
2168  __c[__imm & 31] = __b;
2169  return (__m256i)__c;
2170}
2171
2172#ifdef __x86_64__
2173/// \brief Takes a [4 x i64] vector and replaces the vector element value
2174///    indexed by the immediate constant operand with a new value. Returns the
2175///    modified vector.
2176///
2177/// \headerfile <x86intrin.h>
2178///
2179/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2180///   instruction.
2181///
2182/// \param __a
2183///    A vector of [4 x i64] to be used by the insert operation.
2184/// \param __b
2185///    A 64-bit integer value. The replacement value for the insert operation.
2186/// \param __imm
2187///    An immediate integer specifying the index of the vector element to be
2188///    replaced.
2189/// \returns A copy of vector \a __a, after replacing its element indexed by
2190///     \a __imm with \a __b.
2191static __inline __m256i __DEFAULT_FN_ATTRS
2192_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
2193{
2194  __v4di __c = (__v4di)__a;
2195  __c[__imm & 3] = __b;
2196  return (__m256i)__c;
2197}
2198#endif
2199
2200/* Conversion */
2201/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2202///
2203/// \headerfile <x86intrin.h>
2204///
2205/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2206///
2207/// \param __a
2208///    A 128-bit integer vector of [4 x i32].
2209/// \returns A 256-bit vector of [4 x double] containing the converted values.
2210static __inline __m256d __DEFAULT_FN_ATTRS
2211_mm256_cvtepi32_pd(__m128i __a)
2212{
2213  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2214}
2215
2216/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2217///
2218/// \headerfile <x86intrin.h>
2219///
2220/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2221///
2222/// \param __a
2223///    A 256-bit integer vector.
2224/// \returns A 256-bit vector of [8 x float] containing the converted values.
2225static __inline __m256 __DEFAULT_FN_ATTRS
2226_mm256_cvtepi32_ps(__m256i __a)
2227{
2228  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
2229}
2230
2231/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2232///    [4 x float].
2233///
2234/// \headerfile <x86intrin.h>
2235///
2236/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2237///
2238/// \param __a
2239///    A 256-bit vector of [4 x double].
2240/// \returns A 128-bit vector of [4 x float] containing the converted values.
2241static __inline __m128 __DEFAULT_FN_ATTRS
2242_mm256_cvtpd_ps(__m256d __a)
2243{
2244  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2245}
2246
2247/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2248///
2249/// \headerfile <x86intrin.h>
2250///
2251/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2252///
2253/// \param __a
2254///    A 256-bit vector of [8 x float].
2255/// \returns A 256-bit integer vector containing the converted values.
2256static __inline __m256i __DEFAULT_FN_ATTRS
2257_mm256_cvtps_epi32(__m256 __a)
2258{
2259  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2260}
2261
2262/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2263///    x double].
2264///
2265/// \headerfile <x86intrin.h>
2266///
2267/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2268///
2269/// \param __a
2270///    A 128-bit vector of [4 x float].
2271/// \returns A 256-bit vector of [4 x double] containing the converted values.
2272static __inline __m256d __DEFAULT_FN_ATTRS
2273_mm256_cvtps_pd(__m128 __a)
2274{
2275  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2276}
2277
2278/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2279///    x i32], truncating the result by rounding towards zero when it is
2280///    inexact.
2281///
2282/// \headerfile <x86intrin.h>
2283///
2284/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2285///
2286/// \param __a
2287///    A 256-bit vector of [4 x double].
2288/// \returns A 128-bit integer vector containing the converted values.
2289static __inline __m128i __DEFAULT_FN_ATTRS
2290_mm256_cvttpd_epi32(__m256d __a)
2291{
2292  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2293}
2294
2295/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2296///    x i32]. When a conversion is inexact, the value returned is rounded
2297///    according to the rounding control bits in the MXCSR register.
2298///
2299/// \headerfile <x86intrin.h>
2300///
2301/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2302///
2303/// \param __a
2304///    A 256-bit vector of [4 x double].
2305/// \returns A 128-bit integer vector containing the converted values.
2306static __inline __m128i __DEFAULT_FN_ATTRS
2307_mm256_cvtpd_epi32(__m256d __a)
2308{
2309  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2310}
2311
2312/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2313///    truncating the result by rounding towards zero when it is inexact.
2314///
2315/// \headerfile <x86intrin.h>
2316///
2317/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2318///
2319/// \param __a
2320///    A 256-bit vector of [8 x float].
2321/// \returns A 256-bit integer vector containing the converted values.
2322static __inline __m256i __DEFAULT_FN_ATTRS
2323_mm256_cvttps_epi32(__m256 __a)
2324{
2325  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2326}
2327
2328/// \brief Returns the first element of the input vector of [4 x double].
2329///
2330/// \headerfile <avxintrin.h>
2331///
2332/// This intrinsic is a utility function and does not correspond to a specific
2333///    instruction.
2334///
2335/// \param __a
2336///    A 256-bit vector of [4 x double].
2337/// \returns A 64 bit double containing the first element of the input vector.
2338static __inline double __DEFAULT_FN_ATTRS
2339_mm256_cvtsd_f64(__m256d __a)
2340{
2341 return __a[0];
2342}
2343
2344/// \brief Returns the first element of the input vector of [8 x i32].
2345///
2346/// \headerfile <avxintrin.h>
2347///
2348/// This intrinsic is a utility function and does not correspond to a specific
2349///    instruction.
2350///
2351/// \param __a
2352///    A 256-bit vector of [8 x i32].
2353/// \returns A 32 bit integer containing the first element of the input vector.
2354static __inline int __DEFAULT_FN_ATTRS
2355_mm256_cvtsi256_si32(__m256i __a)
2356{
2357 __v8si __b = (__v8si)__a;
2358 return __b[0];
2359}
2360
2361/// \brief Returns the first element of the input vector of [8 x float].
2362///
2363/// \headerfile <avxintrin.h>
2364///
2365/// This intrinsic is a utility function and does not correspond to a specific
2366///    instruction.
2367///
2368/// \param __a
2369///    A 256-bit vector of [8 x float].
2370/// \returns A 32 bit float containing the first element of the input vector.
2371static __inline float __DEFAULT_FN_ATTRS
2372_mm256_cvtss_f32(__m256 __a)
2373{
2374 return __a[0];
2375}
2376
2377/* Vector replicate */
2378/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
2379///    vector of [8 x float] to float values in a 256-bit vector of
2380///    [8 x float].
2381///
2382/// \headerfile <x86intrin.h>
2383///
2384/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2385///
2386/// \param __a
2387///    A 256-bit vector of [8 x float]. \n
2388///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2389///    the return value. \n
2390///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2391///    the return value. \n
2392///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2393///    return value. \n
2394///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2395///    return value.
2396/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2397///    values.
2398static __inline __m256 __DEFAULT_FN_ATTRS
2399_mm256_movehdup_ps(__m256 __a)
2400{
2401  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2402}
2403
2404/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
2405///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2406///
2407/// \headerfile <x86intrin.h>
2408///
2409/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2410///
2411/// \param __a
2412///    A 256-bit vector of [8 x float]. \n
2413///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2414///    the return value. \n
2415///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2416///    the return value. \n
2417///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2418///    return value. \n
2419///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2420///    return value.
2421/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2422///    values.
2423static __inline __m256 __DEFAULT_FN_ATTRS
2424_mm256_moveldup_ps(__m256 __a)
2425{
2426  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2427}
2428
2429/// \brief Moves and duplicates double-precision floating point values from a
2430///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2431///    vector of [4 x double].
2432///
2433/// \headerfile <x86intrin.h>
2434///
2435/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2436///
2437/// \param __a
2438///    A 256-bit vector of [4 x double]. \n
2439///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2440///    return value. \n
2441///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2442///    the return value.
2443/// \returns A 256-bit vector of [4 x double] containing the moved and
2444///    duplicated values.
2445static __inline __m256d __DEFAULT_FN_ATTRS
2446_mm256_movedup_pd(__m256d __a)
2447{
2448  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2449}
2450
2451/* Unpack and Interleave */
2452/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2453///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2454///
2455/// \headerfile <x86intrin.h>
2456///
2457/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2458///
2459/// \param __a
2460///    A 256-bit floating-point vector of [4 x double]. \n
2461///    Bits [127:64] are written to bits [63:0] of the return value. \n
2462///    Bits [255:192] are written to bits [191:128] of the return value. \n
2463/// \param __b
2464///    A 256-bit floating-point vector of [4 x double]. \n
2465///    Bits [127:64] are written to bits [127:64] of the return value. \n
2466///    Bits [255:192] are written to bits [255:192] of the return value. \n
2467/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2468static __inline __m256d __DEFAULT_FN_ATTRS
2469_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2470{
2471  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2472}
2473
2474/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2475///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2476///
2477/// \headerfile <x86intrin.h>
2478///
2479/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2480///
2481/// \param __a
2482///    A 256-bit floating-point vector of [4 x double]. \n
2483///    Bits [63:0] are written to bits [63:0] of the return value. \n
2484///    Bits [191:128] are written to bits [191:128] of the return value.
2485/// \param __b
2486///    A 256-bit floating-point vector of [4 x double]. \n
2487///    Bits [63:0] are written to bits [127:64] of the return value. \n
2488///    Bits [191:128] are written to bits [255:192] of the return value. \n
2489/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2490static __inline __m256d __DEFAULT_FN_ATTRS
2491_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2492{
2493  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2494}
2495
2496/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2497///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2498///    vector of [8 x float].
2499///
2500/// \headerfile <x86intrin.h>
2501///
2502/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2503///
2504/// \param __a
2505///    A 256-bit vector of [8 x float]. \n
2506///    Bits [95:64] are written to bits [31:0] of the return value. \n
2507///    Bits [127:96] are written to bits [95:64] of the return value. \n
2508///    Bits [223:192] are written to bits [159:128] of the return value. \n
2509///    Bits [255:224] are written to bits [223:192] of the return value.
2510/// \param __b
2511///    A 256-bit vector of [8 x float]. \n
2512///    Bits [95:64] are written to bits [63:32] of the return value. \n
2513///    Bits [127:96] are written to bits [127:96] of the return value. \n
2514///    Bits [223:192] are written to bits [191:160] of the return value. \n
2515///    Bits [255:224] are written to bits [255:224] of the return value.
2516/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2517static __inline __m256 __DEFAULT_FN_ATTRS
2518_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2519{
2520  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2521}
2522
2523/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2524///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2525///    vector of [8 x float].
2526///
2527/// \headerfile <x86intrin.h>
2528///
2529/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2530///
2531/// \param __a
2532///    A 256-bit vector of [8 x float]. \n
2533///    Bits [31:0] are written to bits [31:0] of the return value. \n
2534///    Bits [63:32] are written to bits [95:64] of the return value. \n
2535///    Bits [159:128] are written to bits [159:128] of the return value. \n
2536///    Bits [191:160] are written to bits [223:192] of the return value.
2537/// \param __b
2538///    A 256-bit vector of [8 x float]. \n
2539///    Bits [31:0] are written to bits [63:32] of the return value. \n
2540///    Bits [63:32] are written to bits [127:96] of the return value. \n
2541///    Bits [159:128] are written to bits [191:160] of the return value. \n
2542///    Bits [191:160] are written to bits [255:224] of the return value.
2543/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2544static __inline __m256 __DEFAULT_FN_ATTRS
2545_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2546{
2547  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2548}
2549
2550/* Bit Test */
2551/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2552///    element-by-element comparison of the double-precision element in the
2553///    first source vector and the corresponding element in the second source
2554///    vector.
2555///
2556///    The EFLAGS register is updated as follows: \n
2557///    If there is at least one pair of double-precision elements where the
2558///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2559///    ZF flag is set to 1. \n
2560///    If there is at least one pair of double-precision elements where the
2561///    sign-bit of the first element is 0 and the sign-bit of the second element
2562///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2563///    This intrinsic returns the value of the ZF flag.
2564///
2565/// \headerfile <x86intrin.h>
2566///
2567/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2568///
2569/// \param __a
2570///    A 128-bit vector of [2 x double].
2571/// \param __b
2572///    A 128-bit vector of [2 x double].
2573/// \returns the ZF flag in the EFLAGS register.
2574static __inline int __DEFAULT_FN_ATTRS
2575_mm_testz_pd(__m128d __a, __m128d __b)
2576{
2577  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2578}
2579
2580/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2581///    element-by-element comparison of the double-precision element in the
2582///    first source vector and the corresponding element in the second source
2583///    vector.
2584///
2585///    The EFLAGS register is updated as follows: \n
2586///    If there is at least one pair of double-precision elements where the
2587///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2588///    ZF flag is set to 1. \n
2589///    If there is at least one pair of double-precision elements where the
2590///    sign-bit of the first element is 0 and the sign-bit of the second element
2591///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2592///    This intrinsic returns the value of the CF flag.
2593///
2594/// \headerfile <x86intrin.h>
2595///
2596/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2597///
2598/// \param __a
2599///    A 128-bit vector of [2 x double].
2600/// \param __b
2601///    A 128-bit vector of [2 x double].
2602/// \returns the CF flag in the EFLAGS register.
2603static __inline int __DEFAULT_FN_ATTRS
2604_mm_testc_pd(__m128d __a, __m128d __b)
2605{
2606  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2607}
2608
2609/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2610///    element-by-element comparison of the double-precision element in the
2611///    first source vector and the corresponding element in the second source
2612///    vector.
2613///
2614///    The EFLAGS register is updated as follows: \n
2615///    If there is at least one pair of double-precision elements where the
2616///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2617///    ZF flag is set to 1. \n
2618///    If there is at least one pair of double-precision elements where the
2619///    sign-bit of the first element is 0 and the sign-bit of the second element
2620///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2621///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2622///    otherwise it returns 0.
2623///
2624/// \headerfile <x86intrin.h>
2625///
2626/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2627///
2628/// \param __a
2629///    A 128-bit vector of [2 x double].
2630/// \param __b
2631///    A 128-bit vector of [2 x double].
2632/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2633static __inline int __DEFAULT_FN_ATTRS
2634_mm_testnzc_pd(__m128d __a, __m128d __b)
2635{
2636  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2637}
2638
2639/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2640///    element-by-element comparison of the single-precision element in the
2641///    first source vector and the corresponding element in the second source
2642///    vector.
2643///
2644///    The EFLAGS register is updated as follows: \n
2645///    If there is at least one pair of single-precision elements where the
2646///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2647///    ZF flag is set to 1. \n
2648///    If there is at least one pair of single-precision elements where the
2649///    sign-bit of the first element is 0 and the sign-bit of the second element
2650///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2651///    This intrinsic returns the value of the ZF flag.
2652///
2653/// \headerfile <x86intrin.h>
2654///
2655/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2656///
2657/// \param __a
2658///    A 128-bit vector of [4 x float].
2659/// \param __b
2660///    A 128-bit vector of [4 x float].
2661/// \returns the ZF flag.
2662static __inline int __DEFAULT_FN_ATTRS
2663_mm_testz_ps(__m128 __a, __m128 __b)
2664{
2665  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2666}
2667
2668/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2669///    element-by-element comparison of the single-precision element in the
2670///    first source vector and the corresponding element in the second source
2671///    vector.
2672///
2673///    The EFLAGS register is updated as follows: \n
2674///    If there is at least one pair of single-precision elements where the
2675///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2676///    ZF flag is set to 1. \n
2677///    If there is at least one pair of single-precision elements where the
2678///    sign-bit of the first element is 0 and the sign-bit of the second element
2679///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2680///    This intrinsic returns the value of the CF flag.
2681///
2682/// \headerfile <x86intrin.h>
2683///
2684/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2685///
2686/// \param __a
2687///    A 128-bit vector of [4 x float].
2688/// \param __b
2689///    A 128-bit vector of [4 x float].
2690/// \returns the CF flag.
2691static __inline int __DEFAULT_FN_ATTRS
2692_mm_testc_ps(__m128 __a, __m128 __b)
2693{
2694  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2695}
2696
2697/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2698///    element-by-element comparison of the single-precision element in the
2699///    first source vector and the corresponding element in the second source
2700///    vector.
2701///
2702///    The EFLAGS register is updated as follows: \n
2703///    If there is at least one pair of single-precision elements where the
2704///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2705///    ZF flag is set to 1. \n
2706///    If there is at least one pair of single-precision elements where the
2707///    sign-bit of the first element is 0 and the sign-bit of the second element
2708///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2709///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2710///    otherwise it returns 0.
2711///
2712/// \headerfile <x86intrin.h>
2713///
2714/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2715///
2716/// \param __a
2717///    A 128-bit vector of [4 x float].
2718/// \param __b
2719///    A 128-bit vector of [4 x float].
2720/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2721static __inline int __DEFAULT_FN_ATTRS
2722_mm_testnzc_ps(__m128 __a, __m128 __b)
2723{
2724  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2725}
2726
2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2728///    element-by-element comparison of the double-precision elements in the
2729///    first source vector and the corresponding elements in the second source
2730///    vector.
2731///
2732///    The EFLAGS register is updated as follows: \n
2733///    If there is at least one pair of double-precision elements where the
2734///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2735///    ZF flag is set to 1. \n
2736///    If there is at least one pair of double-precision elements where the
2737///    sign-bit of the first element is 0 and the sign-bit of the second element
2738///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2739///    This intrinsic returns the value of the ZF flag.
2740///
2741/// \headerfile <x86intrin.h>
2742///
2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2744///
2745/// \param __a
2746///    A 256-bit vector of [4 x double].
2747/// \param __b
2748///    A 256-bit vector of [4 x double].
2749/// \returns the ZF flag.
2750static __inline int __DEFAULT_FN_ATTRS
2751_mm256_testz_pd(__m256d __a, __m256d __b)
2752{
2753  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2754}
2755
2756/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2757///    element-by-element comparison of the double-precision elements in the
2758///    first source vector and the corresponding elements in the second source
2759///    vector.
2760///
2761///    The EFLAGS register is updated as follows: \n
2762///    If there is at least one pair of double-precision elements where the
2763///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2764///    ZF flag is set to 1. \n
2765///    If there is at least one pair of double-precision elements where the
2766///    sign-bit of the first element is 0 and the sign-bit of the second element
2767///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2768///    This intrinsic returns the value of the CF flag.
2769///
2770/// \headerfile <x86intrin.h>
2771///
2772/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2773///
2774/// \param __a
2775///    A 256-bit vector of [4 x double].
2776/// \param __b
2777///    A 256-bit vector of [4 x double].
2778/// \returns the CF flag.
2779static __inline int __DEFAULT_FN_ATTRS
2780_mm256_testc_pd(__m256d __a, __m256d __b)
2781{
2782  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2783}
2784
2785/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2786///    element-by-element comparison of the double-precision elements in the
2787///    first source vector and the corresponding elements in the second source
2788///    vector.
2789///
2790///    The EFLAGS register is updated as follows: \n
2791///    If there is at least one pair of double-precision elements where the
2792///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2793///    ZF flag is set to 1. \n
2794///    If there is at least one pair of double-precision elements where the
2795///    sign-bit of the first element is 0 and the sign-bit of the second element
2796///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2797///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2798///    otherwise it returns 0.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2803///
2804/// \param __a
2805///    A 256-bit vector of [4 x double].
2806/// \param __b
2807///    A 256-bit vector of [4 x double].
2808/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2809static __inline int __DEFAULT_FN_ATTRS
2810_mm256_testnzc_pd(__m256d __a, __m256d __b)
2811{
2812  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2813}
2814
2815/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2816///    element-by-element comparison of the single-precision element in the
2817///    first source vector and the corresponding element in the second source
2818///    vector.
2819///
2820///    The EFLAGS register is updated as follows: \n
2821///    If there is at least one pair of single-precision elements where the
2822///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2823///    ZF flag is set to 1. \n
2824///    If there is at least one pair of single-precision elements where the
2825///    sign-bit of the first element is 0 and the sign-bit of the second element
2826///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2827///    This intrinsic returns the value of the ZF flag.
2828///
2829/// \headerfile <x86intrin.h>
2830///
2831/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2832///
2833/// \param __a
2834///    A 256-bit vector of [8 x float].
2835/// \param __b
2836///    A 256-bit vector of [8 x float].
2837/// \returns the ZF flag.
2838static __inline int __DEFAULT_FN_ATTRS
2839_mm256_testz_ps(__m256 __a, __m256 __b)
2840{
2841  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2842}
2843
2844/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2845///    element-by-element comparison of the single-precision element in the
2846///    first source vector and the corresponding element in the second source
2847///    vector.
2848///
2849///    The EFLAGS register is updated as follows: \n
2850///    If there is at least one pair of single-precision elements where the
2851///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2852///    ZF flag is set to 1. \n
2853///    If there is at least one pair of single-precision elements where the
2854///    sign-bit of the first element is 0 and the sign-bit of the second element
2855///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2856///    This intrinsic returns the value of the CF flag.
2857///
2858/// \headerfile <x86intrin.h>
2859///
2860/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2861///
2862/// \param __a
2863///    A 256-bit vector of [8 x float].
2864/// \param __b
2865///    A 256-bit vector of [8 x float].
2866/// \returns the CF flag.
2867static __inline int __DEFAULT_FN_ATTRS
2868_mm256_testc_ps(__m256 __a, __m256 __b)
2869{
2870  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2871}
2872
2873/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2874///    element-by-element comparison of the single-precision elements in the
2875///    first source vector and the corresponding elements in the second source
2876///    vector.
2877///
2878///    The EFLAGS register is updated as follows: \n
2879///    If there is at least one pair of single-precision elements where the
2880///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2881///    ZF flag is set to 1. \n
2882///    If there is at least one pair of single-precision elements where the
2883///    sign-bit of the first element is 0 and the sign-bit of the second element
2884///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2885///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2886///    otherwise it returns 0.
2887///
2888/// \headerfile <x86intrin.h>
2889///
2890/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2891///
2892/// \param __a
2893///    A 256-bit vector of [8 x float].
2894/// \param __b
2895///    A 256-bit vector of [8 x float].
2896/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2897static __inline int __DEFAULT_FN_ATTRS
2898_mm256_testnzc_ps(__m256 __a, __m256 __b)
2899{
2900  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2901}
2902
2903/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2904///    of the two source vectors.
2905///
2906///    The EFLAGS register is updated as follows: \n
2907///    If there is at least one pair of bits where both bits are 1, the ZF flag
2908///    is set to 0. Otherwise the ZF flag is set to 1. \n
2909///    If there is at least one pair of bits where the bit from the first source
2910///    vector is 0 and the bit from the second source vector is 1, the CF flag
2911///    is set to 0. Otherwise the CF flag is set to 1. \n
2912///    This intrinsic returns the value of the ZF flag.
2913///
2914/// \headerfile <x86intrin.h>
2915///
2916/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2917///
2918/// \param __a
2919///    A 256-bit integer vector.
2920/// \param __b
2921///    A 256-bit integer vector.
2922/// \returns the ZF flag.
2923static __inline int __DEFAULT_FN_ATTRS
2924_mm256_testz_si256(__m256i __a, __m256i __b)
2925{
2926  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2927}
2928
2929/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2930///    of the two source vectors.
2931///
2932///    The EFLAGS register is updated as follows: \n
2933///    If there is at least one pair of bits where both bits are 1, the ZF flag
2934///    is set to 0. Otherwise the ZF flag is set to 1. \n
2935///    If there is at least one pair of bits where the bit from the first source
2936///    vector is 0 and the bit from the second source vector is 1, the CF flag
2937///    is set to 0. Otherwise the CF flag is set to 1. \n
2938///    This intrinsic returns the value of the CF flag.
2939///
2940/// \headerfile <x86intrin.h>
2941///
2942/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2943///
2944/// \param __a
2945///    A 256-bit integer vector.
2946/// \param __b
2947///    A 256-bit integer vector.
2948/// \returns the CF flag.
2949static __inline int __DEFAULT_FN_ATTRS
2950_mm256_testc_si256(__m256i __a, __m256i __b)
2951{
2952  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2953}
2954
2955/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2956///    of the two source vectors.
2957///
2958///    The EFLAGS register is updated as follows: \n
2959///    If there is at least one pair of bits where both bits are 1, the ZF flag
2960///    is set to 0. Otherwise the ZF flag is set to 1. \n
2961///    If there is at least one pair of bits where the bit from the first source
2962///    vector is 0 and the bit from the second source vector is 1, the CF flag
2963///    is set to 0. Otherwise the CF flag is set to 1. \n
2964///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2965///    otherwise it returns 0.
2966///
2967/// \headerfile <x86intrin.h>
2968///
2969/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2970///
2971/// \param __a
2972///    A 256-bit integer vector.
2973/// \param __b
2974///    A 256-bit integer vector.
2975/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2976static __inline int __DEFAULT_FN_ATTRS
2977_mm256_testnzc_si256(__m256i __a, __m256i __b)
2978{
2979  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2980}
2981
2982/* Vector extract sign mask */
2983/// \brief Extracts the sign bits of double-precision floating point elements
2984///    in a 256-bit vector of [4 x double] and writes them to the lower order
2985///    bits of the return value.
2986///
2987/// \headerfile <x86intrin.h>
2988///
2989/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2990///
2991/// \param __a
2992///    A 256-bit vector of [4 x double] containing the double-precision
2993///    floating point values with sign bits to be extracted.
2994/// \returns The sign bits from the operand, written to bits [3:0].
2995static __inline int __DEFAULT_FN_ATTRS
2996_mm256_movemask_pd(__m256d __a)
2997{
2998  return __builtin_ia32_movmskpd256((__v4df)__a);
2999}
3000
3001/// \brief Extracts the sign bits of double-precision floating point elements
3002///    in a 256-bit vector of [8 x float] and writes them to the lower order
3003///    bits of the return value.
3004///
3005/// \headerfile <x86intrin.h>
3006///
3007/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
3008///
3009/// \param __a
3010///    A 256-bit vector of [8 x float] containing the double-precision floating
3011///    point values with sign bits to be extracted.
3012/// \returns The sign bits from the operand, written to bits [7:0].
3013static __inline int __DEFAULT_FN_ATTRS
3014_mm256_movemask_ps(__m256 __a)
3015{
3016  return __builtin_ia32_movmskps256((__v8sf)__a);
3017}
3018
3019/* Vector __zero */
3020/// \brief Zeroes the contents of all XMM or YMM registers.
3021///
3022/// \headerfile <x86intrin.h>
3023///
3024/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3025static __inline void __DEFAULT_FN_ATTRS
3026_mm256_zeroall(void)
3027{
3028  __builtin_ia32_vzeroall();
3029}
3030
3031/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3032///
3033/// \headerfile <x86intrin.h>
3034///
3035/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3036static __inline void __DEFAULT_FN_ATTRS
3037_mm256_zeroupper(void)
3038{
3039  __builtin_ia32_vzeroupper();
3040}
3041
3042/* Vector load with broadcast */
3043/// \brief Loads a scalar single-precision floating point value from the
3044///    specified address pointed to by \a __a and broadcasts it to the elements
3045///    of a [4 x float] vector.
3046///
3047/// \headerfile <x86intrin.h>
3048///
3049/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3050///
3051/// \param __a
3052///    The single-precision floating point value to be broadcast.
3053/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3054///    equal to the broadcast value.
3055static __inline __m128 __DEFAULT_FN_ATTRS
3056_mm_broadcast_ss(float const *__a)
3057{
3058  float __f = *__a;
3059  return (__m128)(__v4sf){ __f, __f, __f, __f };
3060}
3061
3062/// \brief Loads a scalar double-precision floating point value from the
3063///    specified address pointed to by \a __a and broadcasts it to the elements
3064///    of a [4 x double] vector.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3069///
3070/// \param __a
3071///    The double-precision floating point value to be broadcast.
3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3073///    equal to the broadcast value.
3074static __inline __m256d __DEFAULT_FN_ATTRS
3075_mm256_broadcast_sd(double const *__a)
3076{
3077  double __d = *__a;
3078  return (__m256d)(__v4df){ __d, __d, __d, __d };
3079}
3080
3081/// \brief Loads a scalar single-precision floating point value from the
3082///    specified address pointed to by \a __a and broadcasts it to the elements
3083///    of a [8 x float] vector.
3084///
3085/// \headerfile <x86intrin.h>
3086///
3087/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3088///
3089/// \param __a
3090///    The single-precision floating point value to be broadcast.
3091/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3092///    equal to the broadcast value.
3093static __inline __m256 __DEFAULT_FN_ATTRS
3094_mm256_broadcast_ss(float const *__a)
3095{
3096  float __f = *__a;
3097  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3098}
3099
3100/// \brief Loads the data from a 128-bit vector of [2 x double] from the
3101///    specified address pointed to by \a __a and broadcasts it to 128-bit
3102///    elements in a 256-bit vector of [4 x double].
3103///
3104/// \headerfile <x86intrin.h>
3105///
3106/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3107///
3108/// \param __a
3109///    The 128-bit vector of [2 x double] to be broadcast.
3110/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3111///    equal to the broadcast value.
3112static __inline __m256d __DEFAULT_FN_ATTRS
3113_mm256_broadcast_pd(__m128d const *__a)
3114{
3115  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
3116}
3117
3118/// \brief Loads the data from a 128-bit vector of [4 x float] from the
3119///    specified address pointed to by \a __a and broadcasts it to 128-bit
3120///    elements in a 256-bit vector of [8 x float].
3121///
3122/// \headerfile <x86intrin.h>
3123///
3124/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3125///
3126/// \param __a
3127///    The 128-bit vector of [4 x float] to be broadcast.
3128/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3129///    equal to the broadcast value.
3130static __inline __m256 __DEFAULT_FN_ATTRS
3131_mm256_broadcast_ps(__m128 const *__a)
3132{
3133  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
3134}
3135
3136/* SIMD load ops */
3137/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
3138///    memory location pointed to by \a __p into a vector of [4 x double].
3139///
3140/// \headerfile <x86intrin.h>
3141///
3142/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3143///
3144/// \param __p
3145///    A 32-byte aligned pointer to a memory location containing
3146///    double-precision floating point values.
3147/// \returns A 256-bit vector of [4 x double] containing the moved values.
3148static __inline __m256d __DEFAULT_FN_ATTRS
3149_mm256_load_pd(double const *__p)
3150{
3151  return *(__m256d *)__p;
3152}
3153
3154/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
3155///    memory location pointed to by \a __p into a vector of [8 x float].
3156///
3157/// \headerfile <x86intrin.h>
3158///
3159/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3160///
3161/// \param __p
3162///    A 32-byte aligned pointer to a memory location containing float values.
3163/// \returns A 256-bit vector of [8 x float] containing the moved values.
3164static __inline __m256 __DEFAULT_FN_ATTRS
3165_mm256_load_ps(float const *__p)
3166{
3167  return *(__m256 *)__p;
3168}
3169
3170/// \brief Loads 4 double-precision floating point values from an unaligned
3171///    memory location pointed to by \a __p into a vector of [4 x double].
3172///
3173/// \headerfile <x86intrin.h>
3174///
3175/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3176///
3177/// \param __p
3178///    A pointer to a memory location containing double-precision floating
3179///    point values.
3180/// \returns A 256-bit vector of [4 x double] containing the moved values.
3181static __inline __m256d __DEFAULT_FN_ATTRS
3182_mm256_loadu_pd(double const *__p)
3183{
3184  struct __loadu_pd {
3185    __m256d __v;
3186  } __attribute__((__packed__, __may_alias__));
3187  return ((struct __loadu_pd*)__p)->__v;
3188}
3189
3190/// \brief Loads 8 single-precision floating point values from an unaligned
3191///    memory location pointed to by \a __p into a vector of [8 x float].
3192///
3193/// \headerfile <x86intrin.h>
3194///
3195/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3196///
3197/// \param __p
3198///    A pointer to a memory location containing single-precision floating
3199///    point values.
3200/// \returns A 256-bit vector of [8 x float] containing the moved values.
3201static __inline __m256 __DEFAULT_FN_ATTRS
3202_mm256_loadu_ps(float const *__p)
3203{
3204  struct __loadu_ps {
3205    __m256 __v;
3206  } __attribute__((__packed__, __may_alias__));
3207  return ((struct __loadu_ps*)__p)->__v;
3208}
3209
3210/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
3211///    location pointed to by \a __p into elements of a 256-bit integer vector.
3212///
3213/// \headerfile <x86intrin.h>
3214///
3215/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3216///
3217/// \param __p
3218///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3219///    values.
3220/// \returns A 256-bit integer vector containing the moved values.
3221static __inline __m256i __DEFAULT_FN_ATTRS
3222_mm256_load_si256(__m256i const *__p)
3223{
3224  return *__p;
3225}
3226
3227/// \brief Loads 256 bits of integer data from an unaligned memory location
3228///    pointed to by \a __p into a 256-bit integer vector.
3229///
3230/// \headerfile <x86intrin.h>
3231///
3232/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3233///
3234/// \param __p
3235///    A pointer to a 256-bit integer vector containing integer values.
3236/// \returns A 256-bit integer vector containing the moved values.
3237static __inline __m256i __DEFAULT_FN_ATTRS
3238_mm256_loadu_si256(__m256i const *__p)
3239{
3240  struct __loadu_si256 {
3241    __m256i __v;
3242  } __attribute__((__packed__, __may_alias__));
3243  return ((struct __loadu_si256*)__p)->__v;
3244}
3245
3246/// \brief Loads 256 bits of integer data from an unaligned memory location
3247///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3248///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3249///    line boundary.
3250///
3251/// \headerfile <x86intrin.h>
3252///
3253/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3254///
3255/// \param __p
3256///    A pointer to a 256-bit integer vector containing integer values.
3257/// \returns A 256-bit integer vector containing the moved values.
3258static __inline __m256i __DEFAULT_FN_ATTRS
3259_mm256_lddqu_si256(__m256i const *__p)
3260{
3261  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3262}
3263
3264/* SIMD store ops */
3265/// \brief Stores double-precision floating point values from a 256-bit vector
3266///    of [4 x double] to a 32-byte aligned memory location pointed to by
3267///    \a __p.
3268///
3269/// \headerfile <x86intrin.h>
3270///
3271/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3272///
3273/// \param __p
3274///    A 32-byte aligned pointer to a memory location that will receive the
3275///    double-precision floaing point values.
3276/// \param __a
3277///    A 256-bit vector of [4 x double] containing the values to be moved.
3278static __inline void __DEFAULT_FN_ATTRS
3279_mm256_store_pd(double *__p, __m256d __a)
3280{
3281  *(__m256d *)__p = __a;
3282}
3283
3284/// \brief Stores single-precision floating point values from a 256-bit vector
3285///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3286///
3287/// \headerfile <x86intrin.h>
3288///
3289/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3290///
3291/// \param __p
3292///    A 32-byte aligned pointer to a memory location that will receive the
3293///    float values.
3294/// \param __a
3295///    A 256-bit vector of [8 x float] containing the values to be moved.
3296static __inline void __DEFAULT_FN_ATTRS
3297_mm256_store_ps(float *__p, __m256 __a)
3298{
3299  *(__m256 *)__p = __a;
3300}
3301
3302/// \brief Stores double-precision floating point values from a 256-bit vector
3303///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3304///
3305/// \headerfile <x86intrin.h>
3306///
3307/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3308///
3309/// \param __p
3310///    A pointer to a memory location that will receive the double-precision
3311///    floating point values.
3312/// \param __a
3313///    A 256-bit vector of [4 x double] containing the values to be moved.
3314static __inline void __DEFAULT_FN_ATTRS
3315_mm256_storeu_pd(double *__p, __m256d __a)
3316{
3317  struct __storeu_pd {
3318    __m256d __v;
3319  } __attribute__((__packed__, __may_alias__));
3320  ((struct __storeu_pd*)__p)->__v = __a;
3321}
3322
3323/// \brief Stores single-precision floating point values from a 256-bit vector
3324///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3325///
3326/// \headerfile <x86intrin.h>
3327///
3328/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3329///
3330/// \param __p
3331///    A pointer to a memory location that will receive the float values.
3332/// \param __a
3333///    A 256-bit vector of [8 x float] containing the values to be moved.
3334static __inline void __DEFAULT_FN_ATTRS
3335_mm256_storeu_ps(float *__p, __m256 __a)
3336{
3337  struct __storeu_ps {
3338    __m256 __v;
3339  } __attribute__((__packed__, __may_alias__));
3340  ((struct __storeu_ps*)__p)->__v = __a;
3341}
3342
3343/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
3344///    aligned memory location pointed to by \a __p.
3345///
3346/// \headerfile <x86intrin.h>
3347///
3348/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3349///
3350/// \param __p
3351///    A 32-byte aligned pointer to a memory location that will receive the
3352///    integer values.
3353/// \param __a
3354///    A 256-bit integer vector containing the values to be moved.
3355static __inline void __DEFAULT_FN_ATTRS
3356_mm256_store_si256(__m256i *__p, __m256i __a)
3357{
3358  *__p = __a;
3359}
3360
3361/// \brief Stores integer values from a 256-bit integer vector to an unaligned
3362///    memory location pointed to by \a __p.
3363///
3364/// \headerfile <x86intrin.h>
3365///
3366/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3367///
3368/// \param __p
3369///    A pointer to a memory location that will receive the integer values.
3370/// \param __a
3371///    A 256-bit integer vector containing the values to be moved.
3372static __inline void __DEFAULT_FN_ATTRS
3373_mm256_storeu_si256(__m256i *__p, __m256i __a)
3374{
3375  struct __storeu_si256 {
3376    __m256i __v;
3377  } __attribute__((__packed__, __may_alias__));
3378  ((struct __storeu_si256*)__p)->__v = __a;
3379}
3380
3381/* Conditional load ops */
3382/// \brief Conditionally loads double-precision floating point elements from a
3383///    memory location pointed to by \a __p into a 128-bit vector of
3384///    [2 x double], depending on the mask bits associated with each data
3385///    element.
3386///
3387/// \headerfile <x86intrin.h>
3388///
3389/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3390///
3391/// \param __p
3392///    A pointer to a memory location that contains the double-precision
3393///    floating point values.
3394/// \param __m
3395///    A 128-bit integer vector containing the mask. The most significant bit of
3396///    each data element represents the mask bits. If a mask bit is zero, the
3397///    corresponding value in the memory location is not loaded and the
3398///    corresponding field in the return value is set to zero.
3399/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3400static __inline __m128d __DEFAULT_FN_ATTRS
3401_mm_maskload_pd(double const *__p, __m128i __m)
3402{
3403  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3404}
3405
3406/// \brief Conditionally loads double-precision floating point elements from a
3407///    memory location pointed to by \a __p into a 256-bit vector of
3408///    [4 x double], depending on the mask bits associated with each data
3409///    element.
3410///
3411/// \headerfile <x86intrin.h>
3412///
3413/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3414///
3415/// \param __p
3416///    A pointer to a memory location that contains the double-precision
3417///    floating point values.
3418/// \param __m
3419///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3420///    significant bit of each quadword element represents the mask bits. If a
3421///    mask bit is zero, the corresponding value in the memory location is not
3422///    loaded and the corresponding field in the return value is set to zero.
3423/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3424static __inline __m256d __DEFAULT_FN_ATTRS
3425_mm256_maskload_pd(double const *__p, __m256i __m)
3426{
3427  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3428                                               (__v4di)__m);
3429}
3430
3431/// \brief Conditionally loads single-precision floating point elements from a
3432///    memory location pointed to by \a __p into a 128-bit vector of
3433///    [4 x float], depending on the mask bits associated with each data
3434///    element.
3435///
3436/// \headerfile <x86intrin.h>
3437///
3438/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3439///
3440/// \param __p
3441///    A pointer to a memory location that contains the single-precision
3442///    floating point values.
3443/// \param __m
3444///    A 128-bit integer vector containing the mask. The most significant bit of
3445///    each data element represents the mask bits. If a mask bit is zero, the
3446///    corresponding value in the memory location is not loaded and the
3447///    corresponding field in the return value is set to zero.
3448/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3449static __inline __m128 __DEFAULT_FN_ATTRS
3450_mm_maskload_ps(float const *__p, __m128i __m)
3451{
3452  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3453}
3454
3455/// \brief Conditionally loads single-precision floating point elements from a
3456///    memory location pointed to by \a __p into a 256-bit vector of
3457///    [8 x float], depending on the mask bits associated with each data
3458///    element.
3459///
3460/// \headerfile <x86intrin.h>
3461///
3462/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3463///
3464/// \param __p
3465///    A pointer to a memory location that contains the single-precision
3466///    floating point values.
3467/// \param __m
3468///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3469///    significant bit of each dword element represents the mask bits. If a mask
3470///    bit is zero, the corresponding value in the memory location is not loaded
3471///    and the corresponding field in the return value is set to zero.
3472/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3473static __inline __m256 __DEFAULT_FN_ATTRS
3474_mm256_maskload_ps(float const *__p, __m256i __m)
3475{
3476  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3477}
3478
3479/* Conditional store ops */
3480/// \brief Moves single-precision floating point values from a 256-bit vector
3481///    of [8 x float] to a memory location pointed to by \a __p, according to
3482///    the specified mask.
3483///
3484/// \headerfile <x86intrin.h>
3485///
3486/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3487///
3488/// \param __p
3489///    A pointer to a memory location that will receive the float values.
3490/// \param __m
3491///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3492///    significant bit of each dword element in the mask vector represents the
3493///    mask bits. If a mask bit is zero, the corresponding value from vector
3494///    \a __a is not stored and the corresponding field in the memory location
3495///    pointed to by \a __p is not changed.
3496/// \param __a
3497///    A 256-bit vector of [8 x float] containing the values to be stored.
3498static __inline void __DEFAULT_FN_ATTRS
3499_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3500{
3501  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3502}
3503
3504/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
3505///    to a memory location pointed to by \a __p, according to the specified
3506///    mask.
3507///
3508/// \headerfile <x86intrin.h>
3509///
3510/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3511///
3512/// \param __p
3513///    A pointer to a memory location that will receive the float values.
3514/// \param __m
3515///    A 128-bit integer vector containing the mask. The most significant bit of
3516///    each field in the mask vector represents the mask bits. If a mask bit is
3517///    zero, the corresponding value from vector \a __a is not stored and the
3518///    corresponding field in the memory location pointed to by \a __p is not
3519///    changed.
3520/// \param __a
3521///    A 128-bit vector of [2 x double] containing the values to be stored.
3522static __inline void __DEFAULT_FN_ATTRS
3523_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3524{
3525  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3526}
3527
3528/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3529///    to a memory location pointed to by \a __p, according to the specified
3530///    mask.
3531///
3532/// \headerfile <x86intrin.h>
3533///
3534/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3535///
3536/// \param __p
3537///    A pointer to a memory location that will receive the float values.
3538/// \param __m
3539///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3540///    significant bit of each quadword element in the mask vector represents
3541///    the mask bits. If a mask bit is zero, the corresponding value from vector
3542///    __a is not stored and the corresponding field in the memory location
3543///    pointed to by \a __p is not changed.
3544/// \param __a
3545///    A 256-bit vector of [4 x double] containing the values to be stored.
3546static __inline void __DEFAULT_FN_ATTRS
3547_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3548{
3549  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3550}
3551
3552/// \brief Moves single-precision floating point values from a 128-bit vector
3553///    of [4 x float] to a memory location pointed to by \a __p, according to
3554///    the specified mask.
3555///
3556/// \headerfile <x86intrin.h>
3557///
3558/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3559///
3560/// \param __p
3561///    A pointer to a memory location that will receive the float values.
3562/// \param __m
3563///    A 128-bit integer vector containing the mask. The most significant bit of
3564///    each field in the mask vector represents the mask bits. If a mask bit is
3565///    zero, the corresponding value from vector __a is not stored and the
3566///    corresponding field in the memory location pointed to by \a __p is not
3567///    changed.
3568/// \param __a
3569///    A 128-bit vector of [4 x float] containing the values to be stored.
3570static __inline void __DEFAULT_FN_ATTRS
3571_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3572{
3573  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3574}
3575
3576/* Cacheability support ops */
3577/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3578///    aligned memory location. To minimize caching, the data is flagged as
3579///    non-temporal (unlikely to be used again soon).
3580///
3581/// \headerfile <x86intrin.h>
3582///
3583/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3584///
3585/// \param __a
3586///    A pointer to a 32-byte aligned memory location that will receive the
3587///    integer values.
3588/// \param __b
3589///    A 256-bit integer vector containing the values to be moved.
3590static __inline void __DEFAULT_FN_ATTRS
3591_mm256_stream_si256(__m256i *__a, __m256i __b)
3592{
3593  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3594  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3595}
3596
3597/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3598///    to a 32-byte aligned memory location. To minimize caching, the data is
3599///    flagged as non-temporal (unlikely to be used again soon).
3600///
3601/// \headerfile <x86intrin.h>
3602///
3603/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3604///
3605/// \param __a
3606///    A pointer to a 32-byte aligned memory location that will receive the
3607///    double-precision floating-point values.
3608/// \param __b
3609///    A 256-bit vector of [4 x double] containing the values to be moved.
3610static __inline void __DEFAULT_FN_ATTRS
3611_mm256_stream_pd(double *__a, __m256d __b)
3612{
3613  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3614  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3615}
3616
3617/// \brief Moves single-precision floating point values from a 256-bit vector
3618///    of [8 x float] to a 32-byte aligned memory location. To minimize
3619///    caching, the data is flagged as non-temporal (unlikely to be used again
3620///    soon).
3621///
3622/// \headerfile <x86intrin.h>
3623///
3624/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3625///
3626/// \param __p
3627///    A pointer to a 32-byte aligned memory location that will receive the
3628///    single-precision floating point values.
3629/// \param __a
3630///    A 256-bit vector of [8 x float] containing the values to be moved.
3631static __inline void __DEFAULT_FN_ATTRS
3632_mm256_stream_ps(float *__p, __m256 __a)
3633{
3634  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3635  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3636}
3637
3638/* Create vectors */
3639/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3640///
3641/// \headerfile <x86intrin.h>
3642///
3643/// This intrinsic has no corresponding instruction.
3644///
3645/// \returns A 256-bit vector of [4 x double] containing undefined values.
3646static __inline__ __m256d __DEFAULT_FN_ATTRS
3647_mm256_undefined_pd(void)
3648{
3649  return (__m256d)__builtin_ia32_undef256();
3650}
3651
3652/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3653///
3654/// \headerfile <x86intrin.h>
3655///
3656/// This intrinsic has no corresponding instruction.
3657///
3658/// \returns A 256-bit vector of [8 x float] containing undefined values.
3659static __inline__ __m256 __DEFAULT_FN_ATTRS
3660_mm256_undefined_ps(void)
3661{
3662  return (__m256)__builtin_ia32_undef256();
3663}
3664
3665/// \brief Create a 256-bit integer vector with undefined values.
3666///
3667/// \headerfile <x86intrin.h>
3668///
3669/// This intrinsic has no corresponding instruction.
3670///
3671/// \returns A 256-bit integer vector containing undefined values.
3672static __inline__ __m256i __DEFAULT_FN_ATTRS
3673_mm256_undefined_si256(void)
3674{
3675  return (__m256i)__builtin_ia32_undef256();
3676}
3677
3678/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3679///    initialized with the specified double-precision floating-point values.
3680///
3681/// \headerfile <x86intrin.h>
3682///
3683/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3684///   instruction.
3685///
3686/// \param __a
3687///    A double-precision floating-point value used to initialize bits [255:192]
3688///    of the result.
3689/// \param __b
3690///    A double-precision floating-point value used to initialize bits [191:128]
3691///    of the result.
3692/// \param __c
3693///    A double-precision floating-point value used to initialize bits [127:64]
3694///    of the result.
3695/// \param __d
3696///    A double-precision floating-point value used to initialize bits [63:0]
3697///    of the result.
3698/// \returns An initialized 256-bit floating-point vector of [4 x double].
3699static __inline __m256d __DEFAULT_FN_ATTRS
3700_mm256_set_pd(double __a, double __b, double __c, double __d)
3701{
3702  return (__m256d){ __d, __c, __b, __a };
3703}
3704
3705/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3706///    with the specified single-precision floating-point values.
3707///
3708/// \headerfile <x86intrin.h>
3709///
3710/// This intrinsic is a utility function and does not correspond to a specific
3711///   instruction.
3712///
3713/// \param __a
3714///    A single-precision floating-point value used to initialize bits [255:224]
3715///    of the result.
3716/// \param __b
3717///    A single-precision floating-point value used to initialize bits [223:192]
3718///    of the result.
3719/// \param __c
3720///    A single-precision floating-point value used to initialize bits [191:160]
3721///    of the result.
3722/// \param __d
3723///    A single-precision floating-point value used to initialize bits [159:128]
3724///    of the result.
3725/// \param __e
3726///    A single-precision floating-point value used to initialize bits [127:96]
3727///    of the result.
3728/// \param __f
3729///    A single-precision floating-point value used to initialize bits [95:64]
3730///    of the result.
3731/// \param __g
3732///    A single-precision floating-point value used to initialize bits [63:32]
3733///    of the result.
3734/// \param __h
3735///    A single-precision floating-point value used to initialize bits [31:0]
3736///    of the result.
3737/// \returns An initialized 256-bit floating-point vector of [8 x float].
3738static __inline __m256 __DEFAULT_FN_ATTRS
3739_mm256_set_ps(float __a, float __b, float __c, float __d,
3740              float __e, float __f, float __g, float __h)
3741{
3742  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3743}
3744
3745/// \brief Constructs a 256-bit integer vector initialized with the specified
3746///    32-bit integral values.
3747///
3748/// \headerfile <x86intrin.h>
3749///
3750/// This intrinsic is a utility function and does not correspond to a specific
3751///   instruction.
3752///
3753/// \param __i0
3754///    A 32-bit integral value used to initialize bits [255:224] of the result.
3755/// \param __i1
3756///    A 32-bit integral value used to initialize bits [223:192] of the result.
3757/// \param __i2
3758///    A 32-bit integral value used to initialize bits [191:160] of the result.
3759/// \param __i3
3760///    A 32-bit integral value used to initialize bits [159:128] of the result.
3761/// \param __i4
3762///    A 32-bit integral value used to initialize bits [127:96] of the result.
3763/// \param __i5
3764///    A 32-bit integral value used to initialize bits [95:64] of the result.
3765/// \param __i6
3766///    A 32-bit integral value used to initialize bits [63:32] of the result.
3767/// \param __i7
3768///    A 32-bit integral value used to initialize bits [31:0] of the result.
3769/// \returns An initialized 256-bit integer vector.
3770static __inline __m256i __DEFAULT_FN_ATTRS
3771_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3772                 int __i4, int __i5, int __i6, int __i7)
3773{
3774  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3775}
3776
3777/// \brief Constructs a 256-bit integer vector initialized with the specified
3778///    16-bit integral values.
3779///
3780/// \headerfile <x86intrin.h>
3781///
3782/// This intrinsic is a utility function and does not correspond to a specific
3783///   instruction.
3784///
3785/// \param __w15
3786///    A 16-bit integral value used to initialize bits [255:240] of the result.
3787/// \param __w14
3788///    A 16-bit integral value used to initialize bits [239:224] of the result.
3789/// \param __w13
3790///    A 16-bit integral value used to initialize bits [223:208] of the result.
3791/// \param __w12
3792///    A 16-bit integral value used to initialize bits [207:192] of the result.
3793/// \param __w11
3794///    A 16-bit integral value used to initialize bits [191:176] of the result.
3795/// \param __w10
3796///    A 16-bit integral value used to initialize bits [175:160] of the result.
3797/// \param __w09
3798///    A 16-bit integral value used to initialize bits [159:144] of the result.
3799/// \param __w08
3800///    A 16-bit integral value used to initialize bits [143:128] of the result.
3801/// \param __w07
3802///    A 16-bit integral value used to initialize bits [127:112] of the result.
3803/// \param __w06
3804///    A 16-bit integral value used to initialize bits [111:96] of the result.
3805/// \param __w05
3806///    A 16-bit integral value used to initialize bits [95:80] of the result.
3807/// \param __w04
3808///    A 16-bit integral value used to initialize bits [79:64] of the result.
3809/// \param __w03
3810///    A 16-bit integral value used to initialize bits [63:48] of the result.
3811/// \param __w02
3812///    A 16-bit integral value used to initialize bits [47:32] of the result.
3813/// \param __w01
3814///    A 16-bit integral value used to initialize bits [31:16] of the result.
3815/// \param __w00
3816///    A 16-bit integral value used to initialize bits [15:0] of the result.
3817/// \returns An initialized 256-bit integer vector.
3818static __inline __m256i __DEFAULT_FN_ATTRS
3819_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3820                 short __w11, short __w10, short __w09, short __w08,
3821                 short __w07, short __w06, short __w05, short __w04,
3822                 short __w03, short __w02, short __w01, short __w00)
3823{
3824  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3825    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3826}
3827
3828/// \brief Constructs a 256-bit integer vector initialized with the specified
3829///    8-bit integral values.
3830///
3831/// \headerfile <x86intrin.h>
3832///
3833/// This intrinsic is a utility function and does not correspond to a specific
3834///   instruction.
3835///
3836/// \param __b31
3837///    An 8-bit integral value used to initialize bits [255:248] of the result.
3838/// \param __b30
3839///    An 8-bit integral value used to initialize bits [247:240] of the result.
3840/// \param __b29
3841///    An 8-bit integral value used to initialize bits [239:232] of the result.
3842/// \param __b28
3843///    An 8-bit integral value used to initialize bits [231:224] of the result.
3844/// \param __b27
3845///    An 8-bit integral value used to initialize bits [223:216] of the result.
3846/// \param __b26
3847///    An 8-bit integral value used to initialize bits [215:208] of the result.
3848/// \param __b25
3849///    An 8-bit integral value used to initialize bits [207:200] of the result.
3850/// \param __b24
3851///    An 8-bit integral value used to initialize bits [199:192] of the result.
3852/// \param __b23
3853///    An 8-bit integral value used to initialize bits [191:184] of the result.
3854/// \param __b22
3855///    An 8-bit integral value used to initialize bits [183:176] of the result.
3856/// \param __b21
3857///    An 8-bit integral value used to initialize bits [175:168] of the result.
3858/// \param __b20
3859///    An 8-bit integral value used to initialize bits [167:160] of the result.
3860/// \param __b19
3861///    An 8-bit integral value used to initialize bits [159:152] of the result.
3862/// \param __b18
3863///    An 8-bit integral value used to initialize bits [151:144] of the result.
3864/// \param __b17
3865///    An 8-bit integral value used to initialize bits [143:136] of the result.
3866/// \param __b16
3867///    An 8-bit integral value used to initialize bits [135:128] of the result.
3868/// \param __b15
3869///    An 8-bit integral value used to initialize bits [127:120] of the result.
3870/// \param __b14
3871///    An 8-bit integral value used to initialize bits [119:112] of the result.
3872/// \param __b13
3873///    An 8-bit integral value used to initialize bits [111:104] of the result.
3874/// \param __b12
3875///    An 8-bit integral value used to initialize bits [103:96] of the result.
3876/// \param __b11
3877///    An 8-bit integral value used to initialize bits [95:88] of the result.
3878/// \param __b10
3879///    An 8-bit integral value used to initialize bits [87:80] of the result.
3880/// \param __b09
3881///    An 8-bit integral value used to initialize bits [79:72] of the result.
3882/// \param __b08
3883///    An 8-bit integral value used to initialize bits [71:64] of the result.
3884/// \param __b07
3885///    An 8-bit integral value used to initialize bits [63:56] of the result.
3886/// \param __b06
3887///    An 8-bit integral value used to initialize bits [55:48] of the result.
3888/// \param __b05
3889///    An 8-bit integral value used to initialize bits [47:40] of the result.
3890/// \param __b04
3891///    An 8-bit integral value used to initialize bits [39:32] of the result.
3892/// \param __b03
3893///    An 8-bit integral value used to initialize bits [31:24] of the result.
3894/// \param __b02
3895///    An 8-bit integral value used to initialize bits [23:16] of the result.
3896/// \param __b01
3897///    An 8-bit integral value used to initialize bits [15:8] of the result.
3898/// \param __b00
3899///    An 8-bit integral value used to initialize bits [7:0] of the result.
3900/// \returns An initialized 256-bit integer vector.
3901static __inline __m256i __DEFAULT_FN_ATTRS
3902_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3903                char __b27, char __b26, char __b25, char __b24,
3904                char __b23, char __b22, char __b21, char __b20,
3905                char __b19, char __b18, char __b17, char __b16,
3906                char __b15, char __b14, char __b13, char __b12,
3907                char __b11, char __b10, char __b09, char __b08,
3908                char __b07, char __b06, char __b05, char __b04,
3909                char __b03, char __b02, char __b01, char __b00)
3910{
3911  return (__m256i)(__v32qi){
3912    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3913    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3914    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3915    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3916  };
3917}
3918
3919/// \brief Constructs a 256-bit integer vector initialized with the specified
3920///    64-bit integral values.
3921///
3922/// \headerfile <x86intrin.h>
3923///
3924/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3925///   instruction.
3926///
3927/// \param __a
3928///    A 64-bit integral value used to initialize bits [255:192] of the result.
3929/// \param __b
3930///    A 64-bit integral value used to initialize bits [191:128] of the result.
3931/// \param __c
3932///    A 64-bit integral value used to initialize bits [127:64] of the result.
3933/// \param __d
3934///    A 64-bit integral value used to initialize bits [63:0] of the result.
3935/// \returns An initialized 256-bit integer vector.
3936static __inline __m256i __DEFAULT_FN_ATTRS
3937_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3938{
3939  return (__m256i)(__v4di){ __d, __c, __b, __a };
3940}
3941
3942/* Create vectors with elements in reverse order */
3943/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3944///    initialized in reverse order with the specified double-precision
3945///    floating-point values.
3946///
3947/// \headerfile <x86intrin.h>
3948///
3949/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3950///   instruction.
3951///
3952/// \param __a
3953///    A double-precision floating-point value used to initialize bits [63:0]
3954///    of the result.
3955/// \param __b
3956///    A double-precision floating-point value used to initialize bits [127:64]
3957///    of the result.
3958/// \param __c
3959///    A double-precision floating-point value used to initialize bits [191:128]
3960///    of the result.
3961/// \param __d
3962///    A double-precision floating-point value used to initialize bits [255:192]
3963///    of the result.
3964/// \returns An initialized 256-bit floating-point vector of [4 x double].
3965static __inline __m256d __DEFAULT_FN_ATTRS
3966_mm256_setr_pd(double __a, double __b, double __c, double __d)
3967{
3968  return (__m256d){ __a, __b, __c, __d };
3969}
3970
3971/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3972///    initialized in reverse order with the specified single-precision
3973///    float-point values.
3974///
3975/// \headerfile <x86intrin.h>
3976///
3977/// This intrinsic is a utility function and does not correspond to a specific
3978///   instruction.
3979///
3980/// \param __a
3981///    A single-precision floating-point value used to initialize bits [31:0]
3982///    of the result.
3983/// \param __b
3984///    A single-precision floating-point value used to initialize bits [63:32]
3985///    of the result.
3986/// \param __c
3987///    A single-precision floating-point value used to initialize bits [95:64]
3988///    of the result.
3989/// \param __d
3990///    A single-precision floating-point value used to initialize bits [127:96]
3991///    of the result.
3992/// \param __e
3993///    A single-precision floating-point value used to initialize bits [159:128]
3994///    of the result.
3995/// \param __f
3996///    A single-precision floating-point value used to initialize bits [191:160]
3997///    of the result.
3998/// \param __g
3999///    A single-precision floating-point value used to initialize bits [223:192]
4000///    of the result.
4001/// \param __h
4002///    A single-precision floating-point value used to initialize bits [255:224]
4003///    of the result.
4004/// \returns An initialized 256-bit floating-point vector of [8 x float].
4005static __inline __m256 __DEFAULT_FN_ATTRS
4006_mm256_setr_ps(float __a, float __b, float __c, float __d,
4007               float __e, float __f, float __g, float __h)
4008{
4009  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
4010}
4011
4012/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4013///    with the specified 32-bit integral values.
4014///
4015/// \headerfile <x86intrin.h>
4016///
4017/// This intrinsic is a utility function and does not correspond to a specific
4018///   instruction.
4019///
4020/// \param __i0
4021///    A 32-bit integral value used to initialize bits [31:0] of the result.
4022/// \param __i1
4023///    A 32-bit integral value used to initialize bits [63:32] of the result.
4024/// \param __i2
4025///    A 32-bit integral value used to initialize bits [95:64] of the result.
4026/// \param __i3
4027///    A 32-bit integral value used to initialize bits [127:96] of the result.
4028/// \param __i4
4029///    A 32-bit integral value used to initialize bits [159:128] of the result.
4030/// \param __i5
4031///    A 32-bit integral value used to initialize bits [191:160] of the result.
4032/// \param __i6
4033///    A 32-bit integral value used to initialize bits [223:192] of the result.
4034/// \param __i7
4035///    A 32-bit integral value used to initialize bits [255:224] of the result.
4036/// \returns An initialized 256-bit integer vector.
4037static __inline __m256i __DEFAULT_FN_ATTRS
4038_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4039                  int __i4, int __i5, int __i6, int __i7)
4040{
4041  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
4042}
4043
4044/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4045///    with the specified 16-bit integral values.
4046///
4047/// \headerfile <x86intrin.h>
4048///
4049/// This intrinsic is a utility function and does not correspond to a specific
4050///   instruction.
4051///
4052/// \param __w15
4053///    A 16-bit integral value used to initialize bits [15:0] of the result.
4054/// \param __w14
4055///    A 16-bit integral value used to initialize bits [31:16] of the result.
4056/// \param __w13
4057///    A 16-bit integral value used to initialize bits [47:32] of the result.
4058/// \param __w12
4059///    A 16-bit integral value used to initialize bits [63:48] of the result.
4060/// \param __w11
4061///    A 16-bit integral value used to initialize bits [79:64] of the result.
4062/// \param __w10
4063///    A 16-bit integral value used to initialize bits [95:80] of the result.
4064/// \param __w09
4065///    A 16-bit integral value used to initialize bits [111:96] of the result.
4066/// \param __w08
4067///    A 16-bit integral value used to initialize bits [127:112] of the result.
4068/// \param __w07
4069///    A 16-bit integral value used to initialize bits [143:128] of the result.
4070/// \param __w06
4071///    A 16-bit integral value used to initialize bits [159:144] of the result.
4072/// \param __w05
4073///    A 16-bit integral value used to initialize bits [175:160] of the result.
4074/// \param __w04
4075///    A 16-bit integral value used to initialize bits [191:176] of the result.
4076/// \param __w03
4077///    A 16-bit integral value used to initialize bits [207:192] of the result.
4078/// \param __w02
4079///    A 16-bit integral value used to initialize bits [223:208] of the result.
4080/// \param __w01
4081///    A 16-bit integral value used to initialize bits [239:224] of the result.
4082/// \param __w00
4083///    A 16-bit integral value used to initialize bits [255:240] of the result.
4084/// \returns An initialized 256-bit integer vector.
4085static __inline __m256i __DEFAULT_FN_ATTRS
4086_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4087       short __w11, short __w10, short __w09, short __w08,
4088       short __w07, short __w06, short __w05, short __w04,
4089       short __w03, short __w02, short __w01, short __w00)
4090{
4091  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
4092    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
4093}
4094
4095/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4096///    with the specified 8-bit integral values.
4097///
4098/// \headerfile <x86intrin.h>
4099///
4100/// This intrinsic is a utility function and does not correspond to a specific
4101///   instruction.
4102///
4103/// \param __b31
4104///    An 8-bit integral value used to initialize bits [7:0] of the result.
4105/// \param __b30
4106///    An 8-bit integral value used to initialize bits [15:8] of the result.
4107/// \param __b29
4108///    An 8-bit integral value used to initialize bits [23:16] of the result.
4109/// \param __b28
4110///    An 8-bit integral value used to initialize bits [31:24] of the result.
4111/// \param __b27
4112///    An 8-bit integral value used to initialize bits [39:32] of the result.
4113/// \param __b26
4114///    An 8-bit integral value used to initialize bits [47:40] of the result.
4115/// \param __b25
4116///    An 8-bit integral value used to initialize bits [55:48] of the result.
4117/// \param __b24
4118///    An 8-bit integral value used to initialize bits [63:56] of the result.
4119/// \param __b23
4120///    An 8-bit integral value used to initialize bits [71:64] of the result.
4121/// \param __b22
4122///    An 8-bit integral value used to initialize bits [79:72] of the result.
4123/// \param __b21
4124///    An 8-bit integral value used to initialize bits [87:80] of the result.
4125/// \param __b20
4126///    An 8-bit integral value used to initialize bits [95:88] of the result.
4127/// \param __b19
4128///    An 8-bit integral value used to initialize bits [103:96] of the result.
4129/// \param __b18
4130///    An 8-bit integral value used to initialize bits [111:104] of the result.
4131/// \param __b17
4132///    An 8-bit integral value used to initialize bits [119:112] of the result.
4133/// \param __b16
4134///    An 8-bit integral value used to initialize bits [127:120] of the result.
4135/// \param __b15
4136///    An 8-bit integral value used to initialize bits [135:128] of the result.
4137/// \param __b14
4138///    An 8-bit integral value used to initialize bits [143:136] of the result.
4139/// \param __b13
4140///    An 8-bit integral value used to initialize bits [151:144] of the result.
4141/// \param __b12
4142///    An 8-bit integral value used to initialize bits [159:152] of the result.
4143/// \param __b11
4144///    An 8-bit integral value used to initialize bits [167:160] of the result.
4145/// \param __b10
4146///    An 8-bit integral value used to initialize bits [175:168] of the result.
4147/// \param __b09
4148///    An 8-bit integral value used to initialize bits [183:176] of the result.
4149/// \param __b08
4150///    An 8-bit integral value used to initialize bits [191:184] of the result.
4151/// \param __b07
4152///    An 8-bit integral value used to initialize bits [199:192] of the result.
4153/// \param __b06
4154///    An 8-bit integral value used to initialize bits [207:200] of the result.
4155/// \param __b05
4156///    An 8-bit integral value used to initialize bits [215:208] of the result.
4157/// \param __b04
4158///    An 8-bit integral value used to initialize bits [223:216] of the result.
4159/// \param __b03
4160///    An 8-bit integral value used to initialize bits [231:224] of the result.
4161/// \param __b02
4162///    An 8-bit integral value used to initialize bits [239:232] of the result.
4163/// \param __b01
4164///    An 8-bit integral value used to initialize bits [247:240] of the result.
4165/// \param __b00
4166///    An 8-bit integral value used to initialize bits [255:248] of the result.
4167/// \returns An initialized 256-bit integer vector.
4168static __inline __m256i __DEFAULT_FN_ATTRS
4169_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4170                 char __b27, char __b26, char __b25, char __b24,
4171                 char __b23, char __b22, char __b21, char __b20,
4172                 char __b19, char __b18, char __b17, char __b16,
4173                 char __b15, char __b14, char __b13, char __b12,
4174                 char __b11, char __b10, char __b09, char __b08,
4175                 char __b07, char __b06, char __b05, char __b04,
4176                 char __b03, char __b02, char __b01, char __b00)
4177{
4178  return (__m256i)(__v32qi){
4179    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
4180    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
4181    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
4182    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
4183}
4184
4185/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4186///    with the specified 64-bit integral values.
4187///
4188/// \headerfile <x86intrin.h>
4189///
4190/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4191///   instruction.
4192///
4193/// \param __a
4194///    A 64-bit integral value used to initialize bits [63:0] of the result.
4195/// \param __b
4196///    A 64-bit integral value used to initialize bits [127:64] of the result.
4197/// \param __c
4198///    A 64-bit integral value used to initialize bits [191:128] of the result.
4199/// \param __d
4200///    A 64-bit integral value used to initialize bits [255:192] of the result.
4201/// \returns An initialized 256-bit integer vector.
4202static __inline __m256i __DEFAULT_FN_ATTRS
4203_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4204{
4205  return (__m256i)(__v4di){ __a, __b, __c, __d };
4206}
4207
4208/* Create vectors with repeated elements */
4209/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4210///    of the four double-precision floating-point vector elements set to the
4211///    specified double-precision floating-point value.
4212///
4213/// \headerfile <x86intrin.h>
4214///
4215/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4216///
4217/// \param __w
4218///    A double-precision floating-point value used to initialize each vector
4219///    element of the result.
4220/// \returns An initialized 256-bit floating-point vector of [4 x double].
4221static __inline __m256d __DEFAULT_FN_ATTRS
4222_mm256_set1_pd(double __w)
4223{
4224  return (__m256d){ __w, __w, __w, __w };
4225}
4226
4227/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4228///    of the eight single-precision floating-point vector elements set to the
4229///    specified single-precision floating-point value.
4230///
4231/// \headerfile <x86intrin.h>
4232///
4233/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4234///   instruction.
4235///
4236/// \param __w
4237///    A single-precision floating-point value used to initialize each vector
4238///    element of the result.
4239/// \returns An initialized 256-bit floating-point vector of [8 x float].
4240static __inline __m256 __DEFAULT_FN_ATTRS
4241_mm256_set1_ps(float __w)
4242{
4243  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
4244}
4245
4246/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4247///    32-bit integral vector elements set to the specified 32-bit integral
4248///    value.
4249///
4250/// \headerfile <x86intrin.h>
4251///
4252/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4253///   instruction.
4254///
4255/// \param __i
4256///    A 32-bit integral value used to initialize each vector element of the
4257///    result.
4258/// \returns An initialized 256-bit integer vector of [8 x i32].
4259static __inline __m256i __DEFAULT_FN_ATTRS
4260_mm256_set1_epi32(int __i)
4261{
4262  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
4263}
4264
4265/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4266///    16-bit integral vector elements set to the specified 16-bit integral
4267///    value.
4268///
4269/// \headerfile <x86intrin.h>
4270///
4271/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4272///
4273/// \param __w
4274///    A 16-bit integral value used to initialize each vector element of the
4275///    result.
4276/// \returns An initialized 256-bit integer vector of [16 x i16].
4277static __inline __m256i __DEFAULT_FN_ATTRS
4278_mm256_set1_epi16(short __w)
4279{
4280  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4281    __w, __w, __w, __w, __w, __w };
4282}
4283
4284/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4285///    8-bit integral vector elements set to the specified 8-bit integral value.
4286///
4287/// \headerfile <x86intrin.h>
4288///
4289/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4290///
4291/// \param __b
4292///    An 8-bit integral value used to initialize each vector element of the
4293///    result.
4294/// \returns An initialized 256-bit integer vector of [32 x i8].
4295static __inline __m256i __DEFAULT_FN_ATTRS
4296_mm256_set1_epi8(char __b)
4297{
4298  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4299    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4300    __b, __b, __b, __b, __b, __b, __b };
4301}
4302
4303/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4304///    64-bit integral vector elements set to the specified 64-bit integral
4305///    value.
4306///
4307/// \headerfile <x86intrin.h>
4308///
4309/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4310///
4311/// \param __q
4312///    A 64-bit integral value used to initialize each vector element of the
4313///    result.
4314/// \returns An initialized 256-bit integer vector of [4 x i64].
4315static __inline __m256i __DEFAULT_FN_ATTRS
4316_mm256_set1_epi64x(long long __q)
4317{
4318  return (__m256i)(__v4di){ __q, __q, __q, __q };
4319}
4320
4321/* Create __zeroed vectors */
4322/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4323///    vector elements initialized to zero.
4324///
4325/// \headerfile <x86intrin.h>
4326///
4327/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4328///
4329/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4330static __inline __m256d __DEFAULT_FN_ATTRS
4331_mm256_setzero_pd(void)
4332{
4333  return (__m256d){ 0, 0, 0, 0 };
4334}
4335
4336/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4337///    vector elements initialized to zero.
4338///
4339/// \headerfile <x86intrin.h>
4340///
4341/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4342///
4343/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4344static __inline __m256 __DEFAULT_FN_ATTRS
4345_mm256_setzero_ps(void)
4346{
4347  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4348}
4349
4350/// \brief Constructs a 256-bit integer vector initialized to zero.
4351///
4352/// \headerfile <x86intrin.h>
4353///
4354/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4355///
4356/// \returns A 256-bit integer vector initialized to zero.
4357static __inline __m256i __DEFAULT_FN_ATTRS
4358_mm256_setzero_si256(void)
4359{
4360  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4361}
4362
4363/* Cast between vector types */
4364/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4365///    floating-point vector of [8 x float].
4366///
4367/// \headerfile <x86intrin.h>
4368///
4369/// This intrinsic has no corresponding instruction.
4370///
4371/// \param __a
4372///    A 256-bit floating-point vector of [4 x double].
4373/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4374///    bitwise pattern as the parameter.
4375static __inline __m256 __DEFAULT_FN_ATTRS
4376_mm256_castpd_ps(__m256d __a)
4377{
4378  return (__m256)__a;
4379}
4380
4381/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4382///    integer vector.
4383///
4384/// \headerfile <x86intrin.h>
4385///
4386/// This intrinsic has no corresponding instruction.
4387///
4388/// \param __a
4389///    A 256-bit floating-point vector of [4 x double].
4390/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4391///    parameter.
4392static __inline __m256i __DEFAULT_FN_ATTRS
4393_mm256_castpd_si256(__m256d __a)
4394{
4395  return (__m256i)__a;
4396}
4397
4398/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4399///    floating-point vector of [4 x double].
4400///
4401/// \headerfile <x86intrin.h>
4402///
4403/// This intrinsic has no corresponding instruction.
4404///
4405/// \param __a
4406///    A 256-bit floating-point vector of [8 x float].
4407/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4408///    bitwise pattern as the parameter.
4409static __inline __m256d __DEFAULT_FN_ATTRS
4410_mm256_castps_pd(__m256 __a)
4411{
4412  return (__m256d)__a;
4413}
4414
4415/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4416///    integer vector.
4417///
4418/// \headerfile <x86intrin.h>
4419///
4420/// This intrinsic has no corresponding instruction.
4421///
4422/// \param __a
4423///    A 256-bit floating-point vector of [8 x float].
4424/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4425///    parameter.
4426static __inline __m256i __DEFAULT_FN_ATTRS
4427_mm256_castps_si256(__m256 __a)
4428{
4429  return (__m256i)__a;
4430}
4431
4432/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4433///    of [8 x float].
4434///
4435/// \headerfile <x86intrin.h>
4436///
4437/// This intrinsic has no corresponding instruction.
4438///
4439/// \param __a
4440///    A 256-bit integer vector.
4441/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4442///    bitwise pattern as the parameter.
4443static __inline __m256 __DEFAULT_FN_ATTRS
4444_mm256_castsi256_ps(__m256i __a)
4445{
4446  return (__m256)__a;
4447}
4448
4449/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4450///    of [4 x double].
4451///
4452/// \headerfile <x86intrin.h>
4453///
4454/// This intrinsic has no corresponding instruction.
4455///
4456/// \param __a
4457///    A 256-bit integer vector.
4458/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4459///    bitwise pattern as the parameter.
4460static __inline __m256d __DEFAULT_FN_ATTRS
4461_mm256_castsi256_pd(__m256i __a)
4462{
4463  return (__m256d)__a;
4464}
4465
4466/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4467///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4468///
4469/// \headerfile <x86intrin.h>
4470///
4471/// This intrinsic has no corresponding instruction.
4472///
4473/// \param __a
4474///    A 256-bit floating-point vector of [4 x double].
4475/// \returns A 128-bit floating-point vector of [2 x double] containing the
4476///    lower 128 bits of the parameter.
4477static __inline __m128d __DEFAULT_FN_ATTRS
4478_mm256_castpd256_pd128(__m256d __a)
4479{
4480  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4481}
4482
4483/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4484///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4485///
4486/// \headerfile <x86intrin.h>
4487///
4488/// This intrinsic has no corresponding instruction.
4489///
4490/// \param __a
4491///    A 256-bit floating-point vector of [8 x float].
4492/// \returns A 128-bit floating-point vector of [4 x float] containing the
4493///    lower 128 bits of the parameter.
4494static __inline __m128 __DEFAULT_FN_ATTRS
4495_mm256_castps256_ps128(__m256 __a)
4496{
4497  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4498}
4499
4500/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4501///
4502/// \headerfile <x86intrin.h>
4503///
4504/// This intrinsic has no corresponding instruction.
4505///
4506/// \param __a
4507///    A 256-bit integer vector.
4508/// \returns A 128-bit integer vector containing the lower 128 bits of the
4509///    parameter.
4510static __inline __m128i __DEFAULT_FN_ATTRS
4511_mm256_castsi256_si128(__m256i __a)
4512{
4513  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4514}
4515
4516/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4517///    128-bit floating-point vector of [2 x double].
4518///
4519///    The lower 128 bits contain the value of the source vector. The contents
4520///    of the upper 128 bits are undefined.
4521///
4522/// \headerfile <x86intrin.h>
4523///
4524/// This intrinsic has no corresponding instruction.
4525///
4526/// \param __a
4527///    A 128-bit vector of [2 x double].
4528/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4529///    contain the value of the parameter. The contents of the upper 128 bits
4530///    are undefined.
4531static __inline __m256d __DEFAULT_FN_ATTRS
4532_mm256_castpd128_pd256(__m128d __a)
4533{
4534  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4535}
4536
4537/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4538///    128-bit floating-point vector of [4 x float].
4539///
4540///    The lower 128 bits contain the value of the source vector. The contents
4541///    of the upper 128 bits are undefined.
4542///
4543/// \headerfile <x86intrin.h>
4544///
4545/// This intrinsic has no corresponding instruction.
4546///
4547/// \param __a
4548///    A 128-bit vector of [4 x float].
4549/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4550///    contain the value of the parameter. The contents of the upper 128 bits
4551///    are undefined.
4552static __inline __m256 __DEFAULT_FN_ATTRS
4553_mm256_castps128_ps256(__m128 __a)
4554{
4555  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4556}
4557
4558/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4559///
4560///    The lower 128 bits contain the value of the source vector. The contents
4561///    of the upper 128 bits are undefined.
4562///
4563/// \headerfile <x86intrin.h>
4564///
4565/// This intrinsic has no corresponding instruction.
4566///
4567/// \param __a
4568///    A 128-bit integer vector.
4569/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4570///    the parameter. The contents of the upper 128 bits are undefined.
4571static __inline __m256i __DEFAULT_FN_ATTRS
4572_mm256_castsi128_si256(__m128i __a)
4573{
4574  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4575}
4576
4577/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4578///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4579///    contain the value of the source vector. The upper 128 bits are set
4580///    to zero.
4581///
4582/// \headerfile <x86intrin.h>
4583///
4584/// This intrinsic has no corresponding instruction.
4585///
4586/// \param __a
4587///    A 128-bit vector of [2 x double].
4588/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4589///    contain the value of the parameter. The upper 128 bits are set to zero.
4590static __inline __m256d __DEFAULT_FN_ATTRS
4591_mm256_zextpd128_pd256(__m128d __a)
4592{
4593  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4594}
4595
4596/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4597///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4598///    the value of the source vector. The upper 128 bits are set to zero.
4599///
4600/// \headerfile <x86intrin.h>
4601///
4602/// This intrinsic has no corresponding instruction.
4603///
4604/// \param __a
4605///    A 128-bit vector of [4 x float].
4606/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4607///    contain the value of the parameter. The upper 128 bits are set to zero.
4608static __inline __m256 __DEFAULT_FN_ATTRS
4609_mm256_zextps128_ps256(__m128 __a)
4610{
4611  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4612}
4613
4614/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4615///    The lower 128 bits contain the value of the source vector. The upper
4616///    128 bits are set to zero.
4617///
4618/// \headerfile <x86intrin.h>
4619///
4620/// This intrinsic has no corresponding instruction.
4621///
4622/// \param __a
4623///    A 128-bit integer vector.
4624/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4625///    the parameter. The upper 128 bits are set to zero.
4626static __inline __m256i __DEFAULT_FN_ATTRS
4627_mm256_zextsi128_si256(__m128i __a)
4628{
4629  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4630}
4631
4632/*
4633   Vector insert.
4634   We use macros rather than inlines because we only want to accept
4635   invocations where the immediate M is a constant expression.
4636*/
4637/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4638///    a 256-bit vector of [8 x float] given in the first parameter, and then
4639///    replacing either the upper or the lower 128 bits with the contents of a
4640///    128-bit vector of [4 x float] in the second parameter.
4641///
4642///    The immediate integer parameter determines between the upper or the lower
4643///    128 bits.
4644///
4645/// \headerfile <x86intrin.h>
4646///
4647/// \code
4648/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4649/// \endcode
4650///
4651/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4652///
4653/// \param V1
4654///    A 256-bit vector of [8 x float]. This vector is copied to the result
4655///    first, and then either the upper or the lower 128 bits of the result will
4656///    be replaced by the contents of \a V2.
4657/// \param V2
4658///    A 128-bit vector of [4 x float]. The contents of this parameter are
4659///    written to either the upper or the lower 128 bits of the result depending
4660///    on the value of parameter \a M.
4661/// \param M
4662///    An immediate integer. The least significant bit determines how the values
4663///    from the two parameters are interleaved: \n
4664///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4665///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4666///    result. \n
4667///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4668///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4669///    result.
4670/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4671#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4672  (__m256)__builtin_shufflevector( \
4673    (__v8sf)(__m256)(V1), \
4674    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4675    (((M) & 1) ?  0 :  8), \
4676    (((M) & 1) ?  1 :  9), \
4677    (((M) & 1) ?  2 : 10), \
4678    (((M) & 1) ?  3 : 11), \
4679    (((M) & 1) ?  8 :  4), \
4680    (((M) & 1) ?  9 :  5), \
4681    (((M) & 1) ? 10 :  6), \
4682    (((M) & 1) ? 11 :  7) );})
4683
4684/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4685///    a 256-bit vector of [4 x double] given in the first parameter, and then
4686///    replacing either the upper or the lower 128 bits with the contents of a
4687///    128-bit vector of [2 x double] in the second parameter.
4688///
4689///    The immediate integer parameter determines between the upper or the lower
4690///    128 bits.
4691///
4692/// \headerfile <x86intrin.h>
4693///
4694/// \code
4695/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4696/// \endcode
4697///
4698/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4699///
4700/// \param V1
4701///    A 256-bit vector of [4 x double]. This vector is copied to the result
4702///    first, and then either the upper or the lower 128 bits of the result will
4703///    be replaced by the contents of \a V2.
4704/// \param V2
4705///    A 128-bit vector of [2 x double]. The contents of this parameter are
4706///    written to either the upper or the lower 128 bits of the result depending
4707///    on the value of parameter \a M.
4708/// \param M
4709///    An immediate integer. The least significant bit determines how the values
4710///    from the two parameters are interleaved: \n
4711///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4712///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4713///    result. \n
4714///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4715///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4716///    result.
4717/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4718#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4719  (__m256d)__builtin_shufflevector( \
4720    (__v4df)(__m256d)(V1), \
4721    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4722    (((M) & 1) ? 0 : 4), \
4723    (((M) & 1) ? 1 : 5), \
4724    (((M) & 1) ? 4 : 2), \
4725    (((M) & 1) ? 5 : 3) );})
4726
4727/// \brief Constructs a new 256-bit integer vector by first duplicating a
4728///    256-bit integer vector given in the first parameter, and then replacing
4729///    either the upper or the lower 128 bits with the contents of a 128-bit
4730///    integer vector in the second parameter.
4731///
4732///    The immediate integer parameter determines between the upper or the lower
4733///    128 bits.
4734///
4735/// \headerfile <x86intrin.h>
4736///
4737/// \code
4738/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4739/// \endcode
4740///
4741/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4742///
4743/// \param V1
4744///    A 256-bit integer vector. This vector is copied to the result first, and
4745///    then either the upper or the lower 128 bits of the result will be
4746///    replaced by the contents of \a V2.
4747/// \param V2
4748///    A 128-bit integer vector. The contents of this parameter are written to
4749///    either the upper or the lower 128 bits of the result depending on the
4750///     value of parameter \a M.
4751/// \param M
4752///    An immediate integer. The least significant bit determines how the values
4753///    from the two parameters are interleaved: \n
4754///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4755///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4756///    result. \n
4757///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4758///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4759///    result.
4760/// \returns A 256-bit integer vector containing the interleaved values.
4761#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4762  (__m256i)__builtin_shufflevector( \
4763    (__v4di)(__m256i)(V1), \
4764    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4765    (((M) & 1) ? 0 : 4), \
4766    (((M) & 1) ? 1 : 5), \
4767    (((M) & 1) ? 4 : 2), \
4768    (((M) & 1) ? 5 : 3) );})
4769
4770/*
4771   Vector extract.
4772   We use macros rather than inlines because we only want to accept
4773   invocations where the immediate M is a constant expression.
4774*/
4775/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4776///    of [8 x float], as determined by the immediate integer parameter, and
4777///    returns the extracted bits as a 128-bit vector of [4 x float].
4778///
4779/// \headerfile <x86intrin.h>
4780///
4781/// \code
4782/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4783/// \endcode
4784///
4785/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4786///
4787/// \param V
4788///    A 256-bit vector of [8 x float].
4789/// \param M
4790///    An immediate integer. The least significant bit determines which bits are
4791///    extracted from the first parameter: \n
4792///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4793///    result. \n
4794///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4795/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4796#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4797  (__m128)__builtin_shufflevector( \
4798    (__v8sf)(__m256)(V), \
4799    (__v8sf)(_mm256_undefined_ps()), \
4800    (((M) & 1) ? 4 : 0), \
4801    (((M) & 1) ? 5 : 1), \
4802    (((M) & 1) ? 6 : 2), \
4803    (((M) & 1) ? 7 : 3) );})
4804
4805/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4806///    of [4 x double], as determined by the immediate integer parameter, and
4807///    returns the extracted bits as a 128-bit vector of [2 x double].
4808///
4809/// \headerfile <x86intrin.h>
4810///
4811/// \code
4812/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4813/// \endcode
4814///
4815/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4816///
4817/// \param V
4818///    A 256-bit vector of [4 x double].
4819/// \param M
4820///    An immediate integer. The least significant bit determines which bits are
4821///    extracted from the first parameter: \n
4822///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4823///    result. \n
4824///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4825/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4826#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4827  (__m128d)__builtin_shufflevector( \
4828    (__v4df)(__m256d)(V), \
4829    (__v4df)(_mm256_undefined_pd()), \
4830    (((M) & 1) ? 2 : 0), \
4831    (((M) & 1) ? 3 : 1) );})
4832
4833/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4834///    integer vector, as determined by the immediate integer parameter, and
4835///    returns the extracted bits as a 128-bit integer vector.
4836///
4837/// \headerfile <x86intrin.h>
4838///
4839/// \code
4840/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4841/// \endcode
4842///
4843/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4844///
4845/// \param V
4846///    A 256-bit integer vector.
4847/// \param M
4848///    An immediate integer. The least significant bit determines which bits are
4849///    extracted from the first parameter:  \n
4850///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4851///    result. \n
4852///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4853/// \returns A 128-bit integer vector containing the extracted bits.
4854#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4855  (__m128i)__builtin_shufflevector( \
4856    (__v4di)(__m256i)(V), \
4857    (__v4di)(_mm256_undefined_si256()), \
4858    (((M) & 1) ? 2 : 0), \
4859    (((M) & 1) ? 3 : 1) );})
4860
4861/* SIMD load ops (unaligned) */
4862/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4863///    unaligned memory locations and constructs a 256-bit floating-point vector
4864///    of [8 x float] by concatenating the two 128-bit vectors.
4865///
4866/// \headerfile <x86intrin.h>
4867///
4868/// This intrinsic corresponds to load instructions followed by the
4869///   <c> VINSERTF128 </c> instruction.
4870///
4871/// \param __addr_hi
4872///    A pointer to a 128-bit memory location containing 4 consecutive
4873///    single-precision floating-point values. These values are to be copied to
4874///    bits[255:128] of the result. The address of the memory location does not
4875///    have to be aligned.
4876/// \param __addr_lo
4877///    A pointer to a 128-bit memory location containing 4 consecutive
4878///    single-precision floating-point values. These values are to be copied to
4879///    bits[127:0] of the result. The address of the memory location does not
4880///    have to be aligned.
4881/// \returns A 256-bit floating-point vector of [8 x float] containing the
4882///    concatenated result.
4883static __inline __m256 __DEFAULT_FN_ATTRS
4884_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4885{
4886  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4887  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4888}
4889
4890/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4891///    unaligned memory locations and constructs a 256-bit floating-point vector
4892///    of [4 x double] by concatenating the two 128-bit vectors.
4893///
4894/// \headerfile <x86intrin.h>
4895///
4896/// This intrinsic corresponds to load instructions followed by the
4897///   <c> VINSERTF128 </c> instruction.
4898///
4899/// \param __addr_hi
4900///    A pointer to a 128-bit memory location containing two consecutive
4901///    double-precision floating-point values. These values are to be copied to
4902///    bits[255:128] of the result. The address of the memory location does not
4903///    have to be aligned.
4904/// \param __addr_lo
4905///    A pointer to a 128-bit memory location containing two consecutive
4906///    double-precision floating-point values. These values are to be copied to
4907///    bits[127:0] of the result. The address of the memory location does not
4908///    have to be aligned.
4909/// \returns A 256-bit floating-point vector of [4 x double] containing the
4910///    concatenated result.
4911static __inline __m256d __DEFAULT_FN_ATTRS
4912_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4913{
4914  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4915  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4916}
4917
4918/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4919///    constructs a 256-bit integer vector by concatenating the two 128-bit
4920///    vectors.
4921///
4922/// \headerfile <x86intrin.h>
4923///
4924/// This intrinsic corresponds to load instructions followed by the
4925///   <c> VINSERTF128 </c> instruction.
4926///
4927/// \param __addr_hi
4928///    A pointer to a 128-bit memory location containing a 128-bit integer
4929///    vector. This vector is to be copied to bits[255:128] of the result. The
4930///    address of the memory location does not have to be aligned.
4931/// \param __addr_lo
4932///    A pointer to a 128-bit memory location containing a 128-bit integer
4933///    vector. This vector is to be copied to bits[127:0] of the result. The
4934///    address of the memory location does not have to be aligned.
4935/// \returns A 256-bit integer vector containing the concatenated result.
4936static __inline __m256i __DEFAULT_FN_ATTRS
4937_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
4938{
4939  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4940  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4941}
4942
4943/* SIMD store ops (unaligned) */
4944/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4945///    vector of [8 x float] into two different unaligned memory locations.
4946///
4947/// \headerfile <x86intrin.h>
4948///
4949/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4950///   store instructions.
4951///
4952/// \param __addr_hi
4953///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4954///    copied to this memory location. The address of this memory location does
4955///    not have to be aligned.
4956/// \param __addr_lo
4957///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4958///    copied to this memory location. The address of this memory location does
4959///    not have to be aligned.
4960/// \param __a
4961///    A 256-bit floating-point vector of [8 x float].
4962static __inline void __DEFAULT_FN_ATTRS
4963_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4964{
4965  __m128 __v128;
4966
4967  __v128 = _mm256_castps256_ps128(__a);
4968  _mm_storeu_ps(__addr_lo, __v128);
4969  __v128 = _mm256_extractf128_ps(__a, 1);
4970  _mm_storeu_ps(__addr_hi, __v128);
4971}
4972
4973/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4974///    vector of [4 x double] into two different unaligned memory locations.
4975///
4976/// \headerfile <x86intrin.h>
4977///
4978/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4979///   store instructions.
4980///
4981/// \param __addr_hi
4982///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4983///    copied to this memory location. The address of this memory location does
4984///    not have to be aligned.
4985/// \param __addr_lo
4986///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4987///    copied to this memory location. The address of this memory location does
4988///    not have to be aligned.
4989/// \param __a
4990///    A 256-bit floating-point vector of [4 x double].
4991static __inline void __DEFAULT_FN_ATTRS
4992_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4993{
4994  __m128d __v128;
4995
4996  __v128 = _mm256_castpd256_pd128(__a);
4997  _mm_storeu_pd(__addr_lo, __v128);
4998  __v128 = _mm256_extractf128_pd(__a, 1);
4999  _mm_storeu_pd(__addr_hi, __v128);
5000}
5001
5002/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
5003///    two different unaligned memory locations.
5004///
5005/// \headerfile <x86intrin.h>
5006///
5007/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5008///   store instructions.
5009///
5010/// \param __addr_hi
5011///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5012///    copied to this memory location. The address of this memory location does
5013///    not have to be aligned.
5014/// \param __addr_lo
5015///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5016///    copied to this memory location. The address of this memory location does
5017///    not have to be aligned.
5018/// \param __a
5019///    A 256-bit integer vector.
5020static __inline void __DEFAULT_FN_ATTRS
5021_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
5022{
5023  __m128i __v128;
5024
5025  __v128 = _mm256_castsi256_si128(__a);
5026  _mm_storeu_si128(__addr_lo, __v128);
5027  __v128 = _mm256_extractf128_si256(__a, 1);
5028  _mm_storeu_si128(__addr_hi, __v128);
5029}
5030
5031/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5032///    concatenating two 128-bit floating-point vectors of [4 x float].
5033///
5034/// \headerfile <x86intrin.h>
5035///
5036/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5037///
5038/// \param __hi
5039///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
5040///    128 bits of the result.
5041/// \param __lo
5042///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
5043///    128 bits of the result.
5044/// \returns A 256-bit floating-point vector of [8 x float] containing the
5045///    concatenated result.
5046static __inline __m256 __DEFAULT_FN_ATTRS
5047_mm256_set_m128 (__m128 __hi, __m128 __lo)
5048{
5049  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
5050}
5051
5052/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5053///    concatenating two 128-bit floating-point vectors of [2 x double].
5054///
5055/// \headerfile <x86intrin.h>
5056///
5057/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5058///
5059/// \param __hi
5060///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5061///    128 bits of the result.
5062/// \param __lo
5063///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5064///    128 bits of the result.
5065/// \returns A 256-bit floating-point vector of [4 x double] containing the
5066///    concatenated result.
5067static __inline __m256d __DEFAULT_FN_ATTRS
5068_mm256_set_m128d (__m128d __hi, __m128d __lo)
5069{
5070  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5071}
5072
5073/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5074///    integer vectors.
5075///
5076/// \headerfile <x86intrin.h>
5077///
5078/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5079///
5080/// \param __hi
5081///    A 128-bit integer vector to be copied to the upper 128 bits of the
5082///    result.
5083/// \param __lo
5084///    A 128-bit integer vector to be copied to the lower 128 bits of the
5085///    result.
5086/// \returns A 256-bit integer vector containing the concatenated result.
5087static __inline __m256i __DEFAULT_FN_ATTRS
5088_mm256_set_m128i (__m128i __hi, __m128i __lo)
5089{
5090  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5091}
5092
5093/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5094///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
5095///    similar to _mm256_set_m128, but the order of the input parameters is
5096///    swapped.
5097///
5098/// \headerfile <x86intrin.h>
5099///
5100/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5101///
5102/// \param __lo
5103///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
5104///    128 bits of the result.
5105/// \param __hi
5106///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
5107///    128 bits of the result.
5108/// \returns A 256-bit floating-point vector of [8 x float] containing the
5109///    concatenated result.
5110static __inline __m256 __DEFAULT_FN_ATTRS
5111_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5112{
5113  return _mm256_set_m128(__hi, __lo);
5114}
5115
5116/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5117///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
5118///    similar to _mm256_set_m128d, but the order of the input parameters is
5119///    swapped.
5120///
5121/// \headerfile <x86intrin.h>
5122///
5123/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5124///
5125/// \param __lo
5126///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5127///    128 bits of the result.
5128/// \param __hi
5129///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5130///    128 bits of the result.
5131/// \returns A 256-bit floating-point vector of [4 x double] containing the
5132///    concatenated result.
5133static __inline __m256d __DEFAULT_FN_ATTRS
5134_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5135{
5136  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5137}
5138
5139/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5140///    integer vectors. This is similar to _mm256_set_m128i, but the order of
5141///    the input parameters is swapped.
5142///
5143/// \headerfile <x86intrin.h>
5144///
5145/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5146///
5147/// \param __lo
5148///    A 128-bit integer vector to be copied to the lower 128 bits of the
5149///    result.
5150/// \param __hi
5151///    A 128-bit integer vector to be copied to the upper 128 bits of the
5152///    result.
5153/// \returns A 256-bit integer vector containing the concatenated result.
5154static __inline __m256i __DEFAULT_FN_ATTRS
5155_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5156{
5157  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5158}
5159
5160#undef __DEFAULT_FN_ATTRS
5161
5162#endif /* __AVXINTRIN_H */
5163