1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42/* Define the default attributes for the functions in this file. */
43#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
44#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
45
46/* Arithmetic */
47/// Adds two 256-bit vectors of [4 x double].
48///
49/// \headerfile <x86intrin.h>
50///
51/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
52///
53/// \param __a
54///    A 256-bit vector of [4 x double] containing one of the source operands.
55/// \param __b
56///    A 256-bit vector of [4 x double] containing one of the source operands.
57/// \returns A 256-bit vector of [4 x double] containing the sums of both
58///    operands.
59static __inline __m256d __DEFAULT_FN_ATTRS
60_mm256_add_pd(__m256d __a, __m256d __b)
61{
62  return (__m256d)((__v4df)__a+(__v4df)__b);
63}
64
65/// Adds two 256-bit vectors of [8 x float].
66///
67/// \headerfile <x86intrin.h>
68///
69/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
70///
71/// \param __a
72///    A 256-bit vector of [8 x float] containing one of the source operands.
73/// \param __b
74///    A 256-bit vector of [8 x float] containing one of the source operands.
75/// \returns A 256-bit vector of [8 x float] containing the sums of both
76///    operands.
77static __inline __m256 __DEFAULT_FN_ATTRS
78_mm256_add_ps(__m256 __a, __m256 __b)
79{
80  return (__m256)((__v8sf)__a+(__v8sf)__b);
81}
82
83/// Subtracts two 256-bit vectors of [4 x double].
84///
85/// \headerfile <x86intrin.h>
86///
87/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
88///
89/// \param __a
90///    A 256-bit vector of [4 x double] containing the minuend.
91/// \param __b
92///    A 256-bit vector of [4 x double] containing the subtrahend.
93/// \returns A 256-bit vector of [4 x double] containing the differences between
94///    both operands.
95static __inline __m256d __DEFAULT_FN_ATTRS
96_mm256_sub_pd(__m256d __a, __m256d __b)
97{
98  return (__m256d)((__v4df)__a-(__v4df)__b);
99}
100
101/// Subtracts two 256-bit vectors of [8 x float].
102///
103/// \headerfile <x86intrin.h>
104///
105/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
106///
107/// \param __a
108///    A 256-bit vector of [8 x float] containing the minuend.
109/// \param __b
110///    A 256-bit vector of [8 x float] containing the subtrahend.
111/// \returns A 256-bit vector of [8 x float] containing the differences between
112///    both operands.
113static __inline __m256 __DEFAULT_FN_ATTRS
114_mm256_sub_ps(__m256 __a, __m256 __b)
115{
116  return (__m256)((__v8sf)__a-(__v8sf)__b);
117}
118
119/// Adds the even-indexed values and subtracts the odd-indexed values of
120///    two 256-bit vectors of [4 x double].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
125///
126/// \param __a
127///    A 256-bit vector of [4 x double] containing the left source operand.
128/// \param __b
129///    A 256-bit vector of [4 x double] containing the right source operand.
130/// \returns A 256-bit vector of [4 x double] containing the alternating sums
131///    and differences between both operands.
132static __inline __m256d __DEFAULT_FN_ATTRS
133_mm256_addsub_pd(__m256d __a, __m256d __b)
134{
135  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
136}
137
138/// Adds the even-indexed values and subtracts the odd-indexed values of
139///    two 256-bit vectors of [8 x float].
140///
141/// \headerfile <x86intrin.h>
142///
143/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
144///
145/// \param __a
146///    A 256-bit vector of [8 x float] containing the left source operand.
147/// \param __b
148///    A 256-bit vector of [8 x float] containing the right source operand.
149/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
150///    differences between both operands.
151static __inline __m256 __DEFAULT_FN_ATTRS
152_mm256_addsub_ps(__m256 __a, __m256 __b)
153{
154  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
155}
156
157/// Divides two 256-bit vectors of [4 x double].
158///
159/// \headerfile <x86intrin.h>
160///
161/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
162///
163/// \param __a
164///    A 256-bit vector of [4 x double] containing the dividend.
165/// \param __b
166///    A 256-bit vector of [4 x double] containing the divisor.
167/// \returns A 256-bit vector of [4 x double] containing the quotients of both
168///    operands.
169static __inline __m256d __DEFAULT_FN_ATTRS
170_mm256_div_pd(__m256d __a, __m256d __b)
171{
172  return (__m256d)((__v4df)__a/(__v4df)__b);
173}
174
175/// Divides two 256-bit vectors of [8 x float].
176///
177/// \headerfile <x86intrin.h>
178///
179/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
180///
181/// \param __a
182///    A 256-bit vector of [8 x float] containing the dividend.
183/// \param __b
184///    A 256-bit vector of [8 x float] containing the divisor.
185/// \returns A 256-bit vector of [8 x float] containing the quotients of both
186///    operands.
187static __inline __m256 __DEFAULT_FN_ATTRS
188_mm256_div_ps(__m256 __a, __m256 __b)
189{
190  return (__m256)((__v8sf)__a/(__v8sf)__b);
191}
192
193/// Compares two 256-bit vectors of [4 x double] and returns the greater
194///    of each pair of values.
195///
196/// \headerfile <x86intrin.h>
197///
198/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
199///
200/// \param __a
201///    A 256-bit vector of [4 x double] containing one of the operands.
202/// \param __b
203///    A 256-bit vector of [4 x double] containing one of the operands.
204/// \returns A 256-bit vector of [4 x double] containing the maximum values
205///    between both operands.
206static __inline __m256d __DEFAULT_FN_ATTRS
207_mm256_max_pd(__m256d __a, __m256d __b)
208{
209  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
210}
211
212/// Compares two 256-bit vectors of [8 x float] and returns the greater
213///    of each pair of values.
214///
215/// \headerfile <x86intrin.h>
216///
217/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
218///
219/// \param __a
220///    A 256-bit vector of [8 x float] containing one of the operands.
221/// \param __b
222///    A 256-bit vector of [8 x float] containing one of the operands.
223/// \returns A 256-bit vector of [8 x float] containing the maximum values
224///    between both operands.
225static __inline __m256 __DEFAULT_FN_ATTRS
226_mm256_max_ps(__m256 __a, __m256 __b)
227{
228  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
229}
230
231/// Compares two 256-bit vectors of [4 x double] and returns the lesser
232///    of each pair of values.
233///
234/// \headerfile <x86intrin.h>
235///
236/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
237///
238/// \param __a
239///    A 256-bit vector of [4 x double] containing one of the operands.
240/// \param __b
241///    A 256-bit vector of [4 x double] containing one of the operands.
242/// \returns A 256-bit vector of [4 x double] containing the minimum values
243///    between both operands.
244static __inline __m256d __DEFAULT_FN_ATTRS
245_mm256_min_pd(__m256d __a, __m256d __b)
246{
247  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
248}
249
250/// Compares two 256-bit vectors of [8 x float] and returns the lesser
251///    of each pair of values.
252///
253/// \headerfile <x86intrin.h>
254///
255/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
256///
257/// \param __a
258///    A 256-bit vector of [8 x float] containing one of the operands.
259/// \param __b
260///    A 256-bit vector of [8 x float] containing one of the operands.
261/// \returns A 256-bit vector of [8 x float] containing the minimum values
262///    between both operands.
263static __inline __m256 __DEFAULT_FN_ATTRS
264_mm256_min_ps(__m256 __a, __m256 __b)
265{
266  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
267}
268
269/// Multiplies two 256-bit vectors of [4 x double].
270///
271/// \headerfile <x86intrin.h>
272///
273/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
274///
275/// \param __a
276///    A 256-bit vector of [4 x double] containing one of the operands.
277/// \param __b
278///    A 256-bit vector of [4 x double] containing one of the operands.
279/// \returns A 256-bit vector of [4 x double] containing the products of both
280///    operands.
281static __inline __m256d __DEFAULT_FN_ATTRS
282_mm256_mul_pd(__m256d __a, __m256d __b)
283{
284  return (__m256d)((__v4df)__a * (__v4df)__b);
285}
286
287/// Multiplies two 256-bit vectors of [8 x float].
288///
289/// \headerfile <x86intrin.h>
290///
291/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
292///
293/// \param __a
294///    A 256-bit vector of [8 x float] containing one of the operands.
295/// \param __b
296///    A 256-bit vector of [8 x float] containing one of the operands.
297/// \returns A 256-bit vector of [8 x float] containing the products of both
298///    operands.
299static __inline __m256 __DEFAULT_FN_ATTRS
300_mm256_mul_ps(__m256 __a, __m256 __b)
301{
302  return (__m256)((__v8sf)__a * (__v8sf)__b);
303}
304
305/// Calculates the square roots of the values in a 256-bit vector of
306///    [4 x double].
307///
308/// \headerfile <x86intrin.h>
309///
310/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
311///
312/// \param __a
313///    A 256-bit vector of [4 x double].
314/// \returns A 256-bit vector of [4 x double] containing the square roots of the
315///    values in the operand.
316static __inline __m256d __DEFAULT_FN_ATTRS
317_mm256_sqrt_pd(__m256d __a)
318{
319  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
320}
321
322/// Calculates the square roots of the values in a 256-bit vector of
323///    [8 x float].
324///
325/// \headerfile <x86intrin.h>
326///
327/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
328///
329/// \param __a
330///    A 256-bit vector of [8 x float].
331/// \returns A 256-bit vector of [8 x float] containing the square roots of the
332///    values in the operand.
333static __inline __m256 __DEFAULT_FN_ATTRS
334_mm256_sqrt_ps(__m256 __a)
335{
336  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
337}
338
339/// Calculates the reciprocal square roots of the values in a 256-bit
340///    vector of [8 x float].
341///
342/// \headerfile <x86intrin.h>
343///
344/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
345///
346/// \param __a
347///    A 256-bit vector of [8 x float].
348/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
349///    roots of the values in the operand.
350static __inline __m256 __DEFAULT_FN_ATTRS
351_mm256_rsqrt_ps(__m256 __a)
352{
353  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
354}
355
356/// Calculates the reciprocals of the values in a 256-bit vector of
357///    [8 x float].
358///
359/// \headerfile <x86intrin.h>
360///
361/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
362///
363/// \param __a
364///    A 256-bit vector of [8 x float].
365/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
366///    values in the operand.
367static __inline __m256 __DEFAULT_FN_ATTRS
368_mm256_rcp_ps(__m256 __a)
369{
370  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
371}
372
373/// Rounds the values in a 256-bit vector of [4 x double] as specified
374///    by the byte operand. The source values are rounded to integer values and
375///    returned as 64-bit double-precision floating-point values.
376///
377/// \headerfile <x86intrin.h>
378///
379/// \code
380/// __m256d _mm256_round_pd(__m256d V, const int M);
381/// \endcode
382///
383/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
384///
385/// \param V
386///    A 256-bit vector of [4 x double].
387/// \param M
388///    An integer value that specifies the rounding operation. \n
389///    Bits [7:4] are reserved. \n
390///    Bit [3] is a precision exception value: \n
391///      0: A normal PE exception is used. \n
392///      1: The PE field is not updated. \n
393///    Bit [2] is the rounding control source: \n
394///      0: Use bits [1:0] of \a M. \n
395///      1: Use the current MXCSR setting. \n
396///    Bits [1:0] contain the rounding control definition: \n
397///      00: Nearest. \n
398///      01: Downward (toward negative infinity). \n
399///      10: Upward (toward positive infinity). \n
400///      11: Truncated.
401/// \returns A 256-bit vector of [4 x double] containing the rounded values.
402#define _mm256_round_pd(V, M) \
403    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
404
405/// Rounds the values stored in a 256-bit vector of [8 x float] as
406///    specified by the byte operand. The source values are rounded to integer
407///    values and returned as floating-point values.
408///
409/// \headerfile <x86intrin.h>
410///
411/// \code
412/// __m256 _mm256_round_ps(__m256 V, const int M);
413/// \endcode
414///
415/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
416///
417/// \param V
418///    A 256-bit vector of [8 x float].
419/// \param M
420///    An integer value that specifies the rounding operation. \n
421///    Bits [7:4] are reserved. \n
422///    Bit [3] is a precision exception value: \n
423///      0: A normal PE exception is used. \n
424///      1: The PE field is not updated. \n
425///    Bit [2] is the rounding control source: \n
426///      0: Use bits [1:0] of \a M. \n
427///      1: Use the current MXCSR setting. \n
428///    Bits [1:0] contain the rounding control definition: \n
429///      00: Nearest. \n
430///      01: Downward (toward negative infinity). \n
431///      10: Upward (toward positive infinity). \n
432///      11: Truncated.
433/// \returns A 256-bit vector of [8 x float] containing the rounded values.
434#define _mm256_round_ps(V, M) \
435  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
436
437/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
438///    source values are rounded up to integer values and returned as 64-bit
439///    double-precision floating-point values.
440///
441/// \headerfile <x86intrin.h>
442///
443/// \code
444/// __m256d _mm256_ceil_pd(__m256d V);
445/// \endcode
446///
447/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
448///
449/// \param V
450///    A 256-bit vector of [4 x double].
451/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
452#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
453
454/// Rounds down the values stored in a 256-bit vector of [4 x double].
455///    The source values are rounded down to integer values and returned as
456///    64-bit double-precision floating-point values.
457///
458/// \headerfile <x86intrin.h>
459///
460/// \code
461/// __m256d _mm256_floor_pd(__m256d V);
462/// \endcode
463///
464/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
465///
466/// \param V
467///    A 256-bit vector of [4 x double].
468/// \returns A 256-bit vector of [4 x double] containing the rounded down
469///    values.
470#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
471
472/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
473///    source values are rounded up to integer values and returned as
474///    floating-point values.
475///
476/// \headerfile <x86intrin.h>
477///
478/// \code
479/// __m256 _mm256_ceil_ps(__m256 V);
480/// \endcode
481///
482/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
483///
484/// \param V
485///    A 256-bit vector of [8 x float].
486/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
487#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
488
489/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
490///    source values are rounded down to integer values and returned as
491///    floating-point values.
492///
493/// \headerfile <x86intrin.h>
494///
495/// \code
496/// __m256 _mm256_floor_ps(__m256 V);
497/// \endcode
498///
499/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
500///
501/// \param V
502///    A 256-bit vector of [8 x float].
503/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
504#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
505
506/* Logical */
507/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
508///
509/// \headerfile <x86intrin.h>
510///
511/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
512///
513/// \param __a
514///    A 256-bit vector of [4 x double] containing one of the source operands.
515/// \param __b
516///    A 256-bit vector of [4 x double] containing one of the source operands.
517/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
518///    values between both operands.
519static __inline __m256d __DEFAULT_FN_ATTRS
520_mm256_and_pd(__m256d __a, __m256d __b)
521{
522  return (__m256d)((__v4du)__a & (__v4du)__b);
523}
524
525/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
526///
527/// \headerfile <x86intrin.h>
528///
529/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
530///
531/// \param __a
532///    A 256-bit vector of [8 x float] containing one of the source operands.
533/// \param __b
534///    A 256-bit vector of [8 x float] containing one of the source operands.
535/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
536///    values between both operands.
537static __inline __m256 __DEFAULT_FN_ATTRS
538_mm256_and_ps(__m256 __a, __m256 __b)
539{
540  return (__m256)((__v8su)__a & (__v8su)__b);
541}
542
543/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
544///    the one's complement of the values contained in the first source operand.
545///
546/// \headerfile <x86intrin.h>
547///
548/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
549///
550/// \param __a
551///    A 256-bit vector of [4 x double] containing the left source operand. The
552///    one's complement of this value is used in the bitwise AND.
553/// \param __b
554///    A 256-bit vector of [4 x double] containing the right source operand.
555/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
556///    values of the second operand and the one's complement of the first
557///    operand.
558static __inline __m256d __DEFAULT_FN_ATTRS
559_mm256_andnot_pd(__m256d __a, __m256d __b)
560{
561  return (__m256d)(~(__v4du)__a & (__v4du)__b);
562}
563
564/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
565///    the one's complement of the values contained in the first source operand.
566///
567/// \headerfile <x86intrin.h>
568///
569/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
570///
571/// \param __a
572///    A 256-bit vector of [8 x float] containing the left source operand. The
573///    one's complement of this value is used in the bitwise AND.
574/// \param __b
575///    A 256-bit vector of [8 x float] containing the right source operand.
576/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
577///    values of the second operand and the one's complement of the first
578///    operand.
579static __inline __m256 __DEFAULT_FN_ATTRS
580_mm256_andnot_ps(__m256 __a, __m256 __b)
581{
582  return (__m256)(~(__v8su)__a & (__v8su)__b);
583}
584
585/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
586///
587/// \headerfile <x86intrin.h>
588///
589/// This intrinsic corresponds to the <c> VORPD </c> instruction.
590///
591/// \param __a
592///    A 256-bit vector of [4 x double] containing one of the source operands.
593/// \param __b
594///    A 256-bit vector of [4 x double] containing one of the source operands.
595/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
596///    values between both operands.
597static __inline __m256d __DEFAULT_FN_ATTRS
598_mm256_or_pd(__m256d __a, __m256d __b)
599{
600  return (__m256d)((__v4du)__a | (__v4du)__b);
601}
602
603/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
604///
605/// \headerfile <x86intrin.h>
606///
607/// This intrinsic corresponds to the <c> VORPS </c> instruction.
608///
609/// \param __a
610///    A 256-bit vector of [8 x float] containing one of the source operands.
611/// \param __b
612///    A 256-bit vector of [8 x float] containing one of the source operands.
613/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
614///    values between both operands.
615static __inline __m256 __DEFAULT_FN_ATTRS
616_mm256_or_ps(__m256 __a, __m256 __b)
617{
618  return (__m256)((__v8su)__a | (__v8su)__b);
619}
620
621/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
622///
623/// \headerfile <x86intrin.h>
624///
625/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
626///
627/// \param __a
628///    A 256-bit vector of [4 x double] containing one of the source operands.
629/// \param __b
630///    A 256-bit vector of [4 x double] containing one of the source operands.
631/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
632///    values between both operands.
633static __inline __m256d __DEFAULT_FN_ATTRS
634_mm256_xor_pd(__m256d __a, __m256d __b)
635{
636  return (__m256d)((__v4du)__a ^ (__v4du)__b);
637}
638
639/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
640///
641/// \headerfile <x86intrin.h>
642///
643/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
644///
645/// \param __a
646///    A 256-bit vector of [8 x float] containing one of the source operands.
647/// \param __b
648///    A 256-bit vector of [8 x float] containing one of the source operands.
649/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
650///    values between both operands.
651static __inline __m256 __DEFAULT_FN_ATTRS
652_mm256_xor_ps(__m256 __a, __m256 __b)
653{
654  return (__m256)((__v8su)__a ^ (__v8su)__b);
655}
656
657/* Horizontal arithmetic */
658/// Horizontally adds the adjacent pairs of values contained in two
659///    256-bit vectors of [4 x double].
660///
661/// \headerfile <x86intrin.h>
662///
663/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
664///
665/// \param __a
666///    A 256-bit vector of [4 x double] containing one of the source operands.
667///    The horizontal sums of the values are returned in the even-indexed
668///    elements of a vector of [4 x double].
669/// \param __b
670///    A 256-bit vector of [4 x double] containing one of the source operands.
671///    The horizontal sums of the values are returned in the odd-indexed
672///    elements of a vector of [4 x double].
673/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
674///    both operands.
675static __inline __m256d __DEFAULT_FN_ATTRS
676_mm256_hadd_pd(__m256d __a, __m256d __b)
677{
678  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
679}
680
681/// Horizontally adds the adjacent pairs of values contained in two
682///    256-bit vectors of [8 x float].
683///
684/// \headerfile <x86intrin.h>
685///
686/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
687///
688/// \param __a
689///    A 256-bit vector of [8 x float] containing one of the source operands.
690///    The horizontal sums of the values are returned in the elements with
691///    index 0, 1, 4, 5 of a vector of [8 x float].
692/// \param __b
693///    A 256-bit vector of [8 x float] containing one of the source operands.
694///    The horizontal sums of the values are returned in the elements with
695///    index 2, 3, 6, 7 of a vector of [8 x float].
696/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
697///    both operands.
698static __inline __m256 __DEFAULT_FN_ATTRS
699_mm256_hadd_ps(__m256 __a, __m256 __b)
700{
701  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
702}
703
704/// Horizontally subtracts the adjacent pairs of values contained in two
705///    256-bit vectors of [4 x double].
706///
707/// \headerfile <x86intrin.h>
708///
709/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
710///
711/// \param __a
712///    A 256-bit vector of [4 x double] containing one of the source operands.
713///    The horizontal differences between the values are returned in the
714///    even-indexed elements of a vector of [4 x double].
715/// \param __b
716///    A 256-bit vector of [4 x double] containing one of the source operands.
717///    The horizontal differences between the values are returned in the
718///    odd-indexed elements of a vector of [4 x double].
719/// \returns A 256-bit vector of [4 x double] containing the horizontal
720///    differences of both operands.
721static __inline __m256d __DEFAULT_FN_ATTRS
722_mm256_hsub_pd(__m256d __a, __m256d __b)
723{
724  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
725}
726
727/// Horizontally subtracts the adjacent pairs of values contained in two
728///    256-bit vectors of [8 x float].
729///
730/// \headerfile <x86intrin.h>
731///
732/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
733///
734/// \param __a
735///    A 256-bit vector of [8 x float] containing one of the source operands.
736///    The horizontal differences between the values are returned in the
737///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
738/// \param __b
739///    A 256-bit vector of [8 x float] containing one of the source operands.
740///    The horizontal differences between the values are returned in the
741///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
742/// \returns A 256-bit vector of [8 x float] containing the horizontal
743///    differences of both operands.
744static __inline __m256 __DEFAULT_FN_ATTRS
745_mm256_hsub_ps(__m256 __a, __m256 __b)
746{
747  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
748}
749
750/* Vector permutations */
751/// Copies the values in a 128-bit vector of [2 x double] as specified
752///    by the 128-bit integer vector operand.
753///
754/// \headerfile <x86intrin.h>
755///
756/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
757///
758/// \param __a
759///    A 128-bit vector of [2 x double].
760/// \param __c
761///    A 128-bit integer vector operand specifying how the values are to be
762///    copied. \n
763///    Bit [1]: \n
764///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
765///         vector. \n
766///      1: Bits [127:64] of the source are copied to bits [63:0] of the
767///         returned vector. \n
768///    Bit [65]: \n
769///      0: Bits [63:0] of the source are copied to bits [127:64] of the
770///         returned vector. \n
771///      1: Bits [127:64] of the source are copied to bits [127:64] of the
772///         returned vector.
773/// \returns A 128-bit vector of [2 x double] containing the copied values.
774static __inline __m128d __DEFAULT_FN_ATTRS128
775_mm_permutevar_pd(__m128d __a, __m128i __c)
776{
777  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
778}
779
780/// Copies the values in a 256-bit vector of [4 x double] as specified
781///    by the 256-bit integer vector operand.
782///
783/// \headerfile <x86intrin.h>
784///
785/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
786///
787/// \param __a
788///    A 256-bit vector of [4 x double].
789/// \param __c
790///    A 256-bit integer vector operand specifying how the values are to be
791///    copied. \n
792///    Bit [1]: \n
793///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
794///         vector. \n
795///      1: Bits [127:64] of the source are copied to bits [63:0] of the
796///         returned vector. \n
797///    Bit [65]: \n
798///      0: Bits [63:0] of the source are copied to bits [127:64] of the
799///         returned vector. \n
800///      1: Bits [127:64] of the source are copied to bits [127:64] of the
801///         returned vector. \n
802///    Bit [129]: \n
803///      0: Bits [191:128] of the source are copied to bits [191:128] of the
804///         returned vector. \n
805///      1: Bits [255:192] of the source are copied to bits [191:128] of the
806///         returned vector. \n
807///    Bit [193]: \n
808///      0: Bits [191:128] of the source are copied to bits [255:192] of the
809///         returned vector. \n
810///      1: Bits [255:192] of the source are copied to bits [255:192] of the
811///    returned vector.
812/// \returns A 256-bit vector of [4 x double] containing the copied values.
813static __inline __m256d __DEFAULT_FN_ATTRS
814_mm256_permutevar_pd(__m256d __a, __m256i __c)
815{
816  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
817}
818
819/// Copies the values stored in a 128-bit vector of [4 x float] as
820///    specified by the 128-bit integer vector operand.
821/// \headerfile <x86intrin.h>
822///
823/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
824///
825/// \param __a
826///    A 128-bit vector of [4 x float].
827/// \param __c
828///    A 128-bit integer vector operand specifying how the values are to be
829///    copied. \n
830///    Bits [1:0]: \n
831///      00: Bits [31:0] of the source are copied to bits [31:0] of the
832///          returned vector. \n
833///      01: Bits [63:32] of the source are copied to bits [31:0] of the
834///          returned vector. \n
835///      10: Bits [95:64] of the source are copied to bits [31:0] of the
836///          returned vector. \n
837///      11: Bits [127:96] of the source are copied to bits [31:0] of the
838///          returned vector. \n
839///    Bits [33:32]: \n
840///      00: Bits [31:0] of the source are copied to bits [63:32] of the
841///          returned vector. \n
842///      01: Bits [63:32] of the source are copied to bits [63:32] of the
843///          returned vector. \n
844///      10: Bits [95:64] of the source are copied to bits [63:32] of the
845///          returned vector. \n
846///      11: Bits [127:96] of the source are copied to bits [63:32] of the
847///          returned vector. \n
848///    Bits [65:64]: \n
849///      00: Bits [31:0] of the source are copied to bits [95:64] of the
850///          returned vector. \n
851///      01: Bits [63:32] of the source are copied to bits [95:64] of the
852///          returned vector. \n
853///      10: Bits [95:64] of the source are copied to bits [95:64] of the
854///          returned vector. \n
855///      11: Bits [127:96] of the source are copied to bits [95:64] of the
856///          returned vector. \n
857///    Bits [97:96]: \n
858///      00: Bits [31:0] of the source are copied to bits [127:96] of the
859///          returned vector. \n
860///      01: Bits [63:32] of the source are copied to bits [127:96] of the
861///          returned vector. \n
862///      10: Bits [95:64] of the source are copied to bits [127:96] of the
863///          returned vector. \n
864///      11: Bits [127:96] of the source are copied to bits [127:96] of the
865///          returned vector.
866/// \returns A 128-bit vector of [4 x float] containing the copied values.
867static __inline __m128 __DEFAULT_FN_ATTRS128
868_mm_permutevar_ps(__m128 __a, __m128i __c)
869{
870  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
871}
872
873/// Copies the values stored in a 256-bit vector of [8 x float] as
874///    specified by the 256-bit integer vector operand.
875///
876/// \headerfile <x86intrin.h>
877///
878/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
879///
880/// \param __a
881///    A 256-bit vector of [8 x float].
882/// \param __c
883///    A 256-bit integer vector operand specifying how the values are to be
884///    copied. \n
885///    Bits [1:0]: \n
886///      00: Bits [31:0] of the source are copied to bits [31:0] of the
887///          returned vector. \n
888///      01: Bits [63:32] of the source are copied to bits [31:0] of the
889///          returned vector. \n
890///      10: Bits [95:64] of the source are copied to bits [31:0] of the
891///          returned vector. \n
892///      11: Bits [127:96] of the source are copied to bits [31:0] of the
893///          returned vector. \n
894///    Bits [33:32]: \n
895///      00: Bits [31:0] of the source are copied to bits [63:32] of the
896///          returned vector. \n
897///      01: Bits [63:32] of the source are copied to bits [63:32] of the
898///          returned vector. \n
899///      10: Bits [95:64] of the source are copied to bits [63:32] of the
900///          returned vector. \n
901///      11: Bits [127:96] of the source are copied to bits [63:32] of the
902///          returned vector. \n
903///    Bits [65:64]: \n
904///      00: Bits [31:0] of the source are copied to bits [95:64] of the
905///          returned vector. \n
906///      01: Bits [63:32] of the source are copied to bits [95:64] of the
907///          returned vector. \n
908///      10: Bits [95:64] of the source are copied to bits [95:64] of the
909///          returned vector. \n
910///      11: Bits [127:96] of the source are copied to bits [95:64] of the
911///          returned vector. \n
912///    Bits [97:96]: \n
913///      00: Bits [31:0] of the source are copied to bits [127:96] of the
914///          returned vector. \n
915///      01: Bits [63:32] of the source are copied to bits [127:96] of the
916///          returned vector. \n
917///      10: Bits [95:64] of the source are copied to bits [127:96] of the
918///          returned vector. \n
919///      11: Bits [127:96] of the source are copied to bits [127:96] of the
920///          returned vector. \n
921///    Bits [129:128]: \n
922///      00: Bits [159:128] of the source are copied to bits [159:128] of the
923///          returned vector. \n
924///      01: Bits [191:160] of the source are copied to bits [159:128] of the
925///          returned vector. \n
926///      10: Bits [223:192] of the source are copied to bits [159:128] of the
927///          returned vector. \n
928///      11: Bits [255:224] of the source are copied to bits [159:128] of the
929///          returned vector. \n
930///    Bits [161:160]: \n
931///      00: Bits [159:128] of the source are copied to bits [191:160] of the
932///          returned vector. \n
933///      01: Bits [191:160] of the source are copied to bits [191:160] of the
934///          returned vector. \n
935///      10: Bits [223:192] of the source are copied to bits [191:160] of the
936///          returned vector. \n
937///      11: Bits [255:224] of the source are copied to bits [191:160] of the
938///          returned vector. \n
939///    Bits [193:192]: \n
940///      00: Bits [159:128] of the source are copied to bits [223:192] of the
941///          returned vector. \n
942///      01: Bits [191:160] of the source are copied to bits [223:192] of the
943///          returned vector. \n
944///      10: Bits [223:192] of the source are copied to bits [223:192] of the
945///          returned vector. \n
946///      11: Bits [255:224] of the source are copied to bits [223:192] of the
947///          returned vector. \n
948///    Bits [225:224]: \n
949///      00: Bits [159:128] of the source are copied to bits [255:224] of the
950///          returned vector. \n
951///      01: Bits [191:160] of the source are copied to bits [255:224] of the
952///          returned vector. \n
953///      10: Bits [223:192] of the source are copied to bits [255:224] of the
954///          returned vector. \n
955///      11: Bits [255:224] of the source are copied to bits [255:224] of the
956///          returned vector.
957/// \returns A 256-bit vector of [8 x float] containing the copied values.
958static __inline __m256 __DEFAULT_FN_ATTRS
959_mm256_permutevar_ps(__m256 __a, __m256i __c)
960{
961  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
962}
963
964/// Copies the values in a 128-bit vector of [2 x double] as specified
965///    by the immediate integer operand.
966///
967/// \headerfile <x86intrin.h>
968///
969/// \code
970/// __m128d _mm_permute_pd(__m128d A, const int C);
971/// \endcode
972///
973/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
974///
975/// \param A
976///    A 128-bit vector of [2 x double].
977/// \param C
978///    An immediate integer operand specifying how the values are to be
979///    copied. \n
980///    Bit [0]: \n
981///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
982///         vector. \n
983///      1: Bits [127:64] of the source are copied to bits [63:0] of the
984///         returned vector. \n
985///    Bit [1]: \n
986///      0: Bits [63:0] of the source are copied to bits [127:64] of the
987///         returned vector. \n
988///      1: Bits [127:64] of the source are copied to bits [127:64] of the
989///         returned vector.
990/// \returns A 128-bit vector of [2 x double] containing the copied values.
991#define _mm_permute_pd(A, C) \
992  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
993
994/// Copies the values in a 256-bit vector of [4 x double] as specified by
995///    the immediate integer operand.
996///
997/// \headerfile <x86intrin.h>
998///
999/// \code
1000/// __m256d _mm256_permute_pd(__m256d A, const int C);
1001/// \endcode
1002///
1003/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1004///
1005/// \param A
1006///    A 256-bit vector of [4 x double].
1007/// \param C
1008///    An immediate integer operand specifying how the values are to be
1009///    copied. \n
1010///    Bit [0]: \n
1011///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1012///         vector. \n
1013///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1014///         returned vector. \n
1015///    Bit [1]: \n
1016///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1017///         returned vector. \n
1018///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1019///         returned vector. \n
1020///    Bit [2]: \n
1021///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1022///         returned vector. \n
1023///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1024///         returned vector. \n
1025///    Bit [3]: \n
1026///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1027///         returned vector. \n
1028///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1029///         returned vector.
1030/// \returns A 256-bit vector of [4 x double] containing the copied values.
1031#define _mm256_permute_pd(A, C) \
1032  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
1033
1034/// Copies the values in a 128-bit vector of [4 x float] as specified by
1035///    the immediate integer operand.
1036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// \code
1040/// __m128 _mm_permute_ps(__m128 A, const int C);
1041/// \endcode
1042///
1043/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1044///
1045/// \param A
1046///    A 128-bit vector of [4 x float].
1047/// \param C
1048///    An immediate integer operand specifying how the values are to be
1049///    copied. \n
1050///    Bits [1:0]: \n
1051///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1052///          returned vector. \n
1053///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1054///          returned vector. \n
1055///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1056///          returned vector. \n
1057///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1058///          returned vector. \n
1059///    Bits [3:2]: \n
1060///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1061///          returned vector. \n
1062///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1063///          returned vector. \n
1064///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1065///          returned vector. \n
1066///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1067///          returned vector. \n
1068///    Bits [5:4]: \n
1069///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1070///          returned vector. \n
1071///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1072///          returned vector. \n
1073///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1074///          returned vector. \n
1075///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1076///          returned vector. \n
1077///    Bits [7:6]: \n
1078///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1079///          returned vector. \n
1080///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1081///          returned vector. \n
1082///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1083///          returned vector. \n
1084///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1085///          returned vector.
1086/// \returns A 128-bit vector of [4 x float] containing the copied values.
1087#define _mm_permute_ps(A, C) \
1088  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
1089
1090/// Copies the values in a 256-bit vector of [8 x float] as specified by
1091///    the immediate integer operand.
1092///
1093/// \headerfile <x86intrin.h>
1094///
1095/// \code
1096/// __m256 _mm256_permute_ps(__m256 A, const int C);
1097/// \endcode
1098///
1099/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1100///
1101/// \param A
1102///    A 256-bit vector of [8 x float].
1103/// \param C
1104///    An immediate integer operand specifying how the values are to be
1105///    copied. \n
1106///    Bits [1:0]: \n
1107///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1108///          returned vector. \n
1109///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1110///          returned vector. \n
1111///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1112///          returned vector. \n
1113///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1114///          returned vector. \n
1115///    Bits [3:2]: \n
1116///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1117///          returned vector. \n
1118///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1119///          returned vector. \n
1120///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1121///          returned vector. \n
1122///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1123///          returned vector. \n
1124///    Bits [5:4]: \n
1125///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1126///          returned vector. \n
1127///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1128///          returned vector. \n
1129///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1130///          returned vector. \n
1131///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1132///          returned vector. \n
1133///    Bits [7:6]: \n
1134///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1135///          returned vector. \n
1136///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1137///          returned vector. \n
1138///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1139///          returned vector. \n
1140///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1141///          returned vector. \n
1142///    Bits [1:0]: \n
1143///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1144///          returned vector. \n
1145///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1146///          returned vector. \n
1147///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1148///          returned vector. \n
1149///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1150///          returned vector. \n
1151///    Bits [3:2]: \n
1152///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1153///          returned vector. \n
1154///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1155///          returned vector. \n
1156///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1157///          returned vector. \n
1158///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1159///          returned vector. \n
1160///    Bits [5:4]: \n
1161///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1162///          returned vector. \n
1163///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1164///          returned vector. \n
1165///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1166///          returned vector. \n
1167///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1168///          returned vector. \n
1169///    Bits [7:6]: \n
1170///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1171///          returned vector. \n
1172///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1173///          returned vector. \n
1174///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1175///          returned vector. \n
1176///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1177///          returned vector.
1178/// \returns A 256-bit vector of [8 x float] containing the copied values.
1179#define _mm256_permute_ps(A, C) \
1180  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
1181
1182/// Permutes 128-bit data values stored in two 256-bit vectors of
1183///    [4 x double], as specified by the immediate integer operand.
1184///
1185/// \headerfile <x86intrin.h>
1186///
1187/// \code
1188/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1189/// \endcode
1190///
1191/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1192///
1193/// \param V1
1194///    A 256-bit vector of [4 x double].
1195/// \param V2
1196///    A 256-bit vector of [4 x double.
1197/// \param M
1198///    An immediate integer operand specifying how the values are to be
1199///    permuted. \n
1200///    Bits [1:0]: \n
1201///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1202///          destination. \n
1203///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1204///          destination. \n
1205///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1206///          destination. \n
1207///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1208///          destination. \n
1209///    Bits [5:4]: \n
1210///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1211///          destination. \n
1212///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1213///          destination. \n
1214///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1215///          destination. \n
1216///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1217///          destination.
1218/// \returns A 256-bit vector of [4 x double] containing the copied values.
1219#define _mm256_permute2f128_pd(V1, V2, M) \
1220  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1221                                           (__v4df)(__m256d)(V2), (int)(M))
1222
1223/// Permutes 128-bit data values stored in two 256-bit vectors of
1224///    [8 x float], as specified by the immediate integer operand.
1225///
1226/// \headerfile <x86intrin.h>
1227///
1228/// \code
1229/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1230/// \endcode
1231///
1232/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1233///
1234/// \param V1
1235///    A 256-bit vector of [8 x float].
1236/// \param V2
1237///    A 256-bit vector of [8 x float].
1238/// \param M
1239///    An immediate integer operand specifying how the values are to be
1240///    permuted. \n
1241///    Bits [1:0]: \n
1242///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1243///    destination. \n
1244///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1245///    destination. \n
1246///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1247///    destination. \n
1248///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1249///    destination. \n
1250///    Bits [5:4]: \n
1251///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1252///    destination. \n
1253///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1254///    destination. \n
1255///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1256///    destination. \n
1257///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1258///    destination.
1259/// \returns A 256-bit vector of [8 x float] containing the copied values.
1260#define _mm256_permute2f128_ps(V1, V2, M) \
1261  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1262                                          (__v8sf)(__m256)(V2), (int)(M))
1263
1264/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1265///    as specified by the immediate integer operand.
1266///
1267/// \headerfile <x86intrin.h>
1268///
1269/// \code
1270/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1271/// \endcode
1272///
1273/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1274///
1275/// \param V1
1276///    A 256-bit integer vector.
1277/// \param V2
1278///    A 256-bit integer vector.
1279/// \param M
1280///    An immediate integer operand specifying how the values are to be copied.
1281///    Bits [1:0]: \n
1282///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1283///    destination. \n
1284///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1285///    destination. \n
1286///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1287///    destination. \n
1288///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1289///    destination. \n
1290///    Bits [5:4]: \n
1291///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1292///    destination. \n
1293///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1294///    destination. \n
1295///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1296///    destination. \n
1297///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1298///    destination.
1299/// \returns A 256-bit integer vector containing the copied values.
1300#define _mm256_permute2f128_si256(V1, V2, M) \
1301  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1302                                           (__v8si)(__m256i)(V2), (int)(M))
1303
1304/* Vector Blend */
1305/// Merges 64-bit double-precision data values stored in either of the
1306///    two 256-bit vectors of [4 x double], as specified by the immediate
1307///    integer operand.
1308///
1309/// \headerfile <x86intrin.h>
1310///
1311/// \code
1312/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1313/// \endcode
1314///
1315/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1316///
1317/// \param V1
1318///    A 256-bit vector of [4 x double].
1319/// \param V2
1320///    A 256-bit vector of [4 x double].
1321/// \param M
1322///    An immediate integer operand, with mask bits [3:0] specifying how the
1323///    values are to be copied. The position of the mask bit corresponds to the
1324///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1325///    element in operand \a V1 is copied to the same position in the
1326///    destination. When a mask bit is 1, the corresponding 64-bit element in
1327///    operand \a V2 is copied to the same position in the destination.
1328/// \returns A 256-bit vector of [4 x double] containing the copied values.
1329#define _mm256_blend_pd(V1, V2, M) \
1330  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1331                                     (__v4df)(__m256d)(V2), (int)(M))
1332
1333/// Merges 32-bit single-precision data values stored in either of the
1334///    two 256-bit vectors of [8 x float], as specified by the immediate
1335///    integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1341/// \endcode
1342///
1343/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1344///
1345/// \param V1
1346///    A 256-bit vector of [8 x float].
1347/// \param V2
1348///    A 256-bit vector of [8 x float].
1349/// \param M
1350///    An immediate integer operand, with mask bits [7:0] specifying how the
1351///    values are to be copied. The position of the mask bit corresponds to the
1352///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1353///    element in operand \a V1 is copied to the same position in the
1354///    destination. When a mask bit is 1, the corresponding 32-bit element in
1355///    operand \a V2 is copied to the same position in the destination.
1356/// \returns A 256-bit vector of [8 x float] containing the copied values.
1357#define _mm256_blend_ps(V1, V2, M) \
1358  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1359                                    (__v8sf)(__m256)(V2), (int)(M))
1360
1361/// Merges 64-bit double-precision data values stored in either of the
1362///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1363///    operand.
1364///
1365/// \headerfile <x86intrin.h>
1366///
1367/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1368///
1369/// \param __a
1370///    A 256-bit vector of [4 x double].
1371/// \param __b
1372///    A 256-bit vector of [4 x double].
1373/// \param __c
1374///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1375///    how the values are to be copied. The position of the mask bit corresponds
1376///    to the most significant bit of a copied value. When a mask bit is 0, the
1377///    corresponding 64-bit element in operand \a __a is copied to the same
1378///    position in the destination. When a mask bit is 1, the corresponding
1379///    64-bit element in operand \a __b is copied to the same position in the
1380///    destination.
1381/// \returns A 256-bit vector of [4 x double] containing the copied values.
1382static __inline __m256d __DEFAULT_FN_ATTRS
1383_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1384{
1385  return (__m256d)__builtin_ia32_blendvpd256(
1386    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1387}
1388
1389/// Merges 32-bit single-precision data values stored in either of the
1390///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1391///    operand.
1392///
1393/// \headerfile <x86intrin.h>
1394///
1395/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1396///
1397/// \param __a
1398///    A 256-bit vector of [8 x float].
1399/// \param __b
1400///    A 256-bit vector of [8 x float].
1401/// \param __c
1402///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1403///    and 31 specifying how the values are to be copied. The position of the
1404///    mask bit corresponds to the most significant bit of a copied value. When
1405///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1406///    copied to the same position in the destination. When a mask bit is 1, the
1407///    corresponding 32-bit element in operand \a __b is copied to the same
1408///    position in the destination.
1409/// \returns A 256-bit vector of [8 x float] containing the copied values.
1410static __inline __m256 __DEFAULT_FN_ATTRS
1411_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1412{
1413  return (__m256)__builtin_ia32_blendvps256(
1414    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1415}
1416
1417/* Vector Dot Product */
1418/// Computes two dot products in parallel, using the lower and upper
1419///    halves of two [8 x float] vectors as input to the two computations, and
1420///    returning the two dot products in the lower and upper halves of the
1421///    [8 x float] result.
1422///
1423///    The immediate integer operand controls which input elements will
1424///    contribute to the dot product, and where the final results are returned.
1425///    In general, for each dot product, the four corresponding elements of the
1426///    input vectors are multiplied; the first two and second two products are
1427///    summed, then the two sums are added to form the final result.
1428///
1429/// \headerfile <x86intrin.h>
1430///
1431/// \code
1432/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1433/// \endcode
1434///
1435/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1436///
1437/// \param V1
1438///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1439/// \param V2
1440///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1441/// \param M
1442///    An immediate integer argument. Bits [7:4] determine which elements of
1443///    the input vectors are used, with bit [4] corresponding to the lowest
1444///    element and bit [7] corresponding to the highest element of each [4 x
1445///    float] subvector. If a bit is set, the corresponding elements from the
1446///    two input vectors are used as an input for dot product; otherwise that
1447///    input is treated as zero. Bits [3:0] determine which elements of the
1448///    result will receive a copy of the final dot product, with bit [0]
1449///    corresponding to the lowest element and bit [3] corresponding to the
1450///    highest element of each [4 x float] subvector. If a bit is set, the dot
1451///    product is returned in the corresponding element; otherwise that element
1452///    is set to zero. The bitmask is applied in the same way to each of the
1453///    two parallel dot product computations.
1454/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1455#define _mm256_dp_ps(V1, V2, M) \
1456  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1457                                 (__v8sf)(__m256)(V2), (M))
1458
1459/* Vector shuffle */
1460/// Selects 8 float values from the 256-bit operands of [8 x float], as
1461///    specified by the immediate value operand.
1462///
1463///    The four selected elements in each operand are copied to the destination
1464///    according to the bits specified in the immediate operand. The selected
1465///    elements from the first 256-bit operand are copied to bits [63:0] and
1466///    bits [191:128] of the destination, and the selected elements from the
1467///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1468///    the destination. For example, if bits [7:0] of the immediate operand
1469///    contain a value of 0xFF, the 256-bit destination vector would contain the
1470///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1471///
1472/// \headerfile <x86intrin.h>
1473///
1474/// \code
1475/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1476/// \endcode
1477///
1478/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1479///
1480/// \param a
1481///    A 256-bit vector of [8 x float]. The four selected elements in this
1482///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1483///    according to the bits specified in the immediate operand.
1484/// \param b
1485///    A 256-bit vector of [8 x float]. The four selected elements in this
1486///    operand are copied to bits [127:64] and bits [255:192] in the
1487///    destination, according to the bits specified in the immediate operand.
1488/// \param mask
1489///    An immediate value containing an 8-bit value specifying which elements to
1490///    copy from \a a and \a b \n.
1491///    Bits [3:0] specify the values copied from operand \a a. \n
1492///    Bits [7:4] specify the values copied from operand \a b. \n
1493///    The destinations within the 256-bit destination are assigned values as
1494///    follows, according to the bit value assignments described below: \n
1495///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1496///    destination. \n
1497///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1498///    destination. \n
1499///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1500///    destination. \n
1501///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1502///    the destination. \n
1503///    Bit value assignments: \n
1504///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1505///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1506///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1507///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1508/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1509#define _mm256_shuffle_ps(a, b, mask) \
1510  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1511                                   (__v8sf)(__m256)(b), (int)(mask))
1512
1513/// Selects four double-precision values from the 256-bit operands of
1514///    [4 x double], as specified by the immediate value operand.
1515///
1516///    The selected elements from the first 256-bit operand are copied to bits
1517///    [63:0] and bits [191:128] in the destination, and the selected elements
1518///    from the second 256-bit operand are copied to bits [127:64] and bits
1519///    [255:192] in the destination. For example, if bits [3:0] of the immediate
1520///    operand contain a value of 0xF, the 256-bit destination vector would
1521///    contain the following values: b[3], a[3], b[1], a[1].
1522///
1523/// \headerfile <x86intrin.h>
1524///
1525/// \code
1526/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1527/// \endcode
1528///
1529/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1530///
1531/// \param a
1532///    A 256-bit vector of [4 x double].
1533/// \param b
1534///    A 256-bit vector of [4 x double].
1535/// \param mask
1536///    An immediate value containing 8-bit values specifying which elements to
1537///    copy from \a a and \a b: \n
1538///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1539///    destination. \n
1540///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1541///    destination. \n
1542///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1543///    destination. \n
1544///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1545///    destination. \n
1546///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1547///    destination. \n
1548///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1549///    destination. \n
1550///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1551///    destination. \n
1552///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1553///    destination.
1554/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1555#define _mm256_shuffle_pd(a, b, mask) \
1556  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1557                                    (__v4df)(__m256d)(b), (int)(mask))
1558
1559/* Compare */
1560#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1561#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1562#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1563#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1564#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1565#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1566#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1567#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
1568#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1569#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
1570#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1571#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1572#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1573#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1574#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1575#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1576#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1577#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1578#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1579#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1580#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1581#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1582#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
1583#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1584#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1585#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
1586#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1587#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1588#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1589#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1590#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1591#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1592
1593/// Compares each of the corresponding double-precision values of two
1594///    128-bit vectors of [2 x double], using the operation specified by the
1595///    immediate integer operand.
1596///
1597///    Returns a [2 x double] vector consisting of two doubles corresponding to
1598///    the two comparison results: zero if the comparison is false, and all 1's
1599///    if the comparison is true.
1600///
1601/// \headerfile <x86intrin.h>
1602///
1603/// \code
1604/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1605/// \endcode
1606///
1607/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1608///
1609/// \param a
1610///    A 128-bit vector of [2 x double].
1611/// \param b
1612///    A 128-bit vector of [2 x double].
1613/// \param c
1614///    An immediate integer operand, with bits [4:0] specifying which comparison
1615///    operation to use: \n
1616///    0x00: Equal (ordered, non-signaling) \n
1617///    0x01: Less-than (ordered, signaling) \n
1618///    0x02: Less-than-or-equal (ordered, signaling) \n
1619///    0x03: Unordered (non-signaling) \n
1620///    0x04: Not-equal (unordered, non-signaling) \n
1621///    0x05: Not-less-than (unordered, signaling) \n
1622///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1623///    0x07: Ordered (non-signaling) \n
1624///    0x08: Equal (unordered, non-signaling) \n
1625///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1626///    0x0A: Not-greater-than (unordered, signaling) \n
1627///    0x0B: False (ordered, non-signaling) \n
1628///    0x0C: Not-equal (ordered, non-signaling) \n
1629///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1630///    0x0E: Greater-than (ordered, signaling) \n
1631///    0x0F: True (unordered, non-signaling) \n
1632///    0x10: Equal (ordered, signaling) \n
1633///    0x11: Less-than (ordered, non-signaling) \n
1634///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1635///    0x13: Unordered (signaling) \n
1636///    0x14: Not-equal (unordered, signaling) \n
1637///    0x15: Not-less-than (unordered, non-signaling) \n
1638///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1639///    0x17: Ordered (signaling) \n
1640///    0x18: Equal (unordered, signaling) \n
1641///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1642///    0x1A: Not-greater-than (unordered, non-signaling) \n
1643///    0x1B: False (ordered, signaling) \n
1644///    0x1C: Not-equal (ordered, signaling) \n
1645///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1646///    0x1E: Greater-than (ordered, non-signaling) \n
1647///    0x1F: True (unordered, signaling)
1648/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1649#define _mm_cmp_pd(a, b, c) \
1650  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1651                                (__v2df)(__m128d)(b), (c))
1652
1653/// Compares each of the corresponding values of two 128-bit vectors of
1654///    [4 x float], using the operation specified by the immediate integer
1655///    operand.
1656///
1657///    Returns a [4 x float] vector consisting of four floats corresponding to
1658///    the four comparison results: zero if the comparison is false, and all 1's
1659///    if the comparison is true.
1660///
1661/// \headerfile <x86intrin.h>
1662///
1663/// \code
1664/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1665/// \endcode
1666///
1667/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1668///
1669/// \param a
1670///    A 128-bit vector of [4 x float].
1671/// \param b
1672///    A 128-bit vector of [4 x float].
1673/// \param c
1674///    An immediate integer operand, with bits [4:0] specifying which comparison
1675///    operation to use: \n
1676///    0x00: Equal (ordered, non-signaling) \n
1677///    0x01: Less-than (ordered, signaling) \n
1678///    0x02: Less-than-or-equal (ordered, signaling) \n
1679///    0x03: Unordered (non-signaling) \n
1680///    0x04: Not-equal (unordered, non-signaling) \n
1681///    0x05: Not-less-than (unordered, signaling) \n
1682///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1683///    0x07: Ordered (non-signaling) \n
1684///    0x08: Equal (unordered, non-signaling) \n
1685///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1686///    0x0A: Not-greater-than (unordered, signaling) \n
1687///    0x0B: False (ordered, non-signaling) \n
1688///    0x0C: Not-equal (ordered, non-signaling) \n
1689///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1690///    0x0E: Greater-than (ordered, signaling) \n
1691///    0x0F: True (unordered, non-signaling) \n
1692///    0x10: Equal (ordered, signaling) \n
1693///    0x11: Less-than (ordered, non-signaling) \n
1694///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1695///    0x13: Unordered (signaling) \n
1696///    0x14: Not-equal (unordered, signaling) \n
1697///    0x15: Not-less-than (unordered, non-signaling) \n
1698///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1699///    0x17: Ordered (signaling) \n
1700///    0x18: Equal (unordered, signaling) \n
1701///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1702///    0x1A: Not-greater-than (unordered, non-signaling) \n
1703///    0x1B: False (ordered, signaling) \n
1704///    0x1C: Not-equal (ordered, signaling) \n
1705///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1706///    0x1E: Greater-than (ordered, non-signaling) \n
1707///    0x1F: True (unordered, signaling)
1708/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1709#define _mm_cmp_ps(a, b, c) \
1710  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1711                               (__v4sf)(__m128)(b), (c))
1712
1713/// Compares each of the corresponding double-precision values of two
1714///    256-bit vectors of [4 x double], using the operation specified by the
1715///    immediate integer operand.
1716///
1717///    Returns a [4 x double] vector consisting of four doubles corresponding to
1718///    the four comparison results: zero if the comparison is false, and all 1's
1719///    if the comparison is true.
1720///
1721/// \headerfile <x86intrin.h>
1722///
1723/// \code
1724/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1725/// \endcode
1726///
1727/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1728///
1729/// \param a
1730///    A 256-bit vector of [4 x double].
1731/// \param b
1732///    A 256-bit vector of [4 x double].
1733/// \param c
1734///    An immediate integer operand, with bits [4:0] specifying which comparison
1735///    operation to use: \n
1736///    0x00: Equal (ordered, non-signaling) \n
1737///    0x01: Less-than (ordered, signaling) \n
1738///    0x02: Less-than-or-equal (ordered, signaling) \n
1739///    0x03: Unordered (non-signaling) \n
1740///    0x04: Not-equal (unordered, non-signaling) \n
1741///    0x05: Not-less-than (unordered, signaling) \n
1742///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1743///    0x07: Ordered (non-signaling) \n
1744///    0x08: Equal (unordered, non-signaling) \n
1745///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1746///    0x0A: Not-greater-than (unordered, signaling) \n
1747///    0x0B: False (ordered, non-signaling) \n
1748///    0x0C: Not-equal (ordered, non-signaling) \n
1749///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1750///    0x0E: Greater-than (ordered, signaling) \n
1751///    0x0F: True (unordered, non-signaling) \n
1752///    0x10: Equal (ordered, signaling) \n
1753///    0x11: Less-than (ordered, non-signaling) \n
1754///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1755///    0x13: Unordered (signaling) \n
1756///    0x14: Not-equal (unordered, signaling) \n
1757///    0x15: Not-less-than (unordered, non-signaling) \n
1758///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1759///    0x17: Ordered (signaling) \n
1760///    0x18: Equal (unordered, signaling) \n
1761///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1762///    0x1A: Not-greater-than (unordered, non-signaling) \n
1763///    0x1B: False (ordered, signaling) \n
1764///    0x1C: Not-equal (ordered, signaling) \n
1765///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1766///    0x1E: Greater-than (ordered, non-signaling) \n
1767///    0x1F: True (unordered, signaling)
1768/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1769#define _mm256_cmp_pd(a, b, c) \
1770  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1771                                   (__v4df)(__m256d)(b), (c))
1772
1773/// Compares each of the corresponding values of two 256-bit vectors of
1774///    [8 x float], using the operation specified by the immediate integer
1775///    operand.
1776///
1777///    Returns a [8 x float] vector consisting of eight floats corresponding to
1778///    the eight comparison results: zero if the comparison is false, and all
1779///    1's if the comparison is true.
1780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// \code
1784/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1785/// \endcode
1786///
1787/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1788///
1789/// \param a
1790///    A 256-bit vector of [8 x float].
1791/// \param b
1792///    A 256-bit vector of [8 x float].
1793/// \param c
1794///    An immediate integer operand, with bits [4:0] specifying which comparison
1795///    operation to use: \n
1796///    0x00: Equal (ordered, non-signaling) \n
1797///    0x01: Less-than (ordered, signaling) \n
1798///    0x02: Less-than-or-equal (ordered, signaling) \n
1799///    0x03: Unordered (non-signaling) \n
1800///    0x04: Not-equal (unordered, non-signaling) \n
1801///    0x05: Not-less-than (unordered, signaling) \n
1802///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1803///    0x07: Ordered (non-signaling) \n
1804///    0x08: Equal (unordered, non-signaling) \n
1805///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1806///    0x0A: Not-greater-than (unordered, signaling) \n
1807///    0x0B: False (ordered, non-signaling) \n
1808///    0x0C: Not-equal (ordered, non-signaling) \n
1809///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1810///    0x0E: Greater-than (ordered, signaling) \n
1811///    0x0F: True (unordered, non-signaling) \n
1812///    0x10: Equal (ordered, signaling) \n
1813///    0x11: Less-than (ordered, non-signaling) \n
1814///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1815///    0x13: Unordered (signaling) \n
1816///    0x14: Not-equal (unordered, signaling) \n
1817///    0x15: Not-less-than (unordered, non-signaling) \n
1818///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1819///    0x17: Ordered (signaling) \n
1820///    0x18: Equal (unordered, signaling) \n
1821///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1822///    0x1A: Not-greater-than (unordered, non-signaling) \n
1823///    0x1B: False (ordered, signaling) \n
1824///    0x1C: Not-equal (ordered, signaling) \n
1825///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1826///    0x1E: Greater-than (ordered, non-signaling) \n
1827///    0x1F: True (unordered, signaling)
1828/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1829#define _mm256_cmp_ps(a, b, c) \
1830  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1831                                  (__v8sf)(__m256)(b), (c))
1832
1833/// Compares each of the corresponding scalar double-precision values of
1834///    two 128-bit vectors of [2 x double], using the operation specified by the
1835///    immediate integer operand.
1836///
1837///    If the result is true, all 64 bits of the destination vector are set;
1838///    otherwise they are cleared.
1839///
1840/// \headerfile <x86intrin.h>
1841///
1842/// \code
1843/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1844/// \endcode
1845///
1846/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1847///
1848/// \param a
1849///    A 128-bit vector of [2 x double].
1850/// \param b
1851///    A 128-bit vector of [2 x double].
1852/// \param c
1853///    An immediate integer operand, with bits [4:0] specifying which comparison
1854///    operation to use: \n
1855///    0x00: Equal (ordered, non-signaling) \n
1856///    0x01: Less-than (ordered, signaling) \n
1857///    0x02: Less-than-or-equal (ordered, signaling) \n
1858///    0x03: Unordered (non-signaling) \n
1859///    0x04: Not-equal (unordered, non-signaling) \n
1860///    0x05: Not-less-than (unordered, signaling) \n
1861///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1862///    0x07: Ordered (non-signaling) \n
1863///    0x08: Equal (unordered, non-signaling) \n
1864///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1865///    0x0A: Not-greater-than (unordered, signaling) \n
1866///    0x0B: False (ordered, non-signaling) \n
1867///    0x0C: Not-equal (ordered, non-signaling) \n
1868///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1869///    0x0E: Greater-than (ordered, signaling) \n
1870///    0x0F: True (unordered, non-signaling) \n
1871///    0x10: Equal (ordered, signaling) \n
1872///    0x11: Less-than (ordered, non-signaling) \n
1873///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1874///    0x13: Unordered (signaling) \n
1875///    0x14: Not-equal (unordered, signaling) \n
1876///    0x15: Not-less-than (unordered, non-signaling) \n
1877///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1878///    0x17: Ordered (signaling) \n
1879///    0x18: Equal (unordered, signaling) \n
1880///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1881///    0x1A: Not-greater-than (unordered, non-signaling) \n
1882///    0x1B: False (ordered, signaling) \n
1883///    0x1C: Not-equal (ordered, signaling) \n
1884///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1885///    0x1E: Greater-than (ordered, non-signaling) \n
1886///    0x1F: True (unordered, signaling)
1887/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1888#define _mm_cmp_sd(a, b, c) \
1889  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1890                                (__v2df)(__m128d)(b), (c))
1891
1892/// Compares each of the corresponding scalar values of two 128-bit
1893///    vectors of [4 x float], using the operation specified by the immediate
1894///    integer operand.
1895///
1896///    If the result is true, all 32 bits of the destination vector are set;
1897///    otherwise they are cleared.
1898///
1899/// \headerfile <x86intrin.h>
1900///
1901/// \code
1902/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1903/// \endcode
1904///
1905/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1906///
1907/// \param a
1908///    A 128-bit vector of [4 x float].
1909/// \param b
1910///    A 128-bit vector of [4 x float].
1911/// \param c
1912///    An immediate integer operand, with bits [4:0] specifying which comparison
1913///    operation to use: \n
1914///    0x00: Equal (ordered, non-signaling) \n
1915///    0x01: Less-than (ordered, signaling) \n
1916///    0x02: Less-than-or-equal (ordered, signaling) \n
1917///    0x03: Unordered (non-signaling) \n
1918///    0x04: Not-equal (unordered, non-signaling) \n
1919///    0x05: Not-less-than (unordered, signaling) \n
1920///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1921///    0x07: Ordered (non-signaling) \n
1922///    0x08: Equal (unordered, non-signaling) \n
1923///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1924///    0x0A: Not-greater-than (unordered, signaling) \n
1925///    0x0B: False (ordered, non-signaling) \n
1926///    0x0C: Not-equal (ordered, non-signaling) \n
1927///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1928///    0x0E: Greater-than (ordered, signaling) \n
1929///    0x0F: True (unordered, non-signaling) \n
1930///    0x10: Equal (ordered, signaling) \n
1931///    0x11: Less-than (ordered, non-signaling) \n
1932///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1933///    0x13: Unordered (signaling) \n
1934///    0x14: Not-equal (unordered, signaling) \n
1935///    0x15: Not-less-than (unordered, non-signaling) \n
1936///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1937///    0x17: Ordered (signaling) \n
1938///    0x18: Equal (unordered, signaling) \n
1939///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1940///    0x1A: Not-greater-than (unordered, non-signaling) \n
1941///    0x1B: False (ordered, signaling) \n
1942///    0x1C: Not-equal (ordered, signaling) \n
1943///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1944///    0x1E: Greater-than (ordered, non-signaling) \n
1945///    0x1F: True (unordered, signaling)
1946/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1947#define _mm_cmp_ss(a, b, c) \
1948  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1949                               (__v4sf)(__m128)(b), (c))
1950
1951/// Takes a [8 x i32] vector and returns the vector element value
1952///    indexed by the immediate constant operand.
1953///
1954/// \headerfile <x86intrin.h>
1955///
1956/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1957///   instruction.
1958///
1959/// \param __a
1960///    A 256-bit vector of [8 x i32].
1961/// \param __imm
1962///    An immediate integer operand with bits [2:0] determining which vector
1963///    element is extracted and returned.
1964/// \returns A 32-bit integer containing the extracted 32 bits of extended
1965///    packed data.
1966#define _mm256_extract_epi32(X, N) \
1967  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
1968
1969/// Takes a [16 x i16] vector and returns the vector element value
1970///    indexed by the immediate constant operand.
1971///
1972/// \headerfile <x86intrin.h>
1973///
1974/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1975///   instruction.
1976///
1977/// \param __a
1978///    A 256-bit integer vector of [16 x i16].
1979/// \param __imm
1980///    An immediate integer operand with bits [3:0] determining which vector
1981///    element is extracted and returned.
1982/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1983///    packed data.
1984#define _mm256_extract_epi16(X, N) \
1985  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1986                                                    (int)(N))
1987
1988/// Takes a [32 x i8] vector and returns the vector element value
1989///    indexed by the immediate constant operand.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1994///   instruction.
1995///
1996/// \param __a
1997///    A 256-bit integer vector of [32 x i8].
1998/// \param __imm
1999///    An immediate integer operand with bits [4:0] determining which vector
2000///    element is extracted and returned.
2001/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2002///    packed data.
2003#define _mm256_extract_epi8(X, N) \
2004  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2005                                                   (int)(N))
2006
2007#ifdef __x86_64__
2008/// Takes a [4 x i64] vector and returns the vector element value
2009///    indexed by the immediate constant operand.
2010///
2011/// \headerfile <x86intrin.h>
2012///
2013/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2014///   instruction.
2015///
2016/// \param __a
2017///    A 256-bit integer vector of [4 x i64].
2018/// \param __imm
2019///    An immediate integer operand with bits [1:0] determining which vector
2020///    element is extracted and returned.
2021/// \returns A 64-bit integer containing the extracted 64 bits of extended
2022///    packed data.
2023#define _mm256_extract_epi64(X, N) \
2024  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
2025#endif
2026
2027/// Takes a [8 x i32] vector and replaces the vector element value
2028///    indexed by the immediate constant operand by a new value. Returns the
2029///    modified vector.
2030///
2031/// \headerfile <x86intrin.h>
2032///
2033/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2034///   instruction.
2035///
2036/// \param __a
2037///    A vector of [8 x i32] to be used by the insert operation.
2038/// \param __b
2039///    An integer value. The replacement value for the insert operation.
2040/// \param __imm
2041///    An immediate integer specifying the index of the vector element to be
2042///    replaced.
2043/// \returns A copy of vector \a __a, after replacing its element indexed by
2044///    \a __imm with \a __b.
2045#define _mm256_insert_epi32(X, I, N) \
2046  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2047                                       (int)(I), (int)(N))
2048
2049
2050/// Takes a [16 x i16] vector and replaces the vector element value
2051///    indexed by the immediate constant operand with a new value. Returns the
2052///    modified vector.
2053///
2054/// \headerfile <x86intrin.h>
2055///
2056/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2057///   instruction.
2058///
2059/// \param __a
2060///    A vector of [16 x i16] to be used by the insert operation.
2061/// \param __b
2062///    An i16 integer value. The replacement value for the insert operation.
2063/// \param __imm
2064///    An immediate integer specifying the index of the vector element to be
2065///    replaced.
2066/// \returns A copy of vector \a __a, after replacing its element indexed by
2067///    \a __imm with \a __b.
2068#define _mm256_insert_epi16(X, I, N) \
2069  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2070                                        (int)(I), (int)(N))
2071
2072/// Takes a [32 x i8] vector and replaces the vector element value
2073///    indexed by the immediate constant operand with a new value. Returns the
2074///    modified vector.
2075///
2076/// \headerfile <x86intrin.h>
2077///
2078/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2079///   instruction.
2080///
2081/// \param __a
2082///    A vector of [32 x i8] to be used by the insert operation.
2083/// \param __b
2084///    An i8 integer value. The replacement value for the insert operation.
2085/// \param __imm
2086///    An immediate integer specifying the index of the vector element to be
2087///    replaced.
2088/// \returns A copy of vector \a __a, after replacing its element indexed by
2089///    \a __imm with \a __b.
2090#define _mm256_insert_epi8(X, I, N) \
2091  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2092                                        (int)(I), (int)(N))
2093
2094#ifdef __x86_64__
2095/// Takes a [4 x i64] vector and replaces the vector element value
2096///    indexed by the immediate constant operand with a new value. Returns the
2097///    modified vector.
2098///
2099/// \headerfile <x86intrin.h>
2100///
2101/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2102///   instruction.
2103///
2104/// \param __a
2105///    A vector of [4 x i64] to be used by the insert operation.
2106/// \param __b
2107///    A 64-bit integer value. The replacement value for the insert operation.
2108/// \param __imm
2109///    An immediate integer specifying the index of the vector element to be
2110///    replaced.
2111/// \returns A copy of vector \a __a, after replacing its element indexed by
2112///     \a __imm with \a __b.
2113#define _mm256_insert_epi64(X, I, N) \
2114  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2115                                       (long long)(I), (int)(N))
2116#endif
2117
2118/* Conversion */
2119/// Converts a vector of [4 x i32] into a vector of [4 x double].
2120///
2121/// \headerfile <x86intrin.h>
2122///
2123/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2124///
2125/// \param __a
2126///    A 128-bit integer vector of [4 x i32].
2127/// \returns A 256-bit vector of [4 x double] containing the converted values.
2128static __inline __m256d __DEFAULT_FN_ATTRS
2129_mm256_cvtepi32_pd(__m128i __a)
2130{
2131  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2132}
2133
2134/// Converts a vector of [8 x i32] into a vector of [8 x float].
2135///
2136/// \headerfile <x86intrin.h>
2137///
2138/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2139///
2140/// \param __a
2141///    A 256-bit integer vector.
2142/// \returns A 256-bit vector of [8 x float] containing the converted values.
2143static __inline __m256 __DEFAULT_FN_ATTRS
2144_mm256_cvtepi32_ps(__m256i __a)
2145{
2146  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2147}
2148
2149/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2150///    [4 x float].
2151///
2152/// \headerfile <x86intrin.h>
2153///
2154/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2155///
2156/// \param __a
2157///    A 256-bit vector of [4 x double].
2158/// \returns A 128-bit vector of [4 x float] containing the converted values.
2159static __inline __m128 __DEFAULT_FN_ATTRS
2160_mm256_cvtpd_ps(__m256d __a)
2161{
2162  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2163}
2164
2165/// Converts a vector of [8 x float] into a vector of [8 x i32].
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2170///
2171/// \param __a
2172///    A 256-bit vector of [8 x float].
2173/// \returns A 256-bit integer vector containing the converted values.
2174static __inline __m256i __DEFAULT_FN_ATTRS
2175_mm256_cvtps_epi32(__m256 __a)
2176{
2177  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2178}
2179
2180/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2181///    x double].
2182///
2183/// \headerfile <x86intrin.h>
2184///
2185/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2186///
2187/// \param __a
2188///    A 128-bit vector of [4 x float].
2189/// \returns A 256-bit vector of [4 x double] containing the converted values.
2190static __inline __m256d __DEFAULT_FN_ATTRS
2191_mm256_cvtps_pd(__m128 __a)
2192{
2193  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2194}
2195
2196/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2197///    x i32], truncating the result by rounding towards zero when it is
2198///    inexact.
2199///
2200/// \headerfile <x86intrin.h>
2201///
2202/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2203///
2204/// \param __a
2205///    A 256-bit vector of [4 x double].
2206/// \returns A 128-bit integer vector containing the converted values.
2207static __inline __m128i __DEFAULT_FN_ATTRS
2208_mm256_cvttpd_epi32(__m256d __a)
2209{
2210  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2211}
2212
2213/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2214///    x i32]. When a conversion is inexact, the value returned is rounded
2215///    according to the rounding control bits in the MXCSR register.
2216///
2217/// \headerfile <x86intrin.h>
2218///
2219/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2220///
2221/// \param __a
2222///    A 256-bit vector of [4 x double].
2223/// \returns A 128-bit integer vector containing the converted values.
2224static __inline __m128i __DEFAULT_FN_ATTRS
2225_mm256_cvtpd_epi32(__m256d __a)
2226{
2227  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2228}
2229
2230/// Converts a vector of [8 x float] into a vector of [8 x i32],
2231///    truncating the result by rounding towards zero when it is inexact.
2232///
2233/// \headerfile <x86intrin.h>
2234///
2235/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2236///
2237/// \param __a
2238///    A 256-bit vector of [8 x float].
2239/// \returns A 256-bit integer vector containing the converted values.
2240static __inline __m256i __DEFAULT_FN_ATTRS
2241_mm256_cvttps_epi32(__m256 __a)
2242{
2243  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2244}
2245
2246/// Returns the first element of the input vector of [4 x double].
2247///
2248/// \headerfile <avxintrin.h>
2249///
2250/// This intrinsic is a utility function and does not correspond to a specific
2251///    instruction.
2252///
2253/// \param __a
2254///    A 256-bit vector of [4 x double].
2255/// \returns A 64 bit double containing the first element of the input vector.
2256static __inline double __DEFAULT_FN_ATTRS
2257_mm256_cvtsd_f64(__m256d __a)
2258{
2259 return __a[0];
2260}
2261
2262/// Returns the first element of the input vector of [8 x i32].
2263///
2264/// \headerfile <avxintrin.h>
2265///
2266/// This intrinsic is a utility function and does not correspond to a specific
2267///    instruction.
2268///
2269/// \param __a
2270///    A 256-bit vector of [8 x i32].
2271/// \returns A 32 bit integer containing the first element of the input vector.
2272static __inline int __DEFAULT_FN_ATTRS
2273_mm256_cvtsi256_si32(__m256i __a)
2274{
2275 __v8si __b = (__v8si)__a;
2276 return __b[0];
2277}
2278
2279/// Returns the first element of the input vector of [8 x float].
2280///
2281/// \headerfile <avxintrin.h>
2282///
2283/// This intrinsic is a utility function and does not correspond to a specific
2284///    instruction.
2285///
2286/// \param __a
2287///    A 256-bit vector of [8 x float].
2288/// \returns A 32 bit float containing the first element of the input vector.
2289static __inline float __DEFAULT_FN_ATTRS
2290_mm256_cvtss_f32(__m256 __a)
2291{
2292 return __a[0];
2293}
2294
2295/* Vector replicate */
2296/// Moves and duplicates odd-indexed values from a 256-bit vector of
2297///    [8 x float] to float values in a 256-bit vector of [8 x float].
2298///
2299/// \headerfile <x86intrin.h>
2300///
2301/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2302///
2303/// \param __a
2304///    A 256-bit vector of [8 x float]. \n
2305///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2306///    the return value. \n
2307///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2308///    the return value. \n
2309///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2310///    return value. \n
2311///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2312///    return value.
2313/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2314///    values.
2315static __inline __m256 __DEFAULT_FN_ATTRS
2316_mm256_movehdup_ps(__m256 __a)
2317{
2318  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2319}
2320
2321/// Moves and duplicates even-indexed values from a 256-bit vector of
2322///    [8 x float] to float values in a 256-bit vector of [8 x float].
2323///
2324/// \headerfile <x86intrin.h>
2325///
2326/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2327///
2328/// \param __a
2329///    A 256-bit vector of [8 x float]. \n
2330///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2331///    the return value. \n
2332///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2333///    the return value. \n
2334///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2335///    return value. \n
2336///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2337///    return value.
2338/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2339///    values.
2340static __inline __m256 __DEFAULT_FN_ATTRS
2341_mm256_moveldup_ps(__m256 __a)
2342{
2343  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2344}
2345
2346/// Moves and duplicates double-precision floating point values from a
2347///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2348///    vector of [4 x double].
2349///
2350/// \headerfile <x86intrin.h>
2351///
2352/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2353///
2354/// \param __a
2355///    A 256-bit vector of [4 x double]. \n
2356///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2357///    return value. \n
2358///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2359///    the return value.
2360/// \returns A 256-bit vector of [4 x double] containing the moved and
2361///    duplicated values.
2362static __inline __m256d __DEFAULT_FN_ATTRS
2363_mm256_movedup_pd(__m256d __a)
2364{
2365  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2366}
2367
2368/* Unpack and Interleave */
2369/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2370///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2371///
2372/// \headerfile <x86intrin.h>
2373///
2374/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2375///
2376/// \param __a
2377///    A 256-bit floating-point vector of [4 x double]. \n
2378///    Bits [127:64] are written to bits [63:0] of the return value. \n
2379///    Bits [255:192] are written to bits [191:128] of the return value. \n
2380/// \param __b
2381///    A 256-bit floating-point vector of [4 x double]. \n
2382///    Bits [127:64] are written to bits [127:64] of the return value. \n
2383///    Bits [255:192] are written to bits [255:192] of the return value. \n
2384/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2385static __inline __m256d __DEFAULT_FN_ATTRS
2386_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2387{
2388  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2389}
2390
2391/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2392///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2393///
2394/// \headerfile <x86intrin.h>
2395///
2396/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2397///
2398/// \param __a
2399///    A 256-bit floating-point vector of [4 x double]. \n
2400///    Bits [63:0] are written to bits [63:0] of the return value. \n
2401///    Bits [191:128] are written to bits [191:128] of the return value.
2402/// \param __b
2403///    A 256-bit floating-point vector of [4 x double]. \n
2404///    Bits [63:0] are written to bits [127:64] of the return value. \n
2405///    Bits [191:128] are written to bits [255:192] of the return value. \n
2406/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2407static __inline __m256d __DEFAULT_FN_ATTRS
2408_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2409{
2410  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2411}
2412
2413/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2414///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2415///    vector of [8 x float].
2416///
2417/// \headerfile <x86intrin.h>
2418///
2419/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2420///
2421/// \param __a
2422///    A 256-bit vector of [8 x float]. \n
2423///    Bits [95:64] are written to bits [31:0] of the return value. \n
2424///    Bits [127:96] are written to bits [95:64] of the return value. \n
2425///    Bits [223:192] are written to bits [159:128] of the return value. \n
2426///    Bits [255:224] are written to bits [223:192] of the return value.
2427/// \param __b
2428///    A 256-bit vector of [8 x float]. \n
2429///    Bits [95:64] are written to bits [63:32] of the return value. \n
2430///    Bits [127:96] are written to bits [127:96] of the return value. \n
2431///    Bits [223:192] are written to bits [191:160] of the return value. \n
2432///    Bits [255:224] are written to bits [255:224] of the return value.
2433/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2434static __inline __m256 __DEFAULT_FN_ATTRS
2435_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2436{
2437  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2438}
2439
2440/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2441///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2442///    vector of [8 x float].
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2447///
2448/// \param __a
2449///    A 256-bit vector of [8 x float]. \n
2450///    Bits [31:0] are written to bits [31:0] of the return value. \n
2451///    Bits [63:32] are written to bits [95:64] of the return value. \n
2452///    Bits [159:128] are written to bits [159:128] of the return value. \n
2453///    Bits [191:160] are written to bits [223:192] of the return value.
2454/// \param __b
2455///    A 256-bit vector of [8 x float]. \n
2456///    Bits [31:0] are written to bits [63:32] of the return value. \n
2457///    Bits [63:32] are written to bits [127:96] of the return value. \n
2458///    Bits [159:128] are written to bits [191:160] of the return value. \n
2459///    Bits [191:160] are written to bits [255:224] of the return value.
2460/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2461static __inline __m256 __DEFAULT_FN_ATTRS
2462_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2463{
2464  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2465}
2466
2467/* Bit Test */
2468/// Given two 128-bit floating-point vectors of [2 x double], perform an
2469///    element-by-element comparison of the double-precision element in the
2470///    first source vector and the corresponding element in the second source
2471///    vector.
2472///
2473///    The EFLAGS register is updated as follows: \n
2474///    If there is at least one pair of double-precision elements where the
2475///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2476///    ZF flag is set to 1. \n
2477///    If there is at least one pair of double-precision elements where the
2478///    sign-bit of the first element is 0 and the sign-bit of the second element
2479///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2480///    This intrinsic returns the value of the ZF flag.
2481///
2482/// \headerfile <x86intrin.h>
2483///
2484/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2485///
2486/// \param __a
2487///    A 128-bit vector of [2 x double].
2488/// \param __b
2489///    A 128-bit vector of [2 x double].
2490/// \returns the ZF flag in the EFLAGS register.
2491static __inline int __DEFAULT_FN_ATTRS128
2492_mm_testz_pd(__m128d __a, __m128d __b)
2493{
2494  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2495}
2496
2497/// Given two 128-bit floating-point vectors of [2 x double], perform an
2498///    element-by-element comparison of the double-precision element in the
2499///    first source vector and the corresponding element in the second source
2500///    vector.
2501///
2502///    The EFLAGS register is updated as follows: \n
2503///    If there is at least one pair of double-precision elements where the
2504///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2505///    ZF flag is set to 1. \n
2506///    If there is at least one pair of double-precision elements where the
2507///    sign-bit of the first element is 0 and the sign-bit of the second element
2508///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2509///    This intrinsic returns the value of the CF flag.
2510///
2511/// \headerfile <x86intrin.h>
2512///
2513/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2514///
2515/// \param __a
2516///    A 128-bit vector of [2 x double].
2517/// \param __b
2518///    A 128-bit vector of [2 x double].
2519/// \returns the CF flag in the EFLAGS register.
2520static __inline int __DEFAULT_FN_ATTRS128
2521_mm_testc_pd(__m128d __a, __m128d __b)
2522{
2523  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2524}
2525
2526/// Given two 128-bit floating-point vectors of [2 x double], perform an
2527///    element-by-element comparison of the double-precision element in the
2528///    first source vector and the corresponding element in the second source
2529///    vector.
2530///
2531///    The EFLAGS register is updated as follows: \n
2532///    If there is at least one pair of double-precision elements where the
2533///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2534///    ZF flag is set to 1. \n
2535///    If there is at least one pair of double-precision elements where the
2536///    sign-bit of the first element is 0 and the sign-bit of the second element
2537///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2538///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2539///    otherwise it returns 0.
2540///
2541/// \headerfile <x86intrin.h>
2542///
2543/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2544///
2545/// \param __a
2546///    A 128-bit vector of [2 x double].
2547/// \param __b
2548///    A 128-bit vector of [2 x double].
2549/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2550static __inline int __DEFAULT_FN_ATTRS128
2551_mm_testnzc_pd(__m128d __a, __m128d __b)
2552{
2553  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2554}
2555
2556/// Given two 128-bit floating-point vectors of [4 x float], perform an
2557///    element-by-element comparison of the single-precision element in the
2558///    first source vector and the corresponding element in the second source
2559///    vector.
2560///
2561///    The EFLAGS register is updated as follows: \n
2562///    If there is at least one pair of single-precision elements where the
2563///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2564///    ZF flag is set to 1. \n
2565///    If there is at least one pair of single-precision elements where the
2566///    sign-bit of the first element is 0 and the sign-bit of the second element
2567///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2568///    This intrinsic returns the value of the ZF flag.
2569///
2570/// \headerfile <x86intrin.h>
2571///
2572/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2573///
2574/// \param __a
2575///    A 128-bit vector of [4 x float].
2576/// \param __b
2577///    A 128-bit vector of [4 x float].
2578/// \returns the ZF flag.
2579static __inline int __DEFAULT_FN_ATTRS128
2580_mm_testz_ps(__m128 __a, __m128 __b)
2581{
2582  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2583}
2584
2585/// Given two 128-bit floating-point vectors of [4 x float], perform an
2586///    element-by-element comparison of the single-precision element in the
2587///    first source vector and the corresponding element in the second source
2588///    vector.
2589///
2590///    The EFLAGS register is updated as follows: \n
2591///    If there is at least one pair of single-precision elements where the
2592///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2593///    ZF flag is set to 1. \n
2594///    If there is at least one pair of single-precision elements where the
2595///    sign-bit of the first element is 0 and the sign-bit of the second element
2596///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2597///    This intrinsic returns the value of the CF flag.
2598///
2599/// \headerfile <x86intrin.h>
2600///
2601/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2602///
2603/// \param __a
2604///    A 128-bit vector of [4 x float].
2605/// \param __b
2606///    A 128-bit vector of [4 x float].
2607/// \returns the CF flag.
2608static __inline int __DEFAULT_FN_ATTRS128
2609_mm_testc_ps(__m128 __a, __m128 __b)
2610{
2611  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2612}
2613
2614/// Given two 128-bit floating-point vectors of [4 x float], perform an
2615///    element-by-element comparison of the single-precision element in the
2616///    first source vector and the corresponding element in the second source
2617///    vector.
2618///
2619///    The EFLAGS register is updated as follows: \n
2620///    If there is at least one pair of single-precision elements where the
2621///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2622///    ZF flag is set to 1. \n
2623///    If there is at least one pair of single-precision elements where the
2624///    sign-bit of the first element is 0 and the sign-bit of the second element
2625///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2626///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2627///    otherwise it returns 0.
2628///
2629/// \headerfile <x86intrin.h>
2630///
2631/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2632///
2633/// \param __a
2634///    A 128-bit vector of [4 x float].
2635/// \param __b
2636///    A 128-bit vector of [4 x float].
2637/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2638static __inline int __DEFAULT_FN_ATTRS128
2639_mm_testnzc_ps(__m128 __a, __m128 __b)
2640{
2641  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2642}
2643
2644/// Given two 256-bit floating-point vectors of [4 x double], perform an
2645///    element-by-element comparison of the double-precision elements in the
2646///    first source vector and the corresponding elements in the second source
2647///    vector.
2648///
2649///    The EFLAGS register is updated as follows: \n
2650///    If there is at least one pair of double-precision elements where the
2651///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2652///    ZF flag is set to 1. \n
2653///    If there is at least one pair of double-precision elements where the
2654///    sign-bit of the first element is 0 and the sign-bit of the second element
2655///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2656///    This intrinsic returns the value of the ZF flag.
2657///
2658/// \headerfile <x86intrin.h>
2659///
2660/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2661///
2662/// \param __a
2663///    A 256-bit vector of [4 x double].
2664/// \param __b
2665///    A 256-bit vector of [4 x double].
2666/// \returns the ZF flag.
2667static __inline int __DEFAULT_FN_ATTRS
2668_mm256_testz_pd(__m256d __a, __m256d __b)
2669{
2670  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2671}
2672
2673/// Given two 256-bit floating-point vectors of [4 x double], perform an
2674///    element-by-element comparison of the double-precision elements in the
2675///    first source vector and the corresponding elements in the second source
2676///    vector.
2677///
2678///    The EFLAGS register is updated as follows: \n
2679///    If there is at least one pair of double-precision elements where the
2680///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2681///    ZF flag is set to 1. \n
2682///    If there is at least one pair of double-precision elements where the
2683///    sign-bit of the first element is 0 and the sign-bit of the second element
2684///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2685///    This intrinsic returns the value of the CF flag.
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2690///
2691/// \param __a
2692///    A 256-bit vector of [4 x double].
2693/// \param __b
2694///    A 256-bit vector of [4 x double].
2695/// \returns the CF flag.
2696static __inline int __DEFAULT_FN_ATTRS
2697_mm256_testc_pd(__m256d __a, __m256d __b)
2698{
2699  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2700}
2701
2702/// Given two 256-bit floating-point vectors of [4 x double], perform an
2703///    element-by-element comparison of the double-precision elements in the
2704///    first source vector and the corresponding elements in the second source
2705///    vector.
2706///
2707///    The EFLAGS register is updated as follows: \n
2708///    If there is at least one pair of double-precision elements where the
2709///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2710///    ZF flag is set to 1. \n
2711///    If there is at least one pair of double-precision elements where the
2712///    sign-bit of the first element is 0 and the sign-bit of the second element
2713///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2714///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2715///    otherwise it returns 0.
2716///
2717/// \headerfile <x86intrin.h>
2718///
2719/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2720///
2721/// \param __a
2722///    A 256-bit vector of [4 x double].
2723/// \param __b
2724///    A 256-bit vector of [4 x double].
2725/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2726static __inline int __DEFAULT_FN_ATTRS
2727_mm256_testnzc_pd(__m256d __a, __m256d __b)
2728{
2729  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2730}
2731
2732/// Given two 256-bit floating-point vectors of [8 x float], perform an
2733///    element-by-element comparison of the single-precision element in the
2734///    first source vector and the corresponding element in the second source
2735///    vector.
2736///
2737///    The EFLAGS register is updated as follows: \n
2738///    If there is at least one pair of single-precision elements where the
2739///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2740///    ZF flag is set to 1. \n
2741///    If there is at least one pair of single-precision elements where the
2742///    sign-bit of the first element is 0 and the sign-bit of the second element
2743///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2744///    This intrinsic returns the value of the ZF flag.
2745///
2746/// \headerfile <x86intrin.h>
2747///
2748/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2749///
2750/// \param __a
2751///    A 256-bit vector of [8 x float].
2752/// \param __b
2753///    A 256-bit vector of [8 x float].
2754/// \returns the ZF flag.
2755static __inline int __DEFAULT_FN_ATTRS
2756_mm256_testz_ps(__m256 __a, __m256 __b)
2757{
2758  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2759}
2760
2761/// Given two 256-bit floating-point vectors of [8 x float], perform an
2762///    element-by-element comparison of the single-precision element in the
2763///    first source vector and the corresponding element in the second source
2764///    vector.
2765///
2766///    The EFLAGS register is updated as follows: \n
2767///    If there is at least one pair of single-precision elements where the
2768///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2769///    ZF flag is set to 1. \n
2770///    If there is at least one pair of single-precision elements where the
2771///    sign-bit of the first element is 0 and the sign-bit of the second element
2772///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2773///    This intrinsic returns the value of the CF flag.
2774///
2775/// \headerfile <x86intrin.h>
2776///
2777/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2778///
2779/// \param __a
2780///    A 256-bit vector of [8 x float].
2781/// \param __b
2782///    A 256-bit vector of [8 x float].
2783/// \returns the CF flag.
2784static __inline int __DEFAULT_FN_ATTRS
2785_mm256_testc_ps(__m256 __a, __m256 __b)
2786{
2787  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2788}
2789
2790/// Given two 256-bit floating-point vectors of [8 x float], perform an
2791///    element-by-element comparison of the single-precision elements in the
2792///    first source vector and the corresponding elements in the second source
2793///    vector.
2794///
2795///    The EFLAGS register is updated as follows: \n
2796///    If there is at least one pair of single-precision elements where the
2797///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2798///    ZF flag is set to 1. \n
2799///    If there is at least one pair of single-precision elements where the
2800///    sign-bit of the first element is 0 and the sign-bit of the second element
2801///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2802///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2803///    otherwise it returns 0.
2804///
2805/// \headerfile <x86intrin.h>
2806///
2807/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2808///
2809/// \param __a
2810///    A 256-bit vector of [8 x float].
2811/// \param __b
2812///    A 256-bit vector of [8 x float].
2813/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2814static __inline int __DEFAULT_FN_ATTRS
2815_mm256_testnzc_ps(__m256 __a, __m256 __b)
2816{
2817  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2818}
2819
2820/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2821///    of the two source vectors.
2822///
2823///    The EFLAGS register is updated as follows: \n
2824///    If there is at least one pair of bits where both bits are 1, the ZF flag
2825///    is set to 0. Otherwise the ZF flag is set to 1. \n
2826///    If there is at least one pair of bits where the bit from the first source
2827///    vector is 0 and the bit from the second source vector is 1, the CF flag
2828///    is set to 0. Otherwise the CF flag is set to 1. \n
2829///    This intrinsic returns the value of the ZF flag.
2830///
2831/// \headerfile <x86intrin.h>
2832///
2833/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2834///
2835/// \param __a
2836///    A 256-bit integer vector.
2837/// \param __b
2838///    A 256-bit integer vector.
2839/// \returns the ZF flag.
2840static __inline int __DEFAULT_FN_ATTRS
2841_mm256_testz_si256(__m256i __a, __m256i __b)
2842{
2843  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2844}
2845
2846/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2847///    of the two source vectors.
2848///
2849///    The EFLAGS register is updated as follows: \n
2850///    If there is at least one pair of bits where both bits are 1, the ZF flag
2851///    is set to 0. Otherwise the ZF flag is set to 1. \n
2852///    If there is at least one pair of bits where the bit from the first source
2853///    vector is 0 and the bit from the second source vector is 1, the CF flag
2854///    is set to 0. Otherwise the CF flag is set to 1. \n
2855///    This intrinsic returns the value of the CF flag.
2856///
2857/// \headerfile <x86intrin.h>
2858///
2859/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2860///
2861/// \param __a
2862///    A 256-bit integer vector.
2863/// \param __b
2864///    A 256-bit integer vector.
2865/// \returns the CF flag.
2866static __inline int __DEFAULT_FN_ATTRS
2867_mm256_testc_si256(__m256i __a, __m256i __b)
2868{
2869  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2870}
2871
2872/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2873///    of the two source vectors.
2874///
2875///    The EFLAGS register is updated as follows: \n
2876///    If there is at least one pair of bits where both bits are 1, the ZF flag
2877///    is set to 0. Otherwise the ZF flag is set to 1. \n
2878///    If there is at least one pair of bits where the bit from the first source
2879///    vector is 0 and the bit from the second source vector is 1, the CF flag
2880///    is set to 0. Otherwise the CF flag is set to 1. \n
2881///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2882///    otherwise it returns 0.
2883///
2884/// \headerfile <x86intrin.h>
2885///
2886/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2887///
2888/// \param __a
2889///    A 256-bit integer vector.
2890/// \param __b
2891///    A 256-bit integer vector.
2892/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2893static __inline int __DEFAULT_FN_ATTRS
2894_mm256_testnzc_si256(__m256i __a, __m256i __b)
2895{
2896  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2897}
2898
2899/* Vector extract sign mask */
2900/// Extracts the sign bits of double-precision floating point elements
2901///    in a 256-bit vector of [4 x double] and writes them to the lower order
2902///    bits of the return value.
2903///
2904/// \headerfile <x86intrin.h>
2905///
2906/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2907///
2908/// \param __a
2909///    A 256-bit vector of [4 x double] containing the double-precision
2910///    floating point values with sign bits to be extracted.
2911/// \returns The sign bits from the operand, written to bits [3:0].
2912static __inline int __DEFAULT_FN_ATTRS
2913_mm256_movemask_pd(__m256d __a)
2914{
2915  return __builtin_ia32_movmskpd256((__v4df)__a);
2916}
2917
2918/// Extracts the sign bits of single-precision floating point elements
2919///    in a 256-bit vector of [8 x float] and writes them to the lower order
2920///    bits of the return value.
2921///
2922/// \headerfile <x86intrin.h>
2923///
2924/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2925///
2926/// \param __a
2927///    A 256-bit vector of [8 x float] containing the single-precision floating
2928///    point values with sign bits to be extracted.
2929/// \returns The sign bits from the operand, written to bits [7:0].
2930static __inline int __DEFAULT_FN_ATTRS
2931_mm256_movemask_ps(__m256 __a)
2932{
2933  return __builtin_ia32_movmskps256((__v8sf)__a);
2934}
2935
2936/* Vector __zero */
2937/// Zeroes the contents of all XMM or YMM registers.
2938///
2939/// \headerfile <x86intrin.h>
2940///
2941/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2942static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2943_mm256_zeroall(void)
2944{
2945  __builtin_ia32_vzeroall();
2946}
2947
2948/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2949///
2950/// \headerfile <x86intrin.h>
2951///
2952/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2953static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2954_mm256_zeroupper(void)
2955{
2956  __builtin_ia32_vzeroupper();
2957}
2958
2959/* Vector load with broadcast */
2960/// Loads a scalar single-precision floating point value from the
2961///    specified address pointed to by \a __a and broadcasts it to the elements
2962///    of a [4 x float] vector.
2963///
2964/// \headerfile <x86intrin.h>
2965///
2966/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2967///
2968/// \param __a
2969///    The single-precision floating point value to be broadcast.
2970/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2971///    equal to the broadcast value.
2972static __inline __m128 __DEFAULT_FN_ATTRS128
2973_mm_broadcast_ss(float const *__a)
2974{
2975  float __f = *__a;
2976  return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
2977}
2978
2979/// Loads a scalar double-precision floating point value from the
2980///    specified address pointed to by \a __a and broadcasts it to the elements
2981///    of a [4 x double] vector.
2982///
2983/// \headerfile <x86intrin.h>
2984///
2985/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
2986///
2987/// \param __a
2988///    The double-precision floating point value to be broadcast.
2989/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2990///    equal to the broadcast value.
2991static __inline __m256d __DEFAULT_FN_ATTRS
2992_mm256_broadcast_sd(double const *__a)
2993{
2994  double __d = *__a;
2995  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
2996}
2997
2998/// Loads a scalar single-precision floating point value from the
2999///    specified address pointed to by \a __a and broadcasts it to the elements
3000///    of a [8 x float] vector.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3005///
3006/// \param __a
3007///    The single-precision floating point value to be broadcast.
3008/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3009///    equal to the broadcast value.
3010static __inline __m256 __DEFAULT_FN_ATTRS
3011_mm256_broadcast_ss(float const *__a)
3012{
3013  float __f = *__a;
3014  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3015}
3016
3017/// Loads the data from a 128-bit vector of [2 x double] from the
3018///    specified address pointed to by \a __a and broadcasts it to 128-bit
3019///    elements in a 256-bit vector of [4 x double].
3020///
3021/// \headerfile <x86intrin.h>
3022///
3023/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3024///
3025/// \param __a
3026///    The 128-bit vector of [2 x double] to be broadcast.
3027/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3028///    equal to the broadcast value.
3029static __inline __m256d __DEFAULT_FN_ATTRS
3030_mm256_broadcast_pd(__m128d const *__a)
3031{
3032  __m128d __b = _mm_loadu_pd((const double *)__a);
3033  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3034                                          0, 1, 0, 1);
3035}
3036
3037/// Loads the data from a 128-bit vector of [4 x float] from the
3038///    specified address pointed to by \a __a and broadcasts it to 128-bit
3039///    elements in a 256-bit vector of [8 x float].
3040///
3041/// \headerfile <x86intrin.h>
3042///
3043/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3044///
3045/// \param __a
3046///    The 128-bit vector of [4 x float] to be broadcast.
3047/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3048///    equal to the broadcast value.
3049static __inline __m256 __DEFAULT_FN_ATTRS
3050_mm256_broadcast_ps(__m128 const *__a)
3051{
3052  __m128 __b = _mm_loadu_ps((const float *)__a);
3053  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3054                                         0, 1, 2, 3, 0, 1, 2, 3);
3055}
3056
3057/* SIMD load ops */
3058/// Loads 4 double-precision floating point values from a 32-byte aligned
3059///    memory location pointed to by \a __p into a vector of [4 x double].
3060///
3061/// \headerfile <x86intrin.h>
3062///
3063/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3064///
3065/// \param __p
3066///    A 32-byte aligned pointer to a memory location containing
3067///    double-precision floating point values.
3068/// \returns A 256-bit vector of [4 x double] containing the moved values.
3069static __inline __m256d __DEFAULT_FN_ATTRS
3070_mm256_load_pd(double const *__p)
3071{
3072  return *(const __m256d *)__p;
3073}
3074
3075/// Loads 8 single-precision floating point values from a 32-byte aligned
3076///    memory location pointed to by \a __p into a vector of [8 x float].
3077///
3078/// \headerfile <x86intrin.h>
3079///
3080/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3081///
3082/// \param __p
3083///    A 32-byte aligned pointer to a memory location containing float values.
3084/// \returns A 256-bit vector of [8 x float] containing the moved values.
3085static __inline __m256 __DEFAULT_FN_ATTRS
3086_mm256_load_ps(float const *__p)
3087{
3088  return *(const __m256 *)__p;
3089}
3090
3091/// Loads 4 double-precision floating point values from an unaligned
3092///    memory location pointed to by \a __p into a vector of [4 x double].
3093///
3094/// \headerfile <x86intrin.h>
3095///
3096/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3097///
3098/// \param __p
3099///    A pointer to a memory location containing double-precision floating
3100///    point values.
3101/// \returns A 256-bit vector of [4 x double] containing the moved values.
3102static __inline __m256d __DEFAULT_FN_ATTRS
3103_mm256_loadu_pd(double const *__p)
3104{
3105  struct __loadu_pd {
3106    __m256d_u __v;
3107  } __attribute__((__packed__, __may_alias__));
3108  return ((const struct __loadu_pd*)__p)->__v;
3109}
3110
3111/// Loads 8 single-precision floating point values from an unaligned
3112///    memory location pointed to by \a __p into a vector of [8 x float].
3113///
3114/// \headerfile <x86intrin.h>
3115///
3116/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3117///
3118/// \param __p
3119///    A pointer to a memory location containing single-precision floating
3120///    point values.
3121/// \returns A 256-bit vector of [8 x float] containing the moved values.
3122static __inline __m256 __DEFAULT_FN_ATTRS
3123_mm256_loadu_ps(float const *__p)
3124{
3125  struct __loadu_ps {
3126    __m256_u __v;
3127  } __attribute__((__packed__, __may_alias__));
3128  return ((const struct __loadu_ps*)__p)->__v;
3129}
3130
3131/// Loads 256 bits of integer data from a 32-byte aligned memory
3132///    location pointed to by \a __p into elements of a 256-bit integer vector.
3133///
3134/// \headerfile <x86intrin.h>
3135///
3136/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3137///
3138/// \param __p
3139///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3140///    values.
3141/// \returns A 256-bit integer vector containing the moved values.
3142static __inline __m256i __DEFAULT_FN_ATTRS
3143_mm256_load_si256(__m256i const *__p)
3144{
3145  return *__p;
3146}
3147
3148/// Loads 256 bits of integer data from an unaligned memory location
3149///    pointed to by \a __p into a 256-bit integer vector.
3150///
3151/// \headerfile <x86intrin.h>
3152///
3153/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3154///
3155/// \param __p
3156///    A pointer to a 256-bit integer vector containing integer values.
3157/// \returns A 256-bit integer vector containing the moved values.
3158static __inline __m256i __DEFAULT_FN_ATTRS
3159_mm256_loadu_si256(__m256i_u const *__p)
3160{
3161  struct __loadu_si256 {
3162    __m256i_u __v;
3163  } __attribute__((__packed__, __may_alias__));
3164  return ((const struct __loadu_si256*)__p)->__v;
3165}
3166
3167/// Loads 256 bits of integer data from an unaligned memory location
3168///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3169///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3170///    line boundary.
3171///
3172/// \headerfile <x86intrin.h>
3173///
3174/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3175///
3176/// \param __p
3177///    A pointer to a 256-bit integer vector containing integer values.
3178/// \returns A 256-bit integer vector containing the moved values.
3179static __inline __m256i __DEFAULT_FN_ATTRS
3180_mm256_lddqu_si256(__m256i const *__p)
3181{
3182  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3183}
3184
3185/* SIMD store ops */
3186/// Stores double-precision floating point values from a 256-bit vector
3187///    of [4 x double] to a 32-byte aligned memory location pointed to by
3188///    \a __p.
3189///
3190/// \headerfile <x86intrin.h>
3191///
3192/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3193///
3194/// \param __p
3195///    A 32-byte aligned pointer to a memory location that will receive the
3196///    double-precision floaing point values.
3197/// \param __a
3198///    A 256-bit vector of [4 x double] containing the values to be moved.
3199static __inline void __DEFAULT_FN_ATTRS
3200_mm256_store_pd(double *__p, __m256d __a)
3201{
3202  *(__m256d *)__p = __a;
3203}
3204
3205/// Stores single-precision floating point values from a 256-bit vector
3206///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3207///
3208/// \headerfile <x86intrin.h>
3209///
3210/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3211///
3212/// \param __p
3213///    A 32-byte aligned pointer to a memory location that will receive the
3214///    float values.
3215/// \param __a
3216///    A 256-bit vector of [8 x float] containing the values to be moved.
3217static __inline void __DEFAULT_FN_ATTRS
3218_mm256_store_ps(float *__p, __m256 __a)
3219{
3220  *(__m256 *)__p = __a;
3221}
3222
3223/// Stores double-precision floating point values from a 256-bit vector
3224///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3225///
3226/// \headerfile <x86intrin.h>
3227///
3228/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3229///
3230/// \param __p
3231///    A pointer to a memory location that will receive the double-precision
3232///    floating point values.
3233/// \param __a
3234///    A 256-bit vector of [4 x double] containing the values to be moved.
3235static __inline void __DEFAULT_FN_ATTRS
3236_mm256_storeu_pd(double *__p, __m256d __a)
3237{
3238  struct __storeu_pd {
3239    __m256d_u __v;
3240  } __attribute__((__packed__, __may_alias__));
3241  ((struct __storeu_pd*)__p)->__v = __a;
3242}
3243
3244/// Stores single-precision floating point values from a 256-bit vector
3245///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3246///
3247/// \headerfile <x86intrin.h>
3248///
3249/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3250///
3251/// \param __p
3252///    A pointer to a memory location that will receive the float values.
3253/// \param __a
3254///    A 256-bit vector of [8 x float] containing the values to be moved.
3255static __inline void __DEFAULT_FN_ATTRS
3256_mm256_storeu_ps(float *__p, __m256 __a)
3257{
3258  struct __storeu_ps {
3259    __m256_u __v;
3260  } __attribute__((__packed__, __may_alias__));
3261  ((struct __storeu_ps*)__p)->__v = __a;
3262}
3263
3264/// Stores integer values from a 256-bit integer vector to a 32-byte
3265///    aligned memory location pointed to by \a __p.
3266///
3267/// \headerfile <x86intrin.h>
3268///
3269/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3270///
3271/// \param __p
3272///    A 32-byte aligned pointer to a memory location that will receive the
3273///    integer values.
3274/// \param __a
3275///    A 256-bit integer vector containing the values to be moved.
3276static __inline void __DEFAULT_FN_ATTRS
3277_mm256_store_si256(__m256i *__p, __m256i __a)
3278{
3279  *__p = __a;
3280}
3281
3282/// Stores integer values from a 256-bit integer vector to an unaligned
3283///    memory location pointed to by \a __p.
3284///
3285/// \headerfile <x86intrin.h>
3286///
3287/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3288///
3289/// \param __p
3290///    A pointer to a memory location that will receive the integer values.
3291/// \param __a
3292///    A 256-bit integer vector containing the values to be moved.
3293static __inline void __DEFAULT_FN_ATTRS
3294_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3295{
3296  struct __storeu_si256 {
3297    __m256i_u __v;
3298  } __attribute__((__packed__, __may_alias__));
3299  ((struct __storeu_si256*)__p)->__v = __a;
3300}
3301
3302/* Conditional load ops */
3303/// Conditionally loads double-precision floating point elements from a
3304///    memory location pointed to by \a __p into a 128-bit vector of
3305///    [2 x double], depending on the mask bits associated with each data
3306///    element.
3307///
3308/// \headerfile <x86intrin.h>
3309///
3310/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3311///
3312/// \param __p
3313///    A pointer to a memory location that contains the double-precision
3314///    floating point values.
3315/// \param __m
3316///    A 128-bit integer vector containing the mask. The most significant bit of
3317///    each data element represents the mask bits. If a mask bit is zero, the
3318///    corresponding value in the memory location is not loaded and the
3319///    corresponding field in the return value is set to zero.
3320/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3321static __inline __m128d __DEFAULT_FN_ATTRS128
3322_mm_maskload_pd(double const *__p, __m128i __m)
3323{
3324  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3325}
3326
3327/// Conditionally loads double-precision floating point elements from a
3328///    memory location pointed to by \a __p into a 256-bit vector of
3329///    [4 x double], depending on the mask bits associated with each data
3330///    element.
3331///
3332/// \headerfile <x86intrin.h>
3333///
3334/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3335///
3336/// \param __p
3337///    A pointer to a memory location that contains the double-precision
3338///    floating point values.
3339/// \param __m
3340///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3341///    significant bit of each quadword element represents the mask bits. If a
3342///    mask bit is zero, the corresponding value in the memory location is not
3343///    loaded and the corresponding field in the return value is set to zero.
3344/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3345static __inline __m256d __DEFAULT_FN_ATTRS
3346_mm256_maskload_pd(double const *__p, __m256i __m)
3347{
3348  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3349                                               (__v4di)__m);
3350}
3351
3352/// Conditionally loads single-precision floating point elements from a
3353///    memory location pointed to by \a __p into a 128-bit vector of
3354///    [4 x float], depending on the mask bits associated with each data
3355///    element.
3356///
3357/// \headerfile <x86intrin.h>
3358///
3359/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3360///
3361/// \param __p
3362///    A pointer to a memory location that contains the single-precision
3363///    floating point values.
3364/// \param __m
3365///    A 128-bit integer vector containing the mask. The most significant bit of
3366///    each data element represents the mask bits. If a mask bit is zero, the
3367///    corresponding value in the memory location is not loaded and the
3368///    corresponding field in the return value is set to zero.
3369/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3370static __inline __m128 __DEFAULT_FN_ATTRS128
3371_mm_maskload_ps(float const *__p, __m128i __m)
3372{
3373  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3374}
3375
3376/// Conditionally loads single-precision floating point elements from a
3377///    memory location pointed to by \a __p into a 256-bit vector of
3378///    [8 x float], depending on the mask bits associated with each data
3379///    element.
3380///
3381/// \headerfile <x86intrin.h>
3382///
3383/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3384///
3385/// \param __p
3386///    A pointer to a memory location that contains the single-precision
3387///    floating point values.
3388/// \param __m
3389///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3390///    significant bit of each dword element represents the mask bits. If a mask
3391///    bit is zero, the corresponding value in the memory location is not loaded
3392///    and the corresponding field in the return value is set to zero.
3393/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3394static __inline __m256 __DEFAULT_FN_ATTRS
3395_mm256_maskload_ps(float const *__p, __m256i __m)
3396{
3397  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3398}
3399
3400/* Conditional store ops */
3401/// Moves single-precision floating point values from a 256-bit vector
3402///    of [8 x float] to a memory location pointed to by \a __p, according to
3403///    the specified mask.
3404///
3405/// \headerfile <x86intrin.h>
3406///
3407/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3408///
3409/// \param __p
3410///    A pointer to a memory location that will receive the float values.
3411/// \param __m
3412///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3413///    significant bit of each dword element in the mask vector represents the
3414///    mask bits. If a mask bit is zero, the corresponding value from vector
3415///    \a __a is not stored and the corresponding field in the memory location
3416///    pointed to by \a __p is not changed.
3417/// \param __a
3418///    A 256-bit vector of [8 x float] containing the values to be stored.
3419static __inline void __DEFAULT_FN_ATTRS
3420_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3421{
3422  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3423}
3424
3425/// Moves double-precision values from a 128-bit vector of [2 x double]
3426///    to a memory location pointed to by \a __p, according to the specified
3427///    mask.
3428///
3429/// \headerfile <x86intrin.h>
3430///
3431/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3432///
3433/// \param __p
3434///    A pointer to a memory location that will receive the float values.
3435/// \param __m
3436///    A 128-bit integer vector containing the mask. The most significant bit of
3437///    each field in the mask vector represents the mask bits. If a mask bit is
3438///    zero, the corresponding value from vector \a __a is not stored and the
3439///    corresponding field in the memory location pointed to by \a __p is not
3440///    changed.
3441/// \param __a
3442///    A 128-bit vector of [2 x double] containing the values to be stored.
3443static __inline void __DEFAULT_FN_ATTRS128
3444_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3445{
3446  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3447}
3448
3449/// Moves double-precision values from a 256-bit vector of [4 x double]
3450///    to a memory location pointed to by \a __p, according to the specified
3451///    mask.
3452///
3453/// \headerfile <x86intrin.h>
3454///
3455/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3456///
3457/// \param __p
3458///    A pointer to a memory location that will receive the float values.
3459/// \param __m
3460///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3461///    significant bit of each quadword element in the mask vector represents
3462///    the mask bits. If a mask bit is zero, the corresponding value from vector
3463///    __a is not stored and the corresponding field in the memory location
3464///    pointed to by \a __p is not changed.
3465/// \param __a
3466///    A 256-bit vector of [4 x double] containing the values to be stored.
3467static __inline void __DEFAULT_FN_ATTRS
3468_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3469{
3470  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3471}
3472
3473/// Moves single-precision floating point values from a 128-bit vector
3474///    of [4 x float] to a memory location pointed to by \a __p, according to
3475///    the specified mask.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3480///
3481/// \param __p
3482///    A pointer to a memory location that will receive the float values.
3483/// \param __m
3484///    A 128-bit integer vector containing the mask. The most significant bit of
3485///    each field in the mask vector represents the mask bits. If a mask bit is
3486///    zero, the corresponding value from vector __a is not stored and the
3487///    corresponding field in the memory location pointed to by \a __p is not
3488///    changed.
3489/// \param __a
3490///    A 128-bit vector of [4 x float] containing the values to be stored.
3491static __inline void __DEFAULT_FN_ATTRS128
3492_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3493{
3494  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3495}
3496
3497/* Cacheability support ops */
3498/// Moves integer data from a 256-bit integer vector to a 32-byte
3499///    aligned memory location. To minimize caching, the data is flagged as
3500///    non-temporal (unlikely to be used again soon).
3501///
3502/// \headerfile <x86intrin.h>
3503///
3504/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3505///
3506/// \param __a
3507///    A pointer to a 32-byte aligned memory location that will receive the
3508///    integer values.
3509/// \param __b
3510///    A 256-bit integer vector containing the values to be moved.
3511static __inline void __DEFAULT_FN_ATTRS
3512_mm256_stream_si256(__m256i *__a, __m256i __b)
3513{
3514  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3515  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3516}
3517
3518/// Moves double-precision values from a 256-bit vector of [4 x double]
3519///    to a 32-byte aligned memory location. To minimize caching, the data is
3520///    flagged as non-temporal (unlikely to be used again soon).
3521///
3522/// \headerfile <x86intrin.h>
3523///
3524/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3525///
3526/// \param __a
3527///    A pointer to a 32-byte aligned memory location that will receive the
3528///    double-precision floating-point values.
3529/// \param __b
3530///    A 256-bit vector of [4 x double] containing the values to be moved.
3531static __inline void __DEFAULT_FN_ATTRS
3532_mm256_stream_pd(double *__a, __m256d __b)
3533{
3534  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3535  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3536}
3537
3538/// Moves single-precision floating point values from a 256-bit vector
3539///    of [8 x float] to a 32-byte aligned memory location. To minimize
3540///    caching, the data is flagged as non-temporal (unlikely to be used again
3541///    soon).
3542///
3543/// \headerfile <x86intrin.h>
3544///
3545/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3546///
3547/// \param __p
3548///    A pointer to a 32-byte aligned memory location that will receive the
3549///    single-precision floating point values.
3550/// \param __a
3551///    A 256-bit vector of [8 x float] containing the values to be moved.
3552static __inline void __DEFAULT_FN_ATTRS
3553_mm256_stream_ps(float *__p, __m256 __a)
3554{
3555  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3556  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3557}
3558
3559/* Create vectors */
3560/// Create a 256-bit vector of [4 x double] with undefined values.
3561///
3562/// \headerfile <x86intrin.h>
3563///
3564/// This intrinsic has no corresponding instruction.
3565///
3566/// \returns A 256-bit vector of [4 x double] containing undefined values.
3567static __inline__ __m256d __DEFAULT_FN_ATTRS
3568_mm256_undefined_pd(void)
3569{
3570  return (__m256d)__builtin_ia32_undef256();
3571}
3572
3573/// Create a 256-bit vector of [8 x float] with undefined values.
3574///
3575/// \headerfile <x86intrin.h>
3576///
3577/// This intrinsic has no corresponding instruction.
3578///
3579/// \returns A 256-bit vector of [8 x float] containing undefined values.
3580static __inline__ __m256 __DEFAULT_FN_ATTRS
3581_mm256_undefined_ps(void)
3582{
3583  return (__m256)__builtin_ia32_undef256();
3584}
3585
3586/// Create a 256-bit integer vector with undefined values.
3587///
3588/// \headerfile <x86intrin.h>
3589///
3590/// This intrinsic has no corresponding instruction.
3591///
3592/// \returns A 256-bit integer vector containing undefined values.
3593static __inline__ __m256i __DEFAULT_FN_ATTRS
3594_mm256_undefined_si256(void)
3595{
3596  return (__m256i)__builtin_ia32_undef256();
3597}
3598
3599/// Constructs a 256-bit floating-point vector of [4 x double]
3600///    initialized with the specified double-precision floating-point values.
3601///
3602/// \headerfile <x86intrin.h>
3603///
3604/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3605///   instruction.
3606///
3607/// \param __a
3608///    A double-precision floating-point value used to initialize bits [255:192]
3609///    of the result.
3610/// \param __b
3611///    A double-precision floating-point value used to initialize bits [191:128]
3612///    of the result.
3613/// \param __c
3614///    A double-precision floating-point value used to initialize bits [127:64]
3615///    of the result.
3616/// \param __d
3617///    A double-precision floating-point value used to initialize bits [63:0]
3618///    of the result.
3619/// \returns An initialized 256-bit floating-point vector of [4 x double].
3620static __inline __m256d __DEFAULT_FN_ATTRS
3621_mm256_set_pd(double __a, double __b, double __c, double __d)
3622{
3623  return __extension__ (__m256d){ __d, __c, __b, __a };
3624}
3625
3626/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3627///    with the specified single-precision floating-point values.
3628///
3629/// \headerfile <x86intrin.h>
3630///
3631/// This intrinsic is a utility function and does not correspond to a specific
3632///   instruction.
3633///
3634/// \param __a
3635///    A single-precision floating-point value used to initialize bits [255:224]
3636///    of the result.
3637/// \param __b
3638///    A single-precision floating-point value used to initialize bits [223:192]
3639///    of the result.
3640/// \param __c
3641///    A single-precision floating-point value used to initialize bits [191:160]
3642///    of the result.
3643/// \param __d
3644///    A single-precision floating-point value used to initialize bits [159:128]
3645///    of the result.
3646/// \param __e
3647///    A single-precision floating-point value used to initialize bits [127:96]
3648///    of the result.
3649/// \param __f
3650///    A single-precision floating-point value used to initialize bits [95:64]
3651///    of the result.
3652/// \param __g
3653///    A single-precision floating-point value used to initialize bits [63:32]
3654///    of the result.
3655/// \param __h
3656///    A single-precision floating-point value used to initialize bits [31:0]
3657///    of the result.
3658/// \returns An initialized 256-bit floating-point vector of [8 x float].
3659static __inline __m256 __DEFAULT_FN_ATTRS
3660_mm256_set_ps(float __a, float __b, float __c, float __d,
3661              float __e, float __f, float __g, float __h)
3662{
3663  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3664}
3665
3666/// Constructs a 256-bit integer vector initialized with the specified
3667///    32-bit integral values.
3668///
3669/// \headerfile <x86intrin.h>
3670///
3671/// This intrinsic is a utility function and does not correspond to a specific
3672///   instruction.
3673///
3674/// \param __i0
3675///    A 32-bit integral value used to initialize bits [255:224] of the result.
3676/// \param __i1
3677///    A 32-bit integral value used to initialize bits [223:192] of the result.
3678/// \param __i2
3679///    A 32-bit integral value used to initialize bits [191:160] of the result.
3680/// \param __i3
3681///    A 32-bit integral value used to initialize bits [159:128] of the result.
3682/// \param __i4
3683///    A 32-bit integral value used to initialize bits [127:96] of the result.
3684/// \param __i5
3685///    A 32-bit integral value used to initialize bits [95:64] of the result.
3686/// \param __i6
3687///    A 32-bit integral value used to initialize bits [63:32] of the result.
3688/// \param __i7
3689///    A 32-bit integral value used to initialize bits [31:0] of the result.
3690/// \returns An initialized 256-bit integer vector.
3691static __inline __m256i __DEFAULT_FN_ATTRS
3692_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3693                 int __i4, int __i5, int __i6, int __i7)
3694{
3695  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3696}
3697
3698/// Constructs a 256-bit integer vector initialized with the specified
3699///    16-bit integral values.
3700///
3701/// \headerfile <x86intrin.h>
3702///
3703/// This intrinsic is a utility function and does not correspond to a specific
3704///   instruction.
3705///
3706/// \param __w15
3707///    A 16-bit integral value used to initialize bits [255:240] of the result.
3708/// \param __w14
3709///    A 16-bit integral value used to initialize bits [239:224] of the result.
3710/// \param __w13
3711///    A 16-bit integral value used to initialize bits [223:208] of the result.
3712/// \param __w12
3713///    A 16-bit integral value used to initialize bits [207:192] of the result.
3714/// \param __w11
3715///    A 16-bit integral value used to initialize bits [191:176] of the result.
3716/// \param __w10
3717///    A 16-bit integral value used to initialize bits [175:160] of the result.
3718/// \param __w09
3719///    A 16-bit integral value used to initialize bits [159:144] of the result.
3720/// \param __w08
3721///    A 16-bit integral value used to initialize bits [143:128] of the result.
3722/// \param __w07
3723///    A 16-bit integral value used to initialize bits [127:112] of the result.
3724/// \param __w06
3725///    A 16-bit integral value used to initialize bits [111:96] of the result.
3726/// \param __w05
3727///    A 16-bit integral value used to initialize bits [95:80] of the result.
3728/// \param __w04
3729///    A 16-bit integral value used to initialize bits [79:64] of the result.
3730/// \param __w03
3731///    A 16-bit integral value used to initialize bits [63:48] of the result.
3732/// \param __w02
3733///    A 16-bit integral value used to initialize bits [47:32] of the result.
3734/// \param __w01
3735///    A 16-bit integral value used to initialize bits [31:16] of the result.
3736/// \param __w00
3737///    A 16-bit integral value used to initialize bits [15:0] of the result.
3738/// \returns An initialized 256-bit integer vector.
3739static __inline __m256i __DEFAULT_FN_ATTRS
3740_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3741                 short __w11, short __w10, short __w09, short __w08,
3742                 short __w07, short __w06, short __w05, short __w04,
3743                 short __w03, short __w02, short __w01, short __w00)
3744{
3745  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3746    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3747}
3748
3749/// Constructs a 256-bit integer vector initialized with the specified
3750///    8-bit integral values.
3751///
3752/// \headerfile <x86intrin.h>
3753///
3754/// This intrinsic is a utility function and does not correspond to a specific
3755///   instruction.
3756///
3757/// \param __b31
3758///    An 8-bit integral value used to initialize bits [255:248] of the result.
3759/// \param __b30
3760///    An 8-bit integral value used to initialize bits [247:240] of the result.
3761/// \param __b29
3762///    An 8-bit integral value used to initialize bits [239:232] of the result.
3763/// \param __b28
3764///    An 8-bit integral value used to initialize bits [231:224] of the result.
3765/// \param __b27
3766///    An 8-bit integral value used to initialize bits [223:216] of the result.
3767/// \param __b26
3768///    An 8-bit integral value used to initialize bits [215:208] of the result.
3769/// \param __b25
3770///    An 8-bit integral value used to initialize bits [207:200] of the result.
3771/// \param __b24
3772///    An 8-bit integral value used to initialize bits [199:192] of the result.
3773/// \param __b23
3774///    An 8-bit integral value used to initialize bits [191:184] of the result.
3775/// \param __b22
3776///    An 8-bit integral value used to initialize bits [183:176] of the result.
3777/// \param __b21
3778///    An 8-bit integral value used to initialize bits [175:168] of the result.
3779/// \param __b20
3780///    An 8-bit integral value used to initialize bits [167:160] of the result.
3781/// \param __b19
3782///    An 8-bit integral value used to initialize bits [159:152] of the result.
3783/// \param __b18
3784///    An 8-bit integral value used to initialize bits [151:144] of the result.
3785/// \param __b17
3786///    An 8-bit integral value used to initialize bits [143:136] of the result.
3787/// \param __b16
3788///    An 8-bit integral value used to initialize bits [135:128] of the result.
3789/// \param __b15
3790///    An 8-bit integral value used to initialize bits [127:120] of the result.
3791/// \param __b14
3792///    An 8-bit integral value used to initialize bits [119:112] of the result.
3793/// \param __b13
3794///    An 8-bit integral value used to initialize bits [111:104] of the result.
3795/// \param __b12
3796///    An 8-bit integral value used to initialize bits [103:96] of the result.
3797/// \param __b11
3798///    An 8-bit integral value used to initialize bits [95:88] of the result.
3799/// \param __b10
3800///    An 8-bit integral value used to initialize bits [87:80] of the result.
3801/// \param __b09
3802///    An 8-bit integral value used to initialize bits [79:72] of the result.
3803/// \param __b08
3804///    An 8-bit integral value used to initialize bits [71:64] of the result.
3805/// \param __b07
3806///    An 8-bit integral value used to initialize bits [63:56] of the result.
3807/// \param __b06
3808///    An 8-bit integral value used to initialize bits [55:48] of the result.
3809/// \param __b05
3810///    An 8-bit integral value used to initialize bits [47:40] of the result.
3811/// \param __b04
3812///    An 8-bit integral value used to initialize bits [39:32] of the result.
3813/// \param __b03
3814///    An 8-bit integral value used to initialize bits [31:24] of the result.
3815/// \param __b02
3816///    An 8-bit integral value used to initialize bits [23:16] of the result.
3817/// \param __b01
3818///    An 8-bit integral value used to initialize bits [15:8] of the result.
3819/// \param __b00
3820///    An 8-bit integral value used to initialize bits [7:0] of the result.
3821/// \returns An initialized 256-bit integer vector.
3822static __inline __m256i __DEFAULT_FN_ATTRS
3823_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3824                char __b27, char __b26, char __b25, char __b24,
3825                char __b23, char __b22, char __b21, char __b20,
3826                char __b19, char __b18, char __b17, char __b16,
3827                char __b15, char __b14, char __b13, char __b12,
3828                char __b11, char __b10, char __b09, char __b08,
3829                char __b07, char __b06, char __b05, char __b04,
3830                char __b03, char __b02, char __b01, char __b00)
3831{
3832  return __extension__ (__m256i)(__v32qi){
3833    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3834    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3835    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3836    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3837  };
3838}
3839
3840/// Constructs a 256-bit integer vector initialized with the specified
3841///    64-bit integral values.
3842///
3843/// \headerfile <x86intrin.h>
3844///
3845/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3846///   instruction.
3847///
3848/// \param __a
3849///    A 64-bit integral value used to initialize bits [255:192] of the result.
3850/// \param __b
3851///    A 64-bit integral value used to initialize bits [191:128] of the result.
3852/// \param __c
3853///    A 64-bit integral value used to initialize bits [127:64] of the result.
3854/// \param __d
3855///    A 64-bit integral value used to initialize bits [63:0] of the result.
3856/// \returns An initialized 256-bit integer vector.
3857static __inline __m256i __DEFAULT_FN_ATTRS
3858_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3859{
3860  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3861}
3862
3863/* Create vectors with elements in reverse order */
3864/// Constructs a 256-bit floating-point vector of [4 x double],
3865///    initialized in reverse order with the specified double-precision
3866///    floating-point values.
3867///
3868/// \headerfile <x86intrin.h>
3869///
3870/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3871///   instruction.
3872///
3873/// \param __a
3874///    A double-precision floating-point value used to initialize bits [63:0]
3875///    of the result.
3876/// \param __b
3877///    A double-precision floating-point value used to initialize bits [127:64]
3878///    of the result.
3879/// \param __c
3880///    A double-precision floating-point value used to initialize bits [191:128]
3881///    of the result.
3882/// \param __d
3883///    A double-precision floating-point value used to initialize bits [255:192]
3884///    of the result.
3885/// \returns An initialized 256-bit floating-point vector of [4 x double].
3886static __inline __m256d __DEFAULT_FN_ATTRS
3887_mm256_setr_pd(double __a, double __b, double __c, double __d)
3888{
3889  return _mm256_set_pd(__d, __c, __b, __a);
3890}
3891
3892/// Constructs a 256-bit floating-point vector of [8 x float],
3893///    initialized in reverse order with the specified single-precision
3894///    float-point values.
3895///
3896/// \headerfile <x86intrin.h>
3897///
3898/// This intrinsic is a utility function and does not correspond to a specific
3899///   instruction.
3900///
3901/// \param __a
3902///    A single-precision floating-point value used to initialize bits [31:0]
3903///    of the result.
3904/// \param __b
3905///    A single-precision floating-point value used to initialize bits [63:32]
3906///    of the result.
3907/// \param __c
3908///    A single-precision floating-point value used to initialize bits [95:64]
3909///    of the result.
3910/// \param __d
3911///    A single-precision floating-point value used to initialize bits [127:96]
3912///    of the result.
3913/// \param __e
3914///    A single-precision floating-point value used to initialize bits [159:128]
3915///    of the result.
3916/// \param __f
3917///    A single-precision floating-point value used to initialize bits [191:160]
3918///    of the result.
3919/// \param __g
3920///    A single-precision floating-point value used to initialize bits [223:192]
3921///    of the result.
3922/// \param __h
3923///    A single-precision floating-point value used to initialize bits [255:224]
3924///    of the result.
3925/// \returns An initialized 256-bit floating-point vector of [8 x float].
3926static __inline __m256 __DEFAULT_FN_ATTRS
3927_mm256_setr_ps(float __a, float __b, float __c, float __d,
3928               float __e, float __f, float __g, float __h)
3929{
3930  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3931}
3932
3933/// Constructs a 256-bit integer vector, initialized in reverse order
3934///    with the specified 32-bit integral values.
3935///
3936/// \headerfile <x86intrin.h>
3937///
3938/// This intrinsic is a utility function and does not correspond to a specific
3939///   instruction.
3940///
3941/// \param __i0
3942///    A 32-bit integral value used to initialize bits [31:0] of the result.
3943/// \param __i1
3944///    A 32-bit integral value used to initialize bits [63:32] of the result.
3945/// \param __i2
3946///    A 32-bit integral value used to initialize bits [95:64] of the result.
3947/// \param __i3
3948///    A 32-bit integral value used to initialize bits [127:96] of the result.
3949/// \param __i4
3950///    A 32-bit integral value used to initialize bits [159:128] of the result.
3951/// \param __i5
3952///    A 32-bit integral value used to initialize bits [191:160] of the result.
3953/// \param __i6
3954///    A 32-bit integral value used to initialize bits [223:192] of the result.
3955/// \param __i7
3956///    A 32-bit integral value used to initialize bits [255:224] of the result.
3957/// \returns An initialized 256-bit integer vector.
3958static __inline __m256i __DEFAULT_FN_ATTRS
3959_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3960                  int __i4, int __i5, int __i6, int __i7)
3961{
3962  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3963}
3964
3965/// Constructs a 256-bit integer vector, initialized in reverse order
3966///    with the specified 16-bit integral values.
3967///
3968/// \headerfile <x86intrin.h>
3969///
3970/// This intrinsic is a utility function and does not correspond to a specific
3971///   instruction.
3972///
3973/// \param __w15
3974///    A 16-bit integral value used to initialize bits [15:0] of the result.
3975/// \param __w14
3976///    A 16-bit integral value used to initialize bits [31:16] of the result.
3977/// \param __w13
3978///    A 16-bit integral value used to initialize bits [47:32] of the result.
3979/// \param __w12
3980///    A 16-bit integral value used to initialize bits [63:48] of the result.
3981/// \param __w11
3982///    A 16-bit integral value used to initialize bits [79:64] of the result.
3983/// \param __w10
3984///    A 16-bit integral value used to initialize bits [95:80] of the result.
3985/// \param __w09
3986///    A 16-bit integral value used to initialize bits [111:96] of the result.
3987/// \param __w08
3988///    A 16-bit integral value used to initialize bits [127:112] of the result.
3989/// \param __w07
3990///    A 16-bit integral value used to initialize bits [143:128] of the result.
3991/// \param __w06
3992///    A 16-bit integral value used to initialize bits [159:144] of the result.
3993/// \param __w05
3994///    A 16-bit integral value used to initialize bits [175:160] of the result.
3995/// \param __w04
3996///    A 16-bit integral value used to initialize bits [191:176] of the result.
3997/// \param __w03
3998///    A 16-bit integral value used to initialize bits [207:192] of the result.
3999/// \param __w02
4000///    A 16-bit integral value used to initialize bits [223:208] of the result.
4001/// \param __w01
4002///    A 16-bit integral value used to initialize bits [239:224] of the result.
4003/// \param __w00
4004///    A 16-bit integral value used to initialize bits [255:240] of the result.
4005/// \returns An initialized 256-bit integer vector.
4006static __inline __m256i __DEFAULT_FN_ATTRS
4007_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4008       short __w11, short __w10, short __w09, short __w08,
4009       short __w07, short __w06, short __w05, short __w04,
4010       short __w03, short __w02, short __w01, short __w00)
4011{
4012  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4013                          __w04, __w05, __w06, __w07,
4014                          __w08, __w09, __w10, __w11,
4015                          __w12, __w13, __w14, __w15);
4016}
4017
4018/// Constructs a 256-bit integer vector, initialized in reverse order
4019///    with the specified 8-bit integral values.
4020///
4021/// \headerfile <x86intrin.h>
4022///
4023/// This intrinsic is a utility function and does not correspond to a specific
4024///   instruction.
4025///
4026/// \param __b31
4027///    An 8-bit integral value used to initialize bits [7:0] of the result.
4028/// \param __b30
4029///    An 8-bit integral value used to initialize bits [15:8] of the result.
4030/// \param __b29
4031///    An 8-bit integral value used to initialize bits [23:16] of the result.
4032/// \param __b28
4033///    An 8-bit integral value used to initialize bits [31:24] of the result.
4034/// \param __b27
4035///    An 8-bit integral value used to initialize bits [39:32] of the result.
4036/// \param __b26
4037///    An 8-bit integral value used to initialize bits [47:40] of the result.
4038/// \param __b25
4039///    An 8-bit integral value used to initialize bits [55:48] of the result.
4040/// \param __b24
4041///    An 8-bit integral value used to initialize bits [63:56] of the result.
4042/// \param __b23
4043///    An 8-bit integral value used to initialize bits [71:64] of the result.
4044/// \param __b22
4045///    An 8-bit integral value used to initialize bits [79:72] of the result.
4046/// \param __b21
4047///    An 8-bit integral value used to initialize bits [87:80] of the result.
4048/// \param __b20
4049///    An 8-bit integral value used to initialize bits [95:88] of the result.
4050/// \param __b19
4051///    An 8-bit integral value used to initialize bits [103:96] of the result.
4052/// \param __b18
4053///    An 8-bit integral value used to initialize bits [111:104] of the result.
4054/// \param __b17
4055///    An 8-bit integral value used to initialize bits [119:112] of the result.
4056/// \param __b16
4057///    An 8-bit integral value used to initialize bits [127:120] of the result.
4058/// \param __b15
4059///    An 8-bit integral value used to initialize bits [135:128] of the result.
4060/// \param __b14
4061///    An 8-bit integral value used to initialize bits [143:136] of the result.
4062/// \param __b13
4063///    An 8-bit integral value used to initialize bits [151:144] of the result.
4064/// \param __b12
4065///    An 8-bit integral value used to initialize bits [159:152] of the result.
4066/// \param __b11
4067///    An 8-bit integral value used to initialize bits [167:160] of the result.
4068/// \param __b10
4069///    An 8-bit integral value used to initialize bits [175:168] of the result.
4070/// \param __b09
4071///    An 8-bit integral value used to initialize bits [183:176] of the result.
4072/// \param __b08
4073///    An 8-bit integral value used to initialize bits [191:184] of the result.
4074/// \param __b07
4075///    An 8-bit integral value used to initialize bits [199:192] of the result.
4076/// \param __b06
4077///    An 8-bit integral value used to initialize bits [207:200] of the result.
4078/// \param __b05
4079///    An 8-bit integral value used to initialize bits [215:208] of the result.
4080/// \param __b04
4081///    An 8-bit integral value used to initialize bits [223:216] of the result.
4082/// \param __b03
4083///    An 8-bit integral value used to initialize bits [231:224] of the result.
4084/// \param __b02
4085///    An 8-bit integral value used to initialize bits [239:232] of the result.
4086/// \param __b01
4087///    An 8-bit integral value used to initialize bits [247:240] of the result.
4088/// \param __b00
4089///    An 8-bit integral value used to initialize bits [255:248] of the result.
4090/// \returns An initialized 256-bit integer vector.
4091static __inline __m256i __DEFAULT_FN_ATTRS
4092_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4093                 char __b27, char __b26, char __b25, char __b24,
4094                 char __b23, char __b22, char __b21, char __b20,
4095                 char __b19, char __b18, char __b17, char __b16,
4096                 char __b15, char __b14, char __b13, char __b12,
4097                 char __b11, char __b10, char __b09, char __b08,
4098                 char __b07, char __b06, char __b05, char __b04,
4099                 char __b03, char __b02, char __b01, char __b00)
4100{
4101  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4102                         __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4103                         __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4104                         __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4105}
4106
4107/// Constructs a 256-bit integer vector, initialized in reverse order
4108///    with the specified 64-bit integral values.
4109///
4110/// \headerfile <x86intrin.h>
4111///
4112/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4113///   instruction.
4114///
4115/// \param __a
4116///    A 64-bit integral value used to initialize bits [63:0] of the result.
4117/// \param __b
4118///    A 64-bit integral value used to initialize bits [127:64] of the result.
4119/// \param __c
4120///    A 64-bit integral value used to initialize bits [191:128] of the result.
4121/// \param __d
4122///    A 64-bit integral value used to initialize bits [255:192] of the result.
4123/// \returns An initialized 256-bit integer vector.
4124static __inline __m256i __DEFAULT_FN_ATTRS
4125_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4126{
4127  return _mm256_set_epi64x(__d, __c, __b, __a);
4128}
4129
4130/* Create vectors with repeated elements */
4131/// Constructs a 256-bit floating-point vector of [4 x double], with each
4132///    of the four double-precision floating-point vector elements set to the
4133///    specified double-precision floating-point value.
4134///
4135/// \headerfile <x86intrin.h>
4136///
4137/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4138///
4139/// \param __w
4140///    A double-precision floating-point value used to initialize each vector
4141///    element of the result.
4142/// \returns An initialized 256-bit floating-point vector of [4 x double].
4143static __inline __m256d __DEFAULT_FN_ATTRS
4144_mm256_set1_pd(double __w)
4145{
4146  return _mm256_set_pd(__w, __w, __w, __w);
4147}
4148
4149/// Constructs a 256-bit floating-point vector of [8 x float], with each
4150///    of the eight single-precision floating-point vector elements set to the
4151///    specified single-precision floating-point value.
4152///
4153/// \headerfile <x86intrin.h>
4154///
4155/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4156///   instruction.
4157///
4158/// \param __w
4159///    A single-precision floating-point value used to initialize each vector
4160///    element of the result.
4161/// \returns An initialized 256-bit floating-point vector of [8 x float].
4162static __inline __m256 __DEFAULT_FN_ATTRS
4163_mm256_set1_ps(float __w)
4164{
4165  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4166}
4167
4168/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4169///    32-bit integral vector elements set to the specified 32-bit integral
4170///    value.
4171///
4172/// \headerfile <x86intrin.h>
4173///
4174/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4175///   instruction.
4176///
4177/// \param __i
4178///    A 32-bit integral value used to initialize each vector element of the
4179///    result.
4180/// \returns An initialized 256-bit integer vector of [8 x i32].
4181static __inline __m256i __DEFAULT_FN_ATTRS
4182_mm256_set1_epi32(int __i)
4183{
4184  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4185}
4186
4187/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4188///    16-bit integral vector elements set to the specified 16-bit integral
4189///    value.
4190///
4191/// \headerfile <x86intrin.h>
4192///
4193/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4194///
4195/// \param __w
4196///    A 16-bit integral value used to initialize each vector element of the
4197///    result.
4198/// \returns An initialized 256-bit integer vector of [16 x i16].
4199static __inline __m256i __DEFAULT_FN_ATTRS
4200_mm256_set1_epi16(short __w)
4201{
4202  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4203                          __w, __w, __w, __w, __w, __w, __w, __w);
4204}
4205
4206/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4207///    8-bit integral vector elements set to the specified 8-bit integral value.
4208///
4209/// \headerfile <x86intrin.h>
4210///
4211/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4212///
4213/// \param __b
4214///    An 8-bit integral value used to initialize each vector element of the
4215///    result.
4216/// \returns An initialized 256-bit integer vector of [32 x i8].
4217static __inline __m256i __DEFAULT_FN_ATTRS
4218_mm256_set1_epi8(char __b)
4219{
4220  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4221                         __b, __b, __b, __b, __b, __b, __b, __b,
4222                         __b, __b, __b, __b, __b, __b, __b, __b,
4223                         __b, __b, __b, __b, __b, __b, __b, __b);
4224}
4225
4226/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4227///    64-bit integral vector elements set to the specified 64-bit integral
4228///    value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
4232/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4233///
4234/// \param __q
4235///    A 64-bit integral value used to initialize each vector element of the
4236///    result.
4237/// \returns An initialized 256-bit integer vector of [4 x i64].
4238static __inline __m256i __DEFAULT_FN_ATTRS
4239_mm256_set1_epi64x(long long __q)
4240{
4241  return _mm256_set_epi64x(__q, __q, __q, __q);
4242}
4243
4244/* Create __zeroed vectors */
4245/// Constructs a 256-bit floating-point vector of [4 x double] with all
4246///    vector elements initialized to zero.
4247///
4248/// \headerfile <x86intrin.h>
4249///
4250/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4251///
4252/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4253static __inline __m256d __DEFAULT_FN_ATTRS
4254_mm256_setzero_pd(void)
4255{
4256  return __extension__ (__m256d){ 0, 0, 0, 0 };
4257}
4258
4259/// Constructs a 256-bit floating-point vector of [8 x float] with all
4260///    vector elements initialized to zero.
4261///
4262/// \headerfile <x86intrin.h>
4263///
4264/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4265///
4266/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4267static __inline __m256 __DEFAULT_FN_ATTRS
4268_mm256_setzero_ps(void)
4269{
4270  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4271}
4272
4273/// Constructs a 256-bit integer vector initialized to zero.
4274///
4275/// \headerfile <x86intrin.h>
4276///
4277/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4278///
4279/// \returns A 256-bit integer vector initialized to zero.
4280static __inline __m256i __DEFAULT_FN_ATTRS
4281_mm256_setzero_si256(void)
4282{
4283  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4284}
4285
4286/* Cast between vector types */
4287/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4288///    floating-point vector of [8 x float].
4289///
4290/// \headerfile <x86intrin.h>
4291///
4292/// This intrinsic has no corresponding instruction.
4293///
4294/// \param __a
4295///    A 256-bit floating-point vector of [4 x double].
4296/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4297///    bitwise pattern as the parameter.
4298static __inline __m256 __DEFAULT_FN_ATTRS
4299_mm256_castpd_ps(__m256d __a)
4300{
4301  return (__m256)__a;
4302}
4303
4304/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4305///    integer vector.
4306///
4307/// \headerfile <x86intrin.h>
4308///
4309/// This intrinsic has no corresponding instruction.
4310///
4311/// \param __a
4312///    A 256-bit floating-point vector of [4 x double].
4313/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4314///    parameter.
4315static __inline __m256i __DEFAULT_FN_ATTRS
4316_mm256_castpd_si256(__m256d __a)
4317{
4318  return (__m256i)__a;
4319}
4320
4321/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4322///    floating-point vector of [4 x double].
4323///
4324/// \headerfile <x86intrin.h>
4325///
4326/// This intrinsic has no corresponding instruction.
4327///
4328/// \param __a
4329///    A 256-bit floating-point vector of [8 x float].
4330/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4331///    bitwise pattern as the parameter.
4332static __inline __m256d __DEFAULT_FN_ATTRS
4333_mm256_castps_pd(__m256 __a)
4334{
4335  return (__m256d)__a;
4336}
4337
4338/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4339///    integer vector.
4340///
4341/// \headerfile <x86intrin.h>
4342///
4343/// This intrinsic has no corresponding instruction.
4344///
4345/// \param __a
4346///    A 256-bit floating-point vector of [8 x float].
4347/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4348///    parameter.
4349static __inline __m256i __DEFAULT_FN_ATTRS
4350_mm256_castps_si256(__m256 __a)
4351{
4352  return (__m256i)__a;
4353}
4354
4355/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4356///    of [8 x float].
4357///
4358/// \headerfile <x86intrin.h>
4359///
4360/// This intrinsic has no corresponding instruction.
4361///
4362/// \param __a
4363///    A 256-bit integer vector.
4364/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4365///    bitwise pattern as the parameter.
4366static __inline __m256 __DEFAULT_FN_ATTRS
4367_mm256_castsi256_ps(__m256i __a)
4368{
4369  return (__m256)__a;
4370}
4371
4372/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4373///    of [4 x double].
4374///
4375/// \headerfile <x86intrin.h>
4376///
4377/// This intrinsic has no corresponding instruction.
4378///
4379/// \param __a
4380///    A 256-bit integer vector.
4381/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4382///    bitwise pattern as the parameter.
4383static __inline __m256d __DEFAULT_FN_ATTRS
4384_mm256_castsi256_pd(__m256i __a)
4385{
4386  return (__m256d)__a;
4387}
4388
4389/// Returns the lower 128 bits of a 256-bit floating-point vector of
4390///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic has no corresponding instruction.
4395///
4396/// \param __a
4397///    A 256-bit floating-point vector of [4 x double].
4398/// \returns A 128-bit floating-point vector of [2 x double] containing the
4399///    lower 128 bits of the parameter.
4400static __inline __m128d __DEFAULT_FN_ATTRS
4401_mm256_castpd256_pd128(__m256d __a)
4402{
4403  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4404}
4405
4406/// Returns the lower 128 bits of a 256-bit floating-point vector of
4407///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4408///
4409/// \headerfile <x86intrin.h>
4410///
4411/// This intrinsic has no corresponding instruction.
4412///
4413/// \param __a
4414///    A 256-bit floating-point vector of [8 x float].
4415/// \returns A 128-bit floating-point vector of [4 x float] containing the
4416///    lower 128 bits of the parameter.
4417static __inline __m128 __DEFAULT_FN_ATTRS
4418_mm256_castps256_ps128(__m256 __a)
4419{
4420  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4421}
4422
4423/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4424///
4425/// \headerfile <x86intrin.h>
4426///
4427/// This intrinsic has no corresponding instruction.
4428///
4429/// \param __a
4430///    A 256-bit integer vector.
4431/// \returns A 128-bit integer vector containing the lower 128 bits of the
4432///    parameter.
4433static __inline __m128i __DEFAULT_FN_ATTRS
4434_mm256_castsi256_si128(__m256i __a)
4435{
4436  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4437}
4438
4439/// Constructs a 256-bit floating-point vector of [4 x double] from a
4440///    128-bit floating-point vector of [2 x double].
4441///
4442///    The lower 128 bits contain the value of the source vector. The contents
4443///    of the upper 128 bits are undefined.
4444///
4445/// \headerfile <x86intrin.h>
4446///
4447/// This intrinsic has no corresponding instruction.
4448///
4449/// \param __a
4450///    A 128-bit vector of [2 x double].
4451/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4452///    contain the value of the parameter. The contents of the upper 128 bits
4453///    are undefined.
4454static __inline __m256d __DEFAULT_FN_ATTRS
4455_mm256_castpd128_pd256(__m128d __a)
4456{
4457  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4458}
4459
4460/// Constructs a 256-bit floating-point vector of [8 x float] from a
4461///    128-bit floating-point vector of [4 x float].
4462///
4463///    The lower 128 bits contain the value of the source vector. The contents
4464///    of the upper 128 bits are undefined.
4465///
4466/// \headerfile <x86intrin.h>
4467///
4468/// This intrinsic has no corresponding instruction.
4469///
4470/// \param __a
4471///    A 128-bit vector of [4 x float].
4472/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4473///    contain the value of the parameter. The contents of the upper 128 bits
4474///    are undefined.
4475static __inline __m256 __DEFAULT_FN_ATTRS
4476_mm256_castps128_ps256(__m128 __a)
4477{
4478  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4479}
4480
4481/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4482///
4483///    The lower 128 bits contain the value of the source vector. The contents
4484///    of the upper 128 bits are undefined.
4485///
4486/// \headerfile <x86intrin.h>
4487///
4488/// This intrinsic has no corresponding instruction.
4489///
4490/// \param __a
4491///    A 128-bit integer vector.
4492/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4493///    the parameter. The contents of the upper 128 bits are undefined.
4494static __inline __m256i __DEFAULT_FN_ATTRS
4495_mm256_castsi128_si256(__m128i __a)
4496{
4497  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4498}
4499
4500/// Constructs a 256-bit floating-point vector of [4 x double] from a
4501///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4502///    contain the value of the source vector. The upper 128 bits are set
4503///    to zero.
4504///
4505/// \headerfile <x86intrin.h>
4506///
4507/// This intrinsic has no corresponding instruction.
4508///
4509/// \param __a
4510///    A 128-bit vector of [2 x double].
4511/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4512///    contain the value of the parameter. The upper 128 bits are set to zero.
4513static __inline __m256d __DEFAULT_FN_ATTRS
4514_mm256_zextpd128_pd256(__m128d __a)
4515{
4516  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4517}
4518
4519/// Constructs a 256-bit floating-point vector of [8 x float] from a
4520///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4521///    the value of the source vector. The upper 128 bits are set to zero.
4522///
4523/// \headerfile <x86intrin.h>
4524///
4525/// This intrinsic has no corresponding instruction.
4526///
4527/// \param __a
4528///    A 128-bit vector of [4 x float].
4529/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4530///    contain the value of the parameter. The upper 128 bits are set to zero.
4531static __inline __m256 __DEFAULT_FN_ATTRS
4532_mm256_zextps128_ps256(__m128 __a)
4533{
4534  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4535}
4536
4537/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4538///    The lower 128 bits contain the value of the source vector. The upper
4539///    128 bits are set to zero.
4540///
4541/// \headerfile <x86intrin.h>
4542///
4543/// This intrinsic has no corresponding instruction.
4544///
4545/// \param __a
4546///    A 128-bit integer vector.
4547/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4548///    the parameter. The upper 128 bits are set to zero.
4549static __inline __m256i __DEFAULT_FN_ATTRS
4550_mm256_zextsi128_si256(__m128i __a)
4551{
4552  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4553}
4554
4555/*
4556   Vector insert.
4557   We use macros rather than inlines because we only want to accept
4558   invocations where the immediate M is a constant expression.
4559*/
4560/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4561///    a 256-bit vector of [8 x float] given in the first parameter, and then
4562///    replacing either the upper or the lower 128 bits with the contents of a
4563///    128-bit vector of [4 x float] in the second parameter.
4564///
4565///    The immediate integer parameter determines between the upper or the lower
4566///    128 bits.
4567///
4568/// \headerfile <x86intrin.h>
4569///
4570/// \code
4571/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4572/// \endcode
4573///
4574/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4575///
4576/// \param V1
4577///    A 256-bit vector of [8 x float]. This vector is copied to the result
4578///    first, and then either the upper or the lower 128 bits of the result will
4579///    be replaced by the contents of \a V2.
4580/// \param V2
4581///    A 128-bit vector of [4 x float]. The contents of this parameter are
4582///    written to either the upper or the lower 128 bits of the result depending
4583///    on the value of parameter \a M.
4584/// \param M
4585///    An immediate integer. The least significant bit determines how the values
4586///    from the two parameters are interleaved: \n
4587///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4588///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4589///    result. \n
4590///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4591///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4592///    result.
4593/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4594#define _mm256_insertf128_ps(V1, V2, M) \
4595  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4596                                           (__v4sf)(__m128)(V2), (int)(M))
4597
4598/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4599///    a 256-bit vector of [4 x double] given in the first parameter, and then
4600///    replacing either the upper or the lower 128 bits with the contents of a
4601///    128-bit vector of [2 x double] in the second parameter.
4602///
4603///    The immediate integer parameter determines between the upper or the lower
4604///    128 bits.
4605///
4606/// \headerfile <x86intrin.h>
4607///
4608/// \code
4609/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4610/// \endcode
4611///
4612/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4613///
4614/// \param V1
4615///    A 256-bit vector of [4 x double]. This vector is copied to the result
4616///    first, and then either the upper or the lower 128 bits of the result will
4617///    be replaced by the contents of \a V2.
4618/// \param V2
4619///    A 128-bit vector of [2 x double]. The contents of this parameter are
4620///    written to either the upper or the lower 128 bits of the result depending
4621///    on the value of parameter \a M.
4622/// \param M
4623///    An immediate integer. The least significant bit determines how the values
4624///    from the two parameters are interleaved: \n
4625///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4626///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4627///    result. \n
4628///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4629///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4630///    result.
4631/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4632#define _mm256_insertf128_pd(V1, V2, M) \
4633  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4634                                            (__v2df)(__m128d)(V2), (int)(M))
4635
4636/// Constructs a new 256-bit integer vector by first duplicating a
4637///    256-bit integer vector given in the first parameter, and then replacing
4638///    either the upper or the lower 128 bits with the contents of a 128-bit
4639///    integer vector in the second parameter.
4640///
4641///    The immediate integer parameter determines between the upper or the lower
4642///    128 bits.
4643///
4644/// \headerfile <x86intrin.h>
4645///
4646/// \code
4647/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4648/// \endcode
4649///
4650/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4651///
4652/// \param V1
4653///    A 256-bit integer vector. This vector is copied to the result first, and
4654///    then either the upper or the lower 128 bits of the result will be
4655///    replaced by the contents of \a V2.
4656/// \param V2
4657///    A 128-bit integer vector. The contents of this parameter are written to
4658///    either the upper or the lower 128 bits of the result depending on the
4659///     value of parameter \a M.
4660/// \param M
4661///    An immediate integer. The least significant bit determines how the values
4662///    from the two parameters are interleaved: \n
4663///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4664///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4665///    result. \n
4666///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4667///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4668///    result.
4669/// \returns A 256-bit integer vector containing the interleaved values.
4670#define _mm256_insertf128_si256(V1, V2, M) \
4671  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4672                                            (__v4si)(__m128i)(V2), (int)(M))
4673
4674/*
4675   Vector extract.
4676   We use macros rather than inlines because we only want to accept
4677   invocations where the immediate M is a constant expression.
4678*/
4679/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4680///    of [8 x float], as determined by the immediate integer parameter, and
4681///    returns the extracted bits as a 128-bit vector of [4 x float].
4682///
4683/// \headerfile <x86intrin.h>
4684///
4685/// \code
4686/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4687/// \endcode
4688///
4689/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4690///
4691/// \param V
4692///    A 256-bit vector of [8 x float].
4693/// \param M
4694///    An immediate integer. The least significant bit determines which bits are
4695///    extracted from the first parameter: \n
4696///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4697///    result. \n
4698///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4699/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4700#define _mm256_extractf128_ps(V, M) \
4701  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
4702
4703/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4704///    of [4 x double], as determined by the immediate integer parameter, and
4705///    returns the extracted bits as a 128-bit vector of [2 x double].
4706///
4707/// \headerfile <x86intrin.h>
4708///
4709/// \code
4710/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4711/// \endcode
4712///
4713/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4714///
4715/// \param V
4716///    A 256-bit vector of [4 x double].
4717/// \param M
4718///    An immediate integer. The least significant bit determines which bits are
4719///    extracted from the first parameter: \n
4720///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4721///    result. \n
4722///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4723/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4724#define _mm256_extractf128_pd(V, M) \
4725  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
4726
4727/// Extracts either the upper or the lower 128 bits from a 256-bit
4728///    integer vector, as determined by the immediate integer parameter, and
4729///    returns the extracted bits as a 128-bit integer vector.
4730///
4731/// \headerfile <x86intrin.h>
4732///
4733/// \code
4734/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4735/// \endcode
4736///
4737/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4738///
4739/// \param V
4740///    A 256-bit integer vector.
4741/// \param M
4742///    An immediate integer. The least significant bit determines which bits are
4743///    extracted from the first parameter:  \n
4744///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4745///    result. \n
4746///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4747/// \returns A 128-bit integer vector containing the extracted bits.
4748#define _mm256_extractf128_si256(V, M) \
4749  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
4750
4751/* SIMD load ops (unaligned) */
4752/// Loads two 128-bit floating-point vectors of [4 x float] from
4753///    unaligned memory locations and constructs a 256-bit floating-point vector
4754///    of [8 x float] by concatenating the two 128-bit vectors.
4755///
4756/// \headerfile <x86intrin.h>
4757///
4758/// This intrinsic corresponds to load instructions followed by the
4759///   <c> VINSERTF128 </c> instruction.
4760///
4761/// \param __addr_hi
4762///    A pointer to a 128-bit memory location containing 4 consecutive
4763///    single-precision floating-point values. These values are to be copied to
4764///    bits[255:128] of the result. The address of the memory location does not
4765///    have to be aligned.
4766/// \param __addr_lo
4767///    A pointer to a 128-bit memory location containing 4 consecutive
4768///    single-precision floating-point values. These values are to be copied to
4769///    bits[127:0] of the result. The address of the memory location does not
4770///    have to be aligned.
4771/// \returns A 256-bit floating-point vector of [8 x float] containing the
4772///    concatenated result.
4773static __inline __m256 __DEFAULT_FN_ATTRS
4774_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4775{
4776  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4777  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4778}
4779
4780/// Loads two 128-bit floating-point vectors of [2 x double] from
4781///    unaligned memory locations and constructs a 256-bit floating-point vector
4782///    of [4 x double] by concatenating the two 128-bit vectors.
4783///
4784/// \headerfile <x86intrin.h>
4785///
4786/// This intrinsic corresponds to load instructions followed by the
4787///   <c> VINSERTF128 </c> instruction.
4788///
4789/// \param __addr_hi
4790///    A pointer to a 128-bit memory location containing two consecutive
4791///    double-precision floating-point values. These values are to be copied to
4792///    bits[255:128] of the result. The address of the memory location does not
4793///    have to be aligned.
4794/// \param __addr_lo
4795///    A pointer to a 128-bit memory location containing two consecutive
4796///    double-precision floating-point values. These values are to be copied to
4797///    bits[127:0] of the result. The address of the memory location does not
4798///    have to be aligned.
4799/// \returns A 256-bit floating-point vector of [4 x double] containing the
4800///    concatenated result.
4801static __inline __m256d __DEFAULT_FN_ATTRS
4802_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4803{
4804  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4805  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4806}
4807
4808/// Loads two 128-bit integer vectors from unaligned memory locations and
4809///    constructs a 256-bit integer vector by concatenating the two 128-bit
4810///    vectors.
4811///
4812/// \headerfile <x86intrin.h>
4813///
4814/// This intrinsic corresponds to load instructions followed by the
4815///   <c> VINSERTF128 </c> instruction.
4816///
4817/// \param __addr_hi
4818///    A pointer to a 128-bit memory location containing a 128-bit integer
4819///    vector. This vector is to be copied to bits[255:128] of the result. The
4820///    address of the memory location does not have to be aligned.
4821/// \param __addr_lo
4822///    A pointer to a 128-bit memory location containing a 128-bit integer
4823///    vector. This vector is to be copied to bits[127:0] of the result. The
4824///    address of the memory location does not have to be aligned.
4825/// \returns A 256-bit integer vector containing the concatenated result.
4826static __inline __m256i __DEFAULT_FN_ATTRS
4827_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
4828{
4829  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4830  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4831}
4832
4833/* SIMD store ops (unaligned) */
4834/// Stores the upper and lower 128 bits of a 256-bit floating-point
4835///    vector of [8 x float] into two different unaligned memory locations.
4836///
4837/// \headerfile <x86intrin.h>
4838///
4839/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4840///   store instructions.
4841///
4842/// \param __addr_hi
4843///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4844///    copied to this memory location. The address of this memory location does
4845///    not have to be aligned.
4846/// \param __addr_lo
4847///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4848///    copied to this memory location. The address of this memory location does
4849///    not have to be aligned.
4850/// \param __a
4851///    A 256-bit floating-point vector of [8 x float].
4852static __inline void __DEFAULT_FN_ATTRS
4853_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4854{
4855  __m128 __v128;
4856
4857  __v128 = _mm256_castps256_ps128(__a);
4858  _mm_storeu_ps(__addr_lo, __v128);
4859  __v128 = _mm256_extractf128_ps(__a, 1);
4860  _mm_storeu_ps(__addr_hi, __v128);
4861}
4862
4863/// Stores the upper and lower 128 bits of a 256-bit floating-point
4864///    vector of [4 x double] into two different unaligned memory locations.
4865///
4866/// \headerfile <x86intrin.h>
4867///
4868/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4869///   store instructions.
4870///
4871/// \param __addr_hi
4872///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4873///    copied to this memory location. The address of this memory location does
4874///    not have to be aligned.
4875/// \param __addr_lo
4876///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4877///    copied to this memory location. The address of this memory location does
4878///    not have to be aligned.
4879/// \param __a
4880///    A 256-bit floating-point vector of [4 x double].
4881static __inline void __DEFAULT_FN_ATTRS
4882_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4883{
4884  __m128d __v128;
4885
4886  __v128 = _mm256_castpd256_pd128(__a);
4887  _mm_storeu_pd(__addr_lo, __v128);
4888  __v128 = _mm256_extractf128_pd(__a, 1);
4889  _mm_storeu_pd(__addr_hi, __v128);
4890}
4891
4892/// Stores the upper and lower 128 bits of a 256-bit integer vector into
4893///    two different unaligned memory locations.
4894///
4895/// \headerfile <x86intrin.h>
4896///
4897/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4898///   store instructions.
4899///
4900/// \param __addr_hi
4901///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4902///    copied to this memory location. The address of this memory location does
4903///    not have to be aligned.
4904/// \param __addr_lo
4905///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4906///    copied to this memory location. The address of this memory location does
4907///    not have to be aligned.
4908/// \param __a
4909///    A 256-bit integer vector.
4910static __inline void __DEFAULT_FN_ATTRS
4911_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
4912{
4913  __m128i __v128;
4914
4915  __v128 = _mm256_castsi256_si128(__a);
4916  _mm_storeu_si128(__addr_lo, __v128);
4917  __v128 = _mm256_extractf128_si256(__a, 1);
4918  _mm_storeu_si128(__addr_hi, __v128);
4919}
4920
4921/// Constructs a 256-bit floating-point vector of [8 x float] by
4922///    concatenating two 128-bit floating-point vectors of [4 x float].
4923///
4924/// \headerfile <x86intrin.h>
4925///
4926/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4927///
4928/// \param __hi
4929///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4930///    128 bits of the result.
4931/// \param __lo
4932///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4933///    128 bits of the result.
4934/// \returns A 256-bit floating-point vector of [8 x float] containing the
4935///    concatenated result.
4936static __inline __m256 __DEFAULT_FN_ATTRS
4937_mm256_set_m128 (__m128 __hi, __m128 __lo)
4938{
4939  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4940}
4941
4942/// Constructs a 256-bit floating-point vector of [4 x double] by
4943///    concatenating two 128-bit floating-point vectors of [2 x double].
4944///
4945/// \headerfile <x86intrin.h>
4946///
4947/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4948///
4949/// \param __hi
4950///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4951///    128 bits of the result.
4952/// \param __lo
4953///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4954///    128 bits of the result.
4955/// \returns A 256-bit floating-point vector of [4 x double] containing the
4956///    concatenated result.
4957static __inline __m256d __DEFAULT_FN_ATTRS
4958_mm256_set_m128d (__m128d __hi, __m128d __lo)
4959{
4960  return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4961}
4962
4963/// Constructs a 256-bit integer vector by concatenating two 128-bit
4964///    integer vectors.
4965///
4966/// \headerfile <x86intrin.h>
4967///
4968/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4969///
4970/// \param __hi
4971///    A 128-bit integer vector to be copied to the upper 128 bits of the
4972///    result.
4973/// \param __lo
4974///    A 128-bit integer vector to be copied to the lower 128 bits of the
4975///    result.
4976/// \returns A 256-bit integer vector containing the concatenated result.
4977static __inline __m256i __DEFAULT_FN_ATTRS
4978_mm256_set_m128i (__m128i __hi, __m128i __lo)
4979{
4980  return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4981}
4982
4983/// Constructs a 256-bit floating-point vector of [8 x float] by
4984///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
4985///    similar to _mm256_set_m128, but the order of the input parameters is
4986///    swapped.
4987///
4988/// \headerfile <x86intrin.h>
4989///
4990/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4991///
4992/// \param __lo
4993///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4994///    128 bits of the result.
4995/// \param __hi
4996///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4997///    128 bits of the result.
4998/// \returns A 256-bit floating-point vector of [8 x float] containing the
4999///    concatenated result.
5000static __inline __m256 __DEFAULT_FN_ATTRS
5001_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5002{
5003  return _mm256_set_m128(__hi, __lo);
5004}
5005
5006/// Constructs a 256-bit floating-point vector of [4 x double] by
5007///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
5008///    similar to _mm256_set_m128d, but the order of the input parameters is
5009///    swapped.
5010///
5011/// \headerfile <x86intrin.h>
5012///
5013/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5014///
5015/// \param __lo
5016///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
5017///    128 bits of the result.
5018/// \param __hi
5019///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
5020///    128 bits of the result.
5021/// \returns A 256-bit floating-point vector of [4 x double] containing the
5022///    concatenated result.
5023static __inline __m256d __DEFAULT_FN_ATTRS
5024_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5025{
5026  return (__m256d)_mm256_set_m128d(__hi, __lo);
5027}
5028
5029/// Constructs a 256-bit integer vector by concatenating two 128-bit
5030///    integer vectors. This is similar to _mm256_set_m128i, but the order of
5031///    the input parameters is swapped.
5032///
5033/// \headerfile <x86intrin.h>
5034///
5035/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5036///
5037/// \param __lo
5038///    A 128-bit integer vector to be copied to the lower 128 bits of the
5039///    result.
5040/// \param __hi
5041///    A 128-bit integer vector to be copied to the upper 128 bits of the
5042///    result.
5043/// \returns A 256-bit integer vector containing the concatenated result.
5044static __inline __m256i __DEFAULT_FN_ATTRS
5045_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5046{
5047  return (__m256i)_mm256_set_m128i(__hi, __lo);
5048}
5049
5050#undef __DEFAULT_FN_ATTRS
5051#undef __DEFAULT_FN_ATTRS128
5052
5053#endif /* __AVXINTRIN_H */
5054