avxintrin.h revision 314564
195003Smux/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
295041Sru *
395003Smux * Permission is hereby granted, free of charge, to any person obtaining a copy
495003Smux * of this software and associated documentation files (the "Software"), to deal
595003Smux * in the Software without restriction, including without limitation the rights
695003Smux * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
795003Smux * copies of the Software, and to permit persons to whom the Software is
895003Smux * furnished to do so, subject to the following conditions:
995003Smux *
1095003Smux * The above copyright notice and this permission notice shall be included in
1195003Smux * all copies or substantial portions of the Software.
1295003Smux *
1395003Smux * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1495003Smux * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1595003Smux * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1695003Smux * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1795003Smux * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1895003Smux * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1995003Smux * THE SOFTWARE.
2095003Smux *
2195003Smux *===-----------------------------------------------------------------------===
2295003Smux */
2395003Smux
2495003Smux#ifndef __IMMINTRIN_H
2595003Smux#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
2695003Smux#endif
2795003Smux
2895003Smux#ifndef __AVXINTRIN_H
2995003Smux#define __AVXINTRIN_H
3095003Smux
3195003Smuxtypedef double __v4df __attribute__ ((__vector_size__ (32)));
3295003Smuxtypedef float __v8sf __attribute__ ((__vector_size__ (32)));
3395003Smuxtypedef long long __v4di __attribute__ ((__vector_size__ (32)));
3495003Smuxtypedef int __v8si __attribute__ ((__vector_size__ (32)));
3595003Smuxtypedef short __v16hi __attribute__ ((__vector_size__ (32)));
3695003Smuxtypedef char __v32qi __attribute__ ((__vector_size__ (32)));
3795003Smux
3895003Smux/* Unsigned types */
3995003Smuxtypedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
4095003Smuxtypedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
4195003Smuxtypedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
4295003Smuxtypedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
4395003Smux
44108028Sru/* We need an explicitly signed variant for char. Note that this shouldn't
4595003Smux * appear in the interface though. */
4695003Smuxtypedef signed char __v32qs __attribute__((__vector_size__(32)));
4795003Smux
4895003Smuxtypedef float __m256 __attribute__ ((__vector_size__ (32)));
4995003Smuxtypedef double __m256d __attribute__((__vector_size__(32)));
5095003Smuxtypedef long long __m256i __attribute__((__vector_size__(32)));
51108087Sru
5295041Sru/* Define the default attributes for the functions in this file. */
5395003Smux#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
5495003Smux
5595041Sru/* Arithmetic */
5695041Sru/// \brief Adds two 256-bit vectors of [4 x double].
5795003Smux///
5895003Smux/// \headerfile <x86intrin.h>
5995003Smux///
6095003Smux/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
6195003Smux///
6295003Smux/// \param __a
6395003Smux///    A 256-bit vector of [4 x double] containing one of the source operands.
6495003Smux/// \param __b
6595041Sru///    A 256-bit vector of [4 x double] containing one of the source operands.
6695003Smux/// \returns A 256-bit vector of [4 x double] containing the sums of both
6795003Smux///    operands.
6895003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS
6995041Sru_mm256_add_pd(__m256d __a, __m256d __b)
7095041Sru{
7195003Smux  return (__m256d)((__v4df)__a+(__v4df)__b);
7295003Smux}
7395003Smux
7495003Smux/// \brief Adds two 256-bit vectors of [8 x float].
7595003Smux///
7695003Smux/// \headerfile <x86intrin.h>
7795003Smux///
7895003Smux/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
7995003Smux///
8095003Smux/// \param __a
8195003Smux///    A 256-bit vector of [8 x float] containing one of the source operands.
8295003Smux/// \param __b
8395003Smux///    A 256-bit vector of [8 x float] containing one of the source operands.
8495003Smux/// \returns A 256-bit vector of [8 x float] containing the sums of both
8595003Smux///    operands.
8695003Smuxstatic __inline __m256 __DEFAULT_FN_ATTRS
8795003Smux_mm256_add_ps(__m256 __a, __m256 __b)
88108087Sru{
8995003Smux  return (__m256)((__v8sf)__a+(__v8sf)__b);
9095003Smux}
9195003Smux
9295003Smux/// \brief Subtracts two 256-bit vectors of [4 x double].
9395003Smux///
9495003Smux/// \headerfile <x86intrin.h>
95108087Sru///
9695003Smux/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
9795003Smux///
9895003Smux/// \param __a
9995003Smux///    A 256-bit vector of [4 x double] containing the minuend.
10095003Smux/// \param __b
10195003Smux///    A 256-bit vector of [4 x double] containing the subtrahend.
10295003Smux/// \returns A 256-bit vector of [4 x double] containing the differences between
10395003Smux///    both operands.
10495003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS
10595041Sru_mm256_sub_pd(__m256d __a, __m256d __b)
10695003Smux{
10795003Smux  return (__m256d)((__v4df)__a-(__v4df)__b);
10895003Smux}
10995003Smux
11095003Smux/// \brief Subtracts two 256-bit vectors of [8 x float].
11195003Smux///
11295003Smux/// \headerfile <x86intrin.h>
11395003Smux///
11495003Smux/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
11595003Smux///
116108087Sru/// \param __a
11795003Smux///    A 256-bit vector of [8 x float] containing the minuend.
11895003Smux/// \param __b
11995003Smux///    A 256-bit vector of [8 x float] containing the subtrahend.
12095003Smux/// \returns A 256-bit vector of [8 x float] containing the differences between
12195003Smux///    both operands.
122108028Srustatic __inline __m256 __DEFAULT_FN_ATTRS
12395003Smux_mm256_sub_ps(__m256 __a, __m256 __b)
12495003Smux{
12595041Sru  return (__m256)((__v8sf)__a-(__v8sf)__b);
12695003Smux}
12795003Smux
12895003Smux/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
12995003Smux///    two 256-bit vectors of [4 x double].
13095003Smux///
13195003Smux/// \headerfile <x86intrin.h>
13295041Sru///
13395041Sru/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
13495003Smux///
13595003Smux/// \param __a
13695003Smux///    A 256-bit vector of [4 x double] containing the left source operand.
137108028Sru/// \param __b
13895003Smux///    A 256-bit vector of [4 x double] containing the right source operand.
139108028Sru/// \returns A 256-bit vector of [4 x double] containing the alternating sums
14095003Smux///    and differences between both operands.
14195003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS
14295003Smux_mm256_addsub_pd(__m256d __a, __m256d __b)
143108087Sru{
14495003Smux  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145108087Sru}
14695003Smux
14795003Smux/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
14895003Smux///    two 256-bit vectors of [8 x float].
14995003Smux///
15095003Smux/// \headerfile <x86intrin.h>
15195003Smux///
15295003Smux/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
15395003Smux///
15495003Smux/// \param __a
15595003Smux///    A 256-bit vector of [8 x float] containing the left source operand.
15695003Smux/// \param __b
15795003Smux///    A 256-bit vector of [8 x float] containing the right source operand.
15895041Sru/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
15995003Smux///    differences between both operands.
16095003Smuxstatic __inline __m256 __DEFAULT_FN_ATTRS
16195041Sru_mm256_addsub_ps(__m256 __a, __m256 __b)
16295003Smux{
16395003Smux  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
16495003Smux}
16595003Smux
16695003Smux/// \brief Divides two 256-bit vectors of [4 x double].
16795003Smux///
16895003Smux/// \headerfile <x86intrin.h>
16995003Smux///
17095003Smux/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
17195003Smux///
17295003Smux/// \param __a
173147700Shmp///    A 256-bit vector of [4 x double] containing the dividend.
17495003Smux/// \param __b
17595003Smux///    A 256-bit vector of [4 x double] containing the divisor.
17695041Sru/// \returns A 256-bit vector of [4 x double] containing the quotients of both
17795003Smux///    operands.
17895003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS
17995003Smux_mm256_div_pd(__m256d __a, __m256d __b)
180{
181  return (__m256d)((__v4df)__a/(__v4df)__b);
182}
183
184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
189///
190/// \param __a
191///    A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193///    A 256-bit vector of [8 x float] containing the divisor.
194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195///    operands.
196static __inline __m256 __DEFAULT_FN_ATTRS
197_mm256_div_ps(__m256 __a, __m256 __b)
198{
199  return (__m256)((__v8sf)__a/(__v8sf)__b);
200}
201
202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203///    of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
208///
209/// \param __a
210///    A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212///    A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214///    between both operands.
215static __inline __m256d __DEFAULT_FN_ATTRS
216_mm256_max_pd(__m256d __a, __m256d __b)
217{
218  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219}
220
221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222///    of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
227///
228/// \param __a
229///    A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231///    A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233///    between both operands.
234static __inline __m256 __DEFAULT_FN_ATTRS
235_mm256_max_ps(__m256 __a, __m256 __b)
236{
237  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238}
239
240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241///    of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
246///
247/// \param __a
248///    A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250///    A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252///    between both operands.
253static __inline __m256d __DEFAULT_FN_ATTRS
254_mm256_min_pd(__m256d __a, __m256d __b)
255{
256  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257}
258
259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260///    of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
265///
266/// \param __a
267///    A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269///    A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271///    between both operands.
272static __inline __m256 __DEFAULT_FN_ATTRS
273_mm256_min_ps(__m256 __a, __m256 __b)
274{
275  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276}
277
278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
283///
284/// \param __a
285///    A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287///    A 256-bit vector of [4 x double] containing one of the operands.
288/// \returns A 256-bit vector of [4 x double] containing the products of both
289///    operands.
290static __inline __m256d __DEFAULT_FN_ATTRS
291_mm256_mul_pd(__m256d __a, __m256d __b)
292{
293  return (__m256d)((__v4df)__a * (__v4df)__b);
294}
295
296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
301///
302/// \param __a
303///    A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305///    A 256-bit vector of [8 x float] containing one of the operands.
306/// \returns A 256-bit vector of [8 x float] containing the products of both
307///    operands.
308static __inline __m256 __DEFAULT_FN_ATTRS
309_mm256_mul_ps(__m256 __a, __m256 __b)
310{
311  return (__m256)((__v8sf)__a * (__v8sf)__b);
312}
313
314/// \brief Calculates the square roots of the values in a 256-bit vector of
315///    [4 x double].
316///
317/// \headerfile <x86intrin.h>
318///
319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
320///
321/// \param __a
322///    A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324///    values in the operand.
325static __inline __m256d __DEFAULT_FN_ATTRS
326_mm256_sqrt_pd(__m256d __a)
327{
328  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329}
330
331/// \brief Calculates the square roots of the values in a 256-bit vector of
332///    [8 x float].
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
337///
338/// \param __a
339///    A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341///    values in the operand.
342static __inline __m256 __DEFAULT_FN_ATTRS
343_mm256_sqrt_ps(__m256 __a)
344{
345  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346}
347
348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349///    vector of [8 x float].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
354///
355/// \param __a
356///    A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358///    roots of the values in the operand.
359static __inline __m256 __DEFAULT_FN_ATTRS
360_mm256_rsqrt_ps(__m256 __a)
361{
362  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363}
364
365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366///    [8 x float].
367///
368/// \headerfile <x86intrin.h>
369///
370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
371///
372/// \param __a
373///    A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375///    values in the operand.
376static __inline __m256 __DEFAULT_FN_ATTRS
377_mm256_rcp_ps(__m256 __a)
378{
379  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380}
381
382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383///    by the byte operand. The source values are rounded to integer values and
384///    returned as 64-bit double-precision floating-point values.
385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
393///
394/// \param V
395///    A 256-bit vector of [4 x double].
396/// \param M
397///    An integer value that specifies the rounding operation. \n
398///    Bits [7:4] are reserved. \n
399///    Bit [3] is a precision exception value: \n
400///      0: A normal PE exception is used. \n
401///      1: The PE field is not updated. \n
402///    Bit [2] is the rounding control source: \n
403///      0: Use bits [1:0] of \a M. \n
404///      1: Use the current MXCSR setting. \n
405///    Bits [1:0] contain the rounding control definition: \n
406///      00: Nearest. \n
407///      01: Downward (toward negative infinity). \n
408///      10: Upward (toward positive infinity). \n
409///      11: Truncated.
410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
411#define _mm256_round_pd(V, M) __extension__ ({ \
412    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
413
414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415///    specified by the byte operand. The source values are rounded to integer
416///    values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
425///
426/// \param V
427///    A 256-bit vector of [8 x float].
428/// \param M
429///    An integer value that specifies the rounding operation. \n
430///    Bits [7:4] are reserved. \n
431///    Bit [3] is a precision exception value: \n
432///      0: A normal PE exception is used. \n
433///      1: The PE field is not updated. \n
434///    Bit [2] is the rounding control source: \n
435///      0: Use bits [1:0] of \a M. \n
436///      1: Use the current MXCSR setting. \n
437///    Bits [1:0] contain the rounding control definition: \n
438///      00: Nearest. \n
439///      01: Downward (toward negative infinity). \n
440///      10: Upward (toward positive infinity). \n
441///      11: Truncated.
442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
443#define _mm256_round_ps(V, M) __extension__ ({ \
444  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
445
446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
447///    source values are rounded up to integer values and returned as 64-bit
448///    double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
457///
458/// \param V
459///    A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
462
463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
464///    The source values are rounded down to integer values and returned as
465///    64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
474///
475/// \param V
476///    A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478///    values.
479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480
481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
482///    source values are rounded up to integer values and returned as
483///    floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
492///
493/// \param V
494///    A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
497
498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
499///    source values are rounded down to integer values and returned as
500///    floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
509///
510/// \param V
511///    A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
521///
522/// \param __a
523///    A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525///    A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527///    values between both operands.
528static __inline __m256d __DEFAULT_FN_ATTRS
529_mm256_and_pd(__m256d __a, __m256d __b)
530{
531  return (__m256d)((__v4du)__a & (__v4du)__b);
532}
533
534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
539///
540/// \param __a
541///    A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543///    A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545///    values between both operands.
546static __inline __m256 __DEFAULT_FN_ATTRS
547_mm256_and_ps(__m256 __a, __m256 __b)
548{
549  return (__m256)((__v8su)__a & (__v8su)__b);
550}
551
552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553///    the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
558///
559/// \param __a
560///    A 256-bit vector of [4 x double] containing the left source operand. The
561///    one's complement of this value is used in the bitwise AND.
562/// \param __b
563///    A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565///    values of the second operand and the one's complement of the first
566///    operand.
567static __inline __m256d __DEFAULT_FN_ATTRS
568_mm256_andnot_pd(__m256d __a, __m256d __b)
569{
570  return (__m256d)(~(__v4du)__a & (__v4du)__b);
571}
572
573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574///    the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
579///
580/// \param __a
581///    A 256-bit vector of [8 x float] containing the left source operand. The
582///    one's complement of this value is used in the bitwise AND.
583/// \param __b
584///    A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586///    values of the second operand and the one's complement of the first
587///    operand.
588static __inline __m256 __DEFAULT_FN_ATTRS
589_mm256_andnot_ps(__m256 __a, __m256 __b)
590{
591  return (__m256)(~(__v8su)__a & (__v8su)__b);
592}
593
594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
599///
600/// \param __a
601///    A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603///    A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605///    values between both operands.
606static __inline __m256d __DEFAULT_FN_ATTRS
607_mm256_or_pd(__m256d __a, __m256d __b)
608{
609  return (__m256d)((__v4du)__a | (__v4du)__b);
610}
611
612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
617///
618/// \param __a
619///    A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621///    A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623///    values between both operands.
624static __inline __m256 __DEFAULT_FN_ATTRS
625_mm256_or_ps(__m256 __a, __m256 __b)
626{
627  return (__m256)((__v8su)__a | (__v8su)__b);
628}
629
630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
635///
636/// \param __a
637///    A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639///    A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641///    values between both operands.
642static __inline __m256d __DEFAULT_FN_ATTRS
643_mm256_xor_pd(__m256d __a, __m256d __b)
644{
645  return (__m256d)((__v4du)__a ^ (__v4du)__b);
646}
647
648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
653///
654/// \param __a
655///    A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657///    A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659///    values between both operands.
660static __inline __m256 __DEFAULT_FN_ATTRS
661_mm256_xor_ps(__m256 __a, __m256 __b)
662{
663  return (__m256)((__v8su)__a ^ (__v8su)__b);
664}
665
666/* Horizontal arithmetic */
667/// \brief Horizontally adds the adjacent pairs of values contained in two
668///    256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
673///
674/// \param __a
675///    A 256-bit vector of [4 x double] containing one of the source operands.
676///    The horizontal sums of the values are returned in the even-indexed
677///    elements of a vector of [4 x double].
678/// \param __b
679///    A 256-bit vector of [4 x double] containing one of the source operands.
680///    The horizontal sums of the values are returned in the odd-indexed
681///    elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683///    both operands.
684static __inline __m256d __DEFAULT_FN_ATTRS
685_mm256_hadd_pd(__m256d __a, __m256d __b)
686{
687  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688}
689
690/// \brief Horizontally adds the adjacent pairs of values contained in two
691///    256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
696///
697/// \param __a
698///    A 256-bit vector of [8 x float] containing one of the source operands.
699///    The horizontal sums of the values are returned in the elements with
700///    index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702///    A 256-bit vector of [8 x float] containing one of the source operands.
703///    The horizontal sums of the values are returned in the elements with
704///    index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706///    both operands.
707static __inline __m256 __DEFAULT_FN_ATTRS
708_mm256_hadd_ps(__m256 __a, __m256 __b)
709{
710  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711}
712
713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714///    256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
719///
720/// \param __a
721///    A 256-bit vector of [4 x double] containing one of the source operands.
722///    The horizontal differences between the values are returned in the
723///    even-indexed elements of a vector of [4 x double].
724/// \param __b
725///    A 256-bit vector of [4 x double] containing one of the source operands.
726///    The horizontal differences between the values are returned in the
727///    odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729///    differences of both operands.
730static __inline __m256d __DEFAULT_FN_ATTRS
731_mm256_hsub_pd(__m256d __a, __m256d __b)
732{
733  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734}
735
736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737///    256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742///
743/// \param __a
744///    A 256-bit vector of [8 x float] containing one of the source operands.
745///    The horizontal differences between the values are returned in the
746///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748///    A 256-bit vector of [8 x float] containing one of the source operands.
749///    The horizontal differences between the values are returned in the
750///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752///    differences of both operands.
753static __inline __m256 __DEFAULT_FN_ATTRS
754_mm256_hsub_ps(__m256 __a, __m256 __b)
755{
756  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757}
758
759/* Vector permutations */
760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761///    by the 128-bit integer vector operand.
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
766///
767/// \param __a
768///    A 128-bit vector of [2 x double].
769/// \param __c
770///    A 128-bit integer vector operand specifying how the values are to be
771///    copied. \n
772///    Bit [1]: \n
773///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774///         vector. \n
775///      1: Bits [127:64] of the source are copied to bits [63:0] of the
776///         returned vector. \n
777///    Bit [65]: \n
778///      0: Bits [63:0] of the source are copied to bits [127:64] of the
779///         returned vector. \n
780///      1: Bits [127:64] of the source are copied to bits [127:64] of the
781///         returned vector.
782/// \returns A 128-bit vector of [2 x double] containing the copied values.
783static __inline __m128d __DEFAULT_FN_ATTRS
784_mm_permutevar_pd(__m128d __a, __m128i __c)
785{
786  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787}
788
789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
790///    by the 256-bit integer vector operand.
791///
792/// \headerfile <x86intrin.h>
793///
794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
795///
796/// \param __a
797///    A 256-bit vector of [4 x double].
798/// \param __c
799///    A 256-bit integer vector operand specifying how the values are to be
800///    copied. \n
801///    Bit [1]: \n
802///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803///         vector. \n
804///      1: Bits [127:64] of the source are copied to bits [63:0] of the
805///         returned vector. \n
806///    Bit [65]: \n
807///      0: Bits [63:0] of the source are copied to bits [127:64] of the
808///         returned vector. \n
809///      1: Bits [127:64] of the source are copied to bits [127:64] of the
810///         returned vector. \n
811///    Bit [129]: \n
812///      0: Bits [191:128] of the source are copied to bits [191:128] of the
813///         returned vector. \n
814///      1: Bits [255:192] of the source are copied to bits [191:128] of the
815///         returned vector. \n
816///    Bit [193]: \n
817///      0: Bits [191:128] of the source are copied to bits [255:192] of the
818///         returned vector. \n
819///      1: Bits [255:192] of the source are copied to bits [255:192] of the
820///    returned vector.
821/// \returns A 256-bit vector of [4 x double] containing the copied values.
822static __inline __m256d __DEFAULT_FN_ATTRS
823_mm256_permutevar_pd(__m256d __a, __m256i __c)
824{
825  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826}
827
828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829///    specified by the 128-bit integer vector operand.
830/// \headerfile <x86intrin.h>
831///
832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
833///
834/// \param __a
835///    A 128-bit vector of [4 x float].
836/// \param __c
837///    A 128-bit integer vector operand specifying how the values are to be
838///    copied. \n
839///    Bits [1:0]: \n
840///      00: Bits [31:0] of the source are copied to bits [31:0] of the
841///          returned vector. \n
842///      01: Bits [63:32] of the source are copied to bits [31:0] of the
843///          returned vector. \n
844///      10: Bits [95:64] of the source are copied to bits [31:0] of the
845///          returned vector. \n
846///      11: Bits [127:96] of the source are copied to bits [31:0] of the
847///          returned vector. \n
848///    Bits [33:32]: \n
849///      00: Bits [31:0] of the source are copied to bits [63:32] of the
850///          returned vector. \n
851///      01: Bits [63:32] of the source are copied to bits [63:32] of the
852///          returned vector. \n
853///      10: Bits [95:64] of the source are copied to bits [63:32] of the
854///          returned vector. \n
855///      11: Bits [127:96] of the source are copied to bits [63:32] of the
856///          returned vector. \n
857///    Bits [65:64]: \n
858///      00: Bits [31:0] of the source are copied to bits [95:64] of the
859///          returned vector. \n
860///      01: Bits [63:32] of the source are copied to bits [95:64] of the
861///          returned vector. \n
862///      10: Bits [95:64] of the source are copied to bits [95:64] of the
863///          returned vector. \n
864///      11: Bits [127:96] of the source are copied to bits [95:64] of the
865///          returned vector. \n
866///    Bits [97:96]: \n
867///      00: Bits [31:0] of the source are copied to bits [127:96] of the
868///          returned vector. \n
869///      01: Bits [63:32] of the source are copied to bits [127:96] of the
870///          returned vector. \n
871///      10: Bits [95:64] of the source are copied to bits [127:96] of the
872///          returned vector. \n
873///      11: Bits [127:96] of the source are copied to bits [127:96] of the
874///          returned vector.
875/// \returns A 128-bit vector of [4 x float] containing the copied values.
876static __inline __m128 __DEFAULT_FN_ATTRS
877_mm_permutevar_ps(__m128 __a, __m128i __c)
878{
879  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
880}
881
882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
883///    specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
888///
889/// \param __a
890///    A 256-bit vector of [8 x float].
891/// \param __c
892///    A 256-bit integer vector operand specifying how the values are to be
893///    copied. \n
894///    Bits [1:0]: \n
895///      00: Bits [31:0] of the source are copied to bits [31:0] of the
896///          returned vector. \n
897///      01: Bits [63:32] of the source are copied to bits [31:0] of the
898///          returned vector. \n
899///      10: Bits [95:64] of the source are copied to bits [31:0] of the
900///          returned vector. \n
901///      11: Bits [127:96] of the source are copied to bits [31:0] of the
902///          returned vector. \n
903///    Bits [33:32]: \n
904///      00: Bits [31:0] of the source are copied to bits [63:32] of the
905///          returned vector. \n
906///      01: Bits [63:32] of the source are copied to bits [63:32] of the
907///          returned vector. \n
908///      10: Bits [95:64] of the source are copied to bits [63:32] of the
909///          returned vector. \n
910///      11: Bits [127:96] of the source are copied to bits [63:32] of the
911///          returned vector. \n
912///    Bits [65:64]: \n
913///      00: Bits [31:0] of the source are copied to bits [95:64] of the
914///          returned vector. \n
915///      01: Bits [63:32] of the source are copied to bits [95:64] of the
916///          returned vector. \n
917///      10: Bits [95:64] of the source are copied to bits [95:64] of the
918///          returned vector. \n
919///      11: Bits [127:96] of the source are copied to bits [95:64] of the
920///          returned vector. \n
921///    Bits [97:96]: \n
922///      00: Bits [31:0] of the source are copied to bits [127:96] of the
923///          returned vector. \n
924///      01: Bits [63:32] of the source are copied to bits [127:96] of the
925///          returned vector. \n
926///      10: Bits [95:64] of the source are copied to bits [127:96] of the
927///          returned vector. \n
928///      11: Bits [127:96] of the source are copied to bits [127:96] of the
929///          returned vector. \n
930///    Bits [129:128]: \n
931///      00: Bits [159:128] of the source are copied to bits [159:128] of the
932///          returned vector. \n
933///      01: Bits [191:160] of the source are copied to bits [159:128] of the
934///          returned vector. \n
935///      10: Bits [223:192] of the source are copied to bits [159:128] of the
936///          returned vector. \n
937///      11: Bits [255:224] of the source are copied to bits [159:128] of the
938///          returned vector. \n
939///    Bits [161:160]: \n
940///      00: Bits [159:128] of the source are copied to bits [191:160] of the
941///          returned vector. \n
942///      01: Bits [191:160] of the source are copied to bits [191:160] of the
943///          returned vector. \n
944///      10: Bits [223:192] of the source are copied to bits [191:160] of the
945///          returned vector. \n
946///      11: Bits [255:224] of the source are copied to bits [191:160] of the
947///          returned vector. \n
948///    Bits [193:192]: \n
949///      00: Bits [159:128] of the source are copied to bits [223:192] of the
950///          returned vector. \n
951///      01: Bits [191:160] of the source are copied to bits [223:192] of the
952///          returned vector. \n
953///      10: Bits [223:192] of the source are copied to bits [223:192] of the
954///          returned vector. \n
955///      11: Bits [255:224] of the source are copied to bits [223:192] of the
956///          returned vector. \n
957///    Bits [225:224]: \n
958///      00: Bits [159:128] of the source are copied to bits [255:224] of the
959///          returned vector. \n
960///      01: Bits [191:160] of the source are copied to bits [255:224] of the
961///          returned vector. \n
962///      10: Bits [223:192] of the source are copied to bits [255:224] of the
963///          returned vector. \n
964///      11: Bits [255:224] of the source are copied to bits [255:224] of the
965///          returned vector.
966/// \returns A 256-bit vector of [8 x float] containing the copied values.
967static __inline __m256 __DEFAULT_FN_ATTRS
968_mm256_permutevar_ps(__m256 __a, __m256i __c)
969{
970  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971}
972
973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
974///    by the immediate integer operand.
975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983///
984/// \param A
985///    A 128-bit vector of [2 x double].
986/// \param C
987///    An immediate integer operand specifying how the values are to be
988///    copied. \n
989///    Bit [0]: \n
990///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991///         vector. \n
992///      1: Bits [127:64] of the source are copied to bits [63:0] of the
993///         returned vector. \n
994///    Bit [1]: \n
995///      0: Bits [63:0] of the source are copied to bits [127:64] of the
996///         returned vector. \n
997///      1: Bits [127:64] of the source are copied to bits [127:64] of the
998///         returned vector.
999/// \returns A 128-bit vector of [2 x double] containing the copied values.
1000#define _mm_permute_pd(A, C) __extension__ ({ \
1001  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
1002                                   (__v2df)_mm_undefined_pd(), \
1003                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
1004
1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
1006///    the immediate integer operand.
1007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// __m256d _mm256_permute_pd(__m256d A, const int C);
1012/// \endcode
1013///
1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1015///
1016/// \param A
1017///    A 256-bit vector of [4 x double].
1018/// \param C
1019///    An immediate integer operand specifying how the values are to be
1020///    copied. \n
1021///    Bit [0]: \n
1022///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1023///         vector. \n
1024///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1025///         returned vector. \n
1026///    Bit [1]: \n
1027///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1028///         returned vector. \n
1029///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1030///         returned vector. \n
1031///    Bit [2]: \n
1032///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1033///         returned vector. \n
1034///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1035///         returned vector. \n
1036///    Bit [3]: \n
1037///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1038///         returned vector. \n
1039///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1040///         returned vector.
1041/// \returns A 256-bit vector of [4 x double] containing the copied values.
1042#define _mm256_permute_pd(A, C) __extension__ ({ \
1043  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1044                                   (__v4df)_mm256_undefined_pd(), \
1045                                   0 + (((C) >> 0) & 0x1), \
1046                                   0 + (((C) >> 1) & 0x1), \
1047                                   2 + (((C) >> 2) & 0x1), \
1048                                   2 + (((C) >> 3) & 0x1)); })
1049
1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
1051///    the immediate integer operand.
1052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// \code
1056/// __m128 _mm_permute_ps(__m128 A, const int C);
1057/// \endcode
1058///
1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1060///
1061/// \param A
1062///    A 128-bit vector of [4 x float].
1063/// \param C
1064///    An immediate integer operand specifying how the values are to be
1065///    copied. \n
1066///    Bits [1:0]: \n
1067///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1068///          returned vector. \n
1069///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1070///          returned vector. \n
1071///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1072///          returned vector. \n
1073///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1074///          returned vector. \n
1075///    Bits [3:2]: \n
1076///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1077///          returned vector. \n
1078///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1079///          returned vector. \n
1080///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1081///          returned vector. \n
1082///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1083///          returned vector. \n
1084///    Bits [5:4]: \n
1085///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1086///          returned vector. \n
1087///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1088///          returned vector. \n
1089///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1090///          returned vector. \n
1091///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1092///          returned vector. \n
1093///    Bits [7:6]: \n
1094///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1095///          returned vector. \n
1096///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1097///          returned vector. \n
1098///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1099///          returned vector. \n
1100///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1101///          returned vector.
1102/// \returns A 128-bit vector of [4 x float] containing the copied values.
1103#define _mm_permute_ps(A, C) __extension__ ({ \
1104  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1105                                  (__v4sf)_mm_undefined_ps(), \
1106                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1107                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
1108
1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
1110///    the immediate integer operand.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1119///
1120/// \param A
1121///    A 256-bit vector of [8 x float].
1122/// \param C
1123///    An immediate integer operand specifying how the values are to be \n
1124///    copied. \n
1125///    Bits [1:0]: \n
1126///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1127///          returned vector. \n
1128///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1129///          returned vector. \n
1130///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1131///          returned vector. \n
1132///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1133///          returned vector. \n
1134///    Bits [3:2]: \n
1135///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1136///          returned vector. \n
1137///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1138///          returned vector. \n
1139///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1140///          returned vector. \n
1141///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1142///          returned vector. \n
1143///    Bits [5:4]: \n
1144///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1145///          returned vector. \n
1146///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1147///          returned vector. \n
1148///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1149///          returned vector. \n
1150///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1151///          returned vector. \n
1152///    Bits [7:6]: \n
1153///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
1154///          returned vector. \n
1155///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1156///          returned vector. \n
1157///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1158///          returned vector. \n
1159///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1160///          returned vector. \n
1161///    Bits [1:0]: \n
1162///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1163///          returned vector. \n
1164///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1165///          returned vector. \n
1166///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1167///          returned vector. \n
1168///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1169///          returned vector. \n
1170///    Bits [3:2]: \n
1171///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1172///          returned vector. \n
1173///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1174///          returned vector. \n
1175///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1176///          returned vector. \n
1177///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1178///          returned vector. \n
1179///    Bits [5:4]: \n
1180///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1181///          returned vector. \n
1182///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1183///          returned vector. \n
1184///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1185///          returned vector. \n
1186///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1187///          returned vector. \n
1188///    Bits [7:6]: \n
1189///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1190///          returned vector. \n
1191///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1192///          returned vector. \n
1193///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1194///          returned vector. \n
1195///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1196///          returned vector.
1197/// \returns A 256-bit vector of [8 x float] containing the copied values.
1198#define _mm256_permute_ps(A, C) __extension__ ({ \
1199  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1200                                  (__v8sf)_mm256_undefined_ps(), \
1201                                  0 + (((C) >> 0) & 0x3), \
1202                                  0 + (((C) >> 2) & 0x3), \
1203                                  0 + (((C) >> 4) & 0x3), \
1204                                  0 + (((C) >> 6) & 0x3), \
1205                                  4 + (((C) >> 0) & 0x3), \
1206                                  4 + (((C) >> 2) & 0x3), \
1207                                  4 + (((C) >> 4) & 0x3), \
1208                                  4 + (((C) >> 6) & 0x3)); })
1209
1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1211///    [4 x double], as specified by the immediate integer operand.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// \code
1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1217/// \endcode
1218///
1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1220///
1221/// \param V1
1222///    A 256-bit vector of [4 x double].
1223/// \param V2
1224///    A 256-bit vector of [4 x double.
1225/// \param M
1226///    An immediate integer operand specifying how the values are to be
1227///    permuted. \n
1228///    Bits [1:0]: \n
1229///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1230///          destination. \n
1231///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1232///          destination. \n
1233///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1234///          destination. \n
1235///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1236///          destination. \n
1237///    Bits [5:4]: \n
1238///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1239///          destination. \n
1240///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1241///          destination. \n
1242///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1243///          destination. \n
1244///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1245///          destination.
1246/// \returns A 256-bit vector of [4 x double] containing the copied values.
1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1248  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1249                                           (__v4df)(__m256d)(V2), (M)); })
1250
1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1252///    [8 x float], as specified by the immediate integer operand.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// \code
1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1258/// \endcode
1259///
1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1261///
1262/// \param V1
1263///    A 256-bit vector of [8 x float].
1264/// \param V2
1265///    A 256-bit vector of [8 x float].
1266/// \param M
1267///    An immediate integer operand specifying how the values are to be
1268///    permuted. \n
1269///    Bits [1:0]: \n
1270///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1271///    destination. \n
1272///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1273///    destination. \n
1274///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1275///    destination. \n
1276///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1277///    destination. \n
1278///    Bits [5:4]: \n
1279///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1280///    destination. \n
1281///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1282///    destination. \n
1283///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1284///    destination. \n
1285///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1286///    destination.
1287/// \returns A 256-bit vector of [8 x float] containing the copied values.
1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1289  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1290                                          (__v8sf)(__m256)(V2), (M)); })
1291
1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1293///    as specified by the immediate integer operand.
1294///
1295/// \headerfile <x86intrin.h>
1296///
1297/// \code
1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1299/// \endcode
1300///
1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1302///
1303/// \param V1
1304///    A 256-bit integer vector.
1305/// \param V2
1306///    A 256-bit integer vector.
1307/// \param M
1308///    An immediate integer operand specifying how the values are to be copied.
1309///    Bits [1:0]: \n
1310///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1311///    destination. \n
1312///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1313///    destination. \n
1314///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1315///    destination. \n
1316///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1317///    destination. \n
1318///    Bits [5:4]: \n
1319///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1320///    destination. \n
1321///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1322///    destination. \n
1323///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1324///    destination. \n
1325///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1326///    destination.
1327/// \returns A 256-bit integer vector containing the copied values.
1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1329  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1330                                           (__v8si)(__m256i)(V2), (M)); })
1331
1332/* Vector Blend */
1333/// \brief Merges 64-bit double-precision data values stored in either of the
1334///    two 256-bit vectors of [4 x double], as specified by the immediate
1335///    integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1341/// \endcode
1342///
1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1344///
1345/// \param V1
1346///    A 256-bit vector of [4 x double].
1347/// \param V2
1348///    A 256-bit vector of [4 x double].
1349/// \param M
1350///    An immediate integer operand, with mask bits [3:0] specifying how the
1351///    values are to be copied. The position of the mask bit corresponds to the
1352///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1353///    element in operand \a V1 is copied to the same position in the
1354///    destination. When a mask bit is 1, the corresponding 64-bit element in
1355///    operand \a V2 is copied to the same position in the destination.
1356/// \returns A 256-bit vector of [4 x double] containing the copied values.
1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1358  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1359                                   (__v4df)(__m256d)(V2), \
1360                                   (((M) & 0x01) ? 4 : 0), \
1361                                   (((M) & 0x02) ? 5 : 1), \
1362                                   (((M) & 0x04) ? 6 : 2), \
1363                                   (((M) & 0x08) ? 7 : 3)); })
1364
1365/// \brief Merges 32-bit single-precision data values stored in either of the
1366///    two 256-bit vectors of [8 x float], as specified by the immediate
1367///    integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1376///
1377/// \param V1
1378///    A 256-bit vector of [8 x float].
1379/// \param V2
1380///    A 256-bit vector of [8 x float].
1381/// \param M
1382///    An immediate integer operand, with mask bits [7:0] specifying how the
1383///    values are to be copied. The position of the mask bit corresponds to the
1384///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1385///    element in operand \a V1 is copied to the same position in the
1386///    destination. When a mask bit is 1, the corresponding 32-bit element in
1387///    operand \a V2 is copied to the same position in the destination.
1388/// \returns A 256-bit vector of [8 x float] containing the copied values.
1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1390  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1391                                  (__v8sf)(__m256)(V2), \
1392                                  (((M) & 0x01) ?  8 : 0), \
1393                                  (((M) & 0x02) ?  9 : 1), \
1394                                  (((M) & 0x04) ? 10 : 2), \
1395                                  (((M) & 0x08) ? 11 : 3), \
1396                                  (((M) & 0x10) ? 12 : 4), \
1397                                  (((M) & 0x20) ? 13 : 5), \
1398                                  (((M) & 0x40) ? 14 : 6), \
1399                                  (((M) & 0x80) ? 15 : 7)); })
1400
1401/// \brief Merges 64-bit double-precision data values stored in either of the
1402///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403///    operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410///    A 256-bit vector of [4 x double].
1411/// \param __b
1412///    A 256-bit vector of [4 x double].
1413/// \param __c
1414///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415///    how the values are to be copied. The position of the mask bit corresponds
1416///    to the most significant bit of a copied value. When a mask bit is 0, the
1417///    corresponding 64-bit element in operand \a __a is copied to the same
1418///    position in the destination. When a mask bit is 1, the corresponding
1419///    64-bit element in operand \a __b is copied to the same position in the
1420///    destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425  return (__m256d)__builtin_ia32_blendvpd256(
1426    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// \brief Merges 32-bit single-precision data values stored in either of the
1430///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431///    operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438///    A 256-bit vector of [8 x float].
1439/// \param __b
1440///    A 256-bit vector of [8 x float].
1441/// \param __c
1442///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443///    and 31 specifying how the values are to be copied. The position of the
1444///    mask bit corresponds to the most significant bit of a copied value. When
1445///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446///    copied to the same position in the destination. When a mask bit is 1, the
1447///    corresponding 32-bit element in operand \a __b is copied to the same
1448///    position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453  return (__m256)__builtin_ia32_blendvps256(
1454    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// \brief Computes two dot products in parallel, using the lower and upper
1459///    halves of two [8 x float] vectors as input to the two computations, and
1460///    returning the two dot products in the lower and upper halves of the
1461///    [8 x float] result. The immediate integer operand controls which input
1462///    elements will contribute to the dot product, and where the final results
1463///    are returned. In general, for each dot product, the four corresponding
1464///    elements of the input vectors are multiplied; the first two and second
1465///    two products are summed, then the two sums are added to form the final
1466///    result.
1467///
1468/// \headerfile <x86intrin.h>
1469///
1470/// \code
1471/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1472/// \endcode
1473///
1474/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1475///
1476/// \param V1
1477///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1478/// \param V2
1479///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1480/// \param M
1481///    An immediate integer argument. Bits [7:4] determine which elements of
1482///    the input vectors are used, with bit [4] corresponding to the lowest
1483///    element and bit [7] corresponding to the highest element of each [4 x
1484///    float] subvector. If a bit is set, the corresponding elements from the
1485///    two input vectors are used as an input for dot product; otherwise that
1486///    input is treated as zero. Bits [3:0] determine which elements of the
1487///    result will receive a copy of the final dot product, with bit [0]
1488///    corresponding to the lowest element and bit [3] corresponding to the
1489///    highest element of each [4 x float] subvector. If a bit is set, the dot
1490///    product is returned in the corresponding element; otherwise that element
1491///    is set to zero. The bitmask is applied in the same way to each of the
1492///    two parallel dot product computations.
1493/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1494#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1495  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1496                                 (__v8sf)(__m256)(V2), (M)); })
1497
1498/* Vector shuffle */
1499/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1500///    specified by the immediate value operand. The four selected elements in
1501///    each operand are copied to the destination according to the bits
1502///    specified in the immediate operand. The selected elements from the first
1503///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
1504///    destination, and the selected elements from the second 256-bit operand
1505///    are copied to bits [127:64] and bits [255:192] of the destination. For
1506///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
1507///    the 256-bit destination vector would contain the following values: b[7],
1508///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1509///
1510/// \headerfile <x86intrin.h>
1511///
1512/// \code
1513/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1514/// \endcode
1515///
1516/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1517///
1518/// \param a
1519///    A 256-bit vector of [8 x float]. The four selected elements in this
1520///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1521///    according to the bits specified in the immediate operand.
1522/// \param b
1523///    A 256-bit vector of [8 x float]. The four selected elements in this
1524///    operand are copied to bits [127:64] and bits [255:192] in the
1525///    destination, according to the bits specified in the immediate operand.
1526/// \param mask
1527///    An immediate value containing an 8-bit value specifying which elements to
1528///    copy from \a a and \a b \n.
1529///    Bits [3:0] specify the values copied from operand \a a. \n
1530///    Bits [7:4] specify the values copied from operand \a b. \n
1531///    The destinations within the 256-bit destination are assigned values as
1532///    follows, according to the bit value assignments described below: \n
1533///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1534///    destination. \n
1535///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1536///    destination. \n
1537///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1538///    destination. \n
1539///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1540///    the destination. \n
1541///    Bit value assignments: \n
1542///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1543///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1544///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1545///    11: Bits [127:96] and [255:224] are copied from the selected operand.
1546/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1547#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1548  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1549                                  (__v8sf)(__m256)(b), \
1550                                  0  + (((mask) >> 0) & 0x3), \
1551                                  0  + (((mask) >> 2) & 0x3), \
1552                                  8  + (((mask) >> 4) & 0x3), \
1553                                  8  + (((mask) >> 6) & 0x3), \
1554                                  4  + (((mask) >> 0) & 0x3), \
1555                                  4  + (((mask) >> 2) & 0x3), \
1556                                  12 + (((mask) >> 4) & 0x3), \
1557                                  12 + (((mask) >> 6) & 0x3)); })
1558
1559/// \brief Selects four double-precision values from the 256-bit operands of
1560///    [4 x double], as specified by the immediate value operand. The selected
1561///    elements from the first 256-bit operand are copied to bits [63:0] and
1562///    bits [191:128] in the destination, and the selected elements from the
1563///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
1564///    the destination. For example, if bits [3:0] of the immediate operand
1565///    contain a value of 0xF, the 256-bit destination vector would contain the
1566///    following values: b[3], a[3], b[1], a[1].
1567///
1568/// \headerfile <x86intrin.h>
1569///
1570/// \code
1571/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1572/// \endcode
1573///
1574/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1575///
1576/// \param a
1577///    A 256-bit vector of [4 x double].
1578/// \param b
1579///    A 256-bit vector of [4 x double].
1580/// \param mask
1581///    An immediate value containing 8-bit values specifying which elements to
1582///    copy from \a a and \a b: \n
1583///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1584///    destination. \n
1585///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1586///    destination. \n
1587///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1588///    destination. \n
1589///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1590///    destination. \n
1591///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1592///    destination. \n
1593///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1594///    destination. \n
1595///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1596///    destination. \n
1597///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1598///    destination.
1599/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1600#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1601  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1602                                   (__v4df)(__m256d)(b), \
1603                                   0 + (((mask) >> 0) & 0x1), \
1604                                   4 + (((mask) >> 1) & 0x1), \
1605                                   2 + (((mask) >> 2) & 0x1), \
1606                                   6 + (((mask) >> 3) & 0x1)); })
1607
1608/* Compare */
1609#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
1610#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
1611#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
1612#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
1613#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
1614#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
1615#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
1616#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
1617#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1618#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
1619#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1620#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1621#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1622#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1623#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1624#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1625#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1626#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1627#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1628#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1629#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1630#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1631#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
1632#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1633#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1634#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
1635#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1636#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1637#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1638#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1639#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1640#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1641
1642/// \brief Compares each of the corresponding double-precision values of two
1643///    128-bit vectors of [2 x double], using the operation specified by the
1644///    immediate integer operand. Returns a [2 x double] vector consisting of
1645///    two doubles corresponding to the two comparison results: zero if the
1646///    comparison is false, and all 1's if the comparison is true.
1647///
1648/// \headerfile <x86intrin.h>
1649///
1650/// \code
1651/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1652/// \endcode
1653///
1654/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1655///
1656/// \param a
1657///    A 128-bit vector of [2 x double].
1658/// \param b
1659///    A 128-bit vector of [2 x double].
1660/// \param c
1661///    An immediate integer operand, with bits [4:0] specifying which comparison
1662///    operation to use: \n
1663///    00h, 08h, 10h, 18h: Equal \n
1664///    01h, 09h, 11h, 19h: Less than \n
1665///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1666///                        (swapped operands) \n
1667///    03h, 0Bh, 13h, 1Bh: Unordered \n
1668///    04h, 0Ch, 14h, 1Ch: Not equal \n
1669///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1670///                        (swapped operands) \n
1671///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1672///                        (swapped operands) \n
1673///    07h, 0Fh, 17h, 1Fh: Ordered
1674/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1675#define _mm_cmp_pd(a, b, c) __extension__ ({ \
1676  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1677                                (__v2df)(__m128d)(b), (c)); })
1678
1679/// \brief Compares each of the corresponding values of two 128-bit vectors of
1680///    [4 x float], using the operation specified by the immediate integer
1681///    operand. Returns a [4 x float] vector consisting of four floats
1682///    corresponding to the four comparison results: zero if the comparison is
1683///    false, and all 1's if the comparison is true.
1684///
1685/// \headerfile <x86intrin.h>
1686///
1687/// \code
1688/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1689/// \endcode
1690///
1691/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1692///
1693/// \param a
1694///    A 128-bit vector of [4 x float].
1695/// \param b
1696///    A 128-bit vector of [4 x float].
1697/// \param c
1698///    An immediate integer operand, with bits [4:0] specifying which comparison
1699///    operation to use: \n
1700///    00h, 08h, 10h, 18h: Equal \n
1701///    01h, 09h, 11h, 19h: Less than \n
1702///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1703///                        (swapped operands) \n
1704///    03h, 0Bh, 13h, 1Bh: Unordered \n
1705///    04h, 0Ch, 14h, 1Ch: Not equal \n
1706///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1707///                        (swapped operands) \n
1708///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1709///                        (swapped operands) \n
1710///    07h, 0Fh, 17h, 1Fh: Ordered
1711/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1712#define _mm_cmp_ps(a, b, c) __extension__ ({ \
1713  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1714                               (__v4sf)(__m128)(b), (c)); })
1715
1716/// \brief Compares each of the corresponding double-precision values of two
1717///    256-bit vectors of [4 x double], using the operation specified by the
1718///    immediate integer operand. Returns a [4 x double] vector consisting of
1719///    four doubles corresponding to the four comparison results: zero if the
1720///    comparison is false, and all 1's if the comparison is true.
1721///
1722/// \headerfile <x86intrin.h>
1723///
1724/// \code
1725/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1726/// \endcode
1727///
1728/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1729///
1730/// \param a
1731///    A 256-bit vector of [4 x double].
1732/// \param b
1733///    A 256-bit vector of [4 x double].
1734/// \param c
1735///    An immediate integer operand, with bits [4:0] specifying which comparison
1736///    operation to use: \n
1737///    00h, 08h, 10h, 18h: Equal \n
1738///    01h, 09h, 11h, 19h: Less than \n
1739///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1740///                        (swapped operands) \n
1741///    03h, 0Bh, 13h, 1Bh: Unordered \n
1742///    04h, 0Ch, 14h, 1Ch: Not equal \n
1743///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1744///                        (swapped operands) \n
1745///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1746///                        (swapped operands) \n
1747///    07h, 0Fh, 17h, 1Fh: Ordered
1748/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1749#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1750  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1751                                   (__v4df)(__m256d)(b), (c)); })
1752
1753/// \brief Compares each of the corresponding values of two 256-bit vectors of
1754///    [8 x float], using the operation specified by the immediate integer
1755///    operand. Returns a [8 x float] vector consisting of eight floats
1756///    corresponding to the eight comparison results: zero if the comparison is
1757///    false, and all 1's if the comparison is true.
1758///
1759/// \headerfile <x86intrin.h>
1760///
1761/// \code
1762/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1763/// \endcode
1764///
1765/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1766///
1767/// \param a
1768///    A 256-bit vector of [8 x float].
1769/// \param b
1770///    A 256-bit vector of [8 x float].
1771/// \param c
1772///    An immediate integer operand, with bits [4:0] specifying which comparison
1773///    operation to use: \n
1774///    00h, 08h, 10h, 18h: Equal \n
1775///    01h, 09h, 11h, 19h: Less than \n
1776///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1777///                        (swapped operands) \n
1778///    03h, 0Bh, 13h, 1Bh: Unordered \n
1779///    04h, 0Ch, 14h, 1Ch: Not equal \n
1780///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1781///                        (swapped operands) \n
1782///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1783///                        (swapped operands) \n
1784///    07h, 0Fh, 17h, 1Fh: Ordered
1785/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1786#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1787  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1788                                  (__v8sf)(__m256)(b), (c)); })
1789
1790/// \brief Compares each of the corresponding scalar double-precision values of
1791///    two 128-bit vectors of [2 x double], using the operation specified by the
1792///    immediate integer operand. If the result is true, all 64 bits of the
1793///    destination vector are set; otherwise they are cleared.
1794///
1795/// \headerfile <x86intrin.h>
1796///
1797/// \code
1798/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1799/// \endcode
1800///
1801/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1802///
1803/// \param a
1804///    A 128-bit vector of [2 x double].
1805/// \param b
1806///    A 128-bit vector of [2 x double].
1807/// \param c
1808///    An immediate integer operand, with bits [4:0] specifying which comparison
1809///    operation to use: \n
1810///    00h, 08h, 10h, 18h: Equal \n
1811///    01h, 09h, 11h, 19h: Less than \n
1812///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1813///                        (swapped operands) \n
1814///    03h, 0Bh, 13h, 1Bh: Unordered \n
1815///    04h, 0Ch, 14h, 1Ch: Not equal \n
1816///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1817///                        (swapped operands) \n
1818///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1819///                        (swapped operands) \n
1820///    07h, 0Fh, 17h, 1Fh: Ordered
1821/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1822#define _mm_cmp_sd(a, b, c) __extension__ ({ \
1823  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1824                                (__v2df)(__m128d)(b), (c)); })
1825
1826/// \brief Compares each of the corresponding scalar values of two 128-bit
1827///    vectors of [4 x float], using the operation specified by the immediate
1828///    integer operand. If the result is true, all 32 bits of the destination
1829///    vector are set; otherwise they are cleared.
1830///
1831/// \headerfile <x86intrin.h>
1832///
1833/// \code
1834/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1835/// \endcode
1836///
1837/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1838///
1839/// \param a
1840///    A 128-bit vector of [4 x float].
1841/// \param b
1842///    A 128-bit vector of [4 x float].
1843/// \param c
1844///    An immediate integer operand, with bits [4:0] specifying which comparison
1845///    operation to use: \n
1846///    00h, 08h, 10h, 18h: Equal \n
1847///    01h, 09h, 11h, 19h: Less than \n
1848///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
1849///                        (swapped operands) \n
1850///    03h, 0Bh, 13h, 1Bh: Unordered \n
1851///    04h, 0Ch, 14h, 1Ch: Not equal \n
1852///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
1853///                        (swapped operands) \n
1854///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1855///                        (swapped operands) \n
1856///    07h, 0Fh, 17h, 1Fh: Ordered
1857/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1858#define _mm_cmp_ss(a, b, c) __extension__ ({ \
1859  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1860                               (__v4sf)(__m128)(b), (c)); })
1861
1862/// \brief Takes a [8 x i32] vector and returns the vector element value
1863///    indexed by the immediate constant operand.
1864///
1865/// \headerfile <x86intrin.h>
1866///
1867/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1868///   instruction.
1869///
1870/// \param __a
1871///    A 256-bit vector of [8 x i32].
1872/// \param __imm
1873///    An immediate integer operand with bits [2:0] determining which vector
1874///    element is extracted and returned.
1875/// \returns A 32-bit integer containing the extracted 32 bits of extended
1876///    packed data.
1877static __inline int __DEFAULT_FN_ATTRS
1878_mm256_extract_epi32(__m256i __a, const int __imm)
1879{
1880  __v8si __b = (__v8si)__a;
1881  return __b[__imm & 7];
1882}
1883
1884/// \brief Takes a [16 x i16] vector and returns the vector element value
1885///    indexed by the immediate constant operand.
1886///
1887/// \headerfile <x86intrin.h>
1888///
1889/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1890///   instruction.
1891///
1892/// \param __a
1893///    A 256-bit integer vector of [16 x i16].
1894/// \param __imm
1895///    An immediate integer operand with bits [3:0] determining which vector
1896///    element is extracted and returned.
1897/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1898///    packed data.
1899static __inline int __DEFAULT_FN_ATTRS
1900_mm256_extract_epi16(__m256i __a, const int __imm)
1901{
1902  __v16hi __b = (__v16hi)__a;
1903  return (unsigned short)__b[__imm & 15];
1904}
1905
1906/// \brief Takes a [32 x i8] vector and returns the vector element value
1907///    indexed by the immediate constant operand.
1908///
1909/// \headerfile <x86intrin.h>
1910///
1911/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1912///   instruction.
1913///
1914/// \param __a
1915///    A 256-bit integer vector of [32 x i8].
1916/// \param __imm
1917///    An immediate integer operand with bits [4:0] determining which vector
1918///    element is extracted and returned.
1919/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
1920///    packed data.
1921static __inline int __DEFAULT_FN_ATTRS
1922_mm256_extract_epi8(__m256i __a, const int __imm)
1923{
1924  __v32qi __b = (__v32qi)__a;
1925  return (unsigned char)__b[__imm & 31];
1926}
1927
1928#ifdef __x86_64__
1929/// \brief Takes a [4 x i64] vector and returns the vector element value
1930///    indexed by the immediate constant operand.
1931///
1932/// \headerfile <x86intrin.h>
1933///
1934/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1935///   instruction.
1936///
1937/// \param __a
1938///    A 256-bit integer vector of [4 x i64].
1939/// \param __imm
1940///    An immediate integer operand with bits [1:0] determining which vector
1941///    element is extracted and returned.
1942/// \returns A 64-bit integer containing the extracted 64 bits of extended
1943///    packed data.
1944static __inline long long  __DEFAULT_FN_ATTRS
1945_mm256_extract_epi64(__m256i __a, const int __imm)
1946{
1947  __v4di __b = (__v4di)__a;
1948  return __b[__imm & 3];
1949}
1950#endif
1951
1952/// \brief Takes a [8 x i32] vector and replaces the vector element value
1953///    indexed by the immediate constant operand by a new value. Returns the
1954///    modified vector.
1955///
1956/// \headerfile <x86intrin.h>
1957///
1958/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1959///   instruction.
1960///
1961/// \param __a
1962///    A vector of [8 x i32] to be used by the insert operation.
1963/// \param __b
1964///    An integer value. The replacement value for the insert operation.
1965/// \param __imm
1966///    An immediate integer specifying the index of the vector element to be
1967///    replaced.
1968/// \returns A copy of vector \a __a, after replacing its element indexed by
1969///    \a __imm with \a __b.
1970static __inline __m256i __DEFAULT_FN_ATTRS
1971_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
1972{
1973  __v8si __c = (__v8si)__a;
1974  __c[__imm & 7] = __b;
1975  return (__m256i)__c;
1976}
1977
1978
1979/// \brief Takes a [16 x i16] vector and replaces the vector element value
1980///    indexed by the immediate constant operand with a new value. Returns the
1981///    modified vector.
1982///
1983/// \headerfile <x86intrin.h>
1984///
1985/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
1986///   instruction.
1987///
1988/// \param __a
1989///    A vector of [16 x i16] to be used by the insert operation.
1990/// \param __b
1991///    An i16 integer value. The replacement value for the insert operation.
1992/// \param __imm
1993///    An immediate integer specifying the index of the vector element to be
1994///    replaced.
1995/// \returns A copy of vector \a __a, after replacing its element indexed by
1996///    \a __imm with \a __b.
1997static __inline __m256i __DEFAULT_FN_ATTRS
1998_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
1999{
2000  __v16hi __c = (__v16hi)__a;
2001  __c[__imm & 15] = __b;
2002  return (__m256i)__c;
2003}
2004
2005/// \brief Takes a [32 x i8] vector and replaces the vector element value
2006///    indexed by the immediate constant operand with a new value. Returns the
2007///    modified vector.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2012///   instruction.
2013///
2014/// \param __a
2015///    A vector of [32 x i8] to be used by the insert operation.
2016/// \param __b
2017///    An i8 integer value. The replacement value for the insert operation.
2018/// \param __imm
2019///    An immediate integer specifying the index of the vector element to be
2020///    replaced.
2021/// \returns A copy of vector \a __a, after replacing its element indexed by
2022///    \a __imm with \a __b.
2023static __inline __m256i __DEFAULT_FN_ATTRS
2024_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
2025{
2026  __v32qi __c = (__v32qi)__a;
2027  __c[__imm & 31] = __b;
2028  return (__m256i)__c;
2029}
2030
2031#ifdef __x86_64__
2032/// \brief Takes a [4 x i64] vector and replaces the vector element value
2033///    indexed by the immediate constant operand with a new value. Returns the
2034///    modified vector.
2035///
2036/// \headerfile <x86intrin.h>
2037///
2038/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2039///   instruction.
2040///
2041/// \param __a
2042///    A vector of [4 x i64] to be used by the insert operation.
2043/// \param __b
2044///    A 64-bit integer value. The replacement value for the insert operation.
2045/// \param __imm
2046///    An immediate integer specifying the index of the vector element to be
2047///    replaced.
2048/// \returns A copy of vector \a __a, after replacing its element indexed by
2049///     \a __imm with \a __b.
2050static __inline __m256i __DEFAULT_FN_ATTRS
2051_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
2052{
2053  __v4di __c = (__v4di)__a;
2054  __c[__imm & 3] = __b;
2055  return (__m256i)__c;
2056}
2057#endif
2058
2059/* Conversion */
2060/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2061///
2062/// \headerfile <x86intrin.h>
2063///
2064/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2065///
2066/// \param __a
2067///    A 128-bit integer vector of [4 x i32].
2068/// \returns A 256-bit vector of [4 x double] containing the converted values.
2069static __inline __m256d __DEFAULT_FN_ATTRS
2070_mm256_cvtepi32_pd(__m128i __a)
2071{
2072  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2073}
2074
2075/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2076///
2077/// \headerfile <x86intrin.h>
2078///
2079/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2080///
2081/// \param __a
2082///    A 256-bit integer vector.
2083/// \returns A 256-bit vector of [8 x float] containing the converted values.
2084static __inline __m256 __DEFAULT_FN_ATTRS
2085_mm256_cvtepi32_ps(__m256i __a)
2086{
2087  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
2088}
2089
2090/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2091///    [4 x float].
2092///
2093/// \headerfile <x86intrin.h>
2094///
2095/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2096///
2097/// \param __a
2098///    A 256-bit vector of [4 x double].
2099/// \returns A 128-bit vector of [4 x float] containing the converted values.
2100static __inline __m128 __DEFAULT_FN_ATTRS
2101_mm256_cvtpd_ps(__m256d __a)
2102{
2103  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2104}
2105
2106/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2107///
2108/// \headerfile <x86intrin.h>
2109///
2110/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2111///
2112/// \param __a
2113///    A 256-bit vector of [8 x float].
2114/// \returns A 256-bit integer vector containing the converted values.
2115static __inline __m256i __DEFAULT_FN_ATTRS
2116_mm256_cvtps_epi32(__m256 __a)
2117{
2118  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2119}
2120
2121/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2122///    x double].
2123///
2124/// \headerfile <x86intrin.h>
2125///
2126/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2127///
2128/// \param __a
2129///    A 128-bit vector of [4 x float].
2130/// \returns A 256-bit vector of [4 x double] containing the converted values.
2131static __inline __m256d __DEFAULT_FN_ATTRS
2132_mm256_cvtps_pd(__m128 __a)
2133{
2134  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2135}
2136
2137/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2138///    x i32], truncating the result by rounding towards zero when it is
2139///    inexact.
2140///
2141/// \headerfile <x86intrin.h>
2142///
2143/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2144///
2145/// \param __a
2146///    A 256-bit vector of [4 x double].
2147/// \returns A 128-bit integer vector containing the converted values.
2148static __inline __m128i __DEFAULT_FN_ATTRS
2149_mm256_cvttpd_epi32(__m256d __a)
2150{
2151  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2152}
2153
2154/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2155///    x i32]. When a conversion is inexact, the value returned is rounded
2156///    according to the rounding control bits in the MXCSR register.
2157///
2158/// \headerfile <x86intrin.h>
2159///
2160/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2161///
2162/// \param __a
2163///    A 256-bit vector of [4 x double].
2164/// \returns A 128-bit integer vector containing the converted values.
2165static __inline __m128i __DEFAULT_FN_ATTRS
2166_mm256_cvtpd_epi32(__m256d __a)
2167{
2168  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2169}
2170
2171/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2172///    truncating the result by rounding towards zero when it is inexact.
2173///
2174/// \headerfile <x86intrin.h>
2175///
2176/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2177///
2178/// \param __a
2179///    A 256-bit vector of [8 x float].
2180/// \returns A 256-bit integer vector containing the converted values.
2181static __inline __m256i __DEFAULT_FN_ATTRS
2182_mm256_cvttps_epi32(__m256 __a)
2183{
2184  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2185}
2186
2187static __inline double __DEFAULT_FN_ATTRS
2188_mm256_cvtsd_f64(__m256d __a)
2189{
2190 return __a[0];
2191}
2192
2193static __inline int __DEFAULT_FN_ATTRS
2194_mm256_cvtsi256_si32(__m256i __a)
2195{
2196 __v8si __b = (__v8si)__a;
2197 return __b[0];
2198}
2199
2200static __inline float __DEFAULT_FN_ATTRS
2201_mm256_cvtss_f32(__m256 __a)
2202{
2203 return __a[0];
2204}
2205
2206/* Vector replicate */
2207/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
2208///    vector of [8 x float] to float values in a 256-bit vector of
2209///    [8 x float].
2210///
2211/// \headerfile <x86intrin.h>
2212///
2213/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2214///
2215/// \param __a
2216///    A 256-bit vector of [8 x float]. \n
2217///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2218///    the return value. \n
2219///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2220///    the return value. \n
2221///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2222///    return value. \n
2223///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2224///    return value.
2225/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2226///    values.
2227static __inline __m256 __DEFAULT_FN_ATTRS
2228_mm256_movehdup_ps(__m256 __a)
2229{
2230  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2231}
2232
2233/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
2234///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2235///
2236/// \headerfile <x86intrin.h>
2237///
2238/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2239///
2240/// \param __a
2241///    A 256-bit vector of [8 x float]. \n
2242///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2243///    the return value. \n
2244///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2245///    the return value. \n
2246///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2247///    return value. \n
2248///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2249///    return value.
2250/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2251///    values.
2252static __inline __m256 __DEFAULT_FN_ATTRS
2253_mm256_moveldup_ps(__m256 __a)
2254{
2255  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2256}
2257
2258/// \brief Moves and duplicates double-precision floating point values from a
2259///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2260///    vector of [4 x double].
2261///
2262/// \headerfile <x86intrin.h>
2263///
2264/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2265///
2266/// \param __a
2267///    A 256-bit vector of [4 x double]. \n
2268///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2269///    return value. \n
2270///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2271///    the return value.
2272/// \returns A 256-bit vector of [4 x double] containing the moved and
2273///    duplicated values.
2274static __inline __m256d __DEFAULT_FN_ATTRS
2275_mm256_movedup_pd(__m256d __a)
2276{
2277  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2278}
2279
2280/* Unpack and Interleave */
2281/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2282///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2283///
2284/// \headerfile <x86intrin.h>
2285///
2286/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2287///
2288/// \param __a
2289///    A 256-bit floating-point vector of [4 x double]. \n
2290///    Bits [127:64] are written to bits [63:0] of the return value. \n
2291///    Bits [255:192] are written to bits [191:128] of the return value. \n
2292/// \param __b
2293///    A 256-bit floating-point vector of [4 x double]. \n
2294///    Bits [127:64] are written to bits [127:64] of the return value. \n
2295///    Bits [255:192] are written to bits [255:192] of the return value. \n
2296/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2297static __inline __m256d __DEFAULT_FN_ATTRS
2298_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2299{
2300  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2301}
2302
2303/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2304///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2305///
2306/// \headerfile <x86intrin.h>
2307///
2308/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2309///
2310/// \param __a
2311///    A 256-bit floating-point vector of [4 x double]. \n
2312///    Bits [63:0] are written to bits [63:0] of the return value. \n
2313///    Bits [191:128] are written to bits [191:128] of the return value.
2314/// \param __b
2315///    A 256-bit floating-point vector of [4 x double]. \n
2316///    Bits [63:0] are written to bits [127:64] of the return value. \n
2317///    Bits [191:128] are written to bits [255:192] of the return value. \n
2318/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2319static __inline __m256d __DEFAULT_FN_ATTRS
2320_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2321{
2322  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2323}
2324
2325/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2326///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2327///    vector of [8 x float].
2328///
2329/// \headerfile <x86intrin.h>
2330///
2331/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2332///
2333/// \param __a
2334///    A 256-bit vector of [8 x float]. \n
2335///    Bits [95:64] are written to bits [31:0] of the return value. \n
2336///    Bits [127:96] are written to bits [95:64] of the return value. \n
2337///    Bits [223:192] are written to bits [159:128] of the return value. \n
2338///    Bits [255:224] are written to bits [223:192] of the return value.
2339/// \param __b
2340///    A 256-bit vector of [8 x float]. \n
2341///    Bits [95:64] are written to bits [63:32] of the return value. \n
2342///    Bits [127:96] are written to bits [127:96] of the return value. \n
2343///    Bits [223:192] are written to bits [191:160] of the return value. \n
2344///    Bits [255:224] are written to bits [255:224] of the return value.
2345/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2346static __inline __m256 __DEFAULT_FN_ATTRS
2347_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2348{
2349  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2350}
2351
2352/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2353///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2354///    vector of [8 x float].
2355///
2356/// \headerfile <x86intrin.h>
2357///
2358/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2359///
2360/// \param __a
2361///    A 256-bit vector of [8 x float]. \n
2362///    Bits [31:0] are written to bits [31:0] of the return value. \n
2363///    Bits [63:32] are written to bits [95:64] of the return value. \n
2364///    Bits [159:128] are written to bits [159:128] of the return value. \n
2365///    Bits [191:160] are written to bits [223:192] of the return value.
2366/// \param __b
2367///    A 256-bit vector of [8 x float]. \n
2368///    Bits [31:0] are written to bits [63:32] of the return value. \n
2369///    Bits [63:32] are written to bits [127:96] of the return value. \n
2370///    Bits [159:128] are written to bits [191:160] of the return value. \n
2371///    Bits [191:160] are written to bits [255:224] of the return value.
2372/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2373static __inline __m256 __DEFAULT_FN_ATTRS
2374_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2375{
2376  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2377}
2378
2379/* Bit Test */
2380/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2381///    element-by-element comparison of the double-precision element in the
2382///    first source vector and the corresponding element in the second source
2383///    vector. The EFLAGS register is updated as follows: \n
2384///    If there is at least one pair of double-precision elements where the
2385///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2386///    ZF flag is set to 1. \n
2387///    If there is at least one pair of double-precision elements where the
2388///    sign-bit of the first element is 0 and the sign-bit of the second element
2389///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2390///    This intrinsic returns the value of the ZF flag.
2391///
2392/// \headerfile <x86intrin.h>
2393///
2394/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2395///
2396/// \param __a
2397///    A 128-bit vector of [2 x double].
2398/// \param __b
2399///    A 128-bit vector of [2 x double].
2400/// \returns the ZF flag in the EFLAGS register.
2401static __inline int __DEFAULT_FN_ATTRS
2402_mm_testz_pd(__m128d __a, __m128d __b)
2403{
2404  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2405}
2406
2407/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2408///    element-by-element comparison of the double-precision element in the
2409///    first source vector and the corresponding element in the second source
2410///    vector. The EFLAGS register is updated as follows: \n
2411///    If there is at least one pair of double-precision elements where the
2412///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2413///    ZF flag is set to 1. \n
2414///    If there is at least one pair of double-precision elements where the
2415///    sign-bit of the first element is 0 and the sign-bit of the second element
2416///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2417///    This intrinsic returns the value of the CF flag.
2418///
2419/// \headerfile <x86intrin.h>
2420///
2421/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2422///
2423/// \param __a
2424///    A 128-bit vector of [2 x double].
2425/// \param __b
2426///    A 128-bit vector of [2 x double].
2427/// \returns the CF flag in the EFLAGS register.
2428static __inline int __DEFAULT_FN_ATTRS
2429_mm_testc_pd(__m128d __a, __m128d __b)
2430{
2431  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2432}
2433
2434/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2435///    element-by-element comparison of the double-precision element in the
2436///    first source vector and the corresponding element in the second source
2437///    vector. The EFLAGS register is updated as follows: \n
2438///    If there is at least one pair of double-precision elements where the
2439///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2440///    ZF flag is set to 1. \n
2441///    If there is at least one pair of double-precision elements where the
2442///    sign-bit of the first element is 0 and the sign-bit of the second element
2443///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2444///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2445///    otherwise it returns 0.
2446///
2447/// \headerfile <x86intrin.h>
2448///
2449/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2450///
2451/// \param __a
2452///    A 128-bit vector of [2 x double].
2453/// \param __b
2454///    A 128-bit vector of [2 x double].
2455/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2456static __inline int __DEFAULT_FN_ATTRS
2457_mm_testnzc_pd(__m128d __a, __m128d __b)
2458{
2459  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2460}
2461
2462/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2463///    element-by-element comparison of the single-precision element in the
2464///    first source vector and the corresponding element in the second source
2465///    vector. The EFLAGS register is updated as follows: \n
2466///    If there is at least one pair of single-precision elements where the
2467///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2468///    ZF flag is set to 1. \n
2469///    If there is at least one pair of single-precision elements where the
2470///    sign-bit of the first element is 0 and the sign-bit of the second element
2471///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2472///    This intrinsic returns the value of the ZF flag.
2473///
2474/// \headerfile <x86intrin.h>
2475///
2476/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2477///
2478/// \param __a
2479///    A 128-bit vector of [4 x float].
2480/// \param __b
2481///    A 128-bit vector of [4 x float].
2482/// \returns the ZF flag.
2483static __inline int __DEFAULT_FN_ATTRS
2484_mm_testz_ps(__m128 __a, __m128 __b)
2485{
2486  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2487}
2488
2489/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2490///    element-by-element comparison of the single-precision element in the
2491///    first source vector and the corresponding element in the second source
2492///    vector. The EFLAGS register is updated as follows: \n
2493///    If there is at least one pair of single-precision elements where the
2494///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2495///    ZF flag is set to 1. \n
2496///    If there is at least one pair of single-precision elements where the
2497///    sign-bit of the first element is 0 and the sign-bit of the second element
2498///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2499///    This intrinsic returns the value of the CF flag.
2500///
2501/// \headerfile <x86intrin.h>
2502///
2503/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2504///
2505/// \param __a
2506///    A 128-bit vector of [4 x float].
2507/// \param __b
2508///    A 128-bit vector of [4 x float].
2509/// \returns the CF flag.
2510static __inline int __DEFAULT_FN_ATTRS
2511_mm_testc_ps(__m128 __a, __m128 __b)
2512{
2513  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2514}
2515
2516/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2517///    element-by-element comparison of the single-precision element in the
2518///    first source vector and the corresponding element in the second source
2519///    vector. The EFLAGS register is updated as follows: \n
2520///    If there is at least one pair of single-precision elements where the
2521///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2522///    ZF flag is set to 1. \n
2523///    If there is at least one pair of single-precision elements where the
2524///    sign-bit of the first element is 0 and the sign-bit of the second element
2525///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2526///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2527///    otherwise it returns 0.
2528///
2529/// \headerfile <x86intrin.h>
2530///
2531/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2532///
2533/// \param __a
2534///    A 128-bit vector of [4 x float].
2535/// \param __b
2536///    A 128-bit vector of [4 x float].
2537/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2538static __inline int __DEFAULT_FN_ATTRS
2539_mm_testnzc_ps(__m128 __a, __m128 __b)
2540{
2541  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2542}
2543
2544/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2545///    element-by-element comparison of the double-precision elements in the
2546///    first source vector and the corresponding elements in the second source
2547///    vector. The EFLAGS register is updated as follows: \n
2548///    If there is at least one pair of double-precision elements where the
2549///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2550///    ZF flag is set to 1. \n
2551///    If there is at least one pair of double-precision elements where the
2552///    sign-bit of the first element is 0 and the sign-bit of the second element
2553///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2554///    This intrinsic returns the value of the ZF flag.
2555///
2556/// \headerfile <x86intrin.h>
2557///
2558/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2559///
2560/// \param __a
2561///    A 256-bit vector of [4 x double].
2562/// \param __b
2563///    A 256-bit vector of [4 x double].
2564/// \returns the ZF flag.
2565static __inline int __DEFAULT_FN_ATTRS
2566_mm256_testz_pd(__m256d __a, __m256d __b)
2567{
2568  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2569}
2570
2571/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2572///    element-by-element comparison of the double-precision elements in the
2573///    first source vector and the corresponding elements in the second source
2574///    vector. The EFLAGS register is updated as follows: \n
2575///    If there is at least one pair of double-precision elements where the
2576///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2577///    ZF flag is set to 1. \n
2578///    If there is at least one pair of double-precision elements where the
2579///    sign-bit of the first element is 0 and the sign-bit of the second element
2580///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2581///    This intrinsic returns the value of the CF flag.
2582///
2583/// \headerfile <x86intrin.h>
2584///
2585/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2586///
2587/// \param __a
2588///    A 256-bit vector of [4 x double].
2589/// \param __b
2590///    A 256-bit vector of [4 x double].
2591/// \returns the CF flag.
2592static __inline int __DEFAULT_FN_ATTRS
2593_mm256_testc_pd(__m256d __a, __m256d __b)
2594{
2595  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2596}
2597
2598/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2599///    element-by-element comparison of the double-precision elements in the
2600///    first source vector and the corresponding elements in the second source
2601///    vector. The EFLAGS register is updated as follows: \n
2602///    If there is at least one pair of double-precision elements where the
2603///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2604///    ZF flag is set to 1. \n
2605///    If there is at least one pair of double-precision elements where the
2606///    sign-bit of the first element is 0 and the sign-bit of the second element
2607///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2608///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2609///    otherwise it returns 0.
2610///
2611/// \headerfile <x86intrin.h>
2612///
2613/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2614///
2615/// \param __a
2616///    A 256-bit vector of [4 x double].
2617/// \param __b
2618///    A 256-bit vector of [4 x double].
2619/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2620static __inline int __DEFAULT_FN_ATTRS
2621_mm256_testnzc_pd(__m256d __a, __m256d __b)
2622{
2623  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2624}
2625
2626/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2627///    element-by-element comparison of the single-precision element in the
2628///    first source vector and the corresponding element in the second source
2629///    vector. The EFLAGS register is updated as follows: \n
2630///    If there is at least one pair of single-precision elements where the
2631///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2632///    ZF flag is set to 1. \n
2633///    If there is at least one pair of single-precision elements where the
2634///    sign-bit of the first element is 0 and the sign-bit of the second element
2635///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2636///    This intrinsic returns the value of the ZF flag.
2637///
2638/// \headerfile <x86intrin.h>
2639///
2640/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2641///
2642/// \param __a
2643///    A 256-bit vector of [8 x float].
2644/// \param __b
2645///    A 256-bit vector of [8 x float].
2646/// \returns the ZF flag.
2647static __inline int __DEFAULT_FN_ATTRS
2648_mm256_testz_ps(__m256 __a, __m256 __b)
2649{
2650  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2651}
2652
2653/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2654///    element-by-element comparison of the single-precision element in the
2655///    first source vector and the corresponding element in the second source
2656///    vector. The EFLAGS register is updated as follows: \n
2657///    If there is at least one pair of single-precision elements where the
2658///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2659///    ZF flag is set to 1. \n
2660///    If there is at least one pair of single-precision elements where the
2661///    sign-bit of the first element is 0 and the sign-bit of the second element
2662///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2663///    This intrinsic returns the value of the CF flag.
2664///
2665/// \headerfile <x86intrin.h>
2666///
2667/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2668///
2669/// \param __a
2670///    A 256-bit vector of [8 x float].
2671/// \param __b
2672///    A 256-bit vector of [8 x float].
2673/// \returns the CF flag.
2674static __inline int __DEFAULT_FN_ATTRS
2675_mm256_testc_ps(__m256 __a, __m256 __b)
2676{
2677  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2678}
2679
2680/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2681///    element-by-element comparison of the single-precision elements in the
2682///    first source vector and the corresponding elements in the second source
2683///    vector. The EFLAGS register is updated as follows: \n
2684///    If there is at least one pair of single-precision elements where the
2685///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2686///    ZF flag is set to 1. \n
2687///    If there is at least one pair of single-precision elements where the
2688///    sign-bit of the first element is 0 and the sign-bit of the second element
2689///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2690///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2691///    otherwise it returns 0.
2692///
2693/// \headerfile <x86intrin.h>
2694///
2695/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2696///
2697/// \param __a
2698///    A 256-bit vector of [8 x float].
2699/// \param __b
2700///    A 256-bit vector of [8 x float].
2701/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2702static __inline int __DEFAULT_FN_ATTRS
2703_mm256_testnzc_ps(__m256 __a, __m256 __b)
2704{
2705  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2706}
2707
2708/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2709///    of the two source vectors and update the EFLAGS register as follows: \n
2710///    If there is at least one pair of bits where both bits are 1, the ZF flag
2711///    is set to 0. Otherwise the ZF flag is set to 1. \n
2712///    If there is at least one pair of bits where the bit from the first source
2713///    vector is 0 and the bit from the second source vector is 1, the CF flag
2714///    is set to 0. Otherwise the CF flag is set to 1. \n
2715///    This intrinsic returns the value of the ZF flag.
2716///
2717/// \headerfile <x86intrin.h>
2718///
2719/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2720///
2721/// \param __a
2722///    A 256-bit integer vector.
2723/// \param __b
2724///    A 256-bit integer vector.
2725/// \returns the ZF flag.
2726static __inline int __DEFAULT_FN_ATTRS
2727_mm256_testz_si256(__m256i __a, __m256i __b)
2728{
2729  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2730}
2731
2732/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2733///    of the two source vectors and update the EFLAGS register as follows: \n
2734///    If there is at least one pair of bits where both bits are 1, the ZF flag
2735///    is set to 0. Otherwise the ZF flag is set to 1. \n
2736///    If there is at least one pair of bits where the bit from the first source
2737///    vector is 0 and the bit from the second source vector is 1, the CF flag
2738///    is set to 0. Otherwise the CF flag is set to 1. \n
2739///    This intrinsic returns the value of the CF flag.
2740///
2741/// \headerfile <x86intrin.h>
2742///
2743/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2744///
2745/// \param __a
2746///    A 256-bit integer vector.
2747/// \param __b
2748///    A 256-bit integer vector.
2749/// \returns the CF flag.
2750static __inline int __DEFAULT_FN_ATTRS
2751_mm256_testc_si256(__m256i __a, __m256i __b)
2752{
2753  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2754}
2755
2756/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2757///    of the two source vectors and update the EFLAGS register as follows: \n
2758///    If there is at least one pair of bits where both bits are 1, the ZF flag
2759///    is set to 0. Otherwise the ZF flag is set to 1. \n
2760///    If there is at least one pair of bits where the bit from the first source
2761///    vector is 0 and the bit from the second source vector is 1, the CF flag
2762///    is set to 0. Otherwise the CF flag is set to 1. \n
2763///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2764///    otherwise it returns 0.
2765///
2766/// \headerfile <x86intrin.h>
2767///
2768/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2769///
2770/// \param __a
2771///    A 256-bit integer vector.
2772/// \param __b
2773///    A 256-bit integer vector.
2774/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2775static __inline int __DEFAULT_FN_ATTRS
2776_mm256_testnzc_si256(__m256i __a, __m256i __b)
2777{
2778  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2779}
2780
2781/* Vector extract sign mask */
2782/// \brief Extracts the sign bits of double-precision floating point elements
2783///    in a 256-bit vector of [4 x double] and writes them to the lower order
2784///    bits of the return value.
2785///
2786/// \headerfile <x86intrin.h>
2787///
2788/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2789///
2790/// \param __a
2791///    A 256-bit vector of [4 x double] containing the double-precision
2792///    floating point values with sign bits to be extracted.
2793/// \returns The sign bits from the operand, written to bits [3:0].
2794static __inline int __DEFAULT_FN_ATTRS
2795_mm256_movemask_pd(__m256d __a)
2796{
2797  return __builtin_ia32_movmskpd256((__v4df)__a);
2798}
2799
2800/// \brief Extracts the sign bits of double-precision floating point elements
2801///    in a 256-bit vector of [8 x float] and writes them to the lower order
2802///    bits of the return value.
2803///
2804/// \headerfile <x86intrin.h>
2805///
2806/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2807///
2808/// \param __a
2809///    A 256-bit vector of [8 x float] containing the double-precision floating
2810///    point values with sign bits to be extracted.
2811/// \returns The sign bits from the operand, written to bits [7:0].
2812static __inline int __DEFAULT_FN_ATTRS
2813_mm256_movemask_ps(__m256 __a)
2814{
2815  return __builtin_ia32_movmskps256((__v8sf)__a);
2816}
2817
2818/* Vector __zero */
2819/// \brief Zeroes the contents of all XMM or YMM registers.
2820///
2821/// \headerfile <x86intrin.h>
2822///
2823/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2824static __inline void __DEFAULT_FN_ATTRS
2825_mm256_zeroall(void)
2826{
2827  __builtin_ia32_vzeroall();
2828}
2829
2830/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2831///
2832/// \headerfile <x86intrin.h>
2833///
2834/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2835static __inline void __DEFAULT_FN_ATTRS
2836_mm256_zeroupper(void)
2837{
2838  __builtin_ia32_vzeroupper();
2839}
2840
2841/* Vector load with broadcast */
2842/// \brief Loads a scalar single-precision floating point value from the
2843///    specified address pointed to by \a __a and broadcasts it to the elements
2844///    of a [4 x float] vector.
2845///
2846/// \headerfile <x86intrin.h>
2847///
2848/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2849///
2850/// \param __a
2851///    The single-precision floating point value to be broadcast.
2852/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2853///    equal to the broadcast value.
2854static __inline __m128 __DEFAULT_FN_ATTRS
2855_mm_broadcast_ss(float const *__a)
2856{
2857  float __f = *__a;
2858  return (__m128)(__v4sf){ __f, __f, __f, __f };
2859}
2860
2861/// \brief Loads a scalar double-precision floating point value from the
2862///    specified address pointed to by \a __a and broadcasts it to the elements
2863///    of a [4 x double] vector.
2864///
2865/// \headerfile <x86intrin.h>
2866///
2867/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
2868///
2869/// \param __a
2870///    The double-precision floating point value to be broadcast.
2871/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2872///    equal to the broadcast value.
2873static __inline __m256d __DEFAULT_FN_ATTRS
2874_mm256_broadcast_sd(double const *__a)
2875{
2876  double __d = *__a;
2877  return (__m256d)(__v4df){ __d, __d, __d, __d };
2878}
2879
2880/// \brief Loads a scalar single-precision floating point value from the
2881///    specified address pointed to by \a __a and broadcasts it to the elements
2882///    of a [8 x float] vector.
2883///
2884/// \headerfile <x86intrin.h>
2885///
2886/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2887///
2888/// \param __a
2889///    The single-precision floating point value to be broadcast.
2890/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
2891///    equal to the broadcast value.
2892static __inline __m256 __DEFAULT_FN_ATTRS
2893_mm256_broadcast_ss(float const *__a)
2894{
2895  float __f = *__a;
2896  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
2897}
2898
2899/// \brief Loads the data from a 128-bit vector of [2 x double] from the
2900///    specified address pointed to by \a __a and broadcasts it to 128-bit
2901///    elements in a 256-bit vector of [4 x double].
2902///
2903/// \headerfile <x86intrin.h>
2904///
2905/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
2906///
2907/// \param __a
2908///    The 128-bit vector of [2 x double] to be broadcast.
2909/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
2910///    equal to the broadcast value.
2911static __inline __m256d __DEFAULT_FN_ATTRS
2912_mm256_broadcast_pd(__m128d const *__a)
2913{
2914  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
2915}
2916
2917/// \brief Loads the data from a 128-bit vector of [4 x float] from the
2918///    specified address pointed to by \a __a and broadcasts it to 128-bit
2919///    elements in a 256-bit vector of [8 x float].
2920///
2921/// \headerfile <x86intrin.h>
2922///
2923/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
2924///
2925/// \param __a
2926///    The 128-bit vector of [4 x float] to be broadcast.
2927/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
2928///    equal to the broadcast value.
2929static __inline __m256 __DEFAULT_FN_ATTRS
2930_mm256_broadcast_ps(__m128 const *__a)
2931{
2932  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
2933}
2934
2935/* SIMD load ops */
2936/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
2937///    memory location pointed to by \a __p into a vector of [4 x double].
2938///
2939/// \headerfile <x86intrin.h>
2940///
2941/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
2942///
2943/// \param __p
2944///    A 32-byte aligned pointer to a memory location containing
2945///    double-precision floating point values.
2946/// \returns A 256-bit vector of [4 x double] containing the moved values.
2947static __inline __m256d __DEFAULT_FN_ATTRS
2948_mm256_load_pd(double const *__p)
2949{
2950  return *(__m256d *)__p;
2951}
2952
2953/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
2954///    memory location pointed to by \a __p into a vector of [8 x float].
2955///
2956/// \headerfile <x86intrin.h>
2957///
2958/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
2959///
2960/// \param __p
2961///    A 32-byte aligned pointer to a memory location containing float values.
2962/// \returns A 256-bit vector of [8 x float] containing the moved values.
2963static __inline __m256 __DEFAULT_FN_ATTRS
2964_mm256_load_ps(float const *__p)
2965{
2966  return *(__m256 *)__p;
2967}
2968
2969/// \brief Loads 4 double-precision floating point values from an unaligned
2970///    memory location pointed to by \a __p into a vector of [4 x double].
2971///
2972/// \headerfile <x86intrin.h>
2973///
2974/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
2975///
2976/// \param __p
2977///    A pointer to a memory location containing double-precision floating
2978///    point values.
2979/// \returns A 256-bit vector of [4 x double] containing the moved values.
2980static __inline __m256d __DEFAULT_FN_ATTRS
2981_mm256_loadu_pd(double const *__p)
2982{
2983  struct __loadu_pd {
2984    __m256d __v;
2985  } __attribute__((__packed__, __may_alias__));
2986  return ((struct __loadu_pd*)__p)->__v;
2987}
2988
2989/// \brief Loads 8 single-precision floating point values from an unaligned
2990///    memory location pointed to by \a __p into a vector of [8 x float].
2991///
2992/// \headerfile <x86intrin.h>
2993///
2994/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
2995///
2996/// \param __p
2997///    A pointer to a memory location containing single-precision floating
2998///    point values.
2999/// \returns A 256-bit vector of [8 x float] containing the moved values.
3000static __inline __m256 __DEFAULT_FN_ATTRS
3001_mm256_loadu_ps(float const *__p)
3002{
3003  struct __loadu_ps {
3004    __m256 __v;
3005  } __attribute__((__packed__, __may_alias__));
3006  return ((struct __loadu_ps*)__p)->__v;
3007}
3008
3009/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
3010///    location pointed to by \a __p into elements of a 256-bit integer vector.
3011///
3012/// \headerfile <x86intrin.h>
3013///
3014/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3015///
3016/// \param __p
3017///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3018///    values.
3019/// \returns A 256-bit integer vector containing the moved values.
3020static __inline __m256i __DEFAULT_FN_ATTRS
3021_mm256_load_si256(__m256i const *__p)
3022{
3023  return *__p;
3024}
3025
3026/// \brief Loads 256 bits of integer data from an unaligned memory location
3027///    pointed to by \a __p into a 256-bit integer vector.
3028///
3029/// \headerfile <x86intrin.h>
3030///
3031/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3032///
3033/// \param __p
3034///    A pointer to a 256-bit integer vector containing integer values.
3035/// \returns A 256-bit integer vector containing the moved values.
3036static __inline __m256i __DEFAULT_FN_ATTRS
3037_mm256_loadu_si256(__m256i const *__p)
3038{
3039  struct __loadu_si256 {
3040    __m256i __v;
3041  } __attribute__((__packed__, __may_alias__));
3042  return ((struct __loadu_si256*)__p)->__v;
3043}
3044
3045/// \brief Loads 256 bits of integer data from an unaligned memory location
3046///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3047///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3048///    line boundary.
3049///
3050/// \headerfile <x86intrin.h>
3051///
3052/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3053///
3054/// \param __p
3055///    A pointer to a 256-bit integer vector containing integer values.
3056/// \returns A 256-bit integer vector containing the moved values.
3057static __inline __m256i __DEFAULT_FN_ATTRS
3058_mm256_lddqu_si256(__m256i const *__p)
3059{
3060  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3061}
3062
3063/* SIMD store ops */
3064/// \brief Stores double-precision floating point values from a 256-bit vector
3065///    of [4 x double] to a 32-byte aligned memory location pointed to by
3066///    \a __p.
3067///
3068/// \headerfile <x86intrin.h>
3069///
3070/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3071///
3072/// \param __p
3073///    A 32-byte aligned pointer to a memory location that will receive the
3074///    double-precision floaing point values.
3075/// \param __a
3076///    A 256-bit vector of [4 x double] containing the values to be moved.
3077static __inline void __DEFAULT_FN_ATTRS
3078_mm256_store_pd(double *__p, __m256d __a)
3079{
3080  *(__m256d *)__p = __a;
3081}
3082
3083/// \brief Stores single-precision floating point values from a 256-bit vector
3084///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3085///
3086/// \headerfile <x86intrin.h>
3087///
3088/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3089///
3090/// \param __p
3091///    A 32-byte aligned pointer to a memory location that will receive the
3092///    float values.
3093/// \param __a
3094///    A 256-bit vector of [8 x float] containing the values to be moved.
3095static __inline void __DEFAULT_FN_ATTRS
3096_mm256_store_ps(float *__p, __m256 __a)
3097{
3098  *(__m256 *)__p = __a;
3099}
3100
3101/// \brief Stores double-precision floating point values from a 256-bit vector
3102///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3103///
3104/// \headerfile <x86intrin.h>
3105///
3106/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3107///
3108/// \param __p
3109///    A pointer to a memory location that will receive the double-precision
3110///    floating point values.
3111/// \param __a
3112///    A 256-bit vector of [4 x double] containing the values to be moved.
3113static __inline void __DEFAULT_FN_ATTRS
3114_mm256_storeu_pd(double *__p, __m256d __a)
3115{
3116  struct __storeu_pd {
3117    __m256d __v;
3118  } __attribute__((__packed__, __may_alias__));
3119  ((struct __storeu_pd*)__p)->__v = __a;
3120}
3121
3122/// \brief Stores single-precision floating point values from a 256-bit vector
3123///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3124///
3125/// \headerfile <x86intrin.h>
3126///
3127/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3128///
3129/// \param __p
3130///    A pointer to a memory location that will receive the float values.
3131/// \param __a
3132///    A 256-bit vector of [8 x float] containing the values to be moved.
3133static __inline void __DEFAULT_FN_ATTRS
3134_mm256_storeu_ps(float *__p, __m256 __a)
3135{
3136  struct __storeu_ps {
3137    __m256 __v;
3138  } __attribute__((__packed__, __may_alias__));
3139  ((struct __storeu_ps*)__p)->__v = __a;
3140}
3141
3142/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
3143///    aligned memory location pointed to by \a __p.
3144///
3145/// \headerfile <x86intrin.h>
3146///
3147/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3148///
3149/// \param __p
3150///    A 32-byte aligned pointer to a memory location that will receive the
3151///    integer values.
3152/// \param __a
3153///    A 256-bit integer vector containing the values to be moved.
3154static __inline void __DEFAULT_FN_ATTRS
3155_mm256_store_si256(__m256i *__p, __m256i __a)
3156{
3157  *__p = __a;
3158}
3159
3160/// \brief Stores integer values from a 256-bit integer vector to an unaligned
3161///    memory location pointed to by \a __p.
3162///
3163/// \headerfile <x86intrin.h>
3164///
3165/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3166///
3167/// \param __p
3168///    A pointer to a memory location that will receive the integer values.
3169/// \param __a
3170///    A 256-bit integer vector containing the values to be moved.
3171static __inline void __DEFAULT_FN_ATTRS
3172_mm256_storeu_si256(__m256i *__p, __m256i __a)
3173{
3174  struct __storeu_si256 {
3175    __m256i __v;
3176  } __attribute__((__packed__, __may_alias__));
3177  ((struct __storeu_si256*)__p)->__v = __a;
3178}
3179
3180/* Conditional load ops */
3181/// \brief Conditionally loads double-precision floating point elements from a
3182///    memory location pointed to by \a __p into a 128-bit vector of
3183///    [2 x double], depending on the mask bits associated with each data
3184///    element.
3185///
3186/// \headerfile <x86intrin.h>
3187///
3188/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3189///
3190/// \param __p
3191///    A pointer to a memory location that contains the double-precision
3192///    floating point values.
3193/// \param __m
3194///    A 128-bit integer vector containing the mask. The most significant bit of
3195///    each data element represents the mask bits. If a mask bit is zero, the
3196///    corresponding value in the memory location is not loaded and the
3197///    corresponding field in the return value is set to zero.
3198/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3199static __inline __m128d __DEFAULT_FN_ATTRS
3200_mm_maskload_pd(double const *__p, __m128i __m)
3201{
3202  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3203}
3204
3205/// \brief Conditionally loads double-precision floating point elements from a
3206///    memory location pointed to by \a __p into a 256-bit vector of
3207///    [4 x double], depending on the mask bits associated with each data
3208///    element.
3209///
3210/// \headerfile <x86intrin.h>
3211///
3212/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3213///
3214/// \param __p
3215///    A pointer to a memory location that contains the double-precision
3216///    floating point values.
3217/// \param __m
3218///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3219///    significant bit of each quadword element represents the mask bits. If a
3220///    mask bit is zero, the corresponding value in the memory location is not
3221///    loaded and the corresponding field in the return value is set to zero.
3222/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3223static __inline __m256d __DEFAULT_FN_ATTRS
3224_mm256_maskload_pd(double const *__p, __m256i __m)
3225{
3226  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3227                                               (__v4di)__m);
3228}
3229
3230/// \brief Conditionally loads single-precision floating point elements from a
3231///    memory location pointed to by \a __p into a 128-bit vector of
3232///    [4 x float], depending on the mask bits associated with each data
3233///    element.
3234///
3235/// \headerfile <x86intrin.h>
3236///
3237/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3238///
3239/// \param __p
3240///    A pointer to a memory location that contains the single-precision
3241///    floating point values.
3242/// \param __m
3243///    A 128-bit integer vector containing the mask. The most significant bit of
3244///    each data element represents the mask bits. If a mask bit is zero, the
3245///    corresponding value in the memory location is not loaded and the
3246///    corresponding field in the return value is set to zero.
3247/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3248static __inline __m128 __DEFAULT_FN_ATTRS
3249_mm_maskload_ps(float const *__p, __m128i __m)
3250{
3251  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3252}
3253
3254/// \brief Conditionally loads single-precision floating point elements from a
3255///    memory location pointed to by \a __p into a 256-bit vector of
3256///    [8 x float], depending on the mask bits associated with each data
3257///    element.
3258///
3259/// \headerfile <x86intrin.h>
3260///
3261/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3262///
3263/// \param __p
3264///    A pointer to a memory location that contains the single-precision
3265///    floating point values.
3266/// \param __m
3267///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3268///    significant bit of each dword element represents the mask bits. If a mask
3269///    bit is zero, the corresponding value in the memory location is not loaded
3270///    and the corresponding field in the return value is set to zero.
3271/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3272static __inline __m256 __DEFAULT_FN_ATTRS
3273_mm256_maskload_ps(float const *__p, __m256i __m)
3274{
3275  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3276}
3277
3278/* Conditional store ops */
3279/// \brief Moves single-precision floating point values from a 256-bit vector
3280///    of [8 x float] to a memory location pointed to by \a __p, according to
3281///    the specified mask.
3282///
3283/// \headerfile <x86intrin.h>
3284///
3285/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3286///
3287/// \param __p
3288///    A pointer to a memory location that will receive the float values.
3289/// \param __m
3290///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3291///    significant bit of each dword element in the mask vector represents the
3292///    mask bits. If a mask bit is zero, the corresponding value from vector
3293///    \a __a is not stored and the corresponding field in the memory location
3294///    pointed to by \a __p is not changed.
3295/// \param __a
3296///    A 256-bit vector of [8 x float] containing the values to be stored.
3297static __inline void __DEFAULT_FN_ATTRS
3298_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3299{
3300  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3301}
3302
3303/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
3304///    to a memory location pointed to by \a __p, according to the specified
3305///    mask.
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3310///
3311/// \param __p
3312///    A pointer to a memory location that will receive the float values.
3313/// \param __m
3314///    A 128-bit integer vector containing the mask. The most significant bit of
3315///    each field in the mask vector represents the mask bits. If a mask bit is
3316///    zero, the corresponding value from vector \a __a is not stored and the
3317///    corresponding field in the memory location pointed to by \a __p is not
3318///    changed.
3319/// \param __a
3320///    A 128-bit vector of [2 x double] containing the values to be stored.
3321static __inline void __DEFAULT_FN_ATTRS
3322_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3323{
3324  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3325}
3326
3327/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3328///    to a memory location pointed to by \a __p, according to the specified
3329///    mask.
3330///
3331/// \headerfile <x86intrin.h>
3332///
3333/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3334///
3335/// \param __p
3336///    A pointer to a memory location that will receive the float values.
3337/// \param __m
3338///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3339///    significant bit of each quadword element in the mask vector represents
3340///    the mask bits. If a mask bit is zero, the corresponding value from vector
3341///    __a is not stored and the corresponding field in the memory location
3342///    pointed to by \a __p is not changed.
3343/// \param __a
3344///    A 256-bit vector of [4 x double] containing the values to be stored.
3345static __inline void __DEFAULT_FN_ATTRS
3346_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3347{
3348  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3349}
3350
3351/// \brief Moves single-precision floating point values from a 128-bit vector
3352///    of [4 x float] to a memory location pointed to by \a __p, according to
3353///    the specified mask.
3354///
3355/// \headerfile <x86intrin.h>
3356///
3357/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3358///
3359/// \param __p
3360///    A pointer to a memory location that will receive the float values.
3361/// \param __m
3362///    A 128-bit integer vector containing the mask. The most significant bit of
3363///    each field in the mask vector represents the mask bits. If a mask bit is
3364///    zero, the corresponding value from vector __a is not stored and the
3365///    corresponding field in the memory location pointed to by \a __p is not
3366///    changed.
3367/// \param __a
3368///    A 128-bit vector of [4 x float] containing the values to be stored.
3369static __inline void __DEFAULT_FN_ATTRS
3370_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3371{
3372  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3373}
3374
3375/* Cacheability support ops */
3376/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3377///    aligned memory location. To minimize caching, the data is flagged as
3378///    non-temporal (unlikely to be used again soon).
3379///
3380/// \headerfile <x86intrin.h>
3381///
3382/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3383///
3384/// \param __a
3385///    A pointer to a 32-byte aligned memory location that will receive the
3386///    integer values.
3387/// \param __b
3388///    A 256-bit integer vector containing the values to be moved.
3389static __inline void __DEFAULT_FN_ATTRS
3390_mm256_stream_si256(__m256i *__a, __m256i __b)
3391{
3392  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
3393}
3394
3395/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3396///    to a 32-byte aligned memory location. To minimize caching, the data is
3397///    flagged as non-temporal (unlikely to be used again soon).
3398///
3399/// \headerfile <x86intrin.h>
3400///
3401/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3402///
3403/// \param __a
3404///    A pointer to a 32-byte aligned memory location that will receive the
3405///    integer values.
3406/// \param __b
3407///    A 256-bit vector of [4 x double] containing the values to be moved.
3408static __inline void __DEFAULT_FN_ATTRS
3409_mm256_stream_pd(double *__a, __m256d __b)
3410{
3411  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
3412}
3413
3414/// \brief Moves single-precision floating point values from a 256-bit vector
3415///    of [8 x float] to a 32-byte aligned memory location. To minimize
3416///    caching, the data is flagged as non-temporal (unlikely to be used again
3417///    soon).
3418///
3419/// \headerfile <x86intrin.h>
3420///
3421/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3422///
3423/// \param __p
3424///    A pointer to a 32-byte aligned memory location that will receive the
3425///    single-precision floating point values.
3426/// \param __a
3427///    A 256-bit vector of [8 x float] containing the values to be moved.
3428static __inline void __DEFAULT_FN_ATTRS
3429_mm256_stream_ps(float *__p, __m256 __a)
3430{
3431  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
3432}
3433
3434/* Create vectors */
3435/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3436///
3437/// \headerfile <x86intrin.h>
3438///
3439/// This intrinsic has no corresponding instruction.
3440///
3441/// \returns A 256-bit vector of [4 x double] containing undefined values.
3442static __inline__ __m256d __DEFAULT_FN_ATTRS
3443_mm256_undefined_pd(void)
3444{
3445  return (__m256d)__builtin_ia32_undef256();
3446}
3447
3448/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3449///
3450/// \headerfile <x86intrin.h>
3451///
3452/// This intrinsic has no corresponding instruction.
3453///
3454/// \returns A 256-bit vector of [8 x float] containing undefined values.
3455static __inline__ __m256 __DEFAULT_FN_ATTRS
3456_mm256_undefined_ps(void)
3457{
3458  return (__m256)__builtin_ia32_undef256();
3459}
3460
3461/// \brief Create a 256-bit integer vector with undefined values.
3462///
3463/// \headerfile <x86intrin.h>
3464///
3465/// This intrinsic has no corresponding instruction.
3466///
3467/// \returns A 256-bit integer vector containing undefined values.
3468static __inline__ __m256i __DEFAULT_FN_ATTRS
3469_mm256_undefined_si256(void)
3470{
3471  return (__m256i)__builtin_ia32_undef256();
3472}
3473
3474/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3475///    initialized with the specified double-precision floating-point values.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3480///   instruction.
3481///
3482/// \param __a
3483///    A double-precision floating-point value used to initialize bits [255:192]
3484///    of the result.
3485/// \param __b
3486///    A double-precision floating-point value used to initialize bits [191:128]
3487///    of the result.
3488/// \param __c
3489///    A double-precision floating-point value used to initialize bits [127:64]
3490///    of the result.
3491/// \param __d
3492///    A double-precision floating-point value used to initialize bits [63:0]
3493///    of the result.
3494/// \returns An initialized 256-bit floating-point vector of [4 x double].
3495static __inline __m256d __DEFAULT_FN_ATTRS
3496_mm256_set_pd(double __a, double __b, double __c, double __d)
3497{
3498  return (__m256d){ __d, __c, __b, __a };
3499}
3500
3501/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3502///    with the specified single-precision floating-point values.
3503///
3504/// \headerfile <x86intrin.h>
3505///
3506/// This intrinsic is a utility function and does not correspond to a specific
3507///   instruction.
3508///
3509/// \param __a
3510///    A single-precision floating-point value used to initialize bits [255:224]
3511///    of the result.
3512/// \param __b
3513///    A single-precision floating-point value used to initialize bits [223:192]
3514///    of the result.
3515/// \param __c
3516///    A single-precision floating-point value used to initialize bits [191:160]
3517///    of the result.
3518/// \param __d
3519///    A single-precision floating-point value used to initialize bits [159:128]
3520///    of the result.
3521/// \param __e
3522///    A single-precision floating-point value used to initialize bits [127:96]
3523///    of the result.
3524/// \param __f
3525///    A single-precision floating-point value used to initialize bits [95:64]
3526///    of the result.
3527/// \param __g
3528///    A single-precision floating-point value used to initialize bits [63:32]
3529///    of the result.
3530/// \param __h
3531///    A single-precision floating-point value used to initialize bits [31:0]
3532///    of the result.
3533/// \returns An initialized 256-bit floating-point vector of [8 x float].
3534static __inline __m256 __DEFAULT_FN_ATTRS
3535_mm256_set_ps(float __a, float __b, float __c, float __d,
3536              float __e, float __f, float __g, float __h)
3537{
3538  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3539}
3540
3541/// \brief Constructs a 256-bit integer vector initialized with the specified
3542///    32-bit integral values.
3543///
3544/// \headerfile <x86intrin.h>
3545///
3546/// This intrinsic is a utility function and does not correspond to a specific
3547///   instruction.
3548///
3549/// \param __i0
3550///    A 32-bit integral value used to initialize bits [255:224] of the result.
3551/// \param __i1
3552///    A 32-bit integral value used to initialize bits [223:192] of the result.
3553/// \param __i2
3554///    A 32-bit integral value used to initialize bits [191:160] of the result.
3555/// \param __i3
3556///    A 32-bit integral value used to initialize bits [159:128] of the result.
3557/// \param __i4
3558///    A 32-bit integral value used to initialize bits [127:96] of the result.
3559/// \param __i5
3560///    A 32-bit integral value used to initialize bits [95:64] of the result.
3561/// \param __i6
3562///    A 32-bit integral value used to initialize bits [63:32] of the result.
3563/// \param __i7
3564///    A 32-bit integral value used to initialize bits [31:0] of the result.
3565/// \returns An initialized 256-bit integer vector.
3566static __inline __m256i __DEFAULT_FN_ATTRS
3567_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3568                 int __i4, int __i5, int __i6, int __i7)
3569{
3570  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3571}
3572
3573/// \brief Constructs a 256-bit integer vector initialized with the specified
3574///    16-bit integral values.
3575///
3576/// \headerfile <x86intrin.h>
3577///
3578/// This intrinsic is a utility function and does not correspond to a specific
3579///   instruction.
3580///
3581/// \param __w15
3582///    A 16-bit integral value used to initialize bits [255:240] of the result.
3583/// \param __w14
3584///    A 16-bit integral value used to initialize bits [239:224] of the result.
3585/// \param __w13
3586///    A 16-bit integral value used to initialize bits [223:208] of the result.
3587/// \param __w12
3588///    A 16-bit integral value used to initialize bits [207:192] of the result.
3589/// \param __w11
3590///    A 16-bit integral value used to initialize bits [191:176] of the result.
3591/// \param __w10
3592///    A 16-bit integral value used to initialize bits [175:160] of the result.
3593/// \param __w09
3594///    A 16-bit integral value used to initialize bits [159:144] of the result.
3595/// \param __w08
3596///    A 16-bit integral value used to initialize bits [143:128] of the result.
3597/// \param __w07
3598///    A 16-bit integral value used to initialize bits [127:112] of the result.
3599/// \param __w06
3600///    A 16-bit integral value used to initialize bits [111:96] of the result.
3601/// \param __w05
3602///    A 16-bit integral value used to initialize bits [95:80] of the result.
3603/// \param __w04
3604///    A 16-bit integral value used to initialize bits [79:64] of the result.
3605/// \param __w03
3606///    A 16-bit integral value used to initialize bits [63:48] of the result.
3607/// \param __w02
3608///    A 16-bit integral value used to initialize bits [47:32] of the result.
3609/// \param __w01
3610///    A 16-bit integral value used to initialize bits [31:16] of the result.
3611/// \param __w00
3612///    A 16-bit integral value used to initialize bits [15:0] of the result.
3613/// \returns An initialized 256-bit integer vector.
3614static __inline __m256i __DEFAULT_FN_ATTRS
3615_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3616                 short __w11, short __w10, short __w09, short __w08,
3617                 short __w07, short __w06, short __w05, short __w04,
3618                 short __w03, short __w02, short __w01, short __w00)
3619{
3620  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3621    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3622}
3623
3624/// \brief Constructs a 256-bit integer vector initialized with the specified
3625///    8-bit integral values.
3626///
3627/// \headerfile <x86intrin.h>
3628///
3629/// This intrinsic is a utility function and does not correspond to a specific
3630///   instruction.
3631///
3632/// \param __b31
3633///    An 8-bit integral value used to initialize bits [255:248] of the result.
3634/// \param __b30
3635///    An 8-bit integral value used to initialize bits [247:240] of the result.
3636/// \param __b29
3637///    An 8-bit integral value used to initialize bits [239:232] of the result.
3638/// \param __b28
3639///    An 8-bit integral value used to initialize bits [231:224] of the result.
3640/// \param __b27
3641///    An 8-bit integral value used to initialize bits [223:216] of the result.
3642/// \param __b26
3643///    An 8-bit integral value used to initialize bits [215:208] of the result.
3644/// \param __b25
3645///    An 8-bit integral value used to initialize bits [207:200] of the result.
3646/// \param __b24
3647///    An 8-bit integral value used to initialize bits [199:192] of the result.
3648/// \param __b23
3649///    An 8-bit integral value used to initialize bits [191:184] of the result.
3650/// \param __b22
3651///    An 8-bit integral value used to initialize bits [183:176] of the result.
3652/// \param __b21
3653///    An 8-bit integral value used to initialize bits [175:168] of the result.
3654/// \param __b20
3655///    An 8-bit integral value used to initialize bits [167:160] of the result.
3656/// \param __b19
3657///    An 8-bit integral value used to initialize bits [159:152] of the result.
3658/// \param __b18
3659///    An 8-bit integral value used to initialize bits [151:144] of the result.
3660/// \param __b17
3661///    An 8-bit integral value used to initialize bits [143:136] of the result.
3662/// \param __b16
3663///    An 8-bit integral value used to initialize bits [135:128] of the result.
3664/// \param __b15
3665///    An 8-bit integral value used to initialize bits [127:120] of the result.
3666/// \param __b14
3667///    An 8-bit integral value used to initialize bits [119:112] of the result.
3668/// \param __b13
3669///    An 8-bit integral value used to initialize bits [111:104] of the result.
3670/// \param __b12
3671///    An 8-bit integral value used to initialize bits [103:96] of the result.
3672/// \param __b11
3673///    An 8-bit integral value used to initialize bits [95:88] of the result.
3674/// \param __b10
3675///    An 8-bit integral value used to initialize bits [87:80] of the result.
3676/// \param __b09
3677///    An 8-bit integral value used to initialize bits [79:72] of the result.
3678/// \param __b08
3679///    An 8-bit integral value used to initialize bits [71:64] of the result.
3680/// \param __b07
3681///    An 8-bit integral value used to initialize bits [63:56] of the result.
3682/// \param __b06
3683///    An 8-bit integral value used to initialize bits [55:48] of the result.
3684/// \param __b05
3685///    An 8-bit integral value used to initialize bits [47:40] of the result.
3686/// \param __b04
3687///    An 8-bit integral value used to initialize bits [39:32] of the result.
3688/// \param __b03
3689///    An 8-bit integral value used to initialize bits [31:24] of the result.
3690/// \param __b02
3691///    An 8-bit integral value used to initialize bits [23:16] of the result.
3692/// \param __b01
3693///    An 8-bit integral value used to initialize bits [15:8] of the result.
3694/// \param __b00
3695///    An 8-bit integral value used to initialize bits [7:0] of the result.
3696/// \returns An initialized 256-bit integer vector.
3697static __inline __m256i __DEFAULT_FN_ATTRS
3698_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3699                char __b27, char __b26, char __b25, char __b24,
3700                char __b23, char __b22, char __b21, char __b20,
3701                char __b19, char __b18, char __b17, char __b16,
3702                char __b15, char __b14, char __b13, char __b12,
3703                char __b11, char __b10, char __b09, char __b08,
3704                char __b07, char __b06, char __b05, char __b04,
3705                char __b03, char __b02, char __b01, char __b00)
3706{
3707  return (__m256i)(__v32qi){
3708    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3709    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3710    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3711    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3712  };
3713}
3714
3715/// \brief Constructs a 256-bit integer vector initialized with the specified
3716///    64-bit integral values.
3717///
3718/// \headerfile <x86intrin.h>
3719///
3720/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3721///   instruction.
3722///
3723/// \param __a
3724///    A 64-bit integral value used to initialize bits [255:192] of the result.
3725/// \param __b
3726///    A 64-bit integral value used to initialize bits [191:128] of the result.
3727/// \param __c
3728///    A 64-bit integral value used to initialize bits [127:64] of the result.
3729/// \param __d
3730///    A 64-bit integral value used to initialize bits [63:0] of the result.
3731/// \returns An initialized 256-bit integer vector.
3732static __inline __m256i __DEFAULT_FN_ATTRS
3733_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3734{
3735  return (__m256i)(__v4di){ __d, __c, __b, __a };
3736}
3737
3738/* Create vectors with elements in reverse order */
3739/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3740///    initialized in reverse order with the specified double-precision
3741///    floating-point values.
3742///
3743/// \headerfile <x86intrin.h>
3744///
3745/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3746///   instruction.
3747///
3748/// \param __a
3749///    A double-precision floating-point value used to initialize bits [63:0]
3750///    of the result.
3751/// \param __b
3752///    A double-precision floating-point value used to initialize bits [127:64]
3753///    of the result.
3754/// \param __c
3755///    A double-precision floating-point value used to initialize bits [191:128]
3756///    of the result.
3757/// \param __d
3758///    A double-precision floating-point value used to initialize bits [255:192]
3759///    of the result.
3760/// \returns An initialized 256-bit floating-point vector of [4 x double].
3761static __inline __m256d __DEFAULT_FN_ATTRS
3762_mm256_setr_pd(double __a, double __b, double __c, double __d)
3763{
3764  return (__m256d){ __a, __b, __c, __d };
3765}
3766
3767/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3768///    initialized in reverse order with the specified single-precision
3769///    float-point values.
3770///
3771/// \headerfile <x86intrin.h>
3772///
3773/// This intrinsic is a utility function and does not correspond to a specific
3774///   instruction.
3775///
3776/// \param __a
3777///    A single-precision floating-point value used to initialize bits [31:0]
3778///    of the result.
3779/// \param __b
3780///    A single-precision floating-point value used to initialize bits [63:32]
3781///    of the result.
3782/// \param __c
3783///    A single-precision floating-point value used to initialize bits [95:64]
3784///    of the result.
3785/// \param __d
3786///    A single-precision floating-point value used to initialize bits [127:96]
3787///    of the result.
3788/// \param __e
3789///    A single-precision floating-point value used to initialize bits [159:128]
3790///    of the result.
3791/// \param __f
3792///    A single-precision floating-point value used to initialize bits [191:160]
3793///    of the result.
3794/// \param __g
3795///    A single-precision floating-point value used to initialize bits [223:192]
3796///    of the result.
3797/// \param __h
3798///    A single-precision floating-point value used to initialize bits [255:224]
3799///    of the result.
3800/// \returns An initialized 256-bit floating-point vector of [8 x float].
3801static __inline __m256 __DEFAULT_FN_ATTRS
3802_mm256_setr_ps(float __a, float __b, float __c, float __d,
3803               float __e, float __f, float __g, float __h)
3804{
3805  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
3806}
3807
3808/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3809///    with the specified 32-bit integral values.
3810///
3811/// \headerfile <x86intrin.h>
3812///
3813/// This intrinsic is a utility function and does not correspond to a specific
3814///   instruction.
3815///
3816/// \param __i0
3817///    A 32-bit integral value used to initialize bits [31:0] of the result.
3818/// \param __i1
3819///    A 32-bit integral value used to initialize bits [63:32] of the result.
3820/// \param __i2
3821///    A 32-bit integral value used to initialize bits [95:64] of the result.
3822/// \param __i3
3823///    A 32-bit integral value used to initialize bits [127:96] of the result.
3824/// \param __i4
3825///    A 32-bit integral value used to initialize bits [159:128] of the result.
3826/// \param __i5
3827///    A 32-bit integral value used to initialize bits [191:160] of the result.
3828/// \param __i6
3829///    A 32-bit integral value used to initialize bits [223:192] of the result.
3830/// \param __i7
3831///    A 32-bit integral value used to initialize bits [255:224] of the result.
3832/// \returns An initialized 256-bit integer vector.
3833static __inline __m256i __DEFAULT_FN_ATTRS
3834_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3835                  int __i4, int __i5, int __i6, int __i7)
3836{
3837  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
3838}
3839
3840/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3841///    with the specified 16-bit integral values.
3842///
3843/// \headerfile <x86intrin.h>
3844///
3845/// This intrinsic is a utility function and does not correspond to a specific
3846///   instruction.
3847///
3848/// \param __w15
3849///    A 16-bit integral value used to initialize bits [15:0] of the result.
3850/// \param __w14
3851///    A 16-bit integral value used to initialize bits [31:16] of the result.
3852/// \param __w13
3853///    A 16-bit integral value used to initialize bits [47:32] of the result.
3854/// \param __w12
3855///    A 16-bit integral value used to initialize bits [63:48] of the result.
3856/// \param __w11
3857///    A 16-bit integral value used to initialize bits [79:64] of the result.
3858/// \param __w10
3859///    A 16-bit integral value used to initialize bits [95:80] of the result.
3860/// \param __w09
3861///    A 16-bit integral value used to initialize bits [111:96] of the result.
3862/// \param __w08
3863///    A 16-bit integral value used to initialize bits [127:112] of the result.
3864/// \param __w07
3865///    A 16-bit integral value used to initialize bits [143:128] of the result.
3866/// \param __w06
3867///    A 16-bit integral value used to initialize bits [159:144] of the result.
3868/// \param __w05
3869///    A 16-bit integral value used to initialize bits [175:160] of the result.
3870/// \param __w04
3871///    A 16-bit integral value used to initialize bits [191:176] of the result.
3872/// \param __w03
3873///    A 16-bit integral value used to initialize bits [207:192] of the result.
3874/// \param __w02
3875///    A 16-bit integral value used to initialize bits [223:208] of the result.
3876/// \param __w01
3877///    A 16-bit integral value used to initialize bits [239:224] of the result.
3878/// \param __w00
3879///    A 16-bit integral value used to initialize bits [255:240] of the result.
3880/// \returns An initialized 256-bit integer vector.
3881static __inline __m256i __DEFAULT_FN_ATTRS
3882_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
3883       short __w11, short __w10, short __w09, short __w08,
3884       short __w07, short __w06, short __w05, short __w04,
3885       short __w03, short __w02, short __w01, short __w00)
3886{
3887  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
3888    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
3889}
3890
3891/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3892///    with the specified 8-bit integral values.
3893///
3894/// \headerfile <x86intrin.h>
3895///
3896/// This intrinsic is a utility function and does not correspond to a specific
3897///   instruction.
3898///
3899/// \param __b31
3900///    An 8-bit integral value used to initialize bits [7:0] of the result.
3901/// \param __b30
3902///    An 8-bit integral value used to initialize bits [15:8] of the result.
3903/// \param __b29
3904///    An 8-bit integral value used to initialize bits [23:16] of the result.
3905/// \param __b28
3906///    An 8-bit integral value used to initialize bits [31:24] of the result.
3907/// \param __b27
3908///    An 8-bit integral value used to initialize bits [39:32] of the result.
3909/// \param __b26
3910///    An 8-bit integral value used to initialize bits [47:40] of the result.
3911/// \param __b25
3912///    An 8-bit integral value used to initialize bits [55:48] of the result.
3913/// \param __b24
3914///    An 8-bit integral value used to initialize bits [63:56] of the result.
3915/// \param __b23
3916///    An 8-bit integral value used to initialize bits [71:64] of the result.
3917/// \param __b22
3918///    An 8-bit integral value used to initialize bits [79:72] of the result.
3919/// \param __b21
3920///    An 8-bit integral value used to initialize bits [87:80] of the result.
3921/// \param __b20
3922///    An 8-bit integral value used to initialize bits [95:88] of the result.
3923/// \param __b19
3924///    An 8-bit integral value used to initialize bits [103:96] of the result.
3925/// \param __b18
3926///    An 8-bit integral value used to initialize bits [111:104] of the result.
3927/// \param __b17
3928///    An 8-bit integral value used to initialize bits [119:112] of the result.
3929/// \param __b16
3930///    An 8-bit integral value used to initialize bits [127:120] of the result.
3931/// \param __b15
3932///    An 8-bit integral value used to initialize bits [135:128] of the result.
3933/// \param __b14
3934///    An 8-bit integral value used to initialize bits [143:136] of the result.
3935/// \param __b13
3936///    An 8-bit integral value used to initialize bits [151:144] of the result.
3937/// \param __b12
3938///    An 8-bit integral value used to initialize bits [159:152] of the result.
3939/// \param __b11
3940///    An 8-bit integral value used to initialize bits [167:160] of the result.
3941/// \param __b10
3942///    An 8-bit integral value used to initialize bits [175:168] of the result.
3943/// \param __b09
3944///    An 8-bit integral value used to initialize bits [183:176] of the result.
3945/// \param __b08
3946///    An 8-bit integral value used to initialize bits [191:184] of the result.
3947/// \param __b07
3948///    An 8-bit integral value used to initialize bits [199:192] of the result.
3949/// \param __b06
3950///    An 8-bit integral value used to initialize bits [207:200] of the result.
3951/// \param __b05
3952///    An 8-bit integral value used to initialize bits [215:208] of the result.
3953/// \param __b04
3954///    An 8-bit integral value used to initialize bits [223:216] of the result.
3955/// \param __b03
3956///    An 8-bit integral value used to initialize bits [231:224] of the result.
3957/// \param __b02
3958///    An 8-bit integral value used to initialize bits [239:232] of the result.
3959/// \param __b01
3960///    An 8-bit integral value used to initialize bits [247:240] of the result.
3961/// \param __b00
3962///    An 8-bit integral value used to initialize bits [255:248] of the result.
3963/// \returns An initialized 256-bit integer vector.
3964static __inline __m256i __DEFAULT_FN_ATTRS
3965_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
3966                 char __b27, char __b26, char __b25, char __b24,
3967                 char __b23, char __b22, char __b21, char __b20,
3968                 char __b19, char __b18, char __b17, char __b16,
3969                 char __b15, char __b14, char __b13, char __b12,
3970                 char __b11, char __b10, char __b09, char __b08,
3971                 char __b07, char __b06, char __b05, char __b04,
3972                 char __b03, char __b02, char __b01, char __b00)
3973{
3974  return (__m256i)(__v32qi){
3975    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
3976    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
3977    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
3978    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
3979}
3980
3981/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3982///    with the specified 64-bit integral values.
3983///
3984/// \headerfile <x86intrin.h>
3985///
3986/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3987///   instruction.
3988///
3989/// \param __a
3990///    A 64-bit integral value used to initialize bits [63:0] of the result.
3991/// \param __b
3992///    A 64-bit integral value used to initialize bits [127:64] of the result.
3993/// \param __c
3994///    A 64-bit integral value used to initialize bits [191:128] of the result.
3995/// \param __d
3996///    A 64-bit integral value used to initialize bits [255:192] of the result.
3997/// \returns An initialized 256-bit integer vector.
3998static __inline __m256i __DEFAULT_FN_ATTRS
3999_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4000{
4001  return (__m256i)(__v4di){ __a, __b, __c, __d };
4002}
4003
4004/* Create vectors with repeated elements */
4005/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4006///    of the four double-precision floating-point vector elements set to the
4007///    specified double-precision floating-point value.
4008///
4009/// \headerfile <x86intrin.h>
4010///
4011/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4012///
4013/// \param __w
4014///    A double-precision floating-point value used to initialize each vector
4015///    element of the result.
4016/// \returns An initialized 256-bit floating-point vector of [4 x double].
4017static __inline __m256d __DEFAULT_FN_ATTRS
4018_mm256_set1_pd(double __w)
4019{
4020  return (__m256d){ __w, __w, __w, __w };
4021}
4022
4023/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4024///    of the eight single-precision floating-point vector elements set to the
4025///    specified single-precision floating-point value.
4026///
4027/// \headerfile <x86intrin.h>
4028///
4029/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4030///   instruction.
4031///
4032/// \param __w
4033///    A single-precision floating-point value used to initialize each vector
4034///    element of the result.
4035/// \returns An initialized 256-bit floating-point vector of [8 x float].
4036static __inline __m256 __DEFAULT_FN_ATTRS
4037_mm256_set1_ps(float __w)
4038{
4039  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
4040}
4041
4042/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4043///    32-bit integral vector elements set to the specified 32-bit integral
4044///    value.
4045///
4046/// \headerfile <x86intrin.h>
4047///
4048/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4049///   instruction.
4050///
4051/// \param __i
4052///    A 32-bit integral value used to initialize each vector element of the
4053///    result.
4054/// \returns An initialized 256-bit integer vector of [8 x i32].
4055static __inline __m256i __DEFAULT_FN_ATTRS
4056_mm256_set1_epi32(int __i)
4057{
4058  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
4059}
4060
4061/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4062///    16-bit integral vector elements set to the specified 16-bit integral
4063///    value.
4064///
4065/// \headerfile <x86intrin.h>
4066///
4067/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4068///
4069/// \param __w
4070///    A 16-bit integral value used to initialize each vector element of the
4071///    result.
4072/// \returns An initialized 256-bit integer vector of [16 x i16].
4073static __inline __m256i __DEFAULT_FN_ATTRS
4074_mm256_set1_epi16(short __w)
4075{
4076  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4077    __w, __w, __w, __w, __w, __w };
4078}
4079
4080/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4081///    8-bit integral vector elements set to the specified 8-bit integral value.
4082///
4083/// \headerfile <x86intrin.h>
4084///
4085/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4086///
4087/// \param __b
4088///    An 8-bit integral value used to initialize each vector element of the
4089///    result.
4090/// \returns An initialized 256-bit integer vector of [32 x i8].
4091static __inline __m256i __DEFAULT_FN_ATTRS
4092_mm256_set1_epi8(char __b)
4093{
4094  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4095    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4096    __b, __b, __b, __b, __b, __b, __b };
4097}
4098
4099/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4100///    64-bit integral vector elements set to the specified 64-bit integral
4101///    value.
4102///
4103/// \headerfile <x86intrin.h>
4104///
4105/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4106///
4107/// \param __q
4108///    A 64-bit integral value used to initialize each vector element of the
4109///    result.
4110/// \returns An initialized 256-bit integer vector of [4 x i64].
4111static __inline __m256i __DEFAULT_FN_ATTRS
4112_mm256_set1_epi64x(long long __q)
4113{
4114  return (__m256i)(__v4di){ __q, __q, __q, __q };
4115}
4116
4117/* Create __zeroed vectors */
4118/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4119///    vector elements initialized to zero.
4120///
4121/// \headerfile <x86intrin.h>
4122///
4123/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4124///
4125/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4126static __inline __m256d __DEFAULT_FN_ATTRS
4127_mm256_setzero_pd(void)
4128{
4129  return (__m256d){ 0, 0, 0, 0 };
4130}
4131
4132/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4133///    vector elements initialized to zero.
4134///
4135/// \headerfile <x86intrin.h>
4136///
4137/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4138///
4139/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4140static __inline __m256 __DEFAULT_FN_ATTRS
4141_mm256_setzero_ps(void)
4142{
4143  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4144}
4145
4146/// \brief Constructs a 256-bit integer vector initialized to zero.
4147///
4148/// \headerfile <x86intrin.h>
4149///
4150/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4151///
4152/// \returns A 256-bit integer vector initialized to zero.
4153static __inline __m256i __DEFAULT_FN_ATTRS
4154_mm256_setzero_si256(void)
4155{
4156  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4157}
4158
4159/* Cast between vector types */
4160/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4161///    floating-point vector of [8 x float].
4162///
4163/// \headerfile <x86intrin.h>
4164///
4165/// This intrinsic has no corresponding instruction.
4166///
4167/// \param __a
4168///    A 256-bit floating-point vector of [4 x double].
4169/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4170///    bitwise pattern as the parameter.
4171static __inline __m256 __DEFAULT_FN_ATTRS
4172_mm256_castpd_ps(__m256d __a)
4173{
4174  return (__m256)__a;
4175}
4176
4177/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4178///    integer vector.
4179///
4180/// \headerfile <x86intrin.h>
4181///
4182/// This intrinsic has no corresponding instruction.
4183///
4184/// \param __a
4185///    A 256-bit floating-point vector of [4 x double].
4186/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4187///    parameter.
4188static __inline __m256i __DEFAULT_FN_ATTRS
4189_mm256_castpd_si256(__m256d __a)
4190{
4191  return (__m256i)__a;
4192}
4193
4194/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4195///    floating-point vector of [4 x double].
4196///
4197/// \headerfile <x86intrin.h>
4198///
4199/// This intrinsic has no corresponding instruction.
4200///
4201/// \param __a
4202///    A 256-bit floating-point vector of [8 x float].
4203/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4204///    bitwise pattern as the parameter.
4205static __inline __m256d __DEFAULT_FN_ATTRS
4206_mm256_castps_pd(__m256 __a)
4207{
4208  return (__m256d)__a;
4209}
4210
4211/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4212///    integer vector.
4213///
4214/// \headerfile <x86intrin.h>
4215///
4216/// This intrinsic has no corresponding instruction.
4217///
4218/// \param __a
4219///    A 256-bit floating-point vector of [8 x float].
4220/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4221///    parameter.
4222static __inline __m256i __DEFAULT_FN_ATTRS
4223_mm256_castps_si256(__m256 __a)
4224{
4225  return (__m256i)__a;
4226}
4227
4228/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4229///    of [8 x float].
4230///
4231/// \headerfile <x86intrin.h>
4232///
4233/// This intrinsic has no corresponding instruction.
4234///
4235/// \param __a
4236///    A 256-bit integer vector.
4237/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4238///    bitwise pattern as the parameter.
4239static __inline __m256 __DEFAULT_FN_ATTRS
4240_mm256_castsi256_ps(__m256i __a)
4241{
4242  return (__m256)__a;
4243}
4244
4245/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4246///    of [4 x double].
4247///
4248/// \headerfile <x86intrin.h>
4249///
4250/// This intrinsic has no corresponding instruction.
4251///
4252/// \param __a
4253///    A 256-bit integer vector.
4254/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4255///    bitwise pattern as the parameter.
4256static __inline __m256d __DEFAULT_FN_ATTRS
4257_mm256_castsi256_pd(__m256i __a)
4258{
4259  return (__m256d)__a;
4260}
4261
4262/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4263///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4264///
4265/// \headerfile <x86intrin.h>
4266///
4267/// This intrinsic has no corresponding instruction.
4268///
4269/// \param __a
4270///    A 256-bit floating-point vector of [4 x double].
4271/// \returns A 128-bit floating-point vector of [2 x double] containing the
4272///    lower 128 bits of the parameter.
4273static __inline __m128d __DEFAULT_FN_ATTRS
4274_mm256_castpd256_pd128(__m256d __a)
4275{
4276  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4277}
4278
4279/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4280///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4281///
4282/// \headerfile <x86intrin.h>
4283///
4284/// This intrinsic has no corresponding instruction.
4285///
4286/// \param __a
4287///    A 256-bit floating-point vector of [8 x float].
4288/// \returns A 128-bit floating-point vector of [4 x float] containing the
4289///    lower 128 bits of the parameter.
4290static __inline __m128 __DEFAULT_FN_ATTRS
4291_mm256_castps256_ps128(__m256 __a)
4292{
4293  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4294}
4295
4296/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4297///
4298/// \headerfile <x86intrin.h>
4299///
4300/// This intrinsic has no corresponding instruction.
4301///
4302/// \param __a
4303///    A 256-bit integer vector.
4304/// \returns A 128-bit integer vector containing the lower 128 bits of the
4305///    parameter.
4306static __inline __m128i __DEFAULT_FN_ATTRS
4307_mm256_castsi256_si128(__m256i __a)
4308{
4309  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4310}
4311
4312/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4313///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4314///    contain the value of the source vector. The contents of the upper 128
4315///    bits are undefined.
4316///
4317/// \headerfile <x86intrin.h>
4318///
4319/// This intrinsic has no corresponding instruction.
4320///
4321/// \param __a
4322///    A 128-bit vector of [2 x double].
4323/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4324///    contain the value of the parameter. The contents of the upper 128 bits
4325///    are undefined.
4326static __inline __m256d __DEFAULT_FN_ATTRS
4327_mm256_castpd128_pd256(__m128d __a)
4328{
4329  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4330}
4331
4332/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4333///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4334///    the value of the source vector. The contents of the upper 128 bits are
4335///    undefined.
4336///
4337/// \headerfile <x86intrin.h>
4338///
4339/// This intrinsic has no corresponding instruction.
4340///
4341/// \param __a
4342///    A 128-bit vector of [4 x float].
4343/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4344///    contain the value of the parameter. The contents of the upper 128 bits
4345///    are undefined.
4346static __inline __m256 __DEFAULT_FN_ATTRS
4347_mm256_castps128_ps256(__m128 __a)
4348{
4349  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4350}
4351
4352/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4353///    The lower 128 bits contain the value of the source vector. The contents
4354///    of the upper 128 bits are undefined.
4355///
4356/// \headerfile <x86intrin.h>
4357///
4358/// This intrinsic has no corresponding instruction.
4359///
4360/// \param __a
4361///    A 128-bit integer vector.
4362/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4363///    the parameter. The contents of the upper 128 bits are undefined.
4364static __inline __m256i __DEFAULT_FN_ATTRS
4365_mm256_castsi128_si256(__m128i __a)
4366{
4367  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4368}
4369
4370/*
4371   Vector insert.
4372   We use macros rather than inlines because we only want to accept
4373   invocations where the immediate M is a constant expression.
4374*/
4375/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4376///    a 256-bit vector of [8 x float] given in the first parameter, and then
4377///    replacing either the upper or the lower 128 bits with the contents of a
4378///    128-bit vector of [4 x float] in the second parameter. The immediate
4379///    integer parameter determines between the upper or the lower 128 bits.
4380///
4381/// \headerfile <x86intrin.h>
4382///
4383/// \code
4384/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4385/// \endcode
4386///
4387/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4388///
4389/// \param V1
4390///    A 256-bit vector of [8 x float]. This vector is copied to the result
4391///    first, and then either the upper or the lower 128 bits of the result will
4392///    be replaced by the contents of \a V2.
4393/// \param V2
4394///    A 128-bit vector of [4 x float]. The contents of this parameter are
4395///    written to either the upper or the lower 128 bits of the result depending
4396///    on the value of parameter \a M.
4397/// \param M
4398///    An immediate integer. The least significant bit determines how the values
4399///    from the two parameters are interleaved: \n
4400///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4401///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4402///    result. \n
4403///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4404///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4405///    result.
4406/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4407#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4408  (__m256)__builtin_shufflevector( \
4409    (__v8sf)(__m256)(V1), \
4410    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4411    (((M) & 1) ?  0 :  8), \
4412    (((M) & 1) ?  1 :  9), \
4413    (((M) & 1) ?  2 : 10), \
4414    (((M) & 1) ?  3 : 11), \
4415    (((M) & 1) ?  8 :  4), \
4416    (((M) & 1) ?  9 :  5), \
4417    (((M) & 1) ? 10 :  6), \
4418    (((M) & 1) ? 11 :  7) );})
4419
4420/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4421///    a 256-bit vector of [4 x double] given in the first parameter, and then
4422///    replacing either the upper or the lower 128 bits with the contents of a
4423///    128-bit vector of [2 x double] in the second parameter. The immediate
4424///    integer parameter determines between the upper or the lower 128 bits.
4425///
4426/// \headerfile <x86intrin.h>
4427///
4428/// \code
4429/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4430/// \endcode
4431///
4432/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4433///
4434/// \param V1
4435///    A 256-bit vector of [4 x double]. This vector is copied to the result
4436///    first, and then either the upper or the lower 128 bits of the result will
4437///    be replaced by the contents of \a V2.
4438/// \param V2
4439///    A 128-bit vector of [2 x double]. The contents of this parameter are
4440///    written to either the upper or the lower 128 bits of the result depending
4441///    on the value of parameter \a M.
4442/// \param M
4443///    An immediate integer. The least significant bit determines how the values
4444///    from the two parameters are interleaved: \n
4445///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4446///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4447///    result. \n
4448///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4449///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4450///    result.
4451/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4452#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4453  (__m256d)__builtin_shufflevector( \
4454    (__v4df)(__m256d)(V1), \
4455    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4456    (((M) & 1) ? 0 : 4), \
4457    (((M) & 1) ? 1 : 5), \
4458    (((M) & 1) ? 4 : 2), \
4459    (((M) & 1) ? 5 : 3) );})
4460
4461/// \brief Constructs a new 256-bit integer vector by first duplicating a
4462///    256-bit integer vector given in the first parameter, and then replacing
4463///    either the upper or the lower 128 bits with the contents of a 128-bit
4464///    integer vector in the second parameter. The immediate integer parameter
4465///    determines between the upper or the lower 128 bits.
4466///
4467/// \headerfile <x86intrin.h>
4468///
4469/// \code
4470/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4471/// \endcode
4472///
4473/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4474///
4475/// \param V1
4476///    A 256-bit integer vector. This vector is copied to the result first, and
4477///    then either the upper or the lower 128 bits of the result will be
4478///    replaced by the contents of \a V2.
4479/// \param V2
4480///    A 128-bit integer vector. The contents of this parameter are written to
4481///    either the upper or the lower 128 bits of the result depending on the
4482///     value of parameter \a M.
4483/// \param M
4484///    An immediate integer. The least significant bit determines how the values
4485///    from the two parameters are interleaved: \n
4486///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4487///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4488///    result. \n
4489///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4490///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4491///    result.
4492/// \returns A 256-bit integer vector containing the interleaved values.
4493#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4494  (__m256i)__builtin_shufflevector( \
4495    (__v4di)(__m256i)(V1), \
4496    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4497    (((M) & 1) ? 0 : 4), \
4498    (((M) & 1) ? 1 : 5), \
4499    (((M) & 1) ? 4 : 2), \
4500    (((M) & 1) ? 5 : 3) );})
4501
4502/*
4503   Vector extract.
4504   We use macros rather than inlines because we only want to accept
4505   invocations where the immediate M is a constant expression.
4506*/
4507/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4508///    of [8 x float], as determined by the immediate integer parameter, and
4509///    returns the extracted bits as a 128-bit vector of [4 x float].
4510///
4511/// \headerfile <x86intrin.h>
4512///
4513/// \code
4514/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4515/// \endcode
4516///
4517/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4518///
4519/// \param V
4520///    A 256-bit vector of [8 x float].
4521/// \param M
4522///    An immediate integer. The least significant bit determines which bits are
4523///    extracted from the first parameter: \n
4524///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4525///    result. \n
4526///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4527/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4528#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4529  (__m128)__builtin_shufflevector( \
4530    (__v8sf)(__m256)(V), \
4531    (__v8sf)(_mm256_undefined_ps()), \
4532    (((M) & 1) ? 4 : 0), \
4533    (((M) & 1) ? 5 : 1), \
4534    (((M) & 1) ? 6 : 2), \
4535    (((M) & 1) ? 7 : 3) );})
4536
4537/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4538///    of [4 x double], as determined by the immediate integer parameter, and
4539///    returns the extracted bits as a 128-bit vector of [2 x double].
4540///
4541/// \headerfile <x86intrin.h>
4542///
4543/// \code
4544/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4545/// \endcode
4546///
4547/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4548///
4549/// \param V
4550///    A 256-bit vector of [4 x double].
4551/// \param M
4552///    An immediate integer. The least significant bit determines which bits are
4553///    extracted from the first parameter: \n
4554///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4555///    result. \n
4556///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4557/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4558#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4559  (__m128d)__builtin_shufflevector( \
4560    (__v4df)(__m256d)(V), \
4561    (__v4df)(_mm256_undefined_pd()), \
4562    (((M) & 1) ? 2 : 0), \
4563    (((M) & 1) ? 3 : 1) );})
4564
4565/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4566///    integer vector, as determined by the immediate integer parameter, and
4567///    returns the extracted bits as a 128-bit integer vector.
4568///
4569/// \headerfile <x86intrin.h>
4570///
4571/// \code
4572/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4573/// \endcode
4574///
4575/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4576///
4577/// \param V
4578///    A 256-bit integer vector.
4579/// \param M
4580///    An immediate integer. The least significant bit determines which bits are
4581///    extracted from the first parameter:  \n
4582///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4583///    result. \n
4584///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4585/// \returns A 128-bit integer vector containing the extracted bits.
4586#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4587  (__m128i)__builtin_shufflevector( \
4588    (__v4di)(__m256i)(V), \
4589    (__v4di)(_mm256_undefined_si256()), \
4590    (((M) & 1) ? 2 : 0), \
4591    (((M) & 1) ? 3 : 1) );})
4592
4593/* SIMD load ops (unaligned) */
4594/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4595///    unaligned memory locations and constructs a 256-bit floating-point vector
4596///    of [8 x float] by concatenating the two 128-bit vectors.
4597///
4598/// \headerfile <x86intrin.h>
4599///
4600/// This intrinsic corresponds to load instructions followed by the
4601///   <c> VINSERTF128 </c> instruction.
4602///
4603/// \param __addr_hi
4604///    A pointer to a 128-bit memory location containing 4 consecutive
4605///    single-precision floating-point values. These values are to be copied to
4606///    bits[255:128] of the result. The address of the memory location does not
4607///    have to be aligned.
4608/// \param __addr_lo
4609///    A pointer to a 128-bit memory location containing 4 consecutive
4610///    single-precision floating-point values. These values are to be copied to
4611///    bits[127:0] of the result. The address of the memory location does not
4612///    have to be aligned.
4613/// \returns A 256-bit floating-point vector of [8 x float] containing the
4614///    concatenated result.
4615static __inline __m256 __DEFAULT_FN_ATTRS
4616_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4617{
4618  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4619  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4620}
4621
4622/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4623///    unaligned memory locations and constructs a 256-bit floating-point vector
4624///    of [4 x double] by concatenating the two 128-bit vectors.
4625///
4626/// \headerfile <x86intrin.h>
4627///
4628/// This intrinsic corresponds to load instructions followed by the
4629///   <c> VINSERTF128 </c> instruction.
4630///
4631/// \param __addr_hi
4632///    A pointer to a 128-bit memory location containing two consecutive
4633///    double-precision floating-point values. These values are to be copied to
4634///    bits[255:128] of the result. The address of the memory location does not
4635///    have to be aligned.
4636/// \param __addr_lo
4637///    A pointer to a 128-bit memory location containing two consecutive
4638///    double-precision floating-point values. These values are to be copied to
4639///    bits[127:0] of the result. The address of the memory location does not
4640///    have to be aligned.
4641/// \returns A 256-bit floating-point vector of [4 x double] containing the
4642///    concatenated result.
4643static __inline __m256d __DEFAULT_FN_ATTRS
4644_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4645{
4646  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4647  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4648}
4649
4650/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4651///    constructs a 256-bit integer vector by concatenating the two 128-bit
4652///    vectors.
4653///
4654/// \headerfile <x86intrin.h>
4655///
4656/// This intrinsic corresponds to load instructions followed by the
4657///   <c> VINSERTF128 </c> instruction.
4658///
4659/// \param __addr_hi
4660///    A pointer to a 128-bit memory location containing a 128-bit integer
4661///    vector. This vector is to be copied to bits[255:128] of the result. The
4662///    address of the memory location does not have to be aligned.
4663/// \param __addr_lo
4664///    A pointer to a 128-bit memory location containing a 128-bit integer
4665///    vector. This vector is to be copied to bits[127:0] of the result. The
4666///    address of the memory location does not have to be aligned.
4667/// \returns A 256-bit integer vector containing the concatenated result.
4668static __inline __m256i __DEFAULT_FN_ATTRS
4669_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
4670{
4671  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4672  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4673}
4674
4675/* SIMD store ops (unaligned) */
4676/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4677///    vector of [8 x float] into two different unaligned memory locations.
4678///
4679/// \headerfile <x86intrin.h>
4680///
4681/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4682///   store instructions.
4683///
4684/// \param __addr_hi
4685///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4686///    copied to this memory location. The address of this memory location does
4687///    not have to be aligned.
4688/// \param __addr_lo
4689///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4690///    copied to this memory location. The address of this memory location does
4691///    not have to be aligned.
4692/// \param __a
4693///    A 256-bit floating-point vector of [8 x float].
4694static __inline void __DEFAULT_FN_ATTRS
4695_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4696{
4697  __m128 __v128;
4698
4699  __v128 = _mm256_castps256_ps128(__a);
4700  _mm_storeu_ps(__addr_lo, __v128);
4701  __v128 = _mm256_extractf128_ps(__a, 1);
4702  _mm_storeu_ps(__addr_hi, __v128);
4703}
4704
4705/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4706///    vector of [4 x double] into two different unaligned memory locations.
4707///
4708/// \headerfile <x86intrin.h>
4709///
4710/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4711///   store instructions.
4712///
4713/// \param __addr_hi
4714///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4715///    copied to this memory location. The address of this memory location does
4716///    not have to be aligned.
4717/// \param __addr_lo
4718///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4719///    copied to this memory location. The address of this memory location does
4720///    not have to be aligned.
4721/// \param __a
4722///    A 256-bit floating-point vector of [4 x double].
4723static __inline void __DEFAULT_FN_ATTRS
4724_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4725{
4726  __m128d __v128;
4727
4728  __v128 = _mm256_castpd256_pd128(__a);
4729  _mm_storeu_pd(__addr_lo, __v128);
4730  __v128 = _mm256_extractf128_pd(__a, 1);
4731  _mm_storeu_pd(__addr_hi, __v128);
4732}
4733
4734/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
4735///    two different unaligned memory locations.
4736///
4737/// \headerfile <x86intrin.h>
4738///
4739/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4740///   store instructions.
4741///
4742/// \param __addr_hi
4743///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4744///    copied to this memory location. The address of this memory location does
4745///    not have to be aligned.
4746/// \param __addr_lo
4747///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4748///    copied to this memory location. The address of this memory location does
4749///    not have to be aligned.
4750/// \param __a
4751///    A 256-bit integer vector.
4752static __inline void __DEFAULT_FN_ATTRS
4753_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
4754{
4755  __m128i __v128;
4756
4757  __v128 = _mm256_castsi256_si128(__a);
4758  _mm_storeu_si128(__addr_lo, __v128);
4759  __v128 = _mm256_extractf128_si256(__a, 1);
4760  _mm_storeu_si128(__addr_hi, __v128);
4761}
4762
4763/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4764///    concatenating two 128-bit floating-point vectors of [4 x float].
4765///
4766/// \headerfile <x86intrin.h>
4767///
4768/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4769///
4770/// \param __hi
4771///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4772///    128 bits of the result.
4773/// \param __lo
4774///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4775///    128 bits of the result.
4776/// \returns A 256-bit floating-point vector of [8 x float] containing the
4777///    concatenated result.
4778static __inline __m256 __DEFAULT_FN_ATTRS
4779_mm256_set_m128 (__m128 __hi, __m128 __lo)
4780{
4781  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4782}
4783
4784/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
4785///    concatenating two 128-bit floating-point vectors of [2 x double].
4786///
4787/// \headerfile <x86intrin.h>
4788///
4789/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4790///
4791/// \param __hi
4792///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4793///    128 bits of the result.
4794/// \param __lo
4795///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4796///    128 bits of the result.
4797/// \returns A 256-bit floating-point vector of [4 x double] containing the
4798///    concatenated result.
4799static __inline __m256d __DEFAULT_FN_ATTRS
4800_mm256_set_m128d (__m128d __hi, __m128d __lo)
4801{
4802  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4803}
4804
4805/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
4806///    integer vectors.
4807///
4808/// \headerfile <x86intrin.h>
4809///
4810/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4811///
4812/// \param __hi
4813///    A 128-bit integer vector to be copied to the upper 128 bits of the
4814///    result.
4815/// \param __lo
4816///    A 128-bit integer vector to be copied to the lower 128 bits of the
4817///    result.
4818/// \returns A 256-bit integer vector containing the concatenated result.
4819static __inline __m256i __DEFAULT_FN_ATTRS
4820_mm256_set_m128i (__m128i __hi, __m128i __lo)
4821{
4822  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4823}
4824
4825/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4826///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
4827///    similar to _mm256_set_m128, but the order of the input parameters is
4828///    swapped.
4829///
4830/// \headerfile <x86intrin.h>
4831///
4832/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4833///
4834/// \param __lo
4835///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4836///    128 bits of the result.
4837/// \param __hi
4838///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4839///    128 bits of the result.
4840/// \returns A 256-bit floating-point vector of [8 x float] containing the
4841///    concatenated result.
4842static __inline __m256 __DEFAULT_FN_ATTRS
4843_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4844{
4845  return _mm256_set_m128(__hi, __lo);
4846}
4847
4848/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
4849///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
4850///    similar to _mm256_set_m128d, but the order of the input parameters is
4851///    swapped.
4852///
4853/// \headerfile <x86intrin.h>
4854///
4855/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4856///
4857/// \param __lo
4858///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4859///    128 bits of the result.
4860/// \param __hi
4861///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4862///    128 bits of the result.
4863/// \returns A 256-bit floating-point vector of [4 x double] containing the
4864///    concatenated result.
4865static __inline __m256d __DEFAULT_FN_ATTRS
4866_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4867{
4868  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4869}
4870
4871/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
4872///    integer vectors. This is similar to _mm256_set_m128i, but the order of
4873///    the input parameters is swapped.
4874///
4875/// \headerfile <x86intrin.h>
4876///
4877/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4878///
4879/// \param __lo
4880///    A 128-bit integer vector to be copied to the lower 128 bits of the
4881///    result.
4882/// \param __hi
4883///    A 128-bit integer vector to be copied to the upper 128 bits of the
4884///    result.
4885/// \returns A 256-bit integer vector containing the concatenated result.
4886static __inline __m256i __DEFAULT_FN_ATTRS
4887_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4888{
4889  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4890}
4891
4892#undef __DEFAULT_FN_ATTRS
4893
4894#endif /* __AVXINTRIN_H */
4895