1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#include <xmmintrin.h>
14
15typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
16typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
17
18typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
19typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
20
21/* Type defines.  */
22typedef double __v2df __attribute__ ((__vector_size__ (16)));
23typedef long long __v2di __attribute__ ((__vector_size__ (16)));
24typedef short __v8hi __attribute__((__vector_size__(16)));
25typedef char __v16qi __attribute__((__vector_size__(16)));
26
27/* Unsigned types */
28typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
29typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
30typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
31
32/* We need an explicitly signed variant for char. Note that this shouldn't
33 * appear in the interface though. */
34typedef signed char __v16qs __attribute__((__vector_size__(16)));
35
36/* Define the default attributes for the functions in this file. */
37#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
38#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
39
40/// Adds lower double-precision values in both operands and returns the
41///    sum in the lower 64 bits of the result. The upper 64 bits of the result
42///    are copied from the upper double-precision value of the first operand.
43///
44/// \headerfile <x86intrin.h>
45///
46/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
47///
48/// \param __a
49///    A 128-bit vector of [2 x double] containing one of the source operands.
50/// \param __b
51///    A 128-bit vector of [2 x double] containing one of the source operands.
52/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
53///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
54///    from the upper 64 bits of the first source operand.
55static __inline__ __m128d __DEFAULT_FN_ATTRS
56_mm_add_sd(__m128d __a, __m128d __b)
57{
58  __a[0] += __b[0];
59  return __a;
60}
61
62/// Adds two 128-bit vectors of [2 x double].
63///
64/// \headerfile <x86intrin.h>
65///
66/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
67///
68/// \param __a
69///    A 128-bit vector of [2 x double] containing one of the source operands.
70/// \param __b
71///    A 128-bit vector of [2 x double] containing one of the source operands.
72/// \returns A 128-bit vector of [2 x double] containing the sums of both
73///    operands.
74static __inline__ __m128d __DEFAULT_FN_ATTRS
75_mm_add_pd(__m128d __a, __m128d __b)
76{
77  return (__m128d)((__v2df)__a + (__v2df)__b);
78}
79
80/// Subtracts the lower double-precision value of the second operand
81///    from the lower double-precision value of the first operand and returns
82///    the difference in the lower 64 bits of the result. The upper 64 bits of
83///    the result are copied from the upper double-precision value of the first
84///    operand.
85///
86/// \headerfile <x86intrin.h>
87///
88/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
89///
90/// \param __a
91///    A 128-bit vector of [2 x double] containing the minuend.
92/// \param __b
93///    A 128-bit vector of [2 x double] containing the subtrahend.
94/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
95///    difference of the lower 64 bits of both operands. The upper 64 bits are
96///    copied from the upper 64 bits of the first source operand.
97static __inline__ __m128d __DEFAULT_FN_ATTRS
98_mm_sub_sd(__m128d __a, __m128d __b)
99{
100  __a[0] -= __b[0];
101  return __a;
102}
103
104/// Subtracts two 128-bit vectors of [2 x double].
105///
106/// \headerfile <x86intrin.h>
107///
108/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
109///
110/// \param __a
111///    A 128-bit vector of [2 x double] containing the minuend.
112/// \param __b
113///    A 128-bit vector of [2 x double] containing the subtrahend.
114/// \returns A 128-bit vector of [2 x double] containing the differences between
115///    both operands.
116static __inline__ __m128d __DEFAULT_FN_ATTRS
117_mm_sub_pd(__m128d __a, __m128d __b)
118{
119  return (__m128d)((__v2df)__a - (__v2df)__b);
120}
121
122/// Multiplies lower double-precision values in both operands and returns
123///    the product in the lower 64 bits of the result. The upper 64 bits of the
124///    result are copied from the upper double-precision value of the first
125///    operand.
126///
127/// \headerfile <x86intrin.h>
128///
129/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
130///
131/// \param __a
132///    A 128-bit vector of [2 x double] containing one of the source operands.
133/// \param __b
134///    A 128-bit vector of [2 x double] containing one of the source operands.
135/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
136///    product of the lower 64 bits of both operands. The upper 64 bits are
137///    copied from the upper 64 bits of the first source operand.
138static __inline__ __m128d __DEFAULT_FN_ATTRS
139_mm_mul_sd(__m128d __a, __m128d __b)
140{
141  __a[0] *= __b[0];
142  return __a;
143}
144
145/// Multiplies two 128-bit vectors of [2 x double].
146///
147/// \headerfile <x86intrin.h>
148///
149/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
150///
151/// \param __a
152///    A 128-bit vector of [2 x double] containing one of the operands.
153/// \param __b
154///    A 128-bit vector of [2 x double] containing one of the operands.
155/// \returns A 128-bit vector of [2 x double] containing the products of both
156///    operands.
157static __inline__ __m128d __DEFAULT_FN_ATTRS
158_mm_mul_pd(__m128d __a, __m128d __b)
159{
160  return (__m128d)((__v2df)__a * (__v2df)__b);
161}
162
163/// Divides the lower double-precision value of the first operand by the
164///    lower double-precision value of the second operand and returns the
165///    quotient in the lower 64 bits of the result. The upper 64 bits of the
166///    result are copied from the upper double-precision value of the first
167///    operand.
168///
169/// \headerfile <x86intrin.h>
170///
171/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
172///
173/// \param __a
174///    A 128-bit vector of [2 x double] containing the dividend.
175/// \param __b
176///    A 128-bit vector of [2 x double] containing divisor.
177/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
178///    quotient of the lower 64 bits of both operands. The upper 64 bits are
179///    copied from the upper 64 bits of the first source operand.
180static __inline__ __m128d __DEFAULT_FN_ATTRS
181_mm_div_sd(__m128d __a, __m128d __b)
182{
183  __a[0] /= __b[0];
184  return __a;
185}
186
187/// Performs an element-by-element division of two 128-bit vectors of
188///    [2 x double].
189///
190/// \headerfile <x86intrin.h>
191///
192/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
193///
194/// \param __a
195///    A 128-bit vector of [2 x double] containing the dividend.
196/// \param __b
197///    A 128-bit vector of [2 x double] containing the divisor.
198/// \returns A 128-bit vector of [2 x double] containing the quotients of both
199///    operands.
200static __inline__ __m128d __DEFAULT_FN_ATTRS
201_mm_div_pd(__m128d __a, __m128d __b)
202{
203  return (__m128d)((__v2df)__a / (__v2df)__b);
204}
205
206/// Calculates the square root of the lower double-precision value of
207///    the second operand and returns it in the lower 64 bits of the result.
208///    The upper 64 bits of the result are copied from the upper
209///    double-precision value of the first operand.
210///
211/// \headerfile <x86intrin.h>
212///
213/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
214///
215/// \param __a
216///    A 128-bit vector of [2 x double] containing one of the operands. The
217///    upper 64 bits of this operand are copied to the upper 64 bits of the
218///    result.
219/// \param __b
220///    A 128-bit vector of [2 x double] containing one of the operands. The
221///    square root is calculated using the lower 64 bits of this operand.
222/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
223///    square root of the lower 64 bits of operand \a __b, and whose upper 64
224///    bits are copied from the upper 64 bits of operand \a __a.
225static __inline__ __m128d __DEFAULT_FN_ATTRS
226_mm_sqrt_sd(__m128d __a, __m128d __b)
227{
228  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
229  return __extension__ (__m128d) { __c[0], __a[1] };
230}
231
232/// Calculates the square root of the each of two values stored in a
233///    128-bit vector of [2 x double].
234///
235/// \headerfile <x86intrin.h>
236///
237/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
238///
239/// \param __a
240///    A 128-bit vector of [2 x double].
241/// \returns A 128-bit vector of [2 x double] containing the square roots of the
242///    values in the operand.
243static __inline__ __m128d __DEFAULT_FN_ATTRS
244_mm_sqrt_pd(__m128d __a)
245{
246  return __builtin_ia32_sqrtpd((__v2df)__a);
247}
248
249/// Compares lower 64-bit double-precision values of both operands, and
250///    returns the lesser of the pair of values in the lower 64-bits of the
251///    result. The upper 64 bits of the result are copied from the upper
252///    double-precision value of the first operand.
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
257///
258/// \param __a
259///    A 128-bit vector of [2 x double] containing one of the operands. The
260///    lower 64 bits of this operand are used in the comparison.
261/// \param __b
262///    A 128-bit vector of [2 x double] containing one of the operands. The
263///    lower 64 bits of this operand are used in the comparison.
264/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
265///    minimum value between both operands. The upper 64 bits are copied from
266///    the upper 64 bits of the first source operand.
267static __inline__ __m128d __DEFAULT_FN_ATTRS
268_mm_min_sd(__m128d __a, __m128d __b)
269{
270  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
271}
272
273/// Performs element-by-element comparison of the two 128-bit vectors of
274///    [2 x double] and returns the vector containing the lesser of each pair of
275///    values.
276///
277/// \headerfile <x86intrin.h>
278///
279/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
280///
281/// \param __a
282///    A 128-bit vector of [2 x double] containing one of the operands.
283/// \param __b
284///    A 128-bit vector of [2 x double] containing one of the operands.
285/// \returns A 128-bit vector of [2 x double] containing the minimum values
286///    between both operands.
287static __inline__ __m128d __DEFAULT_FN_ATTRS
288_mm_min_pd(__m128d __a, __m128d __b)
289{
290  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
291}
292
293/// Compares lower 64-bit double-precision values of both operands, and
294///    returns the greater of the pair of values in the lower 64-bits of the
295///    result. The upper 64 bits of the result are copied from the upper
296///    double-precision value of the first operand.
297///
298/// \headerfile <x86intrin.h>
299///
300/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
301///
302/// \param __a
303///    A 128-bit vector of [2 x double] containing one of the operands. The
304///    lower 64 bits of this operand are used in the comparison.
305/// \param __b
306///    A 128-bit vector of [2 x double] containing one of the operands. The
307///    lower 64 bits of this operand are used in the comparison.
308/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
309///    maximum value between both operands. The upper 64 bits are copied from
310///    the upper 64 bits of the first source operand.
311static __inline__ __m128d __DEFAULT_FN_ATTRS
312_mm_max_sd(__m128d __a, __m128d __b)
313{
314  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
315}
316
317/// Performs element-by-element comparison of the two 128-bit vectors of
318///    [2 x double] and returns the vector containing the greater of each pair
319///    of values.
320///
321/// \headerfile <x86intrin.h>
322///
323/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
324///
325/// \param __a
326///    A 128-bit vector of [2 x double] containing one of the operands.
327/// \param __b
328///    A 128-bit vector of [2 x double] containing one of the operands.
329/// \returns A 128-bit vector of [2 x double] containing the maximum values
330///    between both operands.
331static __inline__ __m128d __DEFAULT_FN_ATTRS
332_mm_max_pd(__m128d __a, __m128d __b)
333{
334  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
335}
336
337/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
342///
343/// \param __a
344///    A 128-bit vector of [2 x double] containing one of the source operands.
345/// \param __b
346///    A 128-bit vector of [2 x double] containing one of the source operands.
347/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
348///    values between both operands.
349static __inline__ __m128d __DEFAULT_FN_ATTRS
350_mm_and_pd(__m128d __a, __m128d __b)
351{
352  return (__m128d)((__v2du)__a & (__v2du)__b);
353}
354
355/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
356///    the one's complement of the values contained in the first source operand.
357///
358/// \headerfile <x86intrin.h>
359///
360/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
361///
362/// \param __a
363///    A 128-bit vector of [2 x double] containing the left source operand. The
364///    one's complement of this value is used in the bitwise AND.
365/// \param __b
366///    A 128-bit vector of [2 x double] containing the right source operand.
367/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
368///    values in the second operand and the one's complement of the first
369///    operand.
370static __inline__ __m128d __DEFAULT_FN_ATTRS
371_mm_andnot_pd(__m128d __a, __m128d __b)
372{
373  return (__m128d)(~(__v2du)__a & (__v2du)__b);
374}
375
376/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
377///
378/// \headerfile <x86intrin.h>
379///
380/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
381///
382/// \param __a
383///    A 128-bit vector of [2 x double] containing one of the source operands.
384/// \param __b
385///    A 128-bit vector of [2 x double] containing one of the source operands.
386/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
387///    values between both operands.
388static __inline__ __m128d __DEFAULT_FN_ATTRS
389_mm_or_pd(__m128d __a, __m128d __b)
390{
391  return (__m128d)((__v2du)__a | (__v2du)__b);
392}
393
394/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
395///
396/// \headerfile <x86intrin.h>
397///
398/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
399///
400/// \param __a
401///    A 128-bit vector of [2 x double] containing one of the source operands.
402/// \param __b
403///    A 128-bit vector of [2 x double] containing one of the source operands.
404/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
405///    values between both operands.
406static __inline__ __m128d __DEFAULT_FN_ATTRS
407_mm_xor_pd(__m128d __a, __m128d __b)
408{
409  return (__m128d)((__v2du)__a ^ (__v2du)__b);
410}
411
412/// Compares each of the corresponding double-precision values of the
413///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414///    for false, 0xFFFFFFFFFFFFFFFF for true.
415///
416/// \headerfile <x86intrin.h>
417///
418/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419///
420/// \param __a
421///    A 128-bit vector of [2 x double].
422/// \param __b
423///    A 128-bit vector of [2 x double].
424/// \returns A 128-bit vector containing the comparison results.
425static __inline__ __m128d __DEFAULT_FN_ATTRS
426_mm_cmpeq_pd(__m128d __a, __m128d __b)
427{
428  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
429}
430
431/// Compares each of the corresponding double-precision values of the
432///    128-bit vectors of [2 x double] to determine if the values in the first
433///    operand are less than those in the second operand. Each comparison
434///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
435///
436/// \headerfile <x86intrin.h>
437///
438/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
439///
440/// \param __a
441///    A 128-bit vector of [2 x double].
442/// \param __b
443///    A 128-bit vector of [2 x double].
444/// \returns A 128-bit vector containing the comparison results.
445static __inline__ __m128d __DEFAULT_FN_ATTRS
446_mm_cmplt_pd(__m128d __a, __m128d __b)
447{
448  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
449}
450
451/// Compares each of the corresponding double-precision values of the
452///    128-bit vectors of [2 x double] to determine if the values in the first
453///    operand are less than or equal to those in the second operand.
454///
455///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
456///
457/// \headerfile <x86intrin.h>
458///
459/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
460///
461/// \param __a
462///    A 128-bit vector of [2 x double].
463/// \param __b
464///    A 128-bit vector of [2 x double].
465/// \returns A 128-bit vector containing the comparison results.
466static __inline__ __m128d __DEFAULT_FN_ATTRS
467_mm_cmple_pd(__m128d __a, __m128d __b)
468{
469  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
470}
471
472/// Compares each of the corresponding double-precision values of the
473///    128-bit vectors of [2 x double] to determine if the values in the first
474///    operand are greater than those in the second operand.
475///
476///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
477///
478/// \headerfile <x86intrin.h>
479///
480/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
481///
482/// \param __a
483///    A 128-bit vector of [2 x double].
484/// \param __b
485///    A 128-bit vector of [2 x double].
486/// \returns A 128-bit vector containing the comparison results.
487static __inline__ __m128d __DEFAULT_FN_ATTRS
488_mm_cmpgt_pd(__m128d __a, __m128d __b)
489{
490  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
491}
492
493/// Compares each of the corresponding double-precision values of the
494///    128-bit vectors of [2 x double] to determine if the values in the first
495///    operand are greater than or equal to those in the second operand.
496///
497///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
498///
499/// \headerfile <x86intrin.h>
500///
501/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
502///
503/// \param __a
504///    A 128-bit vector of [2 x double].
505/// \param __b
506///    A 128-bit vector of [2 x double].
507/// \returns A 128-bit vector containing the comparison results.
508static __inline__ __m128d __DEFAULT_FN_ATTRS
509_mm_cmpge_pd(__m128d __a, __m128d __b)
510{
511  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
512}
513
514/// Compares each of the corresponding double-precision values of the
515///    128-bit vectors of [2 x double] to determine if the values in the first
516///    operand are ordered with respect to those in the second operand.
517///
518///    A pair of double-precision values are "ordered" with respect to each
519///    other if neither value is a NaN. Each comparison yields 0x0 for false,
520///    0xFFFFFFFFFFFFFFFF for true.
521///
522/// \headerfile <x86intrin.h>
523///
524/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
525///
526/// \param __a
527///    A 128-bit vector of [2 x double].
528/// \param __b
529///    A 128-bit vector of [2 x double].
530/// \returns A 128-bit vector containing the comparison results.
531static __inline__ __m128d __DEFAULT_FN_ATTRS
532_mm_cmpord_pd(__m128d __a, __m128d __b)
533{
534  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
535}
536
537/// Compares each of the corresponding double-precision values of the
538///    128-bit vectors of [2 x double] to determine if the values in the first
539///    operand are unordered with respect to those in the second operand.
540///
541///    A pair of double-precision values are "unordered" with respect to each
542///    other if one or both values are NaN. Each comparison yields 0x0 for
543///    false, 0xFFFFFFFFFFFFFFFF for true.
544///
545/// \headerfile <x86intrin.h>
546///
547/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
548///   instruction.
549///
550/// \param __a
551///    A 128-bit vector of [2 x double].
552/// \param __b
553///    A 128-bit vector of [2 x double].
554/// \returns A 128-bit vector containing the comparison results.
555static __inline__ __m128d __DEFAULT_FN_ATTRS
556_mm_cmpunord_pd(__m128d __a, __m128d __b)
557{
558  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
559}
560
561/// Compares each of the corresponding double-precision values of the
562///    128-bit vectors of [2 x double] to determine if the values in the first
563///    operand are unequal to those in the second operand.
564///
565///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
566///
567/// \headerfile <x86intrin.h>
568///
569/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
570///
571/// \param __a
572///    A 128-bit vector of [2 x double].
573/// \param __b
574///    A 128-bit vector of [2 x double].
575/// \returns A 128-bit vector containing the comparison results.
576static __inline__ __m128d __DEFAULT_FN_ATTRS
577_mm_cmpneq_pd(__m128d __a, __m128d __b)
578{
579  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
580}
581
582/// Compares each of the corresponding double-precision values of the
583///    128-bit vectors of [2 x double] to determine if the values in the first
584///    operand are not less than those in the second operand.
585///
586///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
587///
588/// \headerfile <x86intrin.h>
589///
590/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
591///
592/// \param __a
593///    A 128-bit vector of [2 x double].
594/// \param __b
595///    A 128-bit vector of [2 x double].
596/// \returns A 128-bit vector containing the comparison results.
597static __inline__ __m128d __DEFAULT_FN_ATTRS
598_mm_cmpnlt_pd(__m128d __a, __m128d __b)
599{
600  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
601}
602
603/// Compares each of the corresponding double-precision values of the
604///    128-bit vectors of [2 x double] to determine if the values in the first
605///    operand are not less than or equal to those in the second operand.
606///
607///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
608///
609/// \headerfile <x86intrin.h>
610///
611/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
612///
613/// \param __a
614///    A 128-bit vector of [2 x double].
615/// \param __b
616///    A 128-bit vector of [2 x double].
617/// \returns A 128-bit vector containing the comparison results.
618static __inline__ __m128d __DEFAULT_FN_ATTRS
619_mm_cmpnle_pd(__m128d __a, __m128d __b)
620{
621  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
622}
623
624/// Compares each of the corresponding double-precision values of the
625///    128-bit vectors of [2 x double] to determine if the values in the first
626///    operand are not greater than those in the second operand.
627///
628///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
629///
630/// \headerfile <x86intrin.h>
631///
632/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
633///
634/// \param __a
635///    A 128-bit vector of [2 x double].
636/// \param __b
637///    A 128-bit vector of [2 x double].
638/// \returns A 128-bit vector containing the comparison results.
639static __inline__ __m128d __DEFAULT_FN_ATTRS
640_mm_cmpngt_pd(__m128d __a, __m128d __b)
641{
642  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
643}
644
645/// Compares each of the corresponding double-precision values of the
646///    128-bit vectors of [2 x double] to determine if the values in the first
647///    operand are not greater than or equal to those in the second operand.
648///
649///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
650///
651/// \headerfile <x86intrin.h>
652///
653/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
654///
655/// \param __a
656///    A 128-bit vector of [2 x double].
657/// \param __b
658///    A 128-bit vector of [2 x double].
659/// \returns A 128-bit vector containing the comparison results.
660static __inline__ __m128d __DEFAULT_FN_ATTRS
661_mm_cmpnge_pd(__m128d __a, __m128d __b)
662{
663  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
664}
665
666/// Compares the lower double-precision floating-point values in each of
667///    the two 128-bit floating-point vectors of [2 x double] for equality.
668///
669///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
670///
671/// \headerfile <x86intrin.h>
672///
673/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
674///
675/// \param __a
676///    A 128-bit vector of [2 x double]. The lower double-precision value is
677///    compared to the lower double-precision value of \a __b.
678/// \param __b
679///    A 128-bit vector of [2 x double]. The lower double-precision value is
680///    compared to the lower double-precision value of \a __a.
681/// \returns A 128-bit vector. The lower 64 bits contains the comparison
682///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
683static __inline__ __m128d __DEFAULT_FN_ATTRS
684_mm_cmpeq_sd(__m128d __a, __m128d __b)
685{
686  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
687}
688
689/// Compares the lower double-precision floating-point values in each of
690///    the two 128-bit floating-point vectors of [2 x double] to determine if
691///    the value in the first parameter is less than the corresponding value in
692///    the second parameter.
693///
694///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
695///
696/// \headerfile <x86intrin.h>
697///
698/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
699///
700/// \param __a
701///    A 128-bit vector of [2 x double]. The lower double-precision value is
702///    compared to the lower double-precision value of \a __b.
703/// \param __b
704///    A 128-bit vector of [2 x double]. The lower double-precision value is
705///    compared to the lower double-precision value of \a __a.
706/// \returns A 128-bit vector. The lower 64 bits contains the comparison
707///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
708static __inline__ __m128d __DEFAULT_FN_ATTRS
709_mm_cmplt_sd(__m128d __a, __m128d __b)
710{
711  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
712}
713
714/// Compares the lower double-precision floating-point values in each of
715///    the two 128-bit floating-point vectors of [2 x double] to determine if
716///    the value in the first parameter is less than or equal to the
717///    corresponding value in the second parameter.
718///
719///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
720///
721/// \headerfile <x86intrin.h>
722///
723/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
724///
725/// \param __a
726///    A 128-bit vector of [2 x double]. The lower double-precision value is
727///    compared to the lower double-precision value of \a __b.
728/// \param __b
729///    A 128-bit vector of [2 x double]. The lower double-precision value is
730///    compared to the lower double-precision value of \a __a.
731/// \returns A 128-bit vector. The lower 64 bits contains the comparison
732///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
733static __inline__ __m128d __DEFAULT_FN_ATTRS
734_mm_cmple_sd(__m128d __a, __m128d __b)
735{
736  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
737}
738
739/// Compares the lower double-precision floating-point values in each of
740///    the two 128-bit floating-point vectors of [2 x double] to determine if
741///    the value in the first parameter is greater than the corresponding value
742///    in the second parameter.
743///
744///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
745///
746/// \headerfile <x86intrin.h>
747///
748/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
749///
750/// \param __a
751///     A 128-bit vector of [2 x double]. The lower double-precision value is
752///     compared to the lower double-precision value of \a __b.
753/// \param __b
754///     A 128-bit vector of [2 x double]. The lower double-precision value is
755///     compared to the lower double-precision value of \a __a.
756/// \returns A 128-bit vector. The lower 64 bits contains the comparison
757///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
758static __inline__ __m128d __DEFAULT_FN_ATTRS
759_mm_cmpgt_sd(__m128d __a, __m128d __b)
760{
761  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
762  return __extension__ (__m128d) { __c[0], __a[1] };
763}
764
765/// Compares the lower double-precision floating-point values in each of
766///    the two 128-bit floating-point vectors of [2 x double] to determine if
767///    the value in the first parameter is greater than or equal to the
768///    corresponding value in the second parameter.
769///
770///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
771///
772/// \headerfile <x86intrin.h>
773///
774/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
775///
776/// \param __a
777///    A 128-bit vector of [2 x double]. The lower double-precision value is
778///    compared to the lower double-precision value of \a __b.
779/// \param __b
780///    A 128-bit vector of [2 x double]. The lower double-precision value is
781///    compared to the lower double-precision value of \a __a.
782/// \returns A 128-bit vector. The lower 64 bits contains the comparison
783///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
784static __inline__ __m128d __DEFAULT_FN_ATTRS
785_mm_cmpge_sd(__m128d __a, __m128d __b)
786{
787  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
788  return __extension__ (__m128d) { __c[0], __a[1] };
789}
790
791/// Compares the lower double-precision floating-point values in each of
792///    the two 128-bit floating-point vectors of [2 x double] to determine if
793///    the value in the first parameter is "ordered" with respect to the
794///    corresponding value in the second parameter.
795///
796///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
797///    of double-precision values are "ordered" with respect to each other if
798///    neither value is a NaN.
799///
800/// \headerfile <x86intrin.h>
801///
802/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
803///
804/// \param __a
805///    A 128-bit vector of [2 x double]. The lower double-precision value is
806///    compared to the lower double-precision value of \a __b.
807/// \param __b
808///    A 128-bit vector of [2 x double]. The lower double-precision value is
809///    compared to the lower double-precision value of \a __a.
810/// \returns A 128-bit vector. The lower 64 bits contains the comparison
811///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
812static __inline__ __m128d __DEFAULT_FN_ATTRS
813_mm_cmpord_sd(__m128d __a, __m128d __b)
814{
815  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
816}
817
818/// Compares the lower double-precision floating-point values in each of
819///    the two 128-bit floating-point vectors of [2 x double] to determine if
820///    the value in the first parameter is "unordered" with respect to the
821///    corresponding value in the second parameter.
822///
823///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
824///    of double-precision values are "unordered" with respect to each other if
825///    one or both values are NaN.
826///
827/// \headerfile <x86intrin.h>
828///
829/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
830///   instruction.
831///
832/// \param __a
833///    A 128-bit vector of [2 x double]. The lower double-precision value is
834///    compared to the lower double-precision value of \a __b.
835/// \param __b
836///    A 128-bit vector of [2 x double]. The lower double-precision value is
837///    compared to the lower double-precision value of \a __a.
838/// \returns A 128-bit vector. The lower 64 bits contains the comparison
839///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
840static __inline__ __m128d __DEFAULT_FN_ATTRS
841_mm_cmpunord_sd(__m128d __a, __m128d __b)
842{
843  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
844}
845
846/// Compares the lower double-precision floating-point values in each of
847///    the two 128-bit floating-point vectors of [2 x double] to determine if
848///    the value in the first parameter is unequal to the corresponding value in
849///    the second parameter.
850///
851///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
852///
853/// \headerfile <x86intrin.h>
854///
855/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
856///
857/// \param __a
858///    A 128-bit vector of [2 x double]. The lower double-precision value is
859///    compared to the lower double-precision value of \a __b.
860/// \param __b
861///    A 128-bit vector of [2 x double]. The lower double-precision value is
862///    compared to the lower double-precision value of \a __a.
863/// \returns A 128-bit vector. The lower 64 bits contains the comparison
864///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
865static __inline__ __m128d __DEFAULT_FN_ATTRS
866_mm_cmpneq_sd(__m128d __a, __m128d __b)
867{
868  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
869}
870
871/// Compares the lower double-precision floating-point values in each of
872///    the two 128-bit floating-point vectors of [2 x double] to determine if
873///    the value in the first parameter is not less than the corresponding
874///    value in the second parameter.
875///
876///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
877///
878/// \headerfile <x86intrin.h>
879///
880/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
881///
882/// \param __a
883///    A 128-bit vector of [2 x double]. The lower double-precision value is
884///    compared to the lower double-precision value of \a __b.
885/// \param __b
886///    A 128-bit vector of [2 x double]. The lower double-precision value is
887///    compared to the lower double-precision value of \a __a.
888/// \returns A 128-bit vector. The lower 64 bits contains the comparison
889///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
890static __inline__ __m128d __DEFAULT_FN_ATTRS
891_mm_cmpnlt_sd(__m128d __a, __m128d __b)
892{
893  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
894}
895
896/// Compares the lower double-precision floating-point values in each of
897///    the two 128-bit floating-point vectors of [2 x double] to determine if
898///    the value in the first parameter is not less than or equal to the
899///    corresponding value in the second parameter.
900///
901///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
902///
903/// \headerfile <x86intrin.h>
904///
905/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
906///
907/// \param __a
908///    A 128-bit vector of [2 x double]. The lower double-precision value is
909///    compared to the lower double-precision value of \a __b.
910/// \param __b
911///    A 128-bit vector of [2 x double]. The lower double-precision value is
912///    compared to the lower double-precision value of \a __a.
913/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
914///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
915static __inline__ __m128d __DEFAULT_FN_ATTRS
916_mm_cmpnle_sd(__m128d __a, __m128d __b)
917{
918  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
919}
920
921/// Compares the lower double-precision floating-point values in each of
922///    the two 128-bit floating-point vectors of [2 x double] to determine if
923///    the value in the first parameter is not greater than the corresponding
924///    value in the second parameter.
925///
926///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
927///
928/// \headerfile <x86intrin.h>
929///
930/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
931///
932/// \param __a
933///    A 128-bit vector of [2 x double]. The lower double-precision value is
934///    compared to the lower double-precision value of \a __b.
935/// \param __b
936///    A 128-bit vector of [2 x double]. The lower double-precision value is
937///    compared to the lower double-precision value of \a __a.
938/// \returns A 128-bit vector. The lower 64 bits contains the comparison
939///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
940static __inline__ __m128d __DEFAULT_FN_ATTRS
941_mm_cmpngt_sd(__m128d __a, __m128d __b)
942{
943  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
944  return __extension__ (__m128d) { __c[0], __a[1] };
945}
946
947/// Compares the lower double-precision floating-point values in each of
948///    the two 128-bit floating-point vectors of [2 x double] to determine if
949///    the value in the first parameter is not greater than or equal to the
950///    corresponding value in the second parameter.
951///
952///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
953///
954/// \headerfile <x86intrin.h>
955///
956/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
957///
958/// \param __a
959///    A 128-bit vector of [2 x double]. The lower double-precision value is
960///    compared to the lower double-precision value of \a __b.
961/// \param __b
962///    A 128-bit vector of [2 x double]. The lower double-precision value is
963///    compared to the lower double-precision value of \a __a.
964/// \returns A 128-bit vector. The lower 64 bits contains the comparison
965///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
966static __inline__ __m128d __DEFAULT_FN_ATTRS
967_mm_cmpnge_sd(__m128d __a, __m128d __b)
968{
969  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
970  return __extension__ (__m128d) { __c[0], __a[1] };
971}
972
973/// Compares the lower double-precision floating-point values in each of
974///    the two 128-bit floating-point vectors of [2 x double] for equality.
975///
976///    The comparison yields 0 for false, 1 for true. If either of the two
977///    lower double-precision values is NaN, 0 is returned.
978///
979/// \headerfile <x86intrin.h>
980///
981/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
982///
983/// \param __a
984///    A 128-bit vector of [2 x double]. The lower double-precision value is
985///    compared to the lower double-precision value of \a __b.
986/// \param __b
987///    A 128-bit vector of [2 x double]. The lower double-precision value is
988///    compared to the lower double-precision value of \a __a.
989/// \returns An integer containing the comparison results. If either of the two
990///    lower double-precision values is NaN, 0 is returned.
991static __inline__ int __DEFAULT_FN_ATTRS
992_mm_comieq_sd(__m128d __a, __m128d __b)
993{
994  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
995}
996
997/// Compares the lower double-precision floating-point values in each of
998///    the two 128-bit floating-point vectors of [2 x double] to determine if
999///    the value in the first parameter is less than the corresponding value in
1000///    the second parameter.
1001///
1002///    The comparison yields 0 for false, 1 for true. If either of the two
1003///    lower double-precision values is NaN, 0 is returned.
1004///
1005/// \headerfile <x86intrin.h>
1006///
1007/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008///
1009/// \param __a
1010///    A 128-bit vector of [2 x double]. The lower double-precision value is
1011///    compared to the lower double-precision value of \a __b.
1012/// \param __b
1013///    A 128-bit vector of [2 x double]. The lower double-precision value is
1014///    compared to the lower double-precision value of \a __a.
1015/// \returns An integer containing the comparison results. If either of the two
1016///     lower double-precision values is NaN, 0 is returned.
1017static __inline__ int __DEFAULT_FN_ATTRS
1018_mm_comilt_sd(__m128d __a, __m128d __b)
1019{
1020  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1021}
1022
1023/// Compares the lower double-precision floating-point values in each of
1024///    the two 128-bit floating-point vectors of [2 x double] to determine if
1025///    the value in the first parameter is less than or equal to the
1026///    corresponding value in the second parameter.
1027///
1028///    The comparison yields 0 for false, 1 for true. If either of the two
1029///    lower double-precision values is NaN, 0 is returned.
1030///
1031/// \headerfile <x86intrin.h>
1032///
1033/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1034///
1035/// \param __a
1036///    A 128-bit vector of [2 x double]. The lower double-precision value is
1037///    compared to the lower double-precision value of \a __b.
1038/// \param __b
1039///     A 128-bit vector of [2 x double]. The lower double-precision value is
1040///     compared to the lower double-precision value of \a __a.
1041/// \returns An integer containing the comparison results. If either of the two
1042///     lower double-precision values is NaN, 0 is returned.
1043static __inline__ int __DEFAULT_FN_ATTRS
1044_mm_comile_sd(__m128d __a, __m128d __b)
1045{
1046  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1047}
1048
1049/// Compares the lower double-precision floating-point values in each of
1050///    the two 128-bit floating-point vectors of [2 x double] to determine if
1051///    the value in the first parameter is greater than the corresponding value
1052///    in the second parameter.
1053///
1054///    The comparison yields 0 for false, 1 for true. If either of the two
1055///    lower double-precision values is NaN, 0 is returned.
1056///
1057/// \headerfile <x86intrin.h>
1058///
1059/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060///
1061/// \param __a
1062///    A 128-bit vector of [2 x double]. The lower double-precision value is
1063///    compared to the lower double-precision value of \a __b.
1064/// \param __b
1065///    A 128-bit vector of [2 x double]. The lower double-precision value is
1066///    compared to the lower double-precision value of \a __a.
1067/// \returns An integer containing the comparison results. If either of the two
1068///     lower double-precision values is NaN, 0 is returned.
1069static __inline__ int __DEFAULT_FN_ATTRS
1070_mm_comigt_sd(__m128d __a, __m128d __b)
1071{
1072  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1073}
1074
1075/// Compares the lower double-precision floating-point values in each of
1076///    the two 128-bit floating-point vectors of [2 x double] to determine if
1077///    the value in the first parameter is greater than or equal to the
1078///    corresponding value in the second parameter.
1079///
1080///    The comparison yields 0 for false, 1 for true. If either of the two
1081///    lower double-precision values is NaN, 0 is returned.
1082///
1083/// \headerfile <x86intrin.h>
1084///
1085/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1086///
1087/// \param __a
1088///    A 128-bit vector of [2 x double]. The lower double-precision value is
1089///    compared to the lower double-precision value of \a __b.
1090/// \param __b
1091///    A 128-bit vector of [2 x double]. The lower double-precision value is
1092///    compared to the lower double-precision value of \a __a.
1093/// \returns An integer containing the comparison results. If either of the two
1094///    lower double-precision values is NaN, 0 is returned.
1095static __inline__ int __DEFAULT_FN_ATTRS
1096_mm_comige_sd(__m128d __a, __m128d __b)
1097{
1098  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1099}
1100
1101/// Compares the lower double-precision floating-point values in each of
1102///    the two 128-bit floating-point vectors of [2 x double] to determine if
1103///    the value in the first parameter is unequal to the corresponding value in
1104///    the second parameter.
1105///
1106///    The comparison yields 0 for false, 1 for true. If either of the two
1107///    lower double-precision values is NaN, 1 is returned.
1108///
1109/// \headerfile <x86intrin.h>
1110///
1111/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1112///
1113/// \param __a
1114///    A 128-bit vector of [2 x double]. The lower double-precision value is
1115///    compared to the lower double-precision value of \a __b.
1116/// \param __b
1117///    A 128-bit vector of [2 x double]. The lower double-precision value is
1118///    compared to the lower double-precision value of \a __a.
1119/// \returns An integer containing the comparison results. If either of the two
1120///     lower double-precision values is NaN, 1 is returned.
1121static __inline__ int __DEFAULT_FN_ATTRS
1122_mm_comineq_sd(__m128d __a, __m128d __b)
1123{
1124  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1125}
1126
1127/// Compares the lower double-precision floating-point values in each of
1128///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1129///    comparison yields 0 for false, 1 for true.
1130///
1131///    If either of the two lower double-precision values is NaN, 0 is returned.
1132///
1133/// \headerfile <x86intrin.h>
1134///
1135/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1136///
1137/// \param __a
1138///    A 128-bit vector of [2 x double]. The lower double-precision value is
1139///    compared to the lower double-precision value of \a __b.
1140/// \param __b
1141///    A 128-bit vector of [2 x double]. The lower double-precision value is
1142///    compared to the lower double-precision value of \a __a.
1143/// \returns An integer containing the comparison results. If either of the two
1144///    lower double-precision values is NaN, 0 is returned.
1145static __inline__ int __DEFAULT_FN_ATTRS
1146_mm_ucomieq_sd(__m128d __a, __m128d __b)
1147{
1148  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1149}
1150
1151/// Compares the lower double-precision floating-point values in each of
1152///    the two 128-bit floating-point vectors of [2 x double] to determine if
1153///    the value in the first parameter is less than the corresponding value in
1154///    the second parameter.
1155///
1156///    The comparison yields 0 for false, 1 for true. If either of the two lower
1157///    double-precision values is NaN, 0 is returned.
1158///
1159/// \headerfile <x86intrin.h>
1160///
1161/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1162///
1163/// \param __a
1164///    A 128-bit vector of [2 x double]. The lower double-precision value is
1165///    compared to the lower double-precision value of \a __b.
1166/// \param __b
1167///    A 128-bit vector of [2 x double]. The lower double-precision value is
1168///    compared to the lower double-precision value of \a __a.
1169/// \returns An integer containing the comparison results. If either of the two
1170///    lower double-precision values is NaN, 0 is returned.
1171static __inline__ int __DEFAULT_FN_ATTRS
1172_mm_ucomilt_sd(__m128d __a, __m128d __b)
1173{
1174  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1175}
1176
1177/// Compares the lower double-precision floating-point values in each of
1178///    the two 128-bit floating-point vectors of [2 x double] to determine if
1179///    the value in the first parameter is less than or equal to the
1180///    corresponding value in the second parameter.
1181///
1182///    The comparison yields 0 for false, 1 for true. If either of the two lower
1183///    double-precision values is NaN, 0 is returned.
1184///
1185/// \headerfile <x86intrin.h>
1186///
1187/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1188///
1189/// \param __a
1190///    A 128-bit vector of [2 x double]. The lower double-precision value is
1191///    compared to the lower double-precision value of \a __b.
1192/// \param __b
1193///     A 128-bit vector of [2 x double]. The lower double-precision value is
1194///     compared to the lower double-precision value of \a __a.
1195/// \returns An integer containing the comparison results. If either of the two
1196///     lower double-precision values is NaN, 0 is returned.
1197static __inline__ int __DEFAULT_FN_ATTRS
1198_mm_ucomile_sd(__m128d __a, __m128d __b)
1199{
1200  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1201}
1202
1203/// Compares the lower double-precision floating-point values in each of
1204///    the two 128-bit floating-point vectors of [2 x double] to determine if
1205///    the value in the first parameter is greater than the corresponding value
1206///    in the second parameter.
1207///
1208///    The comparison yields 0 for false, 1 for true. If either of the two lower
1209///    double-precision values is NaN, 0 is returned.
1210///
1211/// \headerfile <x86intrin.h>
1212///
1213/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1214///
1215/// \param __a
1216///    A 128-bit vector of [2 x double]. The lower double-precision value is
1217///    compared to the lower double-precision value of \a __b.
1218/// \param __b
1219///     A 128-bit vector of [2 x double]. The lower double-precision value is
1220///     compared to the lower double-precision value of \a __a.
1221/// \returns An integer containing the comparison results. If either of the two
1222///     lower double-precision values is NaN, 0 is returned.
1223static __inline__ int __DEFAULT_FN_ATTRS
1224_mm_ucomigt_sd(__m128d __a, __m128d __b)
1225{
1226  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1227}
1228
1229/// Compares the lower double-precision floating-point values in each of
1230///    the two 128-bit floating-point vectors of [2 x double] to determine if
1231///    the value in the first parameter is greater than or equal to the
1232///    corresponding value in the second parameter.
1233///
1234///    The comparison yields 0 for false, 1 for true.  If either of the two
1235///    lower double-precision values is NaN, 0 is returned.
1236///
1237/// \headerfile <x86intrin.h>
1238///
1239/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1240///
1241/// \param __a
1242///    A 128-bit vector of [2 x double]. The lower double-precision value is
1243///    compared to the lower double-precision value of \a __b.
1244/// \param __b
1245///    A 128-bit vector of [2 x double]. The lower double-precision value is
1246///    compared to the lower double-precision value of \a __a.
1247/// \returns An integer containing the comparison results. If either of the two
1248///    lower double-precision values is NaN, 0 is returned.
1249static __inline__ int __DEFAULT_FN_ATTRS
1250_mm_ucomige_sd(__m128d __a, __m128d __b)
1251{
1252  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1253}
1254
1255/// Compares the lower double-precision floating-point values in each of
1256///    the two 128-bit floating-point vectors of [2 x double] to determine if
1257///    the value in the first parameter is unequal to the corresponding value in
1258///    the second parameter.
1259///
1260///    The comparison yields 0 for false, 1 for true. If either of the two lower
1261///    double-precision values is NaN, 1 is returned.
1262///
1263/// \headerfile <x86intrin.h>
1264///
1265/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1266///
1267/// \param __a
1268///    A 128-bit vector of [2 x double]. The lower double-precision value is
1269///    compared to the lower double-precision value of \a __b.
1270/// \param __b
1271///    A 128-bit vector of [2 x double]. The lower double-precision value is
1272///    compared to the lower double-precision value of \a __a.
1273/// \returns An integer containing the comparison result. If either of the two
1274///    lower double-precision values is NaN, 1 is returned.
1275static __inline__ int __DEFAULT_FN_ATTRS
1276_mm_ucomineq_sd(__m128d __a, __m128d __b)
1277{
1278  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1279}
1280
1281/// Converts the two double-precision floating-point elements of a
1282///    128-bit vector of [2 x double] into two single-precision floating-point
1283///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1284///    The upper 64 bits of the result vector are set to zero.
1285///
1286/// \headerfile <x86intrin.h>
1287///
1288/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1289///
1290/// \param __a
1291///    A 128-bit vector of [2 x double].
1292/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1293///    converted values. The upper 64 bits are set to zero.
1294static __inline__ __m128 __DEFAULT_FN_ATTRS
1295_mm_cvtpd_ps(__m128d __a)
1296{
1297  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1298}
1299
1300/// Converts the lower two single-precision floating-point elements of a
1301///    128-bit vector of [4 x float] into two double-precision floating-point
1302///    values, returned in a 128-bit vector of [2 x double]. The upper two
1303///    elements of the input vector are unused.
1304///
1305/// \headerfile <x86intrin.h>
1306///
1307/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1308///
1309/// \param __a
1310///    A 128-bit vector of [4 x float]. The lower two single-precision
1311///    floating-point elements are converted to double-precision values. The
1312///    upper two elements are unused.
1313/// \returns A 128-bit vector of [2 x double] containing the converted values.
1314static __inline__ __m128d __DEFAULT_FN_ATTRS
1315_mm_cvtps_pd(__m128 __a)
1316{
1317  return (__m128d) __builtin_convertvector(
1318      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1319}
1320
1321/// Converts the lower two integer elements of a 128-bit vector of
1322///    [4 x i32] into two double-precision floating-point values, returned in a
1323///    128-bit vector of [2 x double].
1324///
1325///    The upper two elements of the input vector are unused.
1326///
1327/// \headerfile <x86intrin.h>
1328///
1329/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1330///
1331/// \param __a
1332///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1333///    converted to double-precision values.
1334///
1335///    The upper two elements are unused.
1336/// \returns A 128-bit vector of [2 x double] containing the converted values.
1337static __inline__ __m128d __DEFAULT_FN_ATTRS
1338_mm_cvtepi32_pd(__m128i __a)
1339{
1340  return (__m128d) __builtin_convertvector(
1341      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1342}
1343
1344/// Converts the two double-precision floating-point elements of a
1345///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1346///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1347///    64 bits of the result vector are set to zero.
1348///
1349/// \headerfile <x86intrin.h>
1350///
1351/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1352///
1353/// \param __a
1354///    A 128-bit vector of [2 x double].
1355/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1356///    converted values. The upper 64 bits are set to zero.
1357static __inline__ __m128i __DEFAULT_FN_ATTRS
1358_mm_cvtpd_epi32(__m128d __a)
1359{
1360  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1361}
1362
1363/// Converts the low-order element of a 128-bit vector of [2 x double]
1364///    into a 32-bit signed integer value.
1365///
1366/// \headerfile <x86intrin.h>
1367///
1368/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1369///
1370/// \param __a
1371///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1372///    conversion.
1373/// \returns A 32-bit signed integer containing the converted value.
1374static __inline__ int __DEFAULT_FN_ATTRS
1375_mm_cvtsd_si32(__m128d __a)
1376{
1377  return __builtin_ia32_cvtsd2si((__v2df)__a);
1378}
1379
1380/// Converts the lower double-precision floating-point element of a
1381///    128-bit vector of [2 x double], in the second parameter, into a
1382///    single-precision floating-point value, returned in the lower 32 bits of a
1383///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1384///    copied from the upper 96 bits of the first parameter.
1385///
1386/// \headerfile <x86intrin.h>
1387///
1388/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1389///
1390/// \param __a
1391///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1392///    copied to the upper 96 bits of the result.
1393/// \param __b
1394///    A 128-bit vector of [2 x double]. The lower double-precision
1395///    floating-point element is used in the conversion.
1396/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1397///    converted value from the second parameter. The upper 96 bits are copied
1398///    from the upper 96 bits of the first parameter.
1399static __inline__ __m128 __DEFAULT_FN_ATTRS
1400_mm_cvtsd_ss(__m128 __a, __m128d __b)
1401{
1402  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1403}
1404
1405/// Converts a 32-bit signed integer value, in the second parameter, into
1406///    a double-precision floating-point value, returned in the lower 64 bits of
1407///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1408///    are copied from the upper 64 bits of the first parameter.
1409///
1410/// \headerfile <x86intrin.h>
1411///
1412/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1413///
1414/// \param __a
1415///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1416///    copied to the upper 64 bits of the result.
1417/// \param __b
1418///    A 32-bit signed integer containing the value to be converted.
1419/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1420///    converted value from the second parameter. The upper 64 bits are copied
1421///    from the upper 64 bits of the first parameter.
1422static __inline__ __m128d __DEFAULT_FN_ATTRS
1423_mm_cvtsi32_sd(__m128d __a, int __b)
1424{
1425  __a[0] = __b;
1426  return __a;
1427}
1428
1429/// Converts the lower single-precision floating-point element of a
1430///    128-bit vector of [4 x float], in the second parameter, into a
1431///    double-precision floating-point value, returned in the lower 64 bits of
1432///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1433///    are copied from the upper 64 bits of the first parameter.
1434///
1435/// \headerfile <x86intrin.h>
1436///
1437/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1438///
1439/// \param __a
1440///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1441///    copied to the upper 64 bits of the result.
1442/// \param __b
1443///    A 128-bit vector of [4 x float]. The lower single-precision
1444///    floating-point element is used in the conversion.
1445/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1446///    converted value from the second parameter. The upper 64 bits are copied
1447///    from the upper 64 bits of the first parameter.
1448static __inline__ __m128d __DEFAULT_FN_ATTRS
1449_mm_cvtss_sd(__m128d __a, __m128 __b)
1450{
1451  __a[0] = __b[0];
1452  return __a;
1453}
1454
1455/// Converts the two double-precision floating-point elements of a
1456///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1457///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1458///
1459///    If the result of either conversion is inexact, the result is truncated
1460///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1461///    64 bits of the result vector are set to zero.
1462///
1463/// \headerfile <x86intrin.h>
1464///
1465/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1466///   instruction.
1467///
1468/// \param __a
1469///    A 128-bit vector of [2 x double].
1470/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1471///    converted values. The upper 64 bits are set to zero.
1472static __inline__ __m128i __DEFAULT_FN_ATTRS
1473_mm_cvttpd_epi32(__m128d __a)
1474{
1475  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1476}
1477
1478/// Converts the low-order element of a [2 x double] vector into a 32-bit
1479///    signed integer value, truncating the result when it is inexact.
1480///
1481/// \headerfile <x86intrin.h>
1482///
1483/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1484///   instruction.
1485///
1486/// \param __a
1487///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1488///    conversion.
1489/// \returns A 32-bit signed integer containing the converted value.
1490static __inline__ int __DEFAULT_FN_ATTRS
1491_mm_cvttsd_si32(__m128d __a)
1492{
1493  return __builtin_ia32_cvttsd2si((__v2df)__a);
1494}
1495
1496/// Converts the two double-precision floating-point elements of a
1497///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1498///    returned in a 64-bit vector of [2 x i32].
1499///
1500/// \headerfile <x86intrin.h>
1501///
1502/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1503///
1504/// \param __a
1505///    A 128-bit vector of [2 x double].
1506/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1507static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1508_mm_cvtpd_pi32(__m128d __a)
1509{
1510  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1511}
1512
1513/// Converts the two double-precision floating-point elements of a
1514///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1515///    returned in a 64-bit vector of [2 x i32].
1516///
1517///    If the result of either conversion is inexact, the result is truncated
1518///    (rounded towards zero) regardless of the current MXCSR setting.
1519///
1520/// \headerfile <x86intrin.h>
1521///
1522/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1523///
1524/// \param __a
1525///    A 128-bit vector of [2 x double].
1526/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1527static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1528_mm_cvttpd_pi32(__m128d __a)
1529{
1530  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1531}
1532
1533/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1534///    [2 x i32] into two double-precision floating-point values, returned in a
1535///    128-bit vector of [2 x double].
1536///
1537/// \headerfile <x86intrin.h>
1538///
1539/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1540///
1541/// \param __a
1542///    A 64-bit vector of [2 x i32].
1543/// \returns A 128-bit vector of [2 x double] containing the converted values.
1544static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
1545_mm_cvtpi32_pd(__m64 __a)
1546{
1547  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1548}
1549
1550/// Returns the low-order element of a 128-bit vector of [2 x double] as
1551///    a double-precision floating-point value.
1552///
1553/// \headerfile <x86intrin.h>
1554///
1555/// This intrinsic has no corresponding instruction.
1556///
1557/// \param __a
1558///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1559/// \returns A double-precision floating-point value copied from the lower 64
1560///    bits of \a __a.
1561static __inline__ double __DEFAULT_FN_ATTRS
1562_mm_cvtsd_f64(__m128d __a)
1563{
1564  return __a[0];
1565}
1566
1567/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1568///    memory location.
1569///
1570/// \headerfile <x86intrin.h>
1571///
1572/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1573///
1574/// \param __dp
1575///    A pointer to a 128-bit memory location. The address of the memory
1576///    location has to be 16-byte aligned.
1577/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1578static __inline__ __m128d __DEFAULT_FN_ATTRS
1579_mm_load_pd(double const *__dp)
1580{
1581  return *(const __m128d*)__dp;
1582}
1583
1584/// Loads a double-precision floating-point value from a specified memory
1585///    location and duplicates it to both vector elements of a 128-bit vector of
1586///    [2 x double].
1587///
1588/// \headerfile <x86intrin.h>
1589///
1590/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1591///
1592/// \param __dp
1593///    A pointer to a memory location containing a double-precision value.
1594/// \returns A 128-bit vector of [2 x double] containing the loaded and
1595///    duplicated values.
1596static __inline__ __m128d __DEFAULT_FN_ATTRS
1597_mm_load1_pd(double const *__dp)
1598{
1599  struct __mm_load1_pd_struct {
1600    double __u;
1601  } __attribute__((__packed__, __may_alias__));
1602  double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
1603  return __extension__ (__m128d){ __u, __u };
1604}
1605
1606#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
1607
1608/// Loads two double-precision values, in reverse order, from an aligned
1609///    memory location into a 128-bit vector of [2 x double].
1610///
1611/// \headerfile <x86intrin.h>
1612///
1613/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1614/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1615/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1616///
1617/// \param __dp
1618///    A 16-byte aligned pointer to an array of double-precision values to be
1619///    loaded in reverse order.
1620/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1621///    values.
1622static __inline__ __m128d __DEFAULT_FN_ATTRS
1623_mm_loadr_pd(double const *__dp)
1624{
1625  __m128d __u = *(const __m128d*)__dp;
1626  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1627}
1628
1629/// Loads a 128-bit floating-point vector of [2 x double] from an
1630///    unaligned memory location.
1631///
1632/// \headerfile <x86intrin.h>
1633///
1634/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1635///
1636/// \param __dp
1637///    A pointer to a 128-bit memory location. The address of the memory
1638///    location does not have to be aligned.
1639/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1640static __inline__ __m128d __DEFAULT_FN_ATTRS
1641_mm_loadu_pd(double const *__dp)
1642{
1643  struct __loadu_pd {
1644    __m128d_u __v;
1645  } __attribute__((__packed__, __may_alias__));
1646  return ((const struct __loadu_pd*)__dp)->__v;
1647}
1648
1649/// Loads a 64-bit integer value to the low element of a 128-bit integer
1650///    vector and clears the upper element.
1651///
1652/// \headerfile <x86intrin.h>
1653///
1654/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1655///
1656/// \param __a
1657///    A pointer to a 64-bit memory location. The address of the memory
1658///    location does not have to be aligned.
1659/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1660static __inline__ __m128i __DEFAULT_FN_ATTRS
1661_mm_loadu_si64(void const *__a)
1662{
1663  struct __loadu_si64 {
1664    long long __v;
1665  } __attribute__((__packed__, __may_alias__));
1666  long long __u = ((const struct __loadu_si64*)__a)->__v;
1667  return __extension__ (__m128i)(__v2di){__u, 0LL};
1668}
1669
1670/// Loads a 32-bit integer value to the low element of a 128-bit integer
1671///    vector and clears the upper element.
1672///
1673/// \headerfile <x86intrin.h>
1674///
1675/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1676///
1677/// \param __a
1678///    A pointer to a 32-bit memory location. The address of the memory
1679///    location does not have to be aligned.
1680/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1681static __inline__ __m128i __DEFAULT_FN_ATTRS
1682_mm_loadu_si32(void const *__a)
1683{
1684  struct __loadu_si32 {
1685    int __v;
1686  } __attribute__((__packed__, __may_alias__));
1687  int __u = ((const struct __loadu_si32*)__a)->__v;
1688  return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
1689}
1690
1691/// Loads a 16-bit integer value to the low element of a 128-bit integer
1692///    vector and clears the upper element.
1693///
1694/// \headerfile <x86intrin.h>
1695///
1696/// This intrinsic does not correspond to a specific instruction.
1697///
1698/// \param __a
1699///    A pointer to a 16-bit memory location. The address of the memory
1700///    location does not have to be aligned.
1701/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1702static __inline__ __m128i __DEFAULT_FN_ATTRS
1703_mm_loadu_si16(void const *__a)
1704{
1705  struct __loadu_si16 {
1706    short __v;
1707  } __attribute__((__packed__, __may_alias__));
1708  short __u = ((const struct __loadu_si16*)__a)->__v;
1709  return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1710}
1711
1712/// Loads a 64-bit double-precision value to the low element of a
1713///    128-bit integer vector and clears the upper element.
1714///
1715/// \headerfile <x86intrin.h>
1716///
1717/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1718///
1719/// \param __dp
1720///    A pointer to a memory location containing a double-precision value.
1721///    The address of the memory location does not have to be aligned.
1722/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1723static __inline__ __m128d __DEFAULT_FN_ATTRS
1724_mm_load_sd(double const *__dp)
1725{
1726  struct __mm_load_sd_struct {
1727    double __u;
1728  } __attribute__((__packed__, __may_alias__));
1729  double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
1730  return __extension__ (__m128d){ __u, 0 };
1731}
1732
1733/// Loads a double-precision value into the high-order bits of a 128-bit
1734///    vector of [2 x double]. The low-order bits are copied from the low-order
1735///    bits of the first operand.
1736///
1737/// \headerfile <x86intrin.h>
1738///
1739/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1740///
1741/// \param __a
1742///    A 128-bit vector of [2 x double]. \n
1743///    Bits [63:0] are written to bits [63:0] of the result.
1744/// \param __dp
1745///    A pointer to a 64-bit memory location containing a double-precision
1746///    floating-point value that is loaded. The loaded value is written to bits
1747///    [127:64] of the result. The address of the memory location does not have
1748///    to be aligned.
1749/// \returns A 128-bit vector of [2 x double] containing the moved values.
1750static __inline__ __m128d __DEFAULT_FN_ATTRS
1751_mm_loadh_pd(__m128d __a, double const *__dp)
1752{
1753  struct __mm_loadh_pd_struct {
1754    double __u;
1755  } __attribute__((__packed__, __may_alias__));
1756  double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
1757  return __extension__ (__m128d){ __a[0], __u };
1758}
1759
1760/// Loads a double-precision value into the low-order bits of a 128-bit
1761///    vector of [2 x double]. The high-order bits are copied from the
1762///    high-order bits of the first operand.
1763///
1764/// \headerfile <x86intrin.h>
1765///
1766/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1767///
1768/// \param __a
1769///    A 128-bit vector of [2 x double]. \n
1770///    Bits [127:64] are written to bits [127:64] of the result.
1771/// \param __dp
1772///    A pointer to a 64-bit memory location containing a double-precision
1773///    floating-point value that is loaded. The loaded value is written to bits
1774///    [63:0] of the result. The address of the memory location does not have to
1775///    be aligned.
1776/// \returns A 128-bit vector of [2 x double] containing the moved values.
1777static __inline__ __m128d __DEFAULT_FN_ATTRS
1778_mm_loadl_pd(__m128d __a, double const *__dp)
1779{
1780  struct __mm_loadl_pd_struct {
1781    double __u;
1782  } __attribute__((__packed__, __may_alias__));
1783  double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
1784  return __extension__ (__m128d){ __u, __a[1] };
1785}
1786
1787/// Constructs a 128-bit floating-point vector of [2 x double] with
1788///    unspecified content. This could be used as an argument to another
1789///    intrinsic function where the argument is required but the value is not
1790///    actually used.
1791///
1792/// \headerfile <x86intrin.h>
1793///
1794/// This intrinsic has no corresponding instruction.
1795///
1796/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1797///    content.
1798static __inline__ __m128d __DEFAULT_FN_ATTRS
1799_mm_undefined_pd(void)
1800{
1801  return (__m128d)__builtin_ia32_undef128();
1802}
1803
1804/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1805///    64 bits of the vector are initialized with the specified double-precision
1806///    floating-point value. The upper 64 bits are set to zero.
1807///
1808/// \headerfile <x86intrin.h>
1809///
1810/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1811///
1812/// \param __w
1813///    A double-precision floating-point value used to initialize the lower 64
1814///    bits of the result.
1815/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1816///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1817///    set to zero.
1818static __inline__ __m128d __DEFAULT_FN_ATTRS
1819_mm_set_sd(double __w)
1820{
1821  return __extension__ (__m128d){ __w, 0 };
1822}
1823
1824/// Constructs a 128-bit floating-point vector of [2 x double], with each
1825///    of the two double-precision floating-point vector elements set to the
1826///    specified double-precision floating-point value.
1827///
1828/// \headerfile <x86intrin.h>
1829///
1830/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1831///
1832/// \param __w
1833///    A double-precision floating-point value used to initialize each vector
1834///    element of the result.
1835/// \returns An initialized 128-bit floating-point vector of [2 x double].
1836static __inline__ __m128d __DEFAULT_FN_ATTRS
1837_mm_set1_pd(double __w)
1838{
1839  return __extension__ (__m128d){ __w, __w };
1840}
1841
1842/// Constructs a 128-bit floating-point vector of [2 x double], with each
1843///    of the two double-precision floating-point vector elements set to the
1844///    specified double-precision floating-point value.
1845///
1846/// \headerfile <x86intrin.h>
1847///
1848/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1849///
1850/// \param __w
1851///    A double-precision floating-point value used to initialize each vector
1852///    element of the result.
1853/// \returns An initialized 128-bit floating-point vector of [2 x double].
1854static __inline__ __m128d __DEFAULT_FN_ATTRS
1855_mm_set_pd1(double __w)
1856{
1857  return _mm_set1_pd(__w);
1858}
1859
1860/// Constructs a 128-bit floating-point vector of [2 x double]
1861///    initialized with the specified double-precision floating-point values.
1862///
1863/// \headerfile <x86intrin.h>
1864///
1865/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1866///
1867/// \param __w
1868///    A double-precision floating-point value used to initialize the upper 64
1869///    bits of the result.
1870/// \param __x
1871///    A double-precision floating-point value used to initialize the lower 64
1872///    bits of the result.
1873/// \returns An initialized 128-bit floating-point vector of [2 x double].
1874static __inline__ __m128d __DEFAULT_FN_ATTRS
1875_mm_set_pd(double __w, double __x)
1876{
1877  return __extension__ (__m128d){ __x, __w };
1878}
1879
1880/// Constructs a 128-bit floating-point vector of [2 x double],
1881///    initialized in reverse order with the specified double-precision
1882///    floating-point values.
1883///
1884/// \headerfile <x86intrin.h>
1885///
1886/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1887///
1888/// \param __w
1889///    A double-precision floating-point value used to initialize the lower 64
1890///    bits of the result.
1891/// \param __x
1892///    A double-precision floating-point value used to initialize the upper 64
1893///    bits of the result.
1894/// \returns An initialized 128-bit floating-point vector of [2 x double].
1895static __inline__ __m128d __DEFAULT_FN_ATTRS
1896_mm_setr_pd(double __w, double __x)
1897{
1898  return __extension__ (__m128d){ __w, __x };
1899}
1900
1901/// Constructs a 128-bit floating-point vector of [2 x double]
1902///    initialized to zero.
1903///
1904/// \headerfile <x86intrin.h>
1905///
1906/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1907///
1908/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1909///    all elements set to zero.
1910static __inline__ __m128d __DEFAULT_FN_ATTRS
1911_mm_setzero_pd(void)
1912{
1913  return __extension__ (__m128d){ 0, 0 };
1914}
1915
1916/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1917///    64 bits are set to the lower 64 bits of the second parameter. The upper
1918///    64 bits are set to the upper 64 bits of the first parameter.
1919///
1920/// \headerfile <x86intrin.h>
1921///
1922/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1923///
1924/// \param __a
1925///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1926///    upper 64 bits of the result.
1927/// \param __b
1928///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1929///    lower 64 bits of the result.
1930/// \returns A 128-bit vector of [2 x double] containing the moved values.
1931static __inline__ __m128d __DEFAULT_FN_ATTRS
1932_mm_move_sd(__m128d __a, __m128d __b)
1933{
1934  __a[0] = __b[0];
1935  return __a;
1936}
1937
1938/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1939///    memory location.
1940///
1941/// \headerfile <x86intrin.h>
1942///
1943/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1944///
1945/// \param __dp
1946///    A pointer to a 64-bit memory location.
1947/// \param __a
1948///    A 128-bit vector of [2 x double] containing the value to be stored.
1949static __inline__ void __DEFAULT_FN_ATTRS
1950_mm_store_sd(double *__dp, __m128d __a)
1951{
1952  struct __mm_store_sd_struct {
1953    double __u;
1954  } __attribute__((__packed__, __may_alias__));
1955  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1956}
1957
1958/// Moves packed double-precision values from a 128-bit vector of
1959///    [2 x double] to a memory location.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1964///
1965/// \param __dp
1966///    A pointer to an aligned memory location that can store two
1967///    double-precision values.
1968/// \param __a
1969///    A packed 128-bit vector of [2 x double] containing the values to be
1970///    moved.
1971static __inline__ void __DEFAULT_FN_ATTRS
1972_mm_store_pd(double *__dp, __m128d __a)
1973{
1974  *(__m128d*)__dp = __a;
1975}
1976
1977/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1978///    the upper and lower 64 bits of a memory location.
1979///
1980/// \headerfile <x86intrin.h>
1981///
1982/// This intrinsic corresponds to the
1983///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1984///
1985/// \param __dp
1986///    A pointer to a memory location that can store two double-precision
1987///    values.
1988/// \param __a
1989///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1990///    of the values in \a __dp.
1991static __inline__ void __DEFAULT_FN_ATTRS
1992_mm_store1_pd(double *__dp, __m128d __a)
1993{
1994  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1995  _mm_store_pd(__dp, __a);
1996}
1997
1998/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1999///    the upper and lower 64 bits of a memory location.
2000///
2001/// \headerfile <x86intrin.h>
2002///
2003/// This intrinsic corresponds to the
2004///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
2005///
2006/// \param __dp
2007///    A pointer to a memory location that can store two double-precision
2008///    values.
2009/// \param __a
2010///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2011///    of the values in \a __dp.
2012static __inline__ void __DEFAULT_FN_ATTRS
2013_mm_store_pd1(double *__dp, __m128d __a)
2014{
2015  _mm_store1_pd(__dp, __a);
2016}
2017
2018/// Stores a 128-bit vector of [2 x double] into an unaligned memory
2019///    location.
2020///
2021/// \headerfile <x86intrin.h>
2022///
2023/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
2024///
2025/// \param __dp
2026///    A pointer to a 128-bit memory location. The address of the memory
2027///    location does not have to be aligned.
2028/// \param __a
2029///    A 128-bit vector of [2 x double] containing the values to be stored.
2030static __inline__ void __DEFAULT_FN_ATTRS
2031_mm_storeu_pd(double *__dp, __m128d __a)
2032{
2033  struct __storeu_pd {
2034    __m128d_u __v;
2035  } __attribute__((__packed__, __may_alias__));
2036  ((struct __storeu_pd*)__dp)->__v = __a;
2037}
2038
2039/// Stores two double-precision values, in reverse order, from a 128-bit
2040///    vector of [2 x double] to a 16-byte aligned memory location.
2041///
2042/// \headerfile <x86intrin.h>
2043///
2044/// This intrinsic corresponds to a shuffling instruction followed by a
2045/// <c> VMOVAPD / MOVAPD </c> instruction.
2046///
2047/// \param __dp
2048///    A pointer to a 16-byte aligned memory location that can store two
2049///    double-precision values.
2050/// \param __a
2051///    A 128-bit vector of [2 x double] containing the values to be reversed and
2052///    stored.
2053static __inline__ void __DEFAULT_FN_ATTRS
2054_mm_storer_pd(double *__dp, __m128d __a)
2055{
2056  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2057  *(__m128d *)__dp = __a;
2058}
2059
2060/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2061///    memory location.
2062///
2063/// \headerfile <x86intrin.h>
2064///
2065/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2066///
2067/// \param __dp
2068///    A pointer to a 64-bit memory location.
2069/// \param __a
2070///    A 128-bit vector of [2 x double] containing the value to be stored.
2071static __inline__ void __DEFAULT_FN_ATTRS
2072_mm_storeh_pd(double *__dp, __m128d __a)
2073{
2074  struct __mm_storeh_pd_struct {
2075    double __u;
2076  } __attribute__((__packed__, __may_alias__));
2077  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2078}
2079
2080/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2081///    memory location.
2082///
2083/// \headerfile <x86intrin.h>
2084///
2085/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2086///
2087/// \param __dp
2088///    A pointer to a 64-bit memory location.
2089/// \param __a
2090///    A 128-bit vector of [2 x double] containing the value to be stored.
2091static __inline__ void __DEFAULT_FN_ATTRS
2092_mm_storel_pd(double *__dp, __m128d __a)
2093{
2094  struct __mm_storeh_pd_struct {
2095    double __u;
2096  } __attribute__((__packed__, __may_alias__));
2097  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2098}
2099
2100/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2101///    saving the lower 8 bits of each sum in the corresponding element of a
2102///    128-bit result vector of [16 x i8].
2103///
2104///    The integer elements of both parameters can be either signed or unsigned.
2105///
2106/// \headerfile <x86intrin.h>
2107///
2108/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2109///
2110/// \param __a
2111///    A 128-bit vector of [16 x i8].
2112/// \param __b
2113///    A 128-bit vector of [16 x i8].
2114/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2115///    parameters.
2116static __inline__ __m128i __DEFAULT_FN_ATTRS
2117_mm_add_epi8(__m128i __a, __m128i __b)
2118{
2119  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2120}
2121
2122/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2123///    saving the lower 16 bits of each sum in the corresponding element of a
2124///    128-bit result vector of [8 x i16].
2125///
2126///    The integer elements of both parameters can be either signed or unsigned.
2127///
2128/// \headerfile <x86intrin.h>
2129///
2130/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2131///
2132/// \param __a
2133///    A 128-bit vector of [8 x i16].
2134/// \param __b
2135///    A 128-bit vector of [8 x i16].
2136/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2137///    parameters.
2138static __inline__ __m128i __DEFAULT_FN_ATTRS
2139_mm_add_epi16(__m128i __a, __m128i __b)
2140{
2141  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2142}
2143
2144/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2145///    saving the lower 32 bits of each sum in the corresponding element of a
2146///    128-bit result vector of [4 x i32].
2147///
2148///    The integer elements of both parameters can be either signed or unsigned.
2149///
2150/// \headerfile <x86intrin.h>
2151///
2152/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2153///
2154/// \param __a
2155///    A 128-bit vector of [4 x i32].
2156/// \param __b
2157///    A 128-bit vector of [4 x i32].
2158/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2159///    parameters.
2160static __inline__ __m128i __DEFAULT_FN_ATTRS
2161_mm_add_epi32(__m128i __a, __m128i __b)
2162{
2163  return (__m128i)((__v4su)__a + (__v4su)__b);
2164}
2165
2166/// Adds two signed or unsigned 64-bit integer values, returning the
2167///    lower 64 bits of the sum.
2168///
2169/// \headerfile <x86intrin.h>
2170///
2171/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2172///
2173/// \param __a
2174///    A 64-bit integer.
2175/// \param __b
2176///    A 64-bit integer.
2177/// \returns A 64-bit integer containing the sum of both parameters.
2178static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2179_mm_add_si64(__m64 __a, __m64 __b)
2180{
2181  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2182}
2183
2184/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2185///    saving the lower 64 bits of each sum in the corresponding element of a
2186///    128-bit result vector of [2 x i64].
2187///
2188///    The integer elements of both parameters can be either signed or unsigned.
2189///
2190/// \headerfile <x86intrin.h>
2191///
2192/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2193///
2194/// \param __a
2195///    A 128-bit vector of [2 x i64].
2196/// \param __b
2197///    A 128-bit vector of [2 x i64].
2198/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2199///    parameters.
2200static __inline__ __m128i __DEFAULT_FN_ATTRS
2201_mm_add_epi64(__m128i __a, __m128i __b)
2202{
2203  return (__m128i)((__v2du)__a + (__v2du)__b);
2204}
2205
2206/// Adds, with saturation, the corresponding elements of two 128-bit
2207///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2208///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2209///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2210///
2211/// \headerfile <x86intrin.h>
2212///
2213/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2214///
2215/// \param __a
2216///    A 128-bit signed [16 x i8] vector.
2217/// \param __b
2218///    A 128-bit signed [16 x i8] vector.
2219/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2220///    both parameters.
2221static __inline__ __m128i __DEFAULT_FN_ATTRS
2222_mm_adds_epi8(__m128i __a, __m128i __b)
2223{
2224  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2225}
2226
2227/// Adds, with saturation, the corresponding elements of two 128-bit
2228///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2229///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2230///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2231///    0x8000.
2232///
2233/// \headerfile <x86intrin.h>
2234///
2235/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2236///
2237/// \param __a
2238///    A 128-bit signed [8 x i16] vector.
2239/// \param __b
2240///    A 128-bit signed [8 x i16] vector.
2241/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2242///    both parameters.
2243static __inline__ __m128i __DEFAULT_FN_ATTRS
2244_mm_adds_epi16(__m128i __a, __m128i __b)
2245{
2246  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2247}
2248
2249/// Adds, with saturation, the corresponding elements of two 128-bit
2250///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2251///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2252///    are saturated to 0xFF. Negative sums are saturated to 0x00.
2253///
2254/// \headerfile <x86intrin.h>
2255///
2256/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2257///
2258/// \param __a
2259///    A 128-bit unsigned [16 x i8] vector.
2260/// \param __b
2261///    A 128-bit unsigned [16 x i8] vector.
2262/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2263///    of both parameters.
2264static __inline__ __m128i __DEFAULT_FN_ATTRS
2265_mm_adds_epu8(__m128i __a, __m128i __b)
2266{
2267  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2268}
2269
2270/// Adds, with saturation, the corresponding elements of two 128-bit
2271///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2272///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
2273///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2274///
2275/// \headerfile <x86intrin.h>
2276///
2277/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2278///
2279/// \param __a
2280///    A 128-bit unsigned [8 x i16] vector.
2281/// \param __b
2282///    A 128-bit unsigned [8 x i16] vector.
2283/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2284///    of both parameters.
2285static __inline__ __m128i __DEFAULT_FN_ATTRS
2286_mm_adds_epu16(__m128i __a, __m128i __b)
2287{
2288  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2289}
2290
2291/// Computes the rounded averages of corresponding elements of two
2292///    128-bit unsigned [16 x i8] vectors, saving each result in the
2293///    corresponding element of a 128-bit result vector of [16 x i8].
2294///
2295/// \headerfile <x86intrin.h>
2296///
2297/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2298///
2299/// \param __a
2300///    A 128-bit unsigned [16 x i8] vector.
2301/// \param __b
2302///    A 128-bit unsigned [16 x i8] vector.
2303/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2304///    averages of both parameters.
2305static __inline__ __m128i __DEFAULT_FN_ATTRS
2306_mm_avg_epu8(__m128i __a, __m128i __b)
2307{
2308  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2309}
2310
2311/// Computes the rounded averages of corresponding elements of two
2312///    128-bit unsigned [8 x i16] vectors, saving each result in the
2313///    corresponding element of a 128-bit result vector of [8 x i16].
2314///
2315/// \headerfile <x86intrin.h>
2316///
2317/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2318///
2319/// \param __a
2320///    A 128-bit unsigned [8 x i16] vector.
2321/// \param __b
2322///    A 128-bit unsigned [8 x i16] vector.
2323/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2324///    averages of both parameters.
2325static __inline__ __m128i __DEFAULT_FN_ATTRS
2326_mm_avg_epu16(__m128i __a, __m128i __b)
2327{
2328  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2329}
2330
2331/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2332///    vectors, producing eight intermediate 32-bit signed integer products, and
2333///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2334///    [4 x i32] vector.
2335///
2336///    For example, bits [15:0] of both parameters are multiplied producing a
2337///    32-bit product, bits [31:16] of both parameters are multiplied producing
2338///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2339///    of the result.
2340///
2341/// \headerfile <x86intrin.h>
2342///
2343/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2344///
2345/// \param __a
2346///    A 128-bit signed [8 x i16] vector.
2347/// \param __b
2348///    A 128-bit signed [8 x i16] vector.
2349/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2350///    of both parameters.
2351static __inline__ __m128i __DEFAULT_FN_ATTRS
2352_mm_madd_epi16(__m128i __a, __m128i __b)
2353{
2354  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2355}
2356
2357/// Compares corresponding elements of two 128-bit signed [8 x i16]
2358///    vectors, saving the greater value from each comparison in the
2359///    corresponding element of a 128-bit result vector of [8 x i16].
2360///
2361/// \headerfile <x86intrin.h>
2362///
2363/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2364///
2365/// \param __a
2366///    A 128-bit signed [8 x i16] vector.
2367/// \param __b
2368///    A 128-bit signed [8 x i16] vector.
2369/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2370///    each comparison.
2371static __inline__ __m128i __DEFAULT_FN_ATTRS
2372_mm_max_epi16(__m128i __a, __m128i __b)
2373{
2374  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2375}
2376
2377/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2378///    vectors, saving the greater value from each comparison in the
2379///    corresponding element of a 128-bit result vector of [16 x i8].
2380///
2381/// \headerfile <x86intrin.h>
2382///
2383/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2384///
2385/// \param __a
2386///    A 128-bit unsigned [16 x i8] vector.
2387/// \param __b
2388///    A 128-bit unsigned [16 x i8] vector.
2389/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2390///    each comparison.
2391static __inline__ __m128i __DEFAULT_FN_ATTRS
2392_mm_max_epu8(__m128i __a, __m128i __b)
2393{
2394  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2395}
2396
2397/// Compares corresponding elements of two 128-bit signed [8 x i16]
2398///    vectors, saving the smaller value from each comparison in the
2399///    corresponding element of a 128-bit result vector of [8 x i16].
2400///
2401/// \headerfile <x86intrin.h>
2402///
2403/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2404///
2405/// \param __a
2406///    A 128-bit signed [8 x i16] vector.
2407/// \param __b
2408///    A 128-bit signed [8 x i16] vector.
2409/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2410///    each comparison.
2411static __inline__ __m128i __DEFAULT_FN_ATTRS
2412_mm_min_epi16(__m128i __a, __m128i __b)
2413{
2414  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2415}
2416
2417/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2418///    vectors, saving the smaller value from each comparison in the
2419///    corresponding element of a 128-bit result vector of [16 x i8].
2420///
2421/// \headerfile <x86intrin.h>
2422///
2423/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2424///
2425/// \param __a
2426///    A 128-bit unsigned [16 x i8] vector.
2427/// \param __b
2428///    A 128-bit unsigned [16 x i8] vector.
2429/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2430///    each comparison.
2431static __inline__ __m128i __DEFAULT_FN_ATTRS
2432_mm_min_epu8(__m128i __a, __m128i __b)
2433{
2434  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2435}
2436
2437/// Multiplies the corresponding elements of two signed [8 x i16]
2438///    vectors, saving the upper 16 bits of each 32-bit product in the
2439///    corresponding element of a 128-bit signed [8 x i16] result vector.
2440///
2441/// \headerfile <x86intrin.h>
2442///
2443/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2444///
2445/// \param __a
2446///    A 128-bit signed [8 x i16] vector.
2447/// \param __b
2448///    A 128-bit signed [8 x i16] vector.
2449/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2450///    each of the eight 32-bit products.
2451static __inline__ __m128i __DEFAULT_FN_ATTRS
2452_mm_mulhi_epi16(__m128i __a, __m128i __b)
2453{
2454  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2455}
2456
2457/// Multiplies the corresponding elements of two unsigned [8 x i16]
2458///    vectors, saving the upper 16 bits of each 32-bit product in the
2459///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2460///
2461/// \headerfile <x86intrin.h>
2462///
2463/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2464///
2465/// \param __a
2466///    A 128-bit unsigned [8 x i16] vector.
2467/// \param __b
2468///    A 128-bit unsigned [8 x i16] vector.
2469/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2470///    of each of the eight 32-bit products.
2471static __inline__ __m128i __DEFAULT_FN_ATTRS
2472_mm_mulhi_epu16(__m128i __a, __m128i __b)
2473{
2474  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2475}
2476
2477/// Multiplies the corresponding elements of two signed [8 x i16]
2478///    vectors, saving the lower 16 bits of each 32-bit product in the
2479///    corresponding element of a 128-bit signed [8 x i16] result vector.
2480///
2481/// \headerfile <x86intrin.h>
2482///
2483/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2484///
2485/// \param __a
2486///    A 128-bit signed [8 x i16] vector.
2487/// \param __b
2488///    A 128-bit signed [8 x i16] vector.
2489/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2490///    each of the eight 32-bit products.
2491static __inline__ __m128i __DEFAULT_FN_ATTRS
2492_mm_mullo_epi16(__m128i __a, __m128i __b)
2493{
2494  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2495}
2496
2497/// Multiplies 32-bit unsigned integer values contained in the lower bits
2498///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2499///    product.
2500///
2501/// \headerfile <x86intrin.h>
2502///
2503/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2504///
2505/// \param __a
2506///    A 64-bit integer containing one of the source operands.
2507/// \param __b
2508///    A 64-bit integer containing one of the source operands.
2509/// \returns A 64-bit integer vector containing the product of both operands.
2510static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2511_mm_mul_su32(__m64 __a, __m64 __b)
2512{
2513  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2514}
2515
2516/// Multiplies 32-bit unsigned integer values contained in the lower
2517///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2518///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2519///
2520/// \headerfile <x86intrin.h>
2521///
2522/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2523///
2524/// \param __a
2525///    A [2 x i64] vector containing one of the source operands.
2526/// \param __b
2527///    A [2 x i64] vector containing one of the source operands.
2528/// \returns A [2 x i64] vector containing the product of both operands.
2529static __inline__ __m128i __DEFAULT_FN_ATTRS
2530_mm_mul_epu32(__m128i __a, __m128i __b)
2531{
2532  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2533}
2534
2535/// Computes the absolute differences of corresponding 8-bit integer
2536///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2537///    separately sums the second 8 absolute differences. Packs these two
2538///    unsigned 16-bit integer sums into the upper and lower elements of a
2539///    [2 x i64] vector.
2540///
2541/// \headerfile <x86intrin.h>
2542///
2543/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2544///
2545/// \param __a
2546///    A 128-bit integer vector containing one of the source operands.
2547/// \param __b
2548///    A 128-bit integer vector containing one of the source operands.
2549/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2550///    differences between both operands.
2551static __inline__ __m128i __DEFAULT_FN_ATTRS
2552_mm_sad_epu8(__m128i __a, __m128i __b)
2553{
2554  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2555}
2556
2557/// Subtracts the corresponding 8-bit integer values in the operands.
2558///
2559/// \headerfile <x86intrin.h>
2560///
2561/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2562///
2563/// \param __a
2564///    A 128-bit integer vector containing the minuends.
2565/// \param __b
2566///    A 128-bit integer vector containing the subtrahends.
2567/// \returns A 128-bit integer vector containing the differences of the values
2568///    in the operands.
2569static __inline__ __m128i __DEFAULT_FN_ATTRS
2570_mm_sub_epi8(__m128i __a, __m128i __b)
2571{
2572  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2573}
2574
2575/// Subtracts the corresponding 16-bit integer values in the operands.
2576///
2577/// \headerfile <x86intrin.h>
2578///
2579/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2580///
2581/// \param __a
2582///    A 128-bit integer vector containing the minuends.
2583/// \param __b
2584///    A 128-bit integer vector containing the subtrahends.
2585/// \returns A 128-bit integer vector containing the differences of the values
2586///    in the operands.
2587static __inline__ __m128i __DEFAULT_FN_ATTRS
2588_mm_sub_epi16(__m128i __a, __m128i __b)
2589{
2590  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2591}
2592
2593/// Subtracts the corresponding 32-bit integer values in the operands.
2594///
2595/// \headerfile <x86intrin.h>
2596///
2597/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2598///
2599/// \param __a
2600///    A 128-bit integer vector containing the minuends.
2601/// \param __b
2602///    A 128-bit integer vector containing the subtrahends.
2603/// \returns A 128-bit integer vector containing the differences of the values
2604///    in the operands.
2605static __inline__ __m128i __DEFAULT_FN_ATTRS
2606_mm_sub_epi32(__m128i __a, __m128i __b)
2607{
2608  return (__m128i)((__v4su)__a - (__v4su)__b);
2609}
2610
2611/// Subtracts signed or unsigned 64-bit integer values and writes the
2612///    difference to the corresponding bits in the destination.
2613///
2614/// \headerfile <x86intrin.h>
2615///
2616/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2617///
2618/// \param __a
2619///    A 64-bit integer vector containing the minuend.
2620/// \param __b
2621///    A 64-bit integer vector containing the subtrahend.
2622/// \returns A 64-bit integer vector containing the difference of the values in
2623///    the operands.
2624static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2625_mm_sub_si64(__m64 __a, __m64 __b)
2626{
2627  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2628}
2629
2630/// Subtracts the corresponding elements of two [2 x i64] vectors.
2631///
2632/// \headerfile <x86intrin.h>
2633///
2634/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2635///
2636/// \param __a
2637///    A 128-bit integer vector containing the minuends.
2638/// \param __b
2639///    A 128-bit integer vector containing the subtrahends.
2640/// \returns A 128-bit integer vector containing the differences of the values
2641///    in the operands.
2642static __inline__ __m128i __DEFAULT_FN_ATTRS
2643_mm_sub_epi64(__m128i __a, __m128i __b)
2644{
2645  return (__m128i)((__v2du)__a - (__v2du)__b);
2646}
2647
2648/// Subtracts corresponding 8-bit signed integer values in the input and
2649///    returns the differences in the corresponding bytes in the destination.
2650///    Differences greater than 0x7F are saturated to 0x7F, and differences less
2651///    than 0x80 are saturated to 0x80.
2652///
2653/// \headerfile <x86intrin.h>
2654///
2655/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2656///
2657/// \param __a
2658///    A 128-bit integer vector containing the minuends.
2659/// \param __b
2660///    A 128-bit integer vector containing the subtrahends.
2661/// \returns A 128-bit integer vector containing the differences of the values
2662///    in the operands.
2663static __inline__ __m128i __DEFAULT_FN_ATTRS
2664_mm_subs_epi8(__m128i __a, __m128i __b)
2665{
2666  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2667}
2668
2669/// Subtracts corresponding 16-bit signed integer values in the input and
2670///    returns the differences in the corresponding bytes in the destination.
2671///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2672///    than 0x8000 are saturated to 0x8000.
2673///
2674/// \headerfile <x86intrin.h>
2675///
2676/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2677///
2678/// \param __a
2679///    A 128-bit integer vector containing the minuends.
2680/// \param __b
2681///    A 128-bit integer vector containing the subtrahends.
2682/// \returns A 128-bit integer vector containing the differences of the values
2683///    in the operands.
2684static __inline__ __m128i __DEFAULT_FN_ATTRS
2685_mm_subs_epi16(__m128i __a, __m128i __b)
2686{
2687  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2688}
2689
2690/// Subtracts corresponding 8-bit unsigned integer values in the input
2691///    and returns the differences in the corresponding bytes in the
2692///    destination. Differences less than 0x00 are saturated to 0x00.
2693///
2694/// \headerfile <x86intrin.h>
2695///
2696/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2697///
2698/// \param __a
2699///    A 128-bit integer vector containing the minuends.
2700/// \param __b
2701///    A 128-bit integer vector containing the subtrahends.
2702/// \returns A 128-bit integer vector containing the unsigned integer
2703///    differences of the values in the operands.
2704static __inline__ __m128i __DEFAULT_FN_ATTRS
2705_mm_subs_epu8(__m128i __a, __m128i __b)
2706{
2707  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2708}
2709
2710/// Subtracts corresponding 16-bit unsigned integer values in the input
2711///    and returns the differences in the corresponding bytes in the
2712///    destination. Differences less than 0x0000 are saturated to 0x0000.
2713///
2714/// \headerfile <x86intrin.h>
2715///
2716/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2717///
2718/// \param __a
2719///    A 128-bit integer vector containing the minuends.
2720/// \param __b
2721///    A 128-bit integer vector containing the subtrahends.
2722/// \returns A 128-bit integer vector containing the unsigned integer
2723///    differences of the values in the operands.
2724static __inline__ __m128i __DEFAULT_FN_ATTRS
2725_mm_subs_epu16(__m128i __a, __m128i __b)
2726{
2727  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2728}
2729
2730/// Performs a bitwise AND of two 128-bit integer vectors.
2731///
2732/// \headerfile <x86intrin.h>
2733///
2734/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2735///
2736/// \param __a
2737///    A 128-bit integer vector containing one of the source operands.
2738/// \param __b
2739///    A 128-bit integer vector containing one of the source operands.
2740/// \returns A 128-bit integer vector containing the bitwise AND of the values
2741///    in both operands.
2742static __inline__ __m128i __DEFAULT_FN_ATTRS
2743_mm_and_si128(__m128i __a, __m128i __b)
2744{
2745  return (__m128i)((__v2du)__a & (__v2du)__b);
2746}
2747
2748/// Performs a bitwise AND of two 128-bit integer vectors, using the
2749///    one's complement of the values contained in the first source operand.
2750///
2751/// \headerfile <x86intrin.h>
2752///
2753/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2754///
2755/// \param __a
2756///    A 128-bit vector containing the left source operand. The one's complement
2757///    of this value is used in the bitwise AND.
2758/// \param __b
2759///    A 128-bit vector containing the right source operand.
2760/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2761///    complement of the first operand and the values in the second operand.
2762static __inline__ __m128i __DEFAULT_FN_ATTRS
2763_mm_andnot_si128(__m128i __a, __m128i __b)
2764{
2765  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2766}
2767/// Performs a bitwise OR of two 128-bit integer vectors.
2768///
2769/// \headerfile <x86intrin.h>
2770///
2771/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2772///
2773/// \param __a
2774///    A 128-bit integer vector containing one of the source operands.
2775/// \param __b
2776///    A 128-bit integer vector containing one of the source operands.
2777/// \returns A 128-bit integer vector containing the bitwise OR of the values
2778///    in both operands.
2779static __inline__ __m128i __DEFAULT_FN_ATTRS
2780_mm_or_si128(__m128i __a, __m128i __b)
2781{
2782  return (__m128i)((__v2du)__a | (__v2du)__b);
2783}
2784
2785/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2786///
2787/// \headerfile <x86intrin.h>
2788///
2789/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2790///
2791/// \param __a
2792///    A 128-bit integer vector containing one of the source operands.
2793/// \param __b
2794///    A 128-bit integer vector containing one of the source operands.
2795/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2796///    values in both operands.
2797static __inline__ __m128i __DEFAULT_FN_ATTRS
2798_mm_xor_si128(__m128i __a, __m128i __b)
2799{
2800  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2801}
2802
2803/// Left-shifts the 128-bit integer vector operand by the specified
2804///    number of bytes. Low-order bits are cleared.
2805///
2806/// \headerfile <x86intrin.h>
2807///
2808/// \code
2809/// __m128i _mm_slli_si128(__m128i a, const int imm);
2810/// \endcode
2811///
2812/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2813///
2814/// \param a
2815///    A 128-bit integer vector containing the source operand.
2816/// \param imm
2817///    An immediate value specifying the number of bytes to left-shift operand
2818///    \a a.
2819/// \returns A 128-bit integer vector containing the left-shifted value.
2820#define _mm_slli_si128(a, imm) \
2821  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2822
2823#define _mm_bslli_si128(a, imm) \
2824  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2825
2826/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2827///    by the specified number of bits. Low-order bits are cleared.
2828///
2829/// \headerfile <x86intrin.h>
2830///
2831/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2832///
2833/// \param __a
2834///    A 128-bit integer vector containing the source operand.
2835/// \param __count
2836///    An integer value specifying the number of bits to left-shift each value
2837///    in operand \a __a.
2838/// \returns A 128-bit integer vector containing the left-shifted values.
2839static __inline__ __m128i __DEFAULT_FN_ATTRS
2840_mm_slli_epi16(__m128i __a, int __count)
2841{
2842  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2843}
2844
2845/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2846///    by the specified number of bits. Low-order bits are cleared.
2847///
2848/// \headerfile <x86intrin.h>
2849///
2850/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2851///
2852/// \param __a
2853///    A 128-bit integer vector containing the source operand.
2854/// \param __count
2855///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2856///    to left-shift each value in operand \a __a.
2857/// \returns A 128-bit integer vector containing the left-shifted values.
2858static __inline__ __m128i __DEFAULT_FN_ATTRS
2859_mm_sll_epi16(__m128i __a, __m128i __count)
2860{
2861  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2862}
2863
2864/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2865///    by the specified number of bits. Low-order bits are cleared.
2866///
2867/// \headerfile <x86intrin.h>
2868///
2869/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2870///
2871/// \param __a
2872///    A 128-bit integer vector containing the source operand.
2873/// \param __count
2874///    An integer value specifying the number of bits to left-shift each value
2875///    in operand \a __a.
2876/// \returns A 128-bit integer vector containing the left-shifted values.
2877static __inline__ __m128i __DEFAULT_FN_ATTRS
2878_mm_slli_epi32(__m128i __a, int __count)
2879{
2880  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2881}
2882
2883/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2884///    by the specified number of bits. Low-order bits are cleared.
2885///
2886/// \headerfile <x86intrin.h>
2887///
2888/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2889///
2890/// \param __a
2891///    A 128-bit integer vector containing the source operand.
2892/// \param __count
2893///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2894///    to left-shift each value in operand \a __a.
2895/// \returns A 128-bit integer vector containing the left-shifted values.
2896static __inline__ __m128i __DEFAULT_FN_ATTRS
2897_mm_sll_epi32(__m128i __a, __m128i __count)
2898{
2899  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2900}
2901
2902/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2903///    by the specified number of bits. Low-order bits are cleared.
2904///
2905/// \headerfile <x86intrin.h>
2906///
2907/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2908///
2909/// \param __a
2910///    A 128-bit integer vector containing the source operand.
2911/// \param __count
2912///    An integer value specifying the number of bits to left-shift each value
2913///    in operand \a __a.
2914/// \returns A 128-bit integer vector containing the left-shifted values.
2915static __inline__ __m128i __DEFAULT_FN_ATTRS
2916_mm_slli_epi64(__m128i __a, int __count)
2917{
2918  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2919}
2920
2921/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2922///    by the specified number of bits. Low-order bits are cleared.
2923///
2924/// \headerfile <x86intrin.h>
2925///
2926/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2927///
2928/// \param __a
2929///    A 128-bit integer vector containing the source operand.
2930/// \param __count
2931///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2932///    to left-shift each value in operand \a __a.
2933/// \returns A 128-bit integer vector containing the left-shifted values.
2934static __inline__ __m128i __DEFAULT_FN_ATTRS
2935_mm_sll_epi64(__m128i __a, __m128i __count)
2936{
2937  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2938}
2939
2940/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2941///    by the specified number of bits. High-order bits are filled with the sign
2942///    bit of the initial value.
2943///
2944/// \headerfile <x86intrin.h>
2945///
2946/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2947///
2948/// \param __a
2949///    A 128-bit integer vector containing the source operand.
2950/// \param __count
2951///    An integer value specifying the number of bits to right-shift each value
2952///    in operand \a __a.
2953/// \returns A 128-bit integer vector containing the right-shifted values.
2954static __inline__ __m128i __DEFAULT_FN_ATTRS
2955_mm_srai_epi16(__m128i __a, int __count)
2956{
2957  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2958}
2959
2960/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2961///    by the specified number of bits. High-order bits are filled with the sign
2962///    bit of the initial value.
2963///
2964/// \headerfile <x86intrin.h>
2965///
2966/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2967///
2968/// \param __a
2969///    A 128-bit integer vector containing the source operand.
2970/// \param __count
2971///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2972///    to right-shift each value in operand \a __a.
2973/// \returns A 128-bit integer vector containing the right-shifted values.
2974static __inline__ __m128i __DEFAULT_FN_ATTRS
2975_mm_sra_epi16(__m128i __a, __m128i __count)
2976{
2977  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2978}
2979
2980/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2981///    by the specified number of bits. High-order bits are filled with the sign
2982///    bit of the initial value.
2983///
2984/// \headerfile <x86intrin.h>
2985///
2986/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2987///
2988/// \param __a
2989///    A 128-bit integer vector containing the source operand.
2990/// \param __count
2991///    An integer value specifying the number of bits to right-shift each value
2992///    in operand \a __a.
2993/// \returns A 128-bit integer vector containing the right-shifted values.
2994static __inline__ __m128i __DEFAULT_FN_ATTRS
2995_mm_srai_epi32(__m128i __a, int __count)
2996{
2997  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2998}
2999
3000/// Right-shifts each 32-bit value in the 128-bit integer vector operand
3001///    by the specified number of bits. High-order bits are filled with the sign
3002///    bit of the initial value.
3003///
3004/// \headerfile <x86intrin.h>
3005///
3006/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3007///
3008/// \param __a
3009///    A 128-bit integer vector containing the source operand.
3010/// \param __count
3011///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3012///    to right-shift each value in operand \a __a.
3013/// \returns A 128-bit integer vector containing the right-shifted values.
3014static __inline__ __m128i __DEFAULT_FN_ATTRS
3015_mm_sra_epi32(__m128i __a, __m128i __count)
3016{
3017  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
3018}
3019
3020/// Right-shifts the 128-bit integer vector operand by the specified
3021///    number of bytes. High-order bits are cleared.
3022///
3023/// \headerfile <x86intrin.h>
3024///
3025/// \code
3026/// __m128i _mm_srli_si128(__m128i a, const int imm);
3027/// \endcode
3028///
3029/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3030///
3031/// \param a
3032///    A 128-bit integer vector containing the source operand.
3033/// \param imm
3034///    An immediate value specifying the number of bytes to right-shift operand
3035///    \a a.
3036/// \returns A 128-bit integer vector containing the right-shifted value.
3037#define _mm_srli_si128(a, imm) \
3038  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3039
3040#define _mm_bsrli_si128(a, imm) \
3041  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3042
3043/// Right-shifts each of 16-bit values in the 128-bit integer vector
3044///    operand by the specified number of bits. High-order bits are cleared.
3045///
3046/// \headerfile <x86intrin.h>
3047///
3048/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3049///
3050/// \param __a
3051///    A 128-bit integer vector containing the source operand.
3052/// \param __count
3053///    An integer value specifying the number of bits to right-shift each value
3054///    in operand \a __a.
3055/// \returns A 128-bit integer vector containing the right-shifted values.
3056static __inline__ __m128i __DEFAULT_FN_ATTRS
3057_mm_srli_epi16(__m128i __a, int __count)
3058{
3059  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3060}
3061
3062/// Right-shifts each of 16-bit values in the 128-bit integer vector
3063///    operand by the specified number of bits. High-order bits are cleared.
3064///
3065/// \headerfile <x86intrin.h>
3066///
3067/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3068///
3069/// \param __a
3070///    A 128-bit integer vector containing the source operand.
3071/// \param __count
3072///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3073///    to right-shift each value in operand \a __a.
3074/// \returns A 128-bit integer vector containing the right-shifted values.
3075static __inline__ __m128i __DEFAULT_FN_ATTRS
3076_mm_srl_epi16(__m128i __a, __m128i __count)
3077{
3078  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3079}
3080
3081/// Right-shifts each of 32-bit values in the 128-bit integer vector
3082///    operand by the specified number of bits. High-order bits are cleared.
3083///
3084/// \headerfile <x86intrin.h>
3085///
3086/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3087///
3088/// \param __a
3089///    A 128-bit integer vector containing the source operand.
3090/// \param __count
3091///    An integer value specifying the number of bits to right-shift each value
3092///    in operand \a __a.
3093/// \returns A 128-bit integer vector containing the right-shifted values.
3094static __inline__ __m128i __DEFAULT_FN_ATTRS
3095_mm_srli_epi32(__m128i __a, int __count)
3096{
3097  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3098}
3099
3100/// Right-shifts each of 32-bit values in the 128-bit integer vector
3101///    operand by the specified number of bits. High-order bits are cleared.
3102///
3103/// \headerfile <x86intrin.h>
3104///
3105/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3106///
3107/// \param __a
3108///    A 128-bit integer vector containing the source operand.
3109/// \param __count
3110///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3111///    to right-shift each value in operand \a __a.
3112/// \returns A 128-bit integer vector containing the right-shifted values.
3113static __inline__ __m128i __DEFAULT_FN_ATTRS
3114_mm_srl_epi32(__m128i __a, __m128i __count)
3115{
3116  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3117}
3118
3119/// Right-shifts each of 64-bit values in the 128-bit integer vector
3120///    operand by the specified number of bits. High-order bits are cleared.
3121///
3122/// \headerfile <x86intrin.h>
3123///
3124/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3125///
3126/// \param __a
3127///    A 128-bit integer vector containing the source operand.
3128/// \param __count
3129///    An integer value specifying the number of bits to right-shift each value
3130///    in operand \a __a.
3131/// \returns A 128-bit integer vector containing the right-shifted values.
3132static __inline__ __m128i __DEFAULT_FN_ATTRS
3133_mm_srli_epi64(__m128i __a, int __count)
3134{
3135  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3136}
3137
3138/// Right-shifts each of 64-bit values in the 128-bit integer vector
3139///    operand by the specified number of bits. High-order bits are cleared.
3140///
3141/// \headerfile <x86intrin.h>
3142///
3143/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3144///
3145/// \param __a
3146///    A 128-bit integer vector containing the source operand.
3147/// \param __count
3148///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3149///    to right-shift each value in operand \a __a.
3150/// \returns A 128-bit integer vector containing the right-shifted values.
3151static __inline__ __m128i __DEFAULT_FN_ATTRS
3152_mm_srl_epi64(__m128i __a, __m128i __count)
3153{
3154  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3155}
3156
3157/// Compares each of the corresponding 8-bit values of the 128-bit
3158///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3159///    for true.
3160///
3161/// \headerfile <x86intrin.h>
3162///
3163/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3164///
3165/// \param __a
3166///    A 128-bit integer vector.
3167/// \param __b
3168///    A 128-bit integer vector.
3169/// \returns A 128-bit integer vector containing the comparison results.
3170static __inline__ __m128i __DEFAULT_FN_ATTRS
3171_mm_cmpeq_epi8(__m128i __a, __m128i __b)
3172{
3173  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3174}
3175
3176/// Compares each of the corresponding 16-bit values of the 128-bit
3177///    integer vectors for equality. Each comparison yields 0x0 for false,
3178///    0xFFFF for true.
3179///
3180/// \headerfile <x86intrin.h>
3181///
3182/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3183///
3184/// \param __a
3185///    A 128-bit integer vector.
3186/// \param __b
3187///    A 128-bit integer vector.
3188/// \returns A 128-bit integer vector containing the comparison results.
3189static __inline__ __m128i __DEFAULT_FN_ATTRS
3190_mm_cmpeq_epi16(__m128i __a, __m128i __b)
3191{
3192  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3193}
3194
3195/// Compares each of the corresponding 32-bit values of the 128-bit
3196///    integer vectors for equality. Each comparison yields 0x0 for false,
3197///    0xFFFFFFFF for true.
3198///
3199/// \headerfile <x86intrin.h>
3200///
3201/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3202///
3203/// \param __a
3204///    A 128-bit integer vector.
3205/// \param __b
3206///    A 128-bit integer vector.
3207/// \returns A 128-bit integer vector containing the comparison results.
3208static __inline__ __m128i __DEFAULT_FN_ATTRS
3209_mm_cmpeq_epi32(__m128i __a, __m128i __b)
3210{
3211  return (__m128i)((__v4si)__a == (__v4si)__b);
3212}
3213
3214/// Compares each of the corresponding signed 8-bit values of the 128-bit
3215///    integer vectors to determine if the values in the first operand are
3216///    greater than those in the second operand. Each comparison yields 0x0 for
3217///    false, 0xFF for true.
3218///
3219/// \headerfile <x86intrin.h>
3220///
3221/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3222///
3223/// \param __a
3224///    A 128-bit integer vector.
3225/// \param __b
3226///    A 128-bit integer vector.
3227/// \returns A 128-bit integer vector containing the comparison results.
3228static __inline__ __m128i __DEFAULT_FN_ATTRS
3229_mm_cmpgt_epi8(__m128i __a, __m128i __b)
3230{
3231  /* This function always performs a signed comparison, but __v16qi is a char
3232     which may be signed or unsigned, so use __v16qs. */
3233  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3234}
3235
3236/// Compares each of the corresponding signed 16-bit values of the
3237///    128-bit integer vectors to determine if the values in the first operand
3238///    are greater than those in the second operand.
3239///
3240///    Each comparison yields 0x0 for false, 0xFFFF for true.
3241///
3242/// \headerfile <x86intrin.h>
3243///
3244/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3245///
3246/// \param __a
3247///    A 128-bit integer vector.
3248/// \param __b
3249///    A 128-bit integer vector.
3250/// \returns A 128-bit integer vector containing the comparison results.
3251static __inline__ __m128i __DEFAULT_FN_ATTRS
3252_mm_cmpgt_epi16(__m128i __a, __m128i __b)
3253{
3254  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3255}
3256
3257/// Compares each of the corresponding signed 32-bit values of the
3258///    128-bit integer vectors to determine if the values in the first operand
3259///    are greater than those in the second operand.
3260///
3261///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3262///
3263/// \headerfile <x86intrin.h>
3264///
3265/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3266///
3267/// \param __a
3268///    A 128-bit integer vector.
3269/// \param __b
3270///    A 128-bit integer vector.
3271/// \returns A 128-bit integer vector containing the comparison results.
3272static __inline__ __m128i __DEFAULT_FN_ATTRS
3273_mm_cmpgt_epi32(__m128i __a, __m128i __b)
3274{
3275  return (__m128i)((__v4si)__a > (__v4si)__b);
3276}
3277
3278/// Compares each of the corresponding signed 8-bit values of the 128-bit
3279///    integer vectors to determine if the values in the first operand are less
3280///    than those in the second operand.
3281///
3282///    Each comparison yields 0x0 for false, 0xFF for true.
3283///
3284/// \headerfile <x86intrin.h>
3285///
3286/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3287///
3288/// \param __a
3289///    A 128-bit integer vector.
3290/// \param __b
3291///    A 128-bit integer vector.
3292/// \returns A 128-bit integer vector containing the comparison results.
3293static __inline__ __m128i __DEFAULT_FN_ATTRS
3294_mm_cmplt_epi8(__m128i __a, __m128i __b)
3295{
3296  return _mm_cmpgt_epi8(__b, __a);
3297}
3298
3299/// Compares each of the corresponding signed 16-bit values of the
3300///    128-bit integer vectors to determine if the values in the first operand
3301///    are less than those in the second operand.
3302///
3303///    Each comparison yields 0x0 for false, 0xFFFF for true.
3304///
3305/// \headerfile <x86intrin.h>
3306///
3307/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3308///
3309/// \param __a
3310///    A 128-bit integer vector.
3311/// \param __b
3312///    A 128-bit integer vector.
3313/// \returns A 128-bit integer vector containing the comparison results.
3314static __inline__ __m128i __DEFAULT_FN_ATTRS
3315_mm_cmplt_epi16(__m128i __a, __m128i __b)
3316{
3317  return _mm_cmpgt_epi16(__b, __a);
3318}
3319
3320/// Compares each of the corresponding signed 32-bit values of the
3321///    128-bit integer vectors to determine if the values in the first operand
3322///    are less than those in the second operand.
3323///
3324///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3325///
3326/// \headerfile <x86intrin.h>
3327///
3328/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3329///
3330/// \param __a
3331///    A 128-bit integer vector.
3332/// \param __b
3333///    A 128-bit integer vector.
3334/// \returns A 128-bit integer vector containing the comparison results.
3335static __inline__ __m128i __DEFAULT_FN_ATTRS
3336_mm_cmplt_epi32(__m128i __a, __m128i __b)
3337{
3338  return _mm_cmpgt_epi32(__b, __a);
3339}
3340
3341#ifdef __x86_64__
3342/// Converts a 64-bit signed integer value from the second operand into a
3343///    double-precision value and returns it in the lower element of a [2 x
3344///    double] vector; the upper element of the returned vector is copied from
3345///    the upper element of the first operand.
3346///
3347/// \headerfile <x86intrin.h>
3348///
3349/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3350///
3351/// \param __a
3352///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3353///    copied to the upper 64 bits of the destination.
3354/// \param __b
3355///    A 64-bit signed integer operand containing the value to be converted.
3356/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3357///    converted value of the second operand. The upper 64 bits are copied from
3358///    the upper 64 bits of the first operand.
3359static __inline__ __m128d __DEFAULT_FN_ATTRS
3360_mm_cvtsi64_sd(__m128d __a, long long __b)
3361{
3362  __a[0] = __b;
3363  return __a;
3364}
3365
3366/// Converts the first (lower) element of a vector of [2 x double] into a
3367///    64-bit signed integer value, according to the current rounding mode.
3368///
3369/// \headerfile <x86intrin.h>
3370///
3371/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3372///
3373/// \param __a
3374///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3375///    conversion.
3376/// \returns A 64-bit signed integer containing the converted value.
3377static __inline__ long long __DEFAULT_FN_ATTRS
3378_mm_cvtsd_si64(__m128d __a)
3379{
3380  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3381}
3382
3383/// Converts the first (lower) element of a vector of [2 x double] into a
3384///    64-bit signed integer value, truncating the result when it is inexact.
3385///
3386/// \headerfile <x86intrin.h>
3387///
3388/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3389///   instruction.
3390///
3391/// \param __a
3392///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3393///    conversion.
3394/// \returns A 64-bit signed integer containing the converted value.
3395static __inline__ long long __DEFAULT_FN_ATTRS
3396_mm_cvttsd_si64(__m128d __a)
3397{
3398  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3399}
3400#endif
3401
3402/// Converts a vector of [4 x i32] into a vector of [4 x float].
3403///
3404/// \headerfile <x86intrin.h>
3405///
3406/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3407///
3408/// \param __a
3409///    A 128-bit integer vector.
3410/// \returns A 128-bit vector of [4 x float] containing the converted values.
3411static __inline__ __m128 __DEFAULT_FN_ATTRS
3412_mm_cvtepi32_ps(__m128i __a)
3413{
3414  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3415}
3416
3417/// Converts a vector of [4 x float] into a vector of [4 x i32].
3418///
3419/// \headerfile <x86intrin.h>
3420///
3421/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3422///
3423/// \param __a
3424///    A 128-bit vector of [4 x float].
3425/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3426///    values.
3427static __inline__ __m128i __DEFAULT_FN_ATTRS
3428_mm_cvtps_epi32(__m128 __a)
3429{
3430  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3431}
3432
3433/// Converts a vector of [4 x float] into a vector of [4 x i32],
3434///    truncating the result when it is inexact.
3435///
3436/// \headerfile <x86intrin.h>
3437///
3438/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3439///   instruction.
3440///
3441/// \param __a
3442///    A 128-bit vector of [4 x float].
3443/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3444static __inline__ __m128i __DEFAULT_FN_ATTRS
3445_mm_cvttps_epi32(__m128 __a)
3446{
3447  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3448}
3449
3450/// Returns a vector of [4 x i32] where the lowest element is the input
3451///    operand and the remaining elements are zero.
3452///
3453/// \headerfile <x86intrin.h>
3454///
3455/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3456///
3457/// \param __a
3458///    A 32-bit signed integer operand.
3459/// \returns A 128-bit vector of [4 x i32].
3460static __inline__ __m128i __DEFAULT_FN_ATTRS
3461_mm_cvtsi32_si128(int __a)
3462{
3463  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3464}
3465
3466#ifdef __x86_64__
3467/// Returns a vector of [2 x i64] where the lower element is the input
3468///    operand and the upper element is zero.
3469///
3470/// \headerfile <x86intrin.h>
3471///
3472/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3473///
3474/// \param __a
3475///    A 64-bit signed integer operand containing the value to be converted.
3476/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3477static __inline__ __m128i __DEFAULT_FN_ATTRS
3478_mm_cvtsi64_si128(long long __a)
3479{
3480  return __extension__ (__m128i)(__v2di){ __a, 0 };
3481}
3482#endif
3483
3484/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3485///    32-bit signed integer value.
3486///
3487/// \headerfile <x86intrin.h>
3488///
3489/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3490///
3491/// \param __a
3492///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3493///    destination.
3494/// \returns A 32-bit signed integer containing the moved value.
3495static __inline__ int __DEFAULT_FN_ATTRS
3496_mm_cvtsi128_si32(__m128i __a)
3497{
3498  __v4si __b = (__v4si)__a;
3499  return __b[0];
3500}
3501
3502#ifdef __x86_64__
3503/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3504///    64-bit signed integer value.
3505///
3506/// \headerfile <x86intrin.h>
3507///
3508/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3509///
3510/// \param __a
3511///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3512///    destination.
3513/// \returns A 64-bit signed integer containing the moved value.
3514static __inline__ long long __DEFAULT_FN_ATTRS
3515_mm_cvtsi128_si64(__m128i __a)
3516{
3517  return __a[0];
3518}
3519#endif
3520
3521/// Moves packed integer values from an aligned 128-bit memory location
3522///    to elements in a 128-bit integer vector.
3523///
3524/// \headerfile <x86intrin.h>
3525///
3526/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3527///
3528/// \param __p
3529///    An aligned pointer to a memory location containing integer values.
3530/// \returns A 128-bit integer vector containing the moved values.
3531static __inline__ __m128i __DEFAULT_FN_ATTRS
3532_mm_load_si128(__m128i const *__p)
3533{
3534  return *__p;
3535}
3536
3537/// Moves packed integer values from an unaligned 128-bit memory location
3538///    to elements in a 128-bit integer vector.
3539///
3540/// \headerfile <x86intrin.h>
3541///
3542/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3543///
3544/// \param __p
3545///    A pointer to a memory location containing integer values.
3546/// \returns A 128-bit integer vector containing the moved values.
3547static __inline__ __m128i __DEFAULT_FN_ATTRS
3548_mm_loadu_si128(__m128i_u const *__p)
3549{
3550  struct __loadu_si128 {
3551    __m128i_u __v;
3552  } __attribute__((__packed__, __may_alias__));
3553  return ((const struct __loadu_si128*)__p)->__v;
3554}
3555
3556/// Returns a vector of [2 x i64] where the lower element is taken from
3557///    the lower element of the operand, and the upper element is zero.
3558///
3559/// \headerfile <x86intrin.h>
3560///
3561/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3562///
3563/// \param __p
3564///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3565///    the destination.
3566/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3567///    moved value. The higher order bits are cleared.
3568static __inline__ __m128i __DEFAULT_FN_ATTRS
3569_mm_loadl_epi64(__m128i_u const *__p)
3570{
3571  struct __mm_loadl_epi64_struct {
3572    long long __u;
3573  } __attribute__((__packed__, __may_alias__));
3574  return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3575}
3576
3577/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3578///    This could be used as an argument to another intrinsic function where the
3579///    argument is required but the value is not actually used.
3580///
3581/// \headerfile <x86intrin.h>
3582///
3583/// This intrinsic has no corresponding instruction.
3584///
3585/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3586static __inline__ __m128i __DEFAULT_FN_ATTRS
3587_mm_undefined_si128(void)
3588{
3589  return (__m128i)__builtin_ia32_undef128();
3590}
3591
3592/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3593///    the specified 64-bit integer values.
3594///
3595/// \headerfile <x86intrin.h>
3596///
3597/// This intrinsic is a utility function and does not correspond to a specific
3598///    instruction.
3599///
3600/// \param __q1
3601///    A 64-bit integer value used to initialize the upper 64 bits of the
3602///    destination vector of [2 x i64].
3603/// \param __q0
3604///    A 64-bit integer value used to initialize the lower 64 bits of the
3605///    destination vector of [2 x i64].
3606/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3607///    provided in the operands.
3608static __inline__ __m128i __DEFAULT_FN_ATTRS
3609_mm_set_epi64x(long long __q1, long long __q0)
3610{
3611  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3612}
3613
3614/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3615///    the specified 64-bit integer values.
3616///
3617/// \headerfile <x86intrin.h>
3618///
3619/// This intrinsic is a utility function and does not correspond to a specific
3620///    instruction.
3621///
3622/// \param __q1
3623///    A 64-bit integer value used to initialize the upper 64 bits of the
3624///    destination vector of [2 x i64].
3625/// \param __q0
3626///    A 64-bit integer value used to initialize the lower 64 bits of the
3627///    destination vector of [2 x i64].
3628/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3629///    provided in the operands.
3630static __inline__ __m128i __DEFAULT_FN_ATTRS
3631_mm_set_epi64(__m64 __q1, __m64 __q0)
3632{
3633  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3634}
3635
3636/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3637///    the specified 32-bit integer values.
3638///
3639/// \headerfile <x86intrin.h>
3640///
3641/// This intrinsic is a utility function and does not correspond to a specific
3642///    instruction.
3643///
3644/// \param __i3
3645///    A 32-bit integer value used to initialize bits [127:96] of the
3646///    destination vector.
3647/// \param __i2
3648///    A 32-bit integer value used to initialize bits [95:64] of the destination
3649///    vector.
3650/// \param __i1
3651///    A 32-bit integer value used to initialize bits [63:32] of the destination
3652///    vector.
3653/// \param __i0
3654///    A 32-bit integer value used to initialize bits [31:0] of the destination
3655///    vector.
3656/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3657///    provided in the operands.
3658static __inline__ __m128i __DEFAULT_FN_ATTRS
3659_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3660{
3661  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3662}
3663
3664/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3665///    the specified 16-bit integer values.
3666///
3667/// \headerfile <x86intrin.h>
3668///
3669/// This intrinsic is a utility function and does not correspond to a specific
3670///    instruction.
3671///
3672/// \param __w7
3673///    A 16-bit integer value used to initialize bits [127:112] of the
3674///    destination vector.
3675/// \param __w6
3676///    A 16-bit integer value used to initialize bits [111:96] of the
3677///    destination vector.
3678/// \param __w5
3679///    A 16-bit integer value used to initialize bits [95:80] of the destination
3680///    vector.
3681/// \param __w4
3682///    A 16-bit integer value used to initialize bits [79:64] of the destination
3683///    vector.
3684/// \param __w3
3685///    A 16-bit integer value used to initialize bits [63:48] of the destination
3686///    vector.
3687/// \param __w2
3688///    A 16-bit integer value used to initialize bits [47:32] of the destination
3689///    vector.
3690/// \param __w1
3691///    A 16-bit integer value used to initialize bits [31:16] of the destination
3692///    vector.
3693/// \param __w0
3694///    A 16-bit integer value used to initialize bits [15:0] of the destination
3695///    vector.
3696/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3697///    provided in the operands.
3698static __inline__ __m128i __DEFAULT_FN_ATTRS
3699_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3700{
3701  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3702}
3703
3704/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3705///    the specified 8-bit integer values.
3706///
3707/// \headerfile <x86intrin.h>
3708///
3709/// This intrinsic is a utility function and does not correspond to a specific
3710///    instruction.
3711///
3712/// \param __b15
3713///    Initializes bits [127:120] of the destination vector.
3714/// \param __b14
3715///    Initializes bits [119:112] of the destination vector.
3716/// \param __b13
3717///    Initializes bits [111:104] of the destination vector.
3718/// \param __b12
3719///    Initializes bits [103:96] of the destination vector.
3720/// \param __b11
3721///    Initializes bits [95:88] of the destination vector.
3722/// \param __b10
3723///    Initializes bits [87:80] of the destination vector.
3724/// \param __b9
3725///    Initializes bits [79:72] of the destination vector.
3726/// \param __b8
3727///    Initializes bits [71:64] of the destination vector.
3728/// \param __b7
3729///    Initializes bits [63:56] of the destination vector.
3730/// \param __b6
3731///    Initializes bits [55:48] of the destination vector.
3732/// \param __b5
3733///    Initializes bits [47:40] of the destination vector.
3734/// \param __b4
3735///    Initializes bits [39:32] of the destination vector.
3736/// \param __b3
3737///    Initializes bits [31:24] of the destination vector.
3738/// \param __b2
3739///    Initializes bits [23:16] of the destination vector.
3740/// \param __b1
3741///    Initializes bits [15:8] of the destination vector.
3742/// \param __b0
3743///    Initializes bits [7:0] of the destination vector.
3744/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3745///    provided in the operands.
3746static __inline__ __m128i __DEFAULT_FN_ATTRS
3747_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3748{
3749  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3750}
3751
3752/// Initializes both values in a 128-bit integer vector with the
3753///    specified 64-bit integer value.
3754///
3755/// \headerfile <x86intrin.h>
3756///
3757/// This intrinsic is a utility function and does not correspond to a specific
3758///    instruction.
3759///
3760/// \param __q
3761///    Integer value used to initialize the elements of the destination integer
3762///    vector.
3763/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3764///    elements containing the value provided in the operand.
3765static __inline__ __m128i __DEFAULT_FN_ATTRS
3766_mm_set1_epi64x(long long __q)
3767{
3768  return _mm_set_epi64x(__q, __q);
3769}
3770
3771/// Initializes both values in a 128-bit vector of [2 x i64] with the
3772///    specified 64-bit value.
3773///
3774/// \headerfile <x86intrin.h>
3775///
3776/// This intrinsic is a utility function and does not correspond to a specific
3777///    instruction.
3778///
3779/// \param __q
3780///    A 64-bit value used to initialize the elements of the destination integer
3781///    vector.
3782/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3783///    containing the value provided in the operand.
3784static __inline__ __m128i __DEFAULT_FN_ATTRS
3785_mm_set1_epi64(__m64 __q)
3786{
3787  return _mm_set_epi64(__q, __q);
3788}
3789
3790/// Initializes all values in a 128-bit vector of [4 x i32] with the
3791///    specified 32-bit value.
3792///
3793/// \headerfile <x86intrin.h>
3794///
3795/// This intrinsic is a utility function and does not correspond to a specific
3796///    instruction.
3797///
3798/// \param __i
3799///    A 32-bit value used to initialize the elements of the destination integer
3800///    vector.
3801/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3802///    containing the value provided in the operand.
3803static __inline__ __m128i __DEFAULT_FN_ATTRS
3804_mm_set1_epi32(int __i)
3805{
3806  return _mm_set_epi32(__i, __i, __i, __i);
3807}
3808
3809/// Initializes all values in a 128-bit vector of [8 x i16] with the
3810///    specified 16-bit value.
3811///
3812/// \headerfile <x86intrin.h>
3813///
3814/// This intrinsic is a utility function and does not correspond to a specific
3815///    instruction.
3816///
3817/// \param __w
3818///    A 16-bit value used to initialize the elements of the destination integer
3819///    vector.
3820/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3821///    containing the value provided in the operand.
3822static __inline__ __m128i __DEFAULT_FN_ATTRS
3823_mm_set1_epi16(short __w)
3824{
3825  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3826}
3827
3828/// Initializes all values in a 128-bit vector of [16 x i8] with the
3829///    specified 8-bit value.
3830///
3831/// \headerfile <x86intrin.h>
3832///
3833/// This intrinsic is a utility function and does not correspond to a specific
3834///    instruction.
3835///
3836/// \param __b
3837///    An 8-bit value used to initialize the elements of the destination integer
3838///    vector.
3839/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3840///    containing the value provided in the operand.
3841static __inline__ __m128i __DEFAULT_FN_ATTRS
3842_mm_set1_epi8(char __b)
3843{
3844  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3845}
3846
3847/// Constructs a 128-bit integer vector, initialized in reverse order
3848///     with the specified 64-bit integral values.
3849///
3850/// \headerfile <x86intrin.h>
3851///
3852/// This intrinsic does not correspond to a specific instruction.
3853///
3854/// \param __q0
3855///    A 64-bit integral value used to initialize the lower 64 bits of the
3856///    result.
3857/// \param __q1
3858///    A 64-bit integral value used to initialize the upper 64 bits of the
3859///    result.
3860/// \returns An initialized 128-bit integer vector.
3861static __inline__ __m128i __DEFAULT_FN_ATTRS
3862_mm_setr_epi64(__m64 __q0, __m64 __q1)
3863{
3864  return _mm_set_epi64(__q1, __q0);
3865}
3866
3867/// Constructs a 128-bit integer vector, initialized in reverse order
3868///     with the specified 32-bit integral values.
3869///
3870/// \headerfile <x86intrin.h>
3871///
3872/// This intrinsic is a utility function and does not correspond to a specific
3873///    instruction.
3874///
3875/// \param __i0
3876///    A 32-bit integral value used to initialize bits [31:0] of the result.
3877/// \param __i1
3878///    A 32-bit integral value used to initialize bits [63:32] of the result.
3879/// \param __i2
3880///    A 32-bit integral value used to initialize bits [95:64] of the result.
3881/// \param __i3
3882///    A 32-bit integral value used to initialize bits [127:96] of the result.
3883/// \returns An initialized 128-bit integer vector.
3884static __inline__ __m128i __DEFAULT_FN_ATTRS
3885_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3886{
3887  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3888}
3889
3890/// Constructs a 128-bit integer vector, initialized in reverse order
3891///     with the specified 16-bit integral values.
3892///
3893/// \headerfile <x86intrin.h>
3894///
3895/// This intrinsic is a utility function and does not correspond to a specific
3896///    instruction.
3897///
3898/// \param __w0
3899///    A 16-bit integral value used to initialize bits [15:0] of the result.
3900/// \param __w1
3901///    A 16-bit integral value used to initialize bits [31:16] of the result.
3902/// \param __w2
3903///    A 16-bit integral value used to initialize bits [47:32] of the result.
3904/// \param __w3
3905///    A 16-bit integral value used to initialize bits [63:48] of the result.
3906/// \param __w4
3907///    A 16-bit integral value used to initialize bits [79:64] of the result.
3908/// \param __w5
3909///    A 16-bit integral value used to initialize bits [95:80] of the result.
3910/// \param __w6
3911///    A 16-bit integral value used to initialize bits [111:96] of the result.
3912/// \param __w7
3913///    A 16-bit integral value used to initialize bits [127:112] of the result.
3914/// \returns An initialized 128-bit integer vector.
3915static __inline__ __m128i __DEFAULT_FN_ATTRS
3916_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3917{
3918  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3919}
3920
3921/// Constructs a 128-bit integer vector, initialized in reverse order
3922///     with the specified 8-bit integral values.
3923///
3924/// \headerfile <x86intrin.h>
3925///
3926/// This intrinsic is a utility function and does not correspond to a specific
3927///    instruction.
3928///
3929/// \param __b0
3930///    An 8-bit integral value used to initialize bits [7:0] of the result.
3931/// \param __b1
3932///    An 8-bit integral value used to initialize bits [15:8] of the result.
3933/// \param __b2
3934///    An 8-bit integral value used to initialize bits [23:16] of the result.
3935/// \param __b3
3936///    An 8-bit integral value used to initialize bits [31:24] of the result.
3937/// \param __b4
3938///    An 8-bit integral value used to initialize bits [39:32] of the result.
3939/// \param __b5
3940///    An 8-bit integral value used to initialize bits [47:40] of the result.
3941/// \param __b6
3942///    An 8-bit integral value used to initialize bits [55:48] of the result.
3943/// \param __b7
3944///    An 8-bit integral value used to initialize bits [63:56] of the result.
3945/// \param __b8
3946///    An 8-bit integral value used to initialize bits [71:64] of the result.
3947/// \param __b9
3948///    An 8-bit integral value used to initialize bits [79:72] of the result.
3949/// \param __b10
3950///    An 8-bit integral value used to initialize bits [87:80] of the result.
3951/// \param __b11
3952///    An 8-bit integral value used to initialize bits [95:88] of the result.
3953/// \param __b12
3954///    An 8-bit integral value used to initialize bits [103:96] of the result.
3955/// \param __b13
3956///    An 8-bit integral value used to initialize bits [111:104] of the result.
3957/// \param __b14
3958///    An 8-bit integral value used to initialize bits [119:112] of the result.
3959/// \param __b15
3960///    An 8-bit integral value used to initialize bits [127:120] of the result.
3961/// \returns An initialized 128-bit integer vector.
3962static __inline__ __m128i __DEFAULT_FN_ATTRS
3963_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3964{
3965  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3966}
3967
3968/// Creates a 128-bit integer vector initialized to zero.
3969///
3970/// \headerfile <x86intrin.h>
3971///
3972/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3973///
3974/// \returns An initialized 128-bit integer vector with all elements set to
3975///    zero.
3976static __inline__ __m128i __DEFAULT_FN_ATTRS
3977_mm_setzero_si128(void)
3978{
3979  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
3980}
3981
3982/// Stores a 128-bit integer vector to a memory location aligned on a
3983///    128-bit boundary.
3984///
3985/// \headerfile <x86intrin.h>
3986///
3987/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3988///
3989/// \param __p
3990///    A pointer to an aligned memory location that will receive the integer
3991///    values.
3992/// \param __b
3993///    A 128-bit integer vector containing the values to be moved.
3994static __inline__ void __DEFAULT_FN_ATTRS
3995_mm_store_si128(__m128i *__p, __m128i __b)
3996{
3997  *__p = __b;
3998}
3999
4000/// Stores a 128-bit integer vector to an unaligned memory location.
4001///
4002/// \headerfile <x86intrin.h>
4003///
4004/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
4005///
4006/// \param __p
4007///    A pointer to a memory location that will receive the integer values.
4008/// \param __b
4009///    A 128-bit integer vector containing the values to be moved.
4010static __inline__ void __DEFAULT_FN_ATTRS
4011_mm_storeu_si128(__m128i_u *__p, __m128i __b)
4012{
4013  struct __storeu_si128 {
4014    __m128i_u __v;
4015  } __attribute__((__packed__, __may_alias__));
4016  ((struct __storeu_si128*)__p)->__v = __b;
4017}
4018
4019/// Stores a 64-bit integer value from the low element of a 128-bit integer
4020///    vector.
4021///
4022/// \headerfile <x86intrin.h>
4023///
4024/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4025///
4026/// \param __p
4027///    A pointer to a 64-bit memory location. The address of the memory
4028///    location does not have to be algned.
4029/// \param __b
4030///    A 128-bit integer vector containing the value to be stored.
4031static __inline__ void __DEFAULT_FN_ATTRS
4032_mm_storeu_si64(void *__p, __m128i __b)
4033{
4034  struct __storeu_si64 {
4035    long long __v;
4036  } __attribute__((__packed__, __may_alias__));
4037  ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
4038}
4039
4040/// Stores a 32-bit integer value from the low element of a 128-bit integer
4041///    vector.
4042///
4043/// \headerfile <x86intrin.h>
4044///
4045/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
4046///
4047/// \param __p
4048///    A pointer to a 32-bit memory location. The address of the memory
4049///    location does not have to be aligned.
4050/// \param __b
4051///    A 128-bit integer vector containing the value to be stored.
4052static __inline__ void __DEFAULT_FN_ATTRS
4053_mm_storeu_si32(void *__p, __m128i __b)
4054{
4055  struct __storeu_si32 {
4056    int __v;
4057  } __attribute__((__packed__, __may_alias__));
4058  ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
4059}
4060
4061/// Stores a 16-bit integer value from the low element of a 128-bit integer
4062///    vector.
4063///
4064/// \headerfile <x86intrin.h>
4065///
4066/// This intrinsic does not correspond to a specific instruction.
4067///
4068/// \param __p
4069///    A pointer to a 16-bit memory location. The address of the memory
4070///    location does not have to be aligned.
4071/// \param __b
4072///    A 128-bit integer vector containing the value to be stored.
4073static __inline__ void __DEFAULT_FN_ATTRS
4074_mm_storeu_si16(void *__p, __m128i __b)
4075{
4076  struct __storeu_si16 {
4077    short __v;
4078  } __attribute__((__packed__, __may_alias__));
4079  ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
4080}
4081
4082/// Moves bytes selected by the mask from the first operand to the
4083///    specified unaligned memory location. When a mask bit is 1, the
4084///    corresponding byte is written, otherwise it is not written.
4085///
4086///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4087///    used again soon). Exception and trap behavior for elements not selected
4088///    for storage to memory are implementation dependent.
4089///
4090/// \headerfile <x86intrin.h>
4091///
4092/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4093///   instruction.
4094///
4095/// \param __d
4096///    A 128-bit integer vector containing the values to be moved.
4097/// \param __n
4098///    A 128-bit integer vector containing the mask. The most significant bit of
4099///    each byte represents the mask bits.
4100/// \param __p
4101///    A pointer to an unaligned 128-bit memory location where the specified
4102///    values are moved.
4103static __inline__ void __DEFAULT_FN_ATTRS
4104_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4105{
4106  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4107}
4108
4109/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4110///    a memory location.
4111///
4112/// \headerfile <x86intrin.h>
4113///
4114/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4115///
4116/// \param __p
4117///    A pointer to a 64-bit memory location that will receive the lower 64 bits
4118///    of the integer vector parameter.
4119/// \param __a
4120///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4121///    value to be stored.
4122static __inline__ void __DEFAULT_FN_ATTRS
4123_mm_storel_epi64(__m128i_u *__p, __m128i __a)
4124{
4125  struct __mm_storel_epi64_struct {
4126    long long __u;
4127  } __attribute__((__packed__, __may_alias__));
4128  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4129}
4130
4131/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4132///    aligned memory location.
4133///
4134///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4135///    used again soon).
4136///
4137/// \headerfile <x86intrin.h>
4138///
4139/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4140///
4141/// \param __p
4142///    A pointer to the 128-bit aligned memory location used to store the value.
4143/// \param __a
4144///    A vector of [2 x double] containing the 64-bit values to be stored.
4145static __inline__ void __DEFAULT_FN_ATTRS
4146_mm_stream_pd(double *__p, __m128d __a)
4147{
4148  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4149}
4150
4151/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4152///
4153///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4154///    used again soon).
4155///
4156/// \headerfile <x86intrin.h>
4157///
4158/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4159///
4160/// \param __p
4161///    A pointer to the 128-bit aligned memory location used to store the value.
4162/// \param __a
4163///    A 128-bit integer vector containing the values to be stored.
4164static __inline__ void __DEFAULT_FN_ATTRS
4165_mm_stream_si128(__m128i *__p, __m128i __a)
4166{
4167  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4168}
4169
4170/// Stores a 32-bit integer value in the specified memory location.
4171///
4172///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4173///    used again soon).
4174///
4175/// \headerfile <x86intrin.h>
4176///
4177/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4178///
4179/// \param __p
4180///    A pointer to the 32-bit memory location used to store the value.
4181/// \param __a
4182///    A 32-bit integer containing the value to be stored.
4183static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4184_mm_stream_si32(int *__p, int __a)
4185{
4186  __builtin_ia32_movnti(__p, __a);
4187}
4188
4189#ifdef __x86_64__
4190/// Stores a 64-bit integer value in the specified memory location.
4191///
4192///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4193///    used again soon).
4194///
4195/// \headerfile <x86intrin.h>
4196///
4197/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4198///
4199/// \param __p
4200///    A pointer to the 64-bit memory location used to store the value.
4201/// \param __a
4202///    A 64-bit integer containing the value to be stored.
4203static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4204_mm_stream_si64(long long *__p, long long __a)
4205{
4206  __builtin_ia32_movnti64(__p, __a);
4207}
4208#endif
4209
4210#if defined(__cplusplus)
4211extern "C" {
4212#endif
4213
4214/// The cache line containing \a __p is flushed and invalidated from all
4215///    caches in the coherency domain.
4216///
4217/// \headerfile <x86intrin.h>
4218///
4219/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4220///
4221/// \param __p
4222///    A pointer to the memory location used to identify the cache line to be
4223///    flushed.
4224void _mm_clflush(void const * __p);
4225
4226/// Forces strong memory ordering (serialization) between load
4227///    instructions preceding this instruction and load instructions following
4228///    this instruction, ensuring the system completes all previous loads before
4229///    executing subsequent loads.
4230///
4231/// \headerfile <x86intrin.h>
4232///
4233/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4234///
4235void _mm_lfence(void);
4236
4237/// Forces strong memory ordering (serialization) between load and store
4238///    instructions preceding this instruction and load and store instructions
4239///    following this instruction, ensuring that the system completes all
4240///    previous memory accesses before executing subsequent memory accesses.
4241///
4242/// \headerfile <x86intrin.h>
4243///
4244/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4245///
4246void _mm_mfence(void);
4247
4248#if defined(__cplusplus)
4249} // extern "C"
4250#endif
4251
4252/// Converts 16-bit signed integers from both 128-bit integer vector
4253///    operands into 8-bit signed integers, and packs the results into the
4254///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4255///    Negative values less than 0x80 are saturated to 0x80.
4256///
4257/// \headerfile <x86intrin.h>
4258///
4259/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4260///
4261/// \param __a
4262///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4263///   a signed integer and is converted to a 8-bit signed integer with
4264///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4265///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4266///   written to the lower 64 bits of the result.
4267/// \param __b
4268///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4269///   a signed integer and is converted to a 8-bit signed integer with
4270///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4271///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4272///   written to the higher 64 bits of the result.
4273/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4274static __inline__ __m128i __DEFAULT_FN_ATTRS
4275_mm_packs_epi16(__m128i __a, __m128i __b)
4276{
4277  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4278}
4279
4280/// Converts 32-bit signed integers from both 128-bit integer vector
4281///    operands into 16-bit signed integers, and packs the results into the
4282///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4283///    Negative values less than 0x8000 are saturated to 0x8000.
4284///
4285/// \headerfile <x86intrin.h>
4286///
4287/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4288///
4289/// \param __a
4290///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4291///    a signed integer and is converted to a 16-bit signed integer with
4292///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4293///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4294///    are written to the lower 64 bits of the result.
4295/// \param __b
4296///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4297///    a signed integer and is converted to a 16-bit signed integer with
4298///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4299///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4300///    are written to the higher 64 bits of the result.
4301/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4302static __inline__ __m128i __DEFAULT_FN_ATTRS
4303_mm_packs_epi32(__m128i __a, __m128i __b)
4304{
4305  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4306}
4307
4308/// Converts 16-bit signed integers from both 128-bit integer vector
4309///    operands into 8-bit unsigned integers, and packs the results into the
4310///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4311///    than 0x00 are saturated to 0x00.
4312///
4313/// \headerfile <x86intrin.h>
4314///
4315/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4316///
4317/// \param __a
4318///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4319///    a signed integer and is converted to an 8-bit unsigned integer with
4320///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4321///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4322///    written to the lower 64 bits of the result.
4323/// \param __b
4324///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4325///    a signed integer and is converted to an 8-bit unsigned integer with
4326///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4327///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4328///    written to the higher 64 bits of the result.
4329/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4330static __inline__ __m128i __DEFAULT_FN_ATTRS
4331_mm_packus_epi16(__m128i __a, __m128i __b)
4332{
4333  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4334}
4335
4336/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4337///    the immediate-value parameter as a selector.
4338///
4339/// \headerfile <x86intrin.h>
4340///
4341/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4342///
4343/// \param __a
4344///    A 128-bit integer vector.
4345/// \param __imm
4346///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
4347///    to bits[15:0] of the result. \n
4348///    000: assign values from bits [15:0] of \a __a. \n
4349///    001: assign values from bits [31:16] of \a __a. \n
4350///    010: assign values from bits [47:32] of \a __a. \n
4351///    011: assign values from bits [63:48] of \a __a. \n
4352///    100: assign values from bits [79:64] of \a __a. \n
4353///    101: assign values from bits [95:80] of \a __a. \n
4354///    110: assign values from bits [111:96] of \a __a. \n
4355///    111: assign values from bits [127:112] of \a __a.
4356/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4357///    integer vector parameter and the remaining bits are assigned zeros.
4358#define _mm_extract_epi16(a, imm) \
4359  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4360                                                   (int)(imm))
4361
4362/// Constructs a 128-bit integer vector by first making a copy of the
4363///    128-bit integer vector parameter, and then inserting the lower 16 bits
4364///    of an integer parameter into an offset specified by the immediate-value
4365///    parameter.
4366///
4367/// \headerfile <x86intrin.h>
4368///
4369/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4370///
4371/// \param __a
4372///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4373///    result and then one of the eight elements in the result is replaced by
4374///    the lower 16 bits of \a __b.
4375/// \param __b
4376///    An integer. The lower 16 bits of this parameter are written to the
4377///    result beginning at an offset specified by \a __imm.
4378/// \param __imm
4379///    An immediate value specifying the bit offset in the result at which the
4380///    lower 16 bits of \a __b are written.
4381/// \returns A 128-bit integer vector containing the constructed values.
4382#define _mm_insert_epi16(a, b, imm) \
4383  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4384                                       (int)(imm))
4385
4386/// Copies the values of the most significant bits from each 8-bit
4387///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4388///    value, zero-extends the value, and writes it to the destination.
4389///
4390/// \headerfile <x86intrin.h>
4391///
4392/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4393///
4394/// \param __a
4395///    A 128-bit integer vector containing the values with bits to be extracted.
4396/// \returns The most significant bits from each 8-bit element in \a __a,
4397///    written to bits [15:0]. The other bits are assigned zeros.
4398static __inline__ int __DEFAULT_FN_ATTRS
4399_mm_movemask_epi8(__m128i __a)
4400{
4401  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4402}
4403
4404/// Constructs a 128-bit integer vector by shuffling four 32-bit
4405///    elements of a 128-bit integer vector parameter, using the immediate-value
4406///    parameter as a specifier.
4407///
4408/// \headerfile <x86intrin.h>
4409///
4410/// \code
4411/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4412/// \endcode
4413///
4414/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4415///
4416/// \param a
4417///    A 128-bit integer vector containing the values to be copied.
4418/// \param imm
4419///    An immediate value containing an 8-bit value specifying which elements to
4420///    copy from a. The destinations within the 128-bit destination are assigned
4421///    values as follows: \n
4422///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4423///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4424///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4425///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4426///    Bit value assignments: \n
4427///    00: assign values from bits [31:0] of \a a. \n
4428///    01: assign values from bits [63:32] of \a a. \n
4429///    10: assign values from bits [95:64] of \a a. \n
4430///    11: assign values from bits [127:96] of \a a.
4431/// \returns A 128-bit integer vector containing the shuffled values.
4432#define _mm_shuffle_epi32(a, imm) \
4433  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
4434
4435/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4436///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4437///    value parameter as a specifier.
4438///
4439/// \headerfile <x86intrin.h>
4440///
4441/// \code
4442/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4443/// \endcode
4444///
4445/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4446///
4447/// \param a
4448///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4449///    [127:64] of the result.
4450/// \param imm
4451///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4452///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4453///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4454///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4455///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4456///    Bit value assignments: \n
4457///    00: assign values from bits [15:0] of \a a. \n
4458///    01: assign values from bits [31:16] of \a a. \n
4459///    10: assign values from bits [47:32] of \a a. \n
4460///    11: assign values from bits [63:48] of \a a. \n
4461/// \returns A 128-bit integer vector containing the shuffled values.
4462#define _mm_shufflelo_epi16(a, imm) \
4463  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
4464
4465/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4466///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4467///    value parameter as a specifier.
4468///
4469/// \headerfile <x86intrin.h>
4470///
4471/// \code
4472/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4473/// \endcode
4474///
4475/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4476///
4477/// \param a
4478///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4479///    [63:0] of the result.
4480/// \param imm
4481///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4482///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4483///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4484///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4485///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4486///    Bit value assignments: \n
4487///    00: assign values from bits [79:64] of \a a. \n
4488///    01: assign values from bits [95:80] of \a a. \n
4489///    10: assign values from bits [111:96] of \a a. \n
4490///    11: assign values from bits [127:112] of \a a. \n
4491/// \returns A 128-bit integer vector containing the shuffled values.
4492#define _mm_shufflehi_epi16(a, imm) \
4493  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
4494
4495/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4496///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4497///
4498/// \headerfile <x86intrin.h>
4499///
4500/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4501///   instruction.
4502///
4503/// \param __a
4504///    A 128-bit vector of [16 x i8].
4505///    Bits [71:64] are written to bits [7:0] of the result. \n
4506///    Bits [79:72] are written to bits [23:16] of the result. \n
4507///    Bits [87:80] are written to bits [39:32] of the result. \n
4508///    Bits [95:88] are written to bits [55:48] of the result. \n
4509///    Bits [103:96] are written to bits [71:64] of the result. \n
4510///    Bits [111:104] are written to bits [87:80] of the result. \n
4511///    Bits [119:112] are written to bits [103:96] of the result. \n
4512///    Bits [127:120] are written to bits [119:112] of the result.
4513/// \param __b
4514///    A 128-bit vector of [16 x i8]. \n
4515///    Bits [71:64] are written to bits [15:8] of the result. \n
4516///    Bits [79:72] are written to bits [31:24] of the result. \n
4517///    Bits [87:80] are written to bits [47:40] of the result. \n
4518///    Bits [95:88] are written to bits [63:56] of the result. \n
4519///    Bits [103:96] are written to bits [79:72] of the result. \n
4520///    Bits [111:104] are written to bits [95:88] of the result. \n
4521///    Bits [119:112] are written to bits [111:104] of the result. \n
4522///    Bits [127:120] are written to bits [127:120] of the result.
4523/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4524static __inline__ __m128i __DEFAULT_FN_ATTRS
4525_mm_unpackhi_epi8(__m128i __a, __m128i __b)
4526{
4527  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4528}
4529
4530/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4531///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4532///
4533/// \headerfile <x86intrin.h>
4534///
4535/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4536///   instruction.
4537///
4538/// \param __a
4539///    A 128-bit vector of [8 x i16].
4540///    Bits [79:64] are written to bits [15:0] of the result. \n
4541///    Bits [95:80] are written to bits [47:32] of the result. \n
4542///    Bits [111:96] are written to bits [79:64] of the result. \n
4543///    Bits [127:112] are written to bits [111:96] of the result.
4544/// \param __b
4545///    A 128-bit vector of [8 x i16].
4546///    Bits [79:64] are written to bits [31:16] of the result. \n
4547///    Bits [95:80] are written to bits [63:48] of the result. \n
4548///    Bits [111:96] are written to bits [95:80] of the result. \n
4549///    Bits [127:112] are written to bits [127:112] of the result.
4550/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4551static __inline__ __m128i __DEFAULT_FN_ATTRS
4552_mm_unpackhi_epi16(__m128i __a, __m128i __b)
4553{
4554  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4555}
4556
4557/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4558///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4559///
4560/// \headerfile <x86intrin.h>
4561///
4562/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4563///   instruction.
4564///
4565/// \param __a
4566///    A 128-bit vector of [4 x i32]. \n
4567///    Bits [95:64] are written to bits [31:0] of the destination. \n
4568///    Bits [127:96] are written to bits [95:64] of the destination.
4569/// \param __b
4570///    A 128-bit vector of [4 x i32]. \n
4571///    Bits [95:64] are written to bits [64:32] of the destination. \n
4572///    Bits [127:96] are written to bits [127:96] of the destination.
4573/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4574static __inline__ __m128i __DEFAULT_FN_ATTRS
4575_mm_unpackhi_epi32(__m128i __a, __m128i __b)
4576{
4577  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4578}
4579
4580/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4581///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4582///
4583/// \headerfile <x86intrin.h>
4584///
4585/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4586///   instruction.
4587///
4588/// \param __a
4589///    A 128-bit vector of [2 x i64]. \n
4590///    Bits [127:64] are written to bits [63:0] of the destination.
4591/// \param __b
4592///    A 128-bit vector of [2 x i64]. \n
4593///    Bits [127:64] are written to bits [127:64] of the destination.
4594/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4595static __inline__ __m128i __DEFAULT_FN_ATTRS
4596_mm_unpackhi_epi64(__m128i __a, __m128i __b)
4597{
4598  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4599}
4600
4601/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4602///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4603///
4604/// \headerfile <x86intrin.h>
4605///
4606/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4607///   instruction.
4608///
4609/// \param __a
4610///    A 128-bit vector of [16 x i8]. \n
4611///    Bits [7:0] are written to bits [7:0] of the result. \n
4612///    Bits [15:8] are written to bits [23:16] of the result. \n
4613///    Bits [23:16] are written to bits [39:32] of the result. \n
4614///    Bits [31:24] are written to bits [55:48] of the result. \n
4615///    Bits [39:32] are written to bits [71:64] of the result. \n
4616///    Bits [47:40] are written to bits [87:80] of the result. \n
4617///    Bits [55:48] are written to bits [103:96] of the result. \n
4618///    Bits [63:56] are written to bits [119:112] of the result.
4619/// \param __b
4620///    A 128-bit vector of [16 x i8].
4621///    Bits [7:0] are written to bits [15:8] of the result. \n
4622///    Bits [15:8] are written to bits [31:24] of the result. \n
4623///    Bits [23:16] are written to bits [47:40] of the result. \n
4624///    Bits [31:24] are written to bits [63:56] of the result. \n
4625///    Bits [39:32] are written to bits [79:72] of the result. \n
4626///    Bits [47:40] are written to bits [95:88] of the result. \n
4627///    Bits [55:48] are written to bits [111:104] of the result. \n
4628///    Bits [63:56] are written to bits [127:120] of the result.
4629/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4630static __inline__ __m128i __DEFAULT_FN_ATTRS
4631_mm_unpacklo_epi8(__m128i __a, __m128i __b)
4632{
4633  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4634}
4635
4636/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4637///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4638///    [8 x i16].
4639///
4640/// \headerfile <x86intrin.h>
4641///
4642/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4643///   instruction.
4644///
4645/// \param __a
4646///    A 128-bit vector of [8 x i16].
4647///    Bits [15:0] are written to bits [15:0] of the result. \n
4648///    Bits [31:16] are written to bits [47:32] of the result. \n
4649///    Bits [47:32] are written to bits [79:64] of the result. \n
4650///    Bits [63:48] are written to bits [111:96] of the result.
4651/// \param __b
4652///    A 128-bit vector of [8 x i16].
4653///    Bits [15:0] are written to bits [31:16] of the result. \n
4654///    Bits [31:16] are written to bits [63:48] of the result. \n
4655///    Bits [47:32] are written to bits [95:80] of the result. \n
4656///    Bits [63:48] are written to bits [127:112] of the result.
4657/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4658static __inline__ __m128i __DEFAULT_FN_ATTRS
4659_mm_unpacklo_epi16(__m128i __a, __m128i __b)
4660{
4661  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4662}
4663
4664/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4665///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4666///
4667/// \headerfile <x86intrin.h>
4668///
4669/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4670///   instruction.
4671///
4672/// \param __a
4673///    A 128-bit vector of [4 x i32]. \n
4674///    Bits [31:0] are written to bits [31:0] of the destination. \n
4675///    Bits [63:32] are written to bits [95:64] of the destination.
4676/// \param __b
4677///    A 128-bit vector of [4 x i32]. \n
4678///    Bits [31:0] are written to bits [64:32] of the destination. \n
4679///    Bits [63:32] are written to bits [127:96] of the destination.
4680/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4681static __inline__ __m128i __DEFAULT_FN_ATTRS
4682_mm_unpacklo_epi32(__m128i __a, __m128i __b)
4683{
4684  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4685}
4686
4687/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4688///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4689///
4690/// \headerfile <x86intrin.h>
4691///
4692/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4693///   instruction.
4694///
4695/// \param __a
4696///    A 128-bit vector of [2 x i64]. \n
4697///    Bits [63:0] are written to bits [63:0] of the destination. \n
4698/// \param __b
4699///    A 128-bit vector of [2 x i64]. \n
4700///    Bits [63:0] are written to bits [127:64] of the destination. \n
4701/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4702static __inline__ __m128i __DEFAULT_FN_ATTRS
4703_mm_unpacklo_epi64(__m128i __a, __m128i __b)
4704{
4705  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4706}
4707
4708/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4709///    integer.
4710///
4711/// \headerfile <x86intrin.h>
4712///
4713/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4714///
4715/// \param __a
4716///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4717///    destination.
4718/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4719static __inline__ __m64 __DEFAULT_FN_ATTRS
4720_mm_movepi64_pi64(__m128i __a)
4721{
4722  return (__m64)__a[0];
4723}
4724
4725/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4726///    upper bits.
4727///
4728/// \headerfile <x86intrin.h>
4729///
4730/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4731///
4732/// \param __a
4733///    A 64-bit value.
4734/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4735///    the operand. The upper 64 bits are assigned zeros.
4736static __inline__ __m128i __DEFAULT_FN_ATTRS
4737_mm_movpi64_epi64(__m64 __a)
4738{
4739  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4740}
4741
4742/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4743///    integer vector, zeroing the upper bits.
4744///
4745/// \headerfile <x86intrin.h>
4746///
4747/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4748///
4749/// \param __a
4750///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4751///    destination.
4752/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4753///    the operand. The upper 64 bits are assigned zeros.
4754static __inline__ __m128i __DEFAULT_FN_ATTRS
4755_mm_move_epi64(__m128i __a)
4756{
4757  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4758}
4759
4760/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4761///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4762///    double].
4763///
4764/// \headerfile <x86intrin.h>
4765///
4766/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4767///
4768/// \param __a
4769///    A 128-bit vector of [2 x double]. \n
4770///    Bits [127:64] are written to bits [63:0] of the destination.
4771/// \param __b
4772///    A 128-bit vector of [2 x double]. \n
4773///    Bits [127:64] are written to bits [127:64] of the destination.
4774/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4775static __inline__ __m128d __DEFAULT_FN_ATTRS
4776_mm_unpackhi_pd(__m128d __a, __m128d __b)
4777{
4778  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4779}
4780
4781/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4782///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4783///    double].
4784///
4785/// \headerfile <x86intrin.h>
4786///
4787/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4788///
4789/// \param __a
4790///    A 128-bit vector of [2 x double]. \n
4791///    Bits [63:0] are written to bits [63:0] of the destination.
4792/// \param __b
4793///    A 128-bit vector of [2 x double]. \n
4794///    Bits [63:0] are written to bits [127:64] of the destination.
4795/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4796static __inline__ __m128d __DEFAULT_FN_ATTRS
4797_mm_unpacklo_pd(__m128d __a, __m128d __b)
4798{
4799  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4800}
4801
4802/// Extracts the sign bits of the double-precision values in the 128-bit
4803///    vector of [2 x double], zero-extends the value, and writes it to the
4804///    low-order bits of the destination.
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4809///
4810/// \param __a
4811///    A 128-bit vector of [2 x double] containing the values with sign bits to
4812///    be extracted.
4813/// \returns The sign bits from each of the double-precision elements in \a __a,
4814///    written to bits [1:0]. The remaining bits are assigned values of zero.
4815static __inline__ int __DEFAULT_FN_ATTRS
4816_mm_movemask_pd(__m128d __a)
4817{
4818  return __builtin_ia32_movmskpd((__v2df)__a);
4819}
4820
4821
4822/// Constructs a 128-bit floating-point vector of [2 x double] from two
4823///    128-bit vector parameters of [2 x double], using the immediate-value
4824///     parameter as a specifier.
4825///
4826/// \headerfile <x86intrin.h>
4827///
4828/// \code
4829/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4830/// \endcode
4831///
4832/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4833///
4834/// \param a
4835///    A 128-bit vector of [2 x double].
4836/// \param b
4837///    A 128-bit vector of [2 x double].
4838/// \param i
4839///    An 8-bit immediate value. The least significant two bits specify which
4840///    elements to copy from \a a and \a b: \n
4841///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4842///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4843///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4844///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4845/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4846#define _mm_shuffle_pd(a, b, i) \
4847  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4848                                 (int)(i))
4849
4850/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4851///    floating-point vector of [4 x float].
4852///
4853/// \headerfile <x86intrin.h>
4854///
4855/// This intrinsic has no corresponding instruction.
4856///
4857/// \param __a
4858///    A 128-bit floating-point vector of [2 x double].
4859/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4860///    bitwise pattern as the parameter.
4861static __inline__ __m128 __DEFAULT_FN_ATTRS
4862_mm_castpd_ps(__m128d __a)
4863{
4864  return (__m128)__a;
4865}
4866
4867/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4868///    integer vector.
4869///
4870/// \headerfile <x86intrin.h>
4871///
4872/// This intrinsic has no corresponding instruction.
4873///
4874/// \param __a
4875///    A 128-bit floating-point vector of [2 x double].
4876/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4877///    parameter.
4878static __inline__ __m128i __DEFAULT_FN_ATTRS
4879_mm_castpd_si128(__m128d __a)
4880{
4881  return (__m128i)__a;
4882}
4883
4884/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4885///    floating-point vector of [2 x double].
4886///
4887/// \headerfile <x86intrin.h>
4888///
4889/// This intrinsic has no corresponding instruction.
4890///
4891/// \param __a
4892///    A 128-bit floating-point vector of [4 x float].
4893/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4894///    bitwise pattern as the parameter.
4895static __inline__ __m128d __DEFAULT_FN_ATTRS
4896_mm_castps_pd(__m128 __a)
4897{
4898  return (__m128d)__a;
4899}
4900
4901/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4902///    integer vector.
4903///
4904/// \headerfile <x86intrin.h>
4905///
4906/// This intrinsic has no corresponding instruction.
4907///
4908/// \param __a
4909///    A 128-bit floating-point vector of [4 x float].
4910/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4911///    parameter.
4912static __inline__ __m128i __DEFAULT_FN_ATTRS
4913_mm_castps_si128(__m128 __a)
4914{
4915  return (__m128i)__a;
4916}
4917
4918/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4919///    of [4 x float].
4920///
4921/// \headerfile <x86intrin.h>
4922///
4923/// This intrinsic has no corresponding instruction.
4924///
4925/// \param __a
4926///    A 128-bit integer vector.
4927/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4928///    bitwise pattern as the parameter.
4929static __inline__ __m128 __DEFAULT_FN_ATTRS
4930_mm_castsi128_ps(__m128i __a)
4931{
4932  return (__m128)__a;
4933}
4934
4935/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4936///    of [2 x double].
4937///
4938/// \headerfile <x86intrin.h>
4939///
4940/// This intrinsic has no corresponding instruction.
4941///
4942/// \param __a
4943///    A 128-bit integer vector.
4944/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4945///    bitwise pattern as the parameter.
4946static __inline__ __m128d __DEFAULT_FN_ATTRS
4947_mm_castsi128_pd(__m128i __a)
4948{
4949  return (__m128d)__a;
4950}
4951
4952#if defined(__cplusplus)
4953extern "C" {
4954#endif
4955
4956/// Indicates that a spin loop is being executed for the purposes of
4957///    optimizing power consumption during the loop.
4958///
4959/// \headerfile <x86intrin.h>
4960///
4961/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4962///
4963void _mm_pause(void);
4964
4965#if defined(__cplusplus)
4966} // extern "C"
4967#endif
4968#undef __DEFAULT_FN_ATTRS
4969#undef __DEFAULT_FN_ATTRS_MMX
4970
4971#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4972
4973#define _MM_DENORMALS_ZERO_ON   (0x0040)
4974#define _MM_DENORMALS_ZERO_OFF  (0x0000)
4975
4976#define _MM_DENORMALS_ZERO_MASK (0x0040)
4977
4978#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4979#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4980
4981#endif /* __EMMINTRIN_H */
4982