xmmintrin.h revision 360784
157434Smarkm/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
257434Smarkm *
3156813Sru * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4156813Sru * See https://llvm.org/LICENSE.txt for license information.
557434Smarkm * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6255386Sdes *
7195767Skensmith *===-----------------------------------------------------------------------===
8248619Sdes */
9248619Sdes
10126282Sdes#ifndef __XMMINTRIN_H
11126282Sdes#define __XMMINTRIN_H
12261320Sdes
13181111Sdes#include <mmintrin.h>
14137018Sdes
15221420Sdestypedef int __v4si __attribute__((__vector_size__(16)));
16221420Sdestypedef float __v4sf __attribute__((__vector_size__(16)));
17261320Sdestypedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
18263712Sdes
19261320Sdestypedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
20263712Sdes
21261320Sdes/* Unsigned types */
22147098Sdestypedef unsigned int __v4su __attribute__((__vector_size__(16)));
23147098Sdes
24147098Sdes/* This header should only be included in a hosted environment as it depends on
25147098Sdes * a standard library to provide allocation routines. */
2698820Sdes#if __STDC_HOSTED__
27263712Sdes#include <mm_malloc.h>
28263712Sdes#endif
29221420Sdes
3057434Smarkm/* Define the default attributes for the functions in this file. */
31255460Sdes#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32255460Sdes#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
33255460Sdes
34255460Sdes/// Adds the 32-bit float values in the low-order bits of the operands.
35255460Sdes///
36255460Sdes/// \headerfile <x86intrin.h>
37255460Sdes///
38255460Sdes/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
39255460Sdes///
40106538Sobrien/// \param __a
41103960Smarkm///    A 128-bit vector of [4 x float] containing one of the source operands.
42158519Sdes///    The lower 32 bits of this operand are used in the calculation.
43124250Sru/// \param __b
44156813Sru///    A 128-bit vector of [4 x float] containing one of the source operands.
45255829Sdes///    The lower 32 bits of this operand are used in the calculation.
46178828Sdfr/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47178828Sdfr///    of the lower 32 bits of both operands. The upper 96 bits are copied from
48106132Sdes///    the upper 96 bits of the first source operand.
49106132Sdesstatic __inline__ __m128 __DEFAULT_FN_ATTRS
50245527Sbz_mm_add_ss(__m128 __a, __m128 __b)
51245527Sbz{
52245527Sbz  __a[0] += __b[0];
53245527Sbz  return __a;
54139106Sru}
5557434Smarkm
56255460Sdes/// Adds two 128-bit vectors of [4 x float], and returns the results of
57255460Sdes///    the addition.
5890405Sru///
5957434Smarkm/// \headerfile <x86intrin.h>
6074818Sru///
61106538Sobrien/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
62158529Sdes///
63158529Sdes/// \param __a
64255829Sdes///    A 128-bit vector of [4 x float] containing one of the source operands.
65255829Sdes/// \param __b
66255829Sdes///    A 128-bit vector of [4 x float] containing one of the source operands.
67/// \returns A 128-bit vector of [4 x float] containing the sums of both
68///    operands.
69static __inline__ __m128 __DEFAULT_FN_ATTRS
70_mm_add_ps(__m128 __a, __m128 __b)
71{
72  return (__m128)((__v4sf)__a + (__v4sf)__b);
73}
74
75/// Subtracts the 32-bit float value in the low-order bits of the second
76///    operand from the corresponding value in the first operand.
77///
78/// \headerfile <x86intrin.h>
79///
80/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
81///
82/// \param __a
83///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84///    of this operand are used in the calculation.
85/// \param __b
86///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87///    bits of this operand are used in the calculation.
88/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89///    difference of the lower 32 bits of both operands. The upper 96 bits are
90///    copied from the upper 96 bits of the first source operand.
91static __inline__ __m128 __DEFAULT_FN_ATTRS
92_mm_sub_ss(__m128 __a, __m128 __b)
93{
94  __a[0] -= __b[0];
95  return __a;
96}
97
98/// Subtracts each of the values of the second operand from the first
99///    operand, both of which are 128-bit vectors of [4 x float] and returns
100///    the results of the subtraction.
101///
102/// \headerfile <x86intrin.h>
103///
104/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
105///
106/// \param __a
107///    A 128-bit vector of [4 x float] containing the minuend.
108/// \param __b
109///    A 128-bit vector of [4 x float] containing the subtrahend.
110/// \returns A 128-bit vector of [4 x float] containing the differences between
111///    both operands.
112static __inline__ __m128 __DEFAULT_FN_ATTRS
113_mm_sub_ps(__m128 __a, __m128 __b)
114{
115  return (__m128)((__v4sf)__a - (__v4sf)__b);
116}
117
118/// Multiplies two 32-bit float values in the low-order bits of the
119///    operands.
120///
121/// \headerfile <x86intrin.h>
122///
123/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
124///
125/// \param __a
126///    A 128-bit vector of [4 x float] containing one of the source operands.
127///    The lower 32 bits of this operand are used in the calculation.
128/// \param __b
129///    A 128-bit vector of [4 x float] containing one of the source operands.
130///    The lower 32 bits of this operand are used in the calculation.
131/// \returns A 128-bit vector of [4 x float] containing the product of the lower
132///    32 bits of both operands. The upper 96 bits are copied from the upper 96
133///    bits of the first source operand.
134static __inline__ __m128 __DEFAULT_FN_ATTRS
135_mm_mul_ss(__m128 __a, __m128 __b)
136{
137  __a[0] *= __b[0];
138  return __a;
139}
140
141/// Multiplies two 128-bit vectors of [4 x float] and returns the
142///    results of the multiplication.
143///
144/// \headerfile <x86intrin.h>
145///
146/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
147///
148/// \param __a
149///    A 128-bit vector of [4 x float] containing one of the source operands.
150/// \param __b
151///    A 128-bit vector of [4 x float] containing one of the source operands.
152/// \returns A 128-bit vector of [4 x float] containing the products of both
153///    operands.
154static __inline__ __m128 __DEFAULT_FN_ATTRS
155_mm_mul_ps(__m128 __a, __m128 __b)
156{
157  return (__m128)((__v4sf)__a * (__v4sf)__b);
158}
159
160/// Divides the value in the low-order 32 bits of the first operand by
161///    the corresponding value in the second operand.
162///
163/// \headerfile <x86intrin.h>
164///
165/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
166///
167/// \param __a
168///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
169///    bits of this operand are used in the calculation.
170/// \param __b
171///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172///    of this operand are used in the calculation.
173/// \returns A 128-bit vector of [4 x float] containing the quotients of the
174///    lower 32 bits of both operands. The upper 96 bits are copied from the
175///    upper 96 bits of the first source operand.
176static __inline__ __m128 __DEFAULT_FN_ATTRS
177_mm_div_ss(__m128 __a, __m128 __b)
178{
179  __a[0] /= __b[0];
180  return __a;
181}
182
183/// Divides two 128-bit vectors of [4 x float].
184///
185/// \headerfile <x86intrin.h>
186///
187/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
188///
189/// \param __a
190///    A 128-bit vector of [4 x float] containing the dividend.
191/// \param __b
192///    A 128-bit vector of [4 x float] containing the divisor.
193/// \returns A 128-bit vector of [4 x float] containing the quotients of both
194///    operands.
195static __inline__ __m128 __DEFAULT_FN_ATTRS
196_mm_div_ps(__m128 __a, __m128 __b)
197{
198  return (__m128)((__v4sf)__a / (__v4sf)__b);
199}
200
201/// Calculates the square root of the value stored in the low-order bits
202///    of a 128-bit vector of [4 x float].
203///
204/// \headerfile <x86intrin.h>
205///
206/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
207///
208/// \param __a
209///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210///    used in the calculation.
211/// \returns A 128-bit vector of [4 x float] containing the square root of the
212///    value in the low-order bits of the operand.
213static __inline__ __m128 __DEFAULT_FN_ATTRS
214_mm_sqrt_ss(__m128 __a)
215{
216  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
217}
218
219/// Calculates the square roots of the values stored in a 128-bit vector
220///    of [4 x float].
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
225///
226/// \param __a
227///    A 128-bit vector of [4 x float].
228/// \returns A 128-bit vector of [4 x float] containing the square roots of the
229///    values in the operand.
230static __inline__ __m128 __DEFAULT_FN_ATTRS
231_mm_sqrt_ps(__m128 __a)
232{
233  return __builtin_ia32_sqrtps((__v4sf)__a);
234}
235
236/// Calculates the approximate reciprocal of the value stored in the
237///    low-order bits of a 128-bit vector of [4 x float].
238///
239/// \headerfile <x86intrin.h>
240///
241/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
242///
243/// \param __a
244///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245///    used in the calculation.
246/// \returns A 128-bit vector of [4 x float] containing the approximate
247///    reciprocal of the value in the low-order bits of the operand.
248static __inline__ __m128 __DEFAULT_FN_ATTRS
249_mm_rcp_ss(__m128 __a)
250{
251  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
252}
253
254/// Calculates the approximate reciprocals of the values stored in a
255///    128-bit vector of [4 x float].
256///
257/// \headerfile <x86intrin.h>
258///
259/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
260///
261/// \param __a
262///    A 128-bit vector of [4 x float].
263/// \returns A 128-bit vector of [4 x float] containing the approximate
264///    reciprocals of the values in the operand.
265static __inline__ __m128 __DEFAULT_FN_ATTRS
266_mm_rcp_ps(__m128 __a)
267{
268  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
269}
270
271/// Calculates the approximate reciprocal of the square root of the value
272///    stored in the low-order bits of a 128-bit vector of [4 x float].
273///
274/// \headerfile <x86intrin.h>
275///
276/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
277///
278/// \param __a
279///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280///    used in the calculation.
281/// \returns A 128-bit vector of [4 x float] containing the approximate
282///    reciprocal of the square root of the value in the low-order bits of the
283///    operand.
284static __inline__ __m128 __DEFAULT_FN_ATTRS
285_mm_rsqrt_ss(__m128 __a)
286{
287  return __builtin_ia32_rsqrtss((__v4sf)__a);
288}
289
290/// Calculates the approximate reciprocals of the square roots of the
291///    values stored in a 128-bit vector of [4 x float].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
296///
297/// \param __a
298///    A 128-bit vector of [4 x float].
299/// \returns A 128-bit vector of [4 x float] containing the approximate
300///    reciprocals of the square roots of the values in the operand.
301static __inline__ __m128 __DEFAULT_FN_ATTRS
302_mm_rsqrt_ps(__m128 __a)
303{
304  return __builtin_ia32_rsqrtps((__v4sf)__a);
305}
306
307/// Compares two 32-bit float values in the low-order bits of both
308///    operands and returns the lesser value in the low-order bits of the
309///    vector of [4 x float].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
314///
315/// \param __a
316///    A 128-bit vector of [4 x float] containing one of the operands. The lower
317///    32 bits of this operand are used in the comparison.
318/// \param __b
319///    A 128-bit vector of [4 x float] containing one of the operands. The lower
320///    32 bits of this operand are used in the comparison.
321/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322///    minimum value between both operands. The upper 96 bits are copied from
323///    the upper 96 bits of the first source operand.
324static __inline__ __m128 __DEFAULT_FN_ATTRS
325_mm_min_ss(__m128 __a, __m128 __b)
326{
327  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
328}
329
330/// Compares two 128-bit vectors of [4 x float] and returns the lesser
331///    of each pair of values.
332///
333/// \headerfile <x86intrin.h>
334///
335/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
336///
337/// \param __a
338///    A 128-bit vector of [4 x float] containing one of the operands.
339/// \param __b
340///    A 128-bit vector of [4 x float] containing one of the operands.
341/// \returns A 128-bit vector of [4 x float] containing the minimum values
342///    between both operands.
343static __inline__ __m128 __DEFAULT_FN_ATTRS
344_mm_min_ps(__m128 __a, __m128 __b)
345{
346  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
347}
348
349/// Compares two 32-bit float values in the low-order bits of both
350///    operands and returns the greater value in the low-order bits of a 128-bit
351///    vector of [4 x float].
352///
353/// \headerfile <x86intrin.h>
354///
355/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
356///
357/// \param __a
358///    A 128-bit vector of [4 x float] containing one of the operands. The lower
359///    32 bits of this operand are used in the comparison.
360/// \param __b
361///    A 128-bit vector of [4 x float] containing one of the operands. The lower
362///    32 bits of this operand are used in the comparison.
363/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364///    maximum value between both operands. The upper 96 bits are copied from
365///    the upper 96 bits of the first source operand.
366static __inline__ __m128 __DEFAULT_FN_ATTRS
367_mm_max_ss(__m128 __a, __m128 __b)
368{
369  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
370}
371
372/// Compares two 128-bit vectors of [4 x float] and returns the greater
373///    of each pair of values.
374///
375/// \headerfile <x86intrin.h>
376///
377/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
378///
379/// \param __a
380///    A 128-bit vector of [4 x float] containing one of the operands.
381/// \param __b
382///    A 128-bit vector of [4 x float] containing one of the operands.
383/// \returns A 128-bit vector of [4 x float] containing the maximum values
384///    between both operands.
385static __inline__ __m128 __DEFAULT_FN_ATTRS
386_mm_max_ps(__m128 __a, __m128 __b)
387{
388  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
389}
390
391/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
392///
393/// \headerfile <x86intrin.h>
394///
395/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
396///
397/// \param __a
398///    A 128-bit vector containing one of the source operands.
399/// \param __b
400///    A 128-bit vector containing one of the source operands.
401/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402///    values between both operands.
403static __inline__ __m128 __DEFAULT_FN_ATTRS
404_mm_and_ps(__m128 __a, __m128 __b)
405{
406  return (__m128)((__v4su)__a & (__v4su)__b);
407}
408
409/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
410///    the one's complement of the values contained in the first source
411///    operand.
412///
413/// \headerfile <x86intrin.h>
414///
415/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
416///
417/// \param __a
418///    A 128-bit vector of [4 x float] containing the first source operand. The
419///    one's complement of this value is used in the bitwise AND.
420/// \param __b
421///    A 128-bit vector of [4 x float] containing the second source operand.
422/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423///    one's complement of the first operand and the values in the second
424///    operand.
425static __inline__ __m128 __DEFAULT_FN_ATTRS
426_mm_andnot_ps(__m128 __a, __m128 __b)
427{
428  return (__m128)(~(__v4su)__a & (__v4su)__b);
429}
430
431/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
432///
433/// \headerfile <x86intrin.h>
434///
435/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
436///
437/// \param __a
438///    A 128-bit vector of [4 x float] containing one of the source operands.
439/// \param __b
440///    A 128-bit vector of [4 x float] containing one of the source operands.
441/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442///    values between both operands.
443static __inline__ __m128 __DEFAULT_FN_ATTRS
444_mm_or_ps(__m128 __a, __m128 __b)
445{
446  return (__m128)((__v4su)__a | (__v4su)__b);
447}
448
449/// Performs a bitwise exclusive OR of two 128-bit vectors of
450///    [4 x float].
451///
452/// \headerfile <x86intrin.h>
453///
454/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
455///
456/// \param __a
457///    A 128-bit vector of [4 x float] containing one of the source operands.
458/// \param __b
459///    A 128-bit vector of [4 x float] containing one of the source operands.
460/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461///    of the values between both operands.
462static __inline__ __m128 __DEFAULT_FN_ATTRS
463_mm_xor_ps(__m128 __a, __m128 __b)
464{
465  return (__m128)((__v4su)__a ^ (__v4su)__b);
466}
467
468/// Compares two 32-bit float values in the low-order bits of both
469///    operands for equality and returns the result of the comparison in the
470///    low-order bits of a vector [4 x float].
471///
472/// \headerfile <x86intrin.h>
473///
474/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
475///
476/// \param __a
477///    A 128-bit vector of [4 x float] containing one of the operands. The lower
478///    32 bits of this operand are used in the comparison.
479/// \param __b
480///    A 128-bit vector of [4 x float] containing one of the operands. The lower
481///    32 bits of this operand are used in the comparison.
482/// \returns A 128-bit vector of [4 x float] containing the comparison results
483///    in the low-order bits.
484static __inline__ __m128 __DEFAULT_FN_ATTRS
485_mm_cmpeq_ss(__m128 __a, __m128 __b)
486{
487  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
488}
489
490/// Compares each of the corresponding 32-bit float values of the
491///    128-bit vectors of [4 x float] for equality.
492///
493/// \headerfile <x86intrin.h>
494///
495/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
496///
497/// \param __a
498///    A 128-bit vector of [4 x float].
499/// \param __b
500///    A 128-bit vector of [4 x float].
501/// \returns A 128-bit vector of [4 x float] containing the comparison results.
502static __inline__ __m128 __DEFAULT_FN_ATTRS
503_mm_cmpeq_ps(__m128 __a, __m128 __b)
504{
505  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
506}
507
508/// Compares two 32-bit float values in the low-order bits of both
509///    operands to determine if the value in the first operand is less than the
510///    corresponding value in the second operand and returns the result of the
511///    comparison in the low-order bits of a vector of [4 x float].
512///
513/// \headerfile <x86intrin.h>
514///
515/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
516///
517/// \param __a
518///    A 128-bit vector of [4 x float] containing one of the operands. The lower
519///    32 bits of this operand are used in the comparison.
520/// \param __b
521///    A 128-bit vector of [4 x float] containing one of the operands. The lower
522///    32 bits of this operand are used in the comparison.
523/// \returns A 128-bit vector of [4 x float] containing the comparison results
524///    in the low-order bits.
525static __inline__ __m128 __DEFAULT_FN_ATTRS
526_mm_cmplt_ss(__m128 __a, __m128 __b)
527{
528  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
529}
530
531/// Compares each of the corresponding 32-bit float values of the
532///    128-bit vectors of [4 x float] to determine if the values in the first
533///    operand are less than those in the second operand.
534///
535/// \headerfile <x86intrin.h>
536///
537/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
538///
539/// \param __a
540///    A 128-bit vector of [4 x float].
541/// \param __b
542///    A 128-bit vector of [4 x float].
543/// \returns A 128-bit vector of [4 x float] containing the comparison results.
544static __inline__ __m128 __DEFAULT_FN_ATTRS
545_mm_cmplt_ps(__m128 __a, __m128 __b)
546{
547  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
548}
549
550/// Compares two 32-bit float values in the low-order bits of both
551///    operands to determine if the value in the first operand is less than or
552///    equal to the corresponding value in the second operand and returns the
553///    result of the comparison in the low-order bits of a vector of
554///    [4 x float].
555///
556/// \headerfile <x86intrin.h>
557///
558/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
559///
560/// \param __a
561///    A 128-bit vector of [4 x float] containing one of the operands. The lower
562///    32 bits of this operand are used in the comparison.
563/// \param __b
564///    A 128-bit vector of [4 x float] containing one of the operands. The lower
565///    32 bits of this operand are used in the comparison.
566/// \returns A 128-bit vector of [4 x float] containing the comparison results
567///    in the low-order bits.
568static __inline__ __m128 __DEFAULT_FN_ATTRS
569_mm_cmple_ss(__m128 __a, __m128 __b)
570{
571  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
572}
573
574/// Compares each of the corresponding 32-bit float values of the
575///    128-bit vectors of [4 x float] to determine if the values in the first
576///    operand are less than or equal to those in the second operand.
577///
578/// \headerfile <x86intrin.h>
579///
580/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
581///
582/// \param __a
583///    A 128-bit vector of [4 x float].
584/// \param __b
585///    A 128-bit vector of [4 x float].
586/// \returns A 128-bit vector of [4 x float] containing the comparison results.
587static __inline__ __m128 __DEFAULT_FN_ATTRS
588_mm_cmple_ps(__m128 __a, __m128 __b)
589{
590  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
591}
592
593/// Compares two 32-bit float values in the low-order bits of both
594///    operands to determine if the value in the first operand is greater than
595///    the corresponding value in the second operand and returns the result of
596///    the comparison in the low-order bits of a vector of [4 x float].
597///
598/// \headerfile <x86intrin.h>
599///
600/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
601///
602/// \param __a
603///    A 128-bit vector of [4 x float] containing one of the operands. The lower
604///    32 bits of this operand are used in the comparison.
605/// \param __b
606///    A 128-bit vector of [4 x float] containing one of the operands. The lower
607///    32 bits of this operand are used in the comparison.
608/// \returns A 128-bit vector of [4 x float] containing the comparison results
609///    in the low-order bits.
610static __inline__ __m128 __DEFAULT_FN_ATTRS
611_mm_cmpgt_ss(__m128 __a, __m128 __b)
612{
613  return (__m128)__builtin_shufflevector((__v4sf)__a,
614                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
615                                         4, 1, 2, 3);
616}
617
618/// Compares each of the corresponding 32-bit float values of the
619///    128-bit vectors of [4 x float] to determine if the values in the first
620///    operand are greater than those in the second operand.
621///
622/// \headerfile <x86intrin.h>
623///
624/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
625///
626/// \param __a
627///    A 128-bit vector of [4 x float].
628/// \param __b
629///    A 128-bit vector of [4 x float].
630/// \returns A 128-bit vector of [4 x float] containing the comparison results.
631static __inline__ __m128 __DEFAULT_FN_ATTRS
632_mm_cmpgt_ps(__m128 __a, __m128 __b)
633{
634  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
635}
636
637/// Compares two 32-bit float values in the low-order bits of both
638///    operands to determine if the value in the first operand is greater than
639///    or equal to the corresponding value in the second operand and returns
640///    the result of the comparison in the low-order bits of a vector of
641///    [4 x float].
642///
643/// \headerfile <x86intrin.h>
644///
645/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
646///
647/// \param __a
648///    A 128-bit vector of [4 x float] containing one of the operands. The lower
649///    32 bits of this operand are used in the comparison.
650/// \param __b
651///    A 128-bit vector of [4 x float] containing one of the operands. The lower
652///    32 bits of this operand are used in the comparison.
653/// \returns A 128-bit vector of [4 x float] containing the comparison results
654///    in the low-order bits.
655static __inline__ __m128 __DEFAULT_FN_ATTRS
656_mm_cmpge_ss(__m128 __a, __m128 __b)
657{
658  return (__m128)__builtin_shufflevector((__v4sf)__a,
659                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
660                                         4, 1, 2, 3);
661}
662
663/// Compares each of the corresponding 32-bit float values of the
664///    128-bit vectors of [4 x float] to determine if the values in the first
665///    operand are greater than or equal to those in the second operand.
666///
667/// \headerfile <x86intrin.h>
668///
669/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
670///
671/// \param __a
672///    A 128-bit vector of [4 x float].
673/// \param __b
674///    A 128-bit vector of [4 x float].
675/// \returns A 128-bit vector of [4 x float] containing the comparison results.
676static __inline__ __m128 __DEFAULT_FN_ATTRS
677_mm_cmpge_ps(__m128 __a, __m128 __b)
678{
679  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
680}
681
682/// Compares two 32-bit float values in the low-order bits of both
683///    operands for inequality and returns the result of the comparison in the
684///    low-order bits of a vector of [4 x float].
685///
686/// \headerfile <x86intrin.h>
687///
688/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
689///   instructions.
690///
691/// \param __a
692///    A 128-bit vector of [4 x float] containing one of the operands. The lower
693///    32 bits of this operand are used in the comparison.
694/// \param __b
695///    A 128-bit vector of [4 x float] containing one of the operands. The lower
696///    32 bits of this operand are used in the comparison.
697/// \returns A 128-bit vector of [4 x float] containing the comparison results
698///    in the low-order bits.
699static __inline__ __m128 __DEFAULT_FN_ATTRS
700_mm_cmpneq_ss(__m128 __a, __m128 __b)
701{
702  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
703}
704
705/// Compares each of the corresponding 32-bit float values of the
706///    128-bit vectors of [4 x float] for inequality.
707///
708/// \headerfile <x86intrin.h>
709///
710/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
711///   instructions.
712///
713/// \param __a
714///    A 128-bit vector of [4 x float].
715/// \param __b
716///    A 128-bit vector of [4 x float].
717/// \returns A 128-bit vector of [4 x float] containing the comparison results.
718static __inline__ __m128 __DEFAULT_FN_ATTRS
719_mm_cmpneq_ps(__m128 __a, __m128 __b)
720{
721  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
722}
723
724/// Compares two 32-bit float values in the low-order bits of both
725///    operands to determine if the value in the first operand is not less than
726///    the corresponding value in the second operand and returns the result of
727///    the comparison in the low-order bits of a vector of [4 x float].
728///
729/// \headerfile <x86intrin.h>
730///
731/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
732///   instructions.
733///
734/// \param __a
735///    A 128-bit vector of [4 x float] containing one of the operands. The lower
736///    32 bits of this operand are used in the comparison.
737/// \param __b
738///    A 128-bit vector of [4 x float] containing one of the operands. The lower
739///    32 bits of this operand are used in the comparison.
740/// \returns A 128-bit vector of [4 x float] containing the comparison results
741///    in the low-order bits.
742static __inline__ __m128 __DEFAULT_FN_ATTRS
743_mm_cmpnlt_ss(__m128 __a, __m128 __b)
744{
745  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
746}
747
748/// Compares each of the corresponding 32-bit float values of the
749///    128-bit vectors of [4 x float] to determine if the values in the first
750///    operand are not less than those in the second operand.
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
755///   instructions.
756///
757/// \param __a
758///    A 128-bit vector of [4 x float].
759/// \param __b
760///    A 128-bit vector of [4 x float].
761/// \returns A 128-bit vector of [4 x float] containing the comparison results.
762static __inline__ __m128 __DEFAULT_FN_ATTRS
763_mm_cmpnlt_ps(__m128 __a, __m128 __b)
764{
765  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
766}
767
768/// Compares two 32-bit float values in the low-order bits of both
769///    operands to determine if the value in the first operand is not less than
770///    or equal to the corresponding value in the second operand and returns
771///    the result of the comparison in the low-order bits of a vector of
772///    [4 x float].
773///
774/// \headerfile <x86intrin.h>
775///
776/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
777///   instructions.
778///
779/// \param __a
780///    A 128-bit vector of [4 x float] containing one of the operands. The lower
781///    32 bits of this operand are used in the comparison.
782/// \param __b
783///    A 128-bit vector of [4 x float] containing one of the operands. The lower
784///    32 bits of this operand are used in the comparison.
785/// \returns A 128-bit vector of [4 x float] containing the comparison results
786///    in the low-order bits.
787static __inline__ __m128 __DEFAULT_FN_ATTRS
788_mm_cmpnle_ss(__m128 __a, __m128 __b)
789{
790  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
791}
792
793/// Compares each of the corresponding 32-bit float values of the
794///    128-bit vectors of [4 x float] to determine if the values in the first
795///    operand are not less than or equal to those in the second operand.
796///
797/// \headerfile <x86intrin.h>
798///
799/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
800///   instructions.
801///
802/// \param __a
803///    A 128-bit vector of [4 x float].
804/// \param __b
805///    A 128-bit vector of [4 x float].
806/// \returns A 128-bit vector of [4 x float] containing the comparison results.
807static __inline__ __m128 __DEFAULT_FN_ATTRS
808_mm_cmpnle_ps(__m128 __a, __m128 __b)
809{
810  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
811}
812
813/// Compares two 32-bit float values in the low-order bits of both
814///    operands to determine if the value in the first operand is not greater
815///    than the corresponding value in the second operand and returns the
816///    result of the comparison in the low-order bits of a vector of
817///    [4 x float].
818///
819/// \headerfile <x86intrin.h>
820///
821/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
822///   instructions.
823///
824/// \param __a
825///    A 128-bit vector of [4 x float] containing one of the operands. The lower
826///    32 bits of this operand are used in the comparison.
827/// \param __b
828///    A 128-bit vector of [4 x float] containing one of the operands. The lower
829///    32 bits of this operand are used in the comparison.
830/// \returns A 128-bit vector of [4 x float] containing the comparison results
831///    in the low-order bits.
832static __inline__ __m128 __DEFAULT_FN_ATTRS
833_mm_cmpngt_ss(__m128 __a, __m128 __b)
834{
835  return (__m128)__builtin_shufflevector((__v4sf)__a,
836                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
837                                         4, 1, 2, 3);
838}
839
840/// Compares each of the corresponding 32-bit float values of the
841///    128-bit vectors of [4 x float] to determine if the values in the first
842///    operand are not greater than those in the second operand.
843///
844/// \headerfile <x86intrin.h>
845///
846/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
847///   instructions.
848///
849/// \param __a
850///    A 128-bit vector of [4 x float].
851/// \param __b
852///    A 128-bit vector of [4 x float].
853/// \returns A 128-bit vector of [4 x float] containing the comparison results.
854static __inline__ __m128 __DEFAULT_FN_ATTRS
855_mm_cmpngt_ps(__m128 __a, __m128 __b)
856{
857  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
858}
859
860/// Compares two 32-bit float values in the low-order bits of both
861///    operands to determine if the value in the first operand is not greater
862///    than or equal to the corresponding value in the second operand and
863///    returns the result of the comparison in the low-order bits of a vector
864///    of [4 x float].
865///
866/// \headerfile <x86intrin.h>
867///
868/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
869///   instructions.
870///
871/// \param __a
872///    A 128-bit vector of [4 x float] containing one of the operands. The lower
873///    32 bits of this operand are used in the comparison.
874/// \param __b
875///    A 128-bit vector of [4 x float] containing one of the operands. The lower
876///    32 bits of this operand are used in the comparison.
877/// \returns A 128-bit vector of [4 x float] containing the comparison results
878///    in the low-order bits.
879static __inline__ __m128 __DEFAULT_FN_ATTRS
880_mm_cmpnge_ss(__m128 __a, __m128 __b)
881{
882  return (__m128)__builtin_shufflevector((__v4sf)__a,
883                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
884                                         4, 1, 2, 3);
885}
886
887/// Compares each of the corresponding 32-bit float values of the
888///    128-bit vectors of [4 x float] to determine if the values in the first
889///    operand are not greater than or equal to those in the second operand.
890///
891/// \headerfile <x86intrin.h>
892///
893/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
894///   instructions.
895///
896/// \param __a
897///    A 128-bit vector of [4 x float].
898/// \param __b
899///    A 128-bit vector of [4 x float].
900/// \returns A 128-bit vector of [4 x float] containing the comparison results.
901static __inline__ __m128 __DEFAULT_FN_ATTRS
902_mm_cmpnge_ps(__m128 __a, __m128 __b)
903{
904  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
905}
906
907/// Compares two 32-bit float values in the low-order bits of both
908///    operands to determine if the value in the first operand is ordered with
909///    respect to the corresponding value in the second operand and returns the
910///    result of the comparison in the low-order bits of a vector of
911///    [4 x float].
912///
913/// \headerfile <x86intrin.h>
914///
915/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
916///   instructions.
917///
918/// \param __a
919///    A 128-bit vector of [4 x float] containing one of the operands. The lower
920///    32 bits of this operand are used in the comparison.
921/// \param __b
922///    A 128-bit vector of [4 x float] containing one of the operands. The lower
923///    32 bits of this operand are used in the comparison.
924/// \returns A 128-bit vector of [4 x float] containing the comparison results
925///    in the low-order bits.
926static __inline__ __m128 __DEFAULT_FN_ATTRS
927_mm_cmpord_ss(__m128 __a, __m128 __b)
928{
929  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
930}
931
932/// Compares each of the corresponding 32-bit float values of the
933///    128-bit vectors of [4 x float] to determine if the values in the first
934///    operand are ordered with respect to those in the second operand.
935///
936/// \headerfile <x86intrin.h>
937///
938/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
939///   instructions.
940///
941/// \param __a
942///    A 128-bit vector of [4 x float].
943/// \param __b
944///    A 128-bit vector of [4 x float].
945/// \returns A 128-bit vector of [4 x float] containing the comparison results.
946static __inline__ __m128 __DEFAULT_FN_ATTRS
947_mm_cmpord_ps(__m128 __a, __m128 __b)
948{
949  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
950}
951
952/// Compares two 32-bit float values in the low-order bits of both
953///    operands to determine if the value in the first operand is unordered
954///    with respect to the corresponding value in the second operand and
955///    returns the result of the comparison in the low-order bits of a vector
956///    of [4 x float].
957///
958/// \headerfile <x86intrin.h>
959///
960/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
961///   instructions.
962///
963/// \param __a
964///    A 128-bit vector of [4 x float] containing one of the operands. The lower
965///    32 bits of this operand are used in the comparison.
966/// \param __b
967///    A 128-bit vector of [4 x float] containing one of the operands. The lower
968///    32 bits of this operand are used in the comparison.
969/// \returns A 128-bit vector of [4 x float] containing the comparison results
970///    in the low-order bits.
971static __inline__ __m128 __DEFAULT_FN_ATTRS
972_mm_cmpunord_ss(__m128 __a, __m128 __b)
973{
974  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
975}
976
977/// Compares each of the corresponding 32-bit float values of the
978///    128-bit vectors of [4 x float] to determine if the values in the first
979///    operand are unordered with respect to those in the second operand.
980///
981/// \headerfile <x86intrin.h>
982///
983/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
984///   instructions.
985///
986/// \param __a
987///    A 128-bit vector of [4 x float].
988/// \param __b
989///    A 128-bit vector of [4 x float].
990/// \returns A 128-bit vector of [4 x float] containing the comparison results.
991static __inline__ __m128 __DEFAULT_FN_ATTRS
992_mm_cmpunord_ps(__m128 __a, __m128 __b)
993{
994  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995}
996
997/// Compares two 32-bit float values in the low-order bits of both
998///    operands for equality and returns the result of the comparison.
999///
1000///    If either of the two lower 32-bit values is NaN, 0 is returned.
1001///
1002/// \headerfile <x86intrin.h>
1003///
1004/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005///   instructions.
1006///
1007/// \param __a
1008///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009///    used in the comparison.
1010/// \param __b
1011///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012///    used in the comparison.
1013/// \returns An integer containing the comparison results. If either of the
1014///    two lower 32-bit values is NaN, 0 is returned.
1015static __inline__ int __DEFAULT_FN_ATTRS
1016_mm_comieq_ss(__m128 __a, __m128 __b)
1017{
1018  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019}
1020
1021/// Compares two 32-bit float values in the low-order bits of both
1022///    operands to determine if the first operand is less than the second
1023///    operand and returns the result of the comparison.
1024///
1025///    If either of the two lower 32-bit values is NaN, 0 is returned.
1026///
1027/// \headerfile <x86intrin.h>
1028///
1029/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030///   instructions.
1031///
1032/// \param __a
1033///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034///    used in the comparison.
1035/// \param __b
1036///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037///    used in the comparison.
1038/// \returns An integer containing the comparison results. If either of the two
1039///     lower 32-bit values is NaN, 0 is returned.
1040static __inline__ int __DEFAULT_FN_ATTRS
1041_mm_comilt_ss(__m128 __a, __m128 __b)
1042{
1043  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044}
1045
1046/// Compares two 32-bit float values in the low-order bits of both
1047///    operands to determine if the first operand is less than or equal to the
1048///    second operand and returns the result of the comparison.
1049///
1050///    If either of the two lower 32-bit values is NaN, 0 is returned.
1051///
1052/// \headerfile <x86intrin.h>
1053///
1054/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1055///
1056/// \param __a
1057///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058///    used in the comparison.
1059/// \param __b
1060///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061///    used in the comparison.
1062/// \returns An integer containing the comparison results. If either of the two
1063///     lower 32-bit values is NaN, 0 is returned.
1064static __inline__ int __DEFAULT_FN_ATTRS
1065_mm_comile_ss(__m128 __a, __m128 __b)
1066{
1067  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068}
1069
1070/// Compares two 32-bit float values in the low-order bits of both
1071///    operands to determine if the first operand is greater than the second
1072///    operand and returns the result of the comparison.
1073///
1074///    If either of the two lower 32-bit values is NaN, 0 is returned.
1075///
1076/// \headerfile <x86intrin.h>
1077///
1078/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1079///
1080/// \param __a
1081///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082///    used in the comparison.
1083/// \param __b
1084///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085///    used in the comparison.
1086/// \returns An integer containing the comparison results. If either of the
1087///     two lower 32-bit values is NaN, 0 is returned.
1088static __inline__ int __DEFAULT_FN_ATTRS
1089_mm_comigt_ss(__m128 __a, __m128 __b)
1090{
1091  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092}
1093
1094/// Compares two 32-bit float values in the low-order bits of both
1095///    operands to determine if the first operand is greater than or equal to
1096///    the second operand and returns the result of the comparison.
1097///
1098///    If either of the two lower 32-bit values is NaN, 0 is returned.
1099///
1100/// \headerfile <x86intrin.h>
1101///
1102/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103///
1104/// \param __a
1105///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106///    used in the comparison.
1107/// \param __b
1108///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109///    used in the comparison.
1110/// \returns An integer containing the comparison results. If either of the two
1111///    lower 32-bit values is NaN, 0 is returned.
1112static __inline__ int __DEFAULT_FN_ATTRS
1113_mm_comige_ss(__m128 __a, __m128 __b)
1114{
1115  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116}
1117
1118/// Compares two 32-bit float values in the low-order bits of both
1119///    operands to determine if the first operand is not equal to the second
1120///    operand and returns the result of the comparison.
1121///
1122///    If either of the two lower 32-bit values is NaN, 1 is returned.
1123///
1124/// \headerfile <x86intrin.h>
1125///
1126/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1127///
1128/// \param __a
1129///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130///    used in the comparison.
1131/// \param __b
1132///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133///    used in the comparison.
1134/// \returns An integer containing the comparison results. If either of the
1135///     two lower 32-bit values is NaN, 1 is returned.
1136static __inline__ int __DEFAULT_FN_ATTRS
1137_mm_comineq_ss(__m128 __a, __m128 __b)
1138{
1139  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140}
1141
1142/// Performs an unordered comparison of two 32-bit float values using
1143///    the low-order bits of both operands to determine equality and returns
1144///    the result of the comparison.
1145///
1146///    If either of the two lower 32-bit values is NaN, 0 is returned.
1147///
1148/// \headerfile <x86intrin.h>
1149///
1150/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1151///
1152/// \param __a
1153///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154///    used in the comparison.
1155/// \param __b
1156///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157///    used in the comparison.
1158/// \returns An integer containing the comparison results. If either of the two
1159///     lower 32-bit values is NaN, 0 is returned.
1160static __inline__ int __DEFAULT_FN_ATTRS
1161_mm_ucomieq_ss(__m128 __a, __m128 __b)
1162{
1163  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164}
1165
1166/// Performs an unordered comparison of two 32-bit float values using
1167///    the low-order bits of both operands to determine if the first operand is
1168///    less than the second operand and returns the result of the comparison.
1169///
1170///    If either of the two lower 32-bit values is NaN, 0 is returned.
1171///
1172/// \headerfile <x86intrin.h>
1173///
1174/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1175///
1176/// \param __a
1177///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178///    used in the comparison.
1179/// \param __b
1180///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181///    used in the comparison.
1182/// \returns An integer containing the comparison results. If either of the two
1183///    lower 32-bit values is NaN, 0 is returned.
1184static __inline__ int __DEFAULT_FN_ATTRS
1185_mm_ucomilt_ss(__m128 __a, __m128 __b)
1186{
1187  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188}
1189
1190/// Performs an unordered comparison of two 32-bit float values using
1191///    the low-order bits of both operands to determine if the first operand is
1192///    less than or equal to the second operand and returns the result of the
1193///    comparison.
1194///
1195///    If either of the two lower 32-bit values is NaN, 0 is returned.
1196///
1197/// \headerfile <x86intrin.h>
1198///
1199/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200///
1201/// \param __a
1202///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203///    used in the comparison.
1204/// \param __b
1205///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206///    used in the comparison.
1207/// \returns An integer containing the comparison results. If either of the two
1208///     lower 32-bit values is NaN, 0 is returned.
1209static __inline__ int __DEFAULT_FN_ATTRS
1210_mm_ucomile_ss(__m128 __a, __m128 __b)
1211{
1212  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213}
1214
1215/// Performs an unordered comparison of two 32-bit float values using
1216///    the low-order bits of both operands to determine if the first operand is
1217///    greater than the second operand and returns the result of the
1218///    comparison.
1219///
1220///    If either of the two lower 32-bit values is NaN, 0 is returned.
1221///
1222/// \headerfile <x86intrin.h>
1223///
1224/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1225///
1226/// \param __a
1227///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228///    used in the comparison.
1229/// \param __b
1230///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231///    used in the comparison.
1232/// \returns An integer containing the comparison results. If either of the two
1233///     lower 32-bit values is NaN, 0 is returned.
1234static __inline__ int __DEFAULT_FN_ATTRS
1235_mm_ucomigt_ss(__m128 __a, __m128 __b)
1236{
1237  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238}
1239
1240/// Performs an unordered comparison of two 32-bit float values using
1241///    the low-order bits of both operands to determine if the first operand is
1242///    greater than or equal to the second operand and returns the result of
1243///    the comparison.
1244///
1245///    If either of the two lower 32-bit values is NaN, 0 is returned.
1246///
1247/// \headerfile <x86intrin.h>
1248///
1249/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250///
1251/// \param __a
1252///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253///    used in the comparison.
1254/// \param __b
1255///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256///    used in the comparison.
1257/// \returns An integer containing the comparison results. If either of the two
1258///     lower 32-bit values is NaN, 0 is returned.
1259static __inline__ int __DEFAULT_FN_ATTRS
1260_mm_ucomige_ss(__m128 __a, __m128 __b)
1261{
1262  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263}
1264
1265/// Performs an unordered comparison of two 32-bit float values using
1266///    the low-order bits of both operands to determine inequality and returns
1267///    the result of the comparison.
1268///
1269///    If either of the two lower 32-bit values is NaN, 1 is returned.
1270///
1271/// \headerfile <x86intrin.h>
1272///
1273/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274///
1275/// \param __a
1276///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277///    used in the comparison.
1278/// \param __b
1279///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280///    used in the comparison.
1281/// \returns An integer containing the comparison results. If either of the two
1282///    lower 32-bit values is NaN, 1 is returned.
1283static __inline__ int __DEFAULT_FN_ATTRS
1284_mm_ucomineq_ss(__m128 __a, __m128 __b)
1285{
1286  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287}
1288
1289/// Converts a float value contained in the lower 32 bits of a vector of
1290///    [4 x float] into a 32-bit integer.
1291///
1292/// \headerfile <x86intrin.h>
1293///
1294/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295///   instructions.
1296///
1297/// \param __a
1298///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299///    used in the conversion.
1300/// \returns A 32-bit integer containing the converted value.
1301static __inline__ int __DEFAULT_FN_ATTRS
1302_mm_cvtss_si32(__m128 __a)
1303{
1304  return __builtin_ia32_cvtss2si((__v4sf)__a);
1305}
1306
1307/// Converts a float value contained in the lower 32 bits of a vector of
1308///    [4 x float] into a 32-bit integer.
1309///
1310/// \headerfile <x86intrin.h>
1311///
1312/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313///   instructions.
1314///
1315/// \param __a
1316///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317///    used in the conversion.
1318/// \returns A 32-bit integer containing the converted value.
1319static __inline__ int __DEFAULT_FN_ATTRS
1320_mm_cvt_ss2si(__m128 __a)
1321{
1322  return _mm_cvtss_si32(__a);
1323}
1324
1325#ifdef __x86_64__
1326
1327/// Converts a float value contained in the lower 32 bits of a vector of
1328///    [4 x float] into a 64-bit integer.
1329///
1330/// \headerfile <x86intrin.h>
1331///
1332/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333///   instructions.
1334///
1335/// \param __a
1336///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337///    used in the conversion.
1338/// \returns A 64-bit integer containing the converted value.
1339static __inline__ long long __DEFAULT_FN_ATTRS
1340_mm_cvtss_si64(__m128 __a)
1341{
1342  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343}
1344
1345#endif
1346
1347/// Converts two low-order float values in a 128-bit vector of
1348///    [4 x float] into a 64-bit vector of [2 x i32].
1349///
1350/// \headerfile <x86intrin.h>
1351///
1352/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1353///
1354/// \param __a
1355///    A 128-bit vector of [4 x float].
1356/// \returns A 64-bit integer vector containing the converted values.
1357static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1358_mm_cvtps_pi32(__m128 __a)
1359{
1360  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361}
1362
1363/// Converts two low-order float values in a 128-bit vector of
1364///    [4 x float] into a 64-bit vector of [2 x i32].
1365///
1366/// \headerfile <x86intrin.h>
1367///
1368/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1369///
1370/// \param __a
1371///    A 128-bit vector of [4 x float].
1372/// \returns A 64-bit integer vector containing the converted values.
1373static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1374_mm_cvt_ps2pi(__m128 __a)
1375{
1376  return _mm_cvtps_pi32(__a);
1377}
1378
1379/// Converts a float value contained in the lower 32 bits of a vector of
1380///    [4 x float] into a 32-bit integer, truncating the result when it is
1381///    inexact.
1382///
1383/// \headerfile <x86intrin.h>
1384///
1385/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386///   instructions.
1387///
1388/// \param __a
1389///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390///    used in the conversion.
1391/// \returns A 32-bit integer containing the converted value.
1392static __inline__ int __DEFAULT_FN_ATTRS
1393_mm_cvttss_si32(__m128 __a)
1394{
1395  return __builtin_ia32_cvttss2si((__v4sf)__a);
1396}
1397
1398/// Converts a float value contained in the lower 32 bits of a vector of
1399///    [4 x float] into a 32-bit integer, truncating the result when it is
1400///    inexact.
1401///
1402/// \headerfile <x86intrin.h>
1403///
1404/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405///   instructions.
1406///
1407/// \param __a
1408///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409///    used in the conversion.
1410/// \returns A 32-bit integer containing the converted value.
1411static __inline__ int __DEFAULT_FN_ATTRS
1412_mm_cvtt_ss2si(__m128 __a)
1413{
1414  return _mm_cvttss_si32(__a);
1415}
1416
1417#ifdef __x86_64__
1418/// Converts a float value contained in the lower 32 bits of a vector of
1419///    [4 x float] into a 64-bit integer, truncating the result when it is
1420///    inexact.
1421///
1422/// \headerfile <x86intrin.h>
1423///
1424/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425///   instructions.
1426///
1427/// \param __a
1428///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429///    used in the conversion.
1430/// \returns A 64-bit integer containing the converted value.
1431static __inline__ long long __DEFAULT_FN_ATTRS
1432_mm_cvttss_si64(__m128 __a)
1433{
1434  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1435}
1436#endif
1437
1438/// Converts two low-order float values in a 128-bit vector of
1439///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440///    when it is inexact.
1441///
1442/// \headerfile <x86intrin.h>
1443///
1444/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445///   instructions.
1446///
1447/// \param __a
1448///    A 128-bit vector of [4 x float].
1449/// \returns A 64-bit integer vector containing the converted values.
1450static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1451_mm_cvttps_pi32(__m128 __a)
1452{
1453  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454}
1455
1456/// Converts two low-order float values in a 128-bit vector of [4 x
1457///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1458///    is inexact.
1459///
1460/// \headerfile <x86intrin.h>
1461///
1462/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1463///
1464/// \param __a
1465///    A 128-bit vector of [4 x float].
1466/// \returns A 64-bit integer vector containing the converted values.
1467static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1468_mm_cvtt_ps2pi(__m128 __a)
1469{
1470  return _mm_cvttps_pi32(__a);
1471}
1472
1473/// Converts a 32-bit signed integer value into a floating point value
1474///    and writes it to the lower 32 bits of the destination. The remaining
1475///    higher order elements of the destination vector are copied from the
1476///    corresponding elements in the first operand.
1477///
1478/// \headerfile <x86intrin.h>
1479///
1480/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1481///
1482/// \param __a
1483///    A 128-bit vector of [4 x float].
1484/// \param __b
1485///    A 32-bit signed integer operand containing the value to be converted.
1486/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487///    converted value of the second operand. The upper 96 bits are copied from
1488///    the upper 96 bits of the first operand.
1489static __inline__ __m128 __DEFAULT_FN_ATTRS
1490_mm_cvtsi32_ss(__m128 __a, int __b)
1491{
1492  __a[0] = __b;
1493  return __a;
1494}
1495
1496/// Converts a 32-bit signed integer value into a floating point value
1497///    and writes it to the lower 32 bits of the destination. The remaining
1498///    higher order elements of the destination are copied from the
1499///    corresponding elements in the first operand.
1500///
1501/// \headerfile <x86intrin.h>
1502///
1503/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1504///
1505/// \param __a
1506///    A 128-bit vector of [4 x float].
1507/// \param __b
1508///    A 32-bit signed integer operand containing the value to be converted.
1509/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510///    converted value of the second operand. The upper 96 bits are copied from
1511///    the upper 96 bits of the first operand.
1512static __inline__ __m128 __DEFAULT_FN_ATTRS
1513_mm_cvt_si2ss(__m128 __a, int __b)
1514{
1515  return _mm_cvtsi32_ss(__a, __b);
1516}
1517
1518#ifdef __x86_64__
1519
1520/// Converts a 64-bit signed integer value into a floating point value
1521///    and writes it to the lower 32 bits of the destination. The remaining
1522///    higher order elements of the destination are copied from the
1523///    corresponding elements in the first operand.
1524///
1525/// \headerfile <x86intrin.h>
1526///
1527/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1528///
1529/// \param __a
1530///    A 128-bit vector of [4 x float].
1531/// \param __b
1532///    A 64-bit signed integer operand containing the value to be converted.
1533/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534///    converted value of the second operand. The upper 96 bits are copied from
1535///    the upper 96 bits of the first operand.
1536static __inline__ __m128 __DEFAULT_FN_ATTRS
1537_mm_cvtsi64_ss(__m128 __a, long long __b)
1538{
1539  __a[0] = __b;
1540  return __a;
1541}
1542
1543#endif
1544
1545/// Converts two elements of a 64-bit vector of [2 x i32] into two
1546///    floating point values and writes them to the lower 64-bits of the
1547///    destination. The remaining higher order elements of the destination are
1548///    copied from the corresponding elements in the first operand.
1549///
1550/// \headerfile <x86intrin.h>
1551///
1552/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1553///
1554/// \param __a
1555///    A 128-bit vector of [4 x float].
1556/// \param __b
1557///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558///    and written to the corresponding low-order elements in the destination.
1559/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560///    converted value of the second operand. The upper 64 bits are copied from
1561///    the upper 64 bits of the first operand.
1562static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1563_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564{
1565  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566}
1567
1568/// Converts two elements of a 64-bit vector of [2 x i32] into two
1569///    floating point values and writes them to the lower 64-bits of the
1570///    destination. The remaining higher order elements of the destination are
1571///    copied from the corresponding elements in the first operand.
1572///
1573/// \headerfile <x86intrin.h>
1574///
1575/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1576///
1577/// \param __a
1578///    A 128-bit vector of [4 x float].
1579/// \param __b
1580///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581///    and written to the corresponding low-order elements in the destination.
1582/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583///    converted value from the second operand. The upper 64 bits are copied
1584///    from the upper 64 bits of the first operand.
1585static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1586_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587{
1588  return _mm_cvtpi32_ps(__a, __b);
1589}
1590
1591/// Extracts a float value contained in the lower 32 bits of a vector of
1592///    [4 x float].
1593///
1594/// \headerfile <x86intrin.h>
1595///
1596/// This intrinsic has no corresponding instruction.
1597///
1598/// \param __a
1599///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600///    used in the extraction.
1601/// \returns A 32-bit float containing the extracted value.
1602static __inline__ float __DEFAULT_FN_ATTRS
1603_mm_cvtss_f32(__m128 __a)
1604{
1605  return __a[0];
1606}
1607
1608/// Loads two packed float values from the address \a __p into the
1609///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610///     are copied from the low-order bits of the first operand.
1611///
1612/// \headerfile <x86intrin.h>
1613///
1614/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1615///
1616/// \param __a
1617///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618///    of the destination.
1619/// \param __p
1620///    A pointer to two packed float values. Bits [63:0] are written to bits
1621///    [127:64] of the destination.
1622/// \returns A 128-bit vector of [4 x float] containing the moved values.
1623static __inline__ __m128 __DEFAULT_FN_ATTRS
1624_mm_loadh_pi(__m128 __a, const __m64 *__p)
1625{
1626  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627  struct __mm_loadh_pi_struct {
1628    __mm_loadh_pi_v2f32 __u;
1629  } __attribute__((__packed__, __may_alias__));
1630  __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1631  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633}
1634
1635/// Loads two packed float values from the address \a __p into the
1636///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637///    are copied from the high-order bits of the first operand.
1638///
1639/// \headerfile <x86intrin.h>
1640///
1641/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1642///
1643/// \param __a
1644///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645///    [127:64] of the destination.
1646/// \param __p
1647///    A pointer to two packed float values. Bits [63:0] are written to bits
1648///    [63:0] of the destination.
1649/// \returns A 128-bit vector of [4 x float] containing the moved values.
1650static __inline__ __m128 __DEFAULT_FN_ATTRS
1651_mm_loadl_pi(__m128 __a, const __m64 *__p)
1652{
1653  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654  struct __mm_loadl_pi_struct {
1655    __mm_loadl_pi_v2f32 __u;
1656  } __attribute__((__packed__, __may_alias__));
1657  __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1658  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660}
1661
1662/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663///    32 bits of the vector are initialized with the single-precision
1664///    floating-point value loaded from a specified memory location. The upper
1665///    96 bits are set to zero.
1666///
1667/// \headerfile <x86intrin.h>
1668///
1669/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1670///
1671/// \param __p
1672///    A pointer to a 32-bit memory location containing a single-precision
1673///    floating-point value.
1674/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675///    lower 32 bits contain the value loaded from the memory location. The
1676///    upper 96 bits are set to zero.
1677static __inline__ __m128 __DEFAULT_FN_ATTRS
1678_mm_load_ss(const float *__p)
1679{
1680  struct __mm_load_ss_struct {
1681    float __u;
1682  } __attribute__((__packed__, __may_alias__));
1683  float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1684  return __extension__ (__m128){ __u, 0, 0, 0 };
1685}
1686
1687/// Loads a 32-bit float value and duplicates it to all four vector
1688///    elements of a 128-bit vector of [4 x float].
1689///
1690/// \headerfile <x86intrin.h>
1691///
1692/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1693///    instruction.
1694///
1695/// \param __p
1696///    A pointer to a float value to be loaded and duplicated.
1697/// \returns A 128-bit vector of [4 x float] containing the loaded and
1698///    duplicated values.
1699static __inline__ __m128 __DEFAULT_FN_ATTRS
1700_mm_load1_ps(const float *__p)
1701{
1702  struct __mm_load1_ps_struct {
1703    float __u;
1704  } __attribute__((__packed__, __may_alias__));
1705  float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1706  return __extension__ (__m128){ __u, __u, __u, __u };
1707}
1708
1709#define        _mm_load_ps1(p) _mm_load1_ps(p)
1710
1711/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712///    memory location.
1713///
1714/// \headerfile <x86intrin.h>
1715///
1716/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1717///
1718/// \param __p
1719///    A pointer to a 128-bit memory location. The address of the memory
1720///    location has to be 128-bit aligned.
1721/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722static __inline__ __m128 __DEFAULT_FN_ATTRS
1723_mm_load_ps(const float *__p)
1724{
1725  return *(const __m128*)__p;
1726}
1727
1728/// Loads a 128-bit floating-point vector of [4 x float] from an
1729///    unaligned memory location.
1730///
1731/// \headerfile <x86intrin.h>
1732///
1733/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1734///
1735/// \param __p
1736///    A pointer to a 128-bit memory location. The address of the memory
1737///    location does not have to be aligned.
1738/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739static __inline__ __m128 __DEFAULT_FN_ATTRS
1740_mm_loadu_ps(const float *__p)
1741{
1742  struct __loadu_ps {
1743    __m128_u __v;
1744  } __attribute__((__packed__, __may_alias__));
1745  return ((const struct __loadu_ps*)__p)->__v;
1746}
1747
1748/// Loads four packed float values, in reverse order, from an aligned
1749///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750///
1751/// \headerfile <x86intrin.h>
1752///
1753/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1754///    instruction.
1755///
1756/// \param __p
1757///    A pointer to a 128-bit memory location. The address of the memory
1758///    location has to be 128-bit aligned.
1759/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760///    in reverse order.
1761static __inline__ __m128 __DEFAULT_FN_ATTRS
1762_mm_loadr_ps(const float *__p)
1763{
1764  __m128 __a = _mm_load_ps(__p);
1765  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766}
1767
1768/// Create a 128-bit vector of [4 x float] with undefined values.
1769///
1770/// \headerfile <x86intrin.h>
1771///
1772/// This intrinsic has no corresponding instruction.
1773///
1774/// \returns A 128-bit vector of [4 x float] containing undefined values.
1775static __inline__ __m128 __DEFAULT_FN_ATTRS
1776_mm_undefined_ps(void)
1777{
1778  return (__m128)__builtin_ia32_undef128();
1779}
1780
1781/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782///    32 bits of the vector are initialized with the specified single-precision
1783///    floating-point value. The upper 96 bits are set to zero.
1784///
1785/// \headerfile <x86intrin.h>
1786///
1787/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1788///
1789/// \param __w
1790///    A single-precision floating-point value used to initialize the lower 32
1791///    bits of the result.
1792/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793///    lower 32 bits contain the value provided in the source operand. The
1794///    upper 96 bits are set to zero.
1795static __inline__ __m128 __DEFAULT_FN_ATTRS
1796_mm_set_ss(float __w)
1797{
1798  return __extension__ (__m128){ __w, 0, 0, 0 };
1799}
1800
1801/// Constructs a 128-bit floating-point vector of [4 x float], with each
1802///    of the four single-precision floating-point vector elements set to the
1803///    specified single-precision floating-point value.
1804///
1805/// \headerfile <x86intrin.h>
1806///
1807/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1808///
1809/// \param __w
1810///    A single-precision floating-point value used to initialize each vector
1811///    element of the result.
1812/// \returns An initialized 128-bit floating-point vector of [4 x float].
1813static __inline__ __m128 __DEFAULT_FN_ATTRS
1814_mm_set1_ps(float __w)
1815{
1816  return __extension__ (__m128){ __w, __w, __w, __w };
1817}
1818
1819/* Microsoft specific. */
1820/// Constructs a 128-bit floating-point vector of [4 x float], with each
1821///    of the four single-precision floating-point vector elements set to the
1822///    specified single-precision floating-point value.
1823///
1824/// \headerfile <x86intrin.h>
1825///
1826/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1827///
1828/// \param __w
1829///    A single-precision floating-point value used to initialize each vector
1830///    element of the result.
1831/// \returns An initialized 128-bit floating-point vector of [4 x float].
1832static __inline__ __m128 __DEFAULT_FN_ATTRS
1833_mm_set_ps1(float __w)
1834{
1835    return _mm_set1_ps(__w);
1836}
1837
1838/// Constructs a 128-bit floating-point vector of [4 x float]
1839///    initialized with the specified single-precision floating-point values.
1840///
1841/// \headerfile <x86intrin.h>
1842///
1843/// This intrinsic is a utility function and does not correspond to a specific
1844///    instruction.
1845///
1846/// \param __z
1847///    A single-precision floating-point value used to initialize bits [127:96]
1848///    of the result.
1849/// \param __y
1850///    A single-precision floating-point value used to initialize bits [95:64]
1851///    of the result.
1852/// \param __x
1853///    A single-precision floating-point value used to initialize bits [63:32]
1854///    of the result.
1855/// \param __w
1856///    A single-precision floating-point value used to initialize bits [31:0]
1857///    of the result.
1858/// \returns An initialized 128-bit floating-point vector of [4 x float].
1859static __inline__ __m128 __DEFAULT_FN_ATTRS
1860_mm_set_ps(float __z, float __y, float __x, float __w)
1861{
1862  return __extension__ (__m128){ __w, __x, __y, __z };
1863}
1864
1865/// Constructs a 128-bit floating-point vector of [4 x float],
1866///    initialized in reverse order with the specified 32-bit single-precision
1867///    float-point values.
1868///
1869/// \headerfile <x86intrin.h>
1870///
1871/// This intrinsic is a utility function and does not correspond to a specific
1872///    instruction.
1873///
1874/// \param __z
1875///    A single-precision floating-point value used to initialize bits [31:0]
1876///    of the result.
1877/// \param __y
1878///    A single-precision floating-point value used to initialize bits [63:32]
1879///    of the result.
1880/// \param __x
1881///    A single-precision floating-point value used to initialize bits [95:64]
1882///    of the result.
1883/// \param __w
1884///    A single-precision floating-point value used to initialize bits [127:96]
1885///    of the result.
1886/// \returns An initialized 128-bit floating-point vector of [4 x float].
1887static __inline__ __m128 __DEFAULT_FN_ATTRS
1888_mm_setr_ps(float __z, float __y, float __x, float __w)
1889{
1890  return __extension__ (__m128){ __z, __y, __x, __w };
1891}
1892
1893/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1894///    to zero.
1895///
1896/// \headerfile <x86intrin.h>
1897///
1898/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1899///
1900/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901///    all elements set to zero.
1902static __inline__ __m128 __DEFAULT_FN_ATTRS
1903_mm_setzero_ps(void)
1904{
1905  return __extension__ (__m128){ 0, 0, 0, 0 };
1906}
1907
1908/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909///    memory location.
1910///
1911/// \headerfile <x86intrin.h>
1912///
1913/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1914///
1915/// \param __p
1916///    A pointer to a 64-bit memory location.
1917/// \param __a
1918///    A 128-bit vector of [4 x float] containing the values to be stored.
1919static __inline__ void __DEFAULT_FN_ATTRS
1920_mm_storeh_pi(__m64 *__p, __m128 __a)
1921{
1922  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923  struct __mm_storeh_pi_struct {
1924    __mm_storeh_pi_v2f32 __u;
1925  } __attribute__((__packed__, __may_alias__));
1926  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1927}
1928
1929/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1930///     memory location.
1931///
1932/// \headerfile <x86intrin.h>
1933///
1934/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1935///
1936/// \param __p
1937///    A pointer to a memory location that will receive the float values.
1938/// \param __a
1939///    A 128-bit vector of [4 x float] containing the values to be stored.
1940static __inline__ void __DEFAULT_FN_ATTRS
1941_mm_storel_pi(__m64 *__p, __m128 __a)
1942{
1943  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944  struct __mm_storeh_pi_struct {
1945    __mm_storeh_pi_v2f32 __u;
1946  } __attribute__((__packed__, __may_alias__));
1947  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1948}
1949
1950/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1951///     memory location.
1952///
1953/// \headerfile <x86intrin.h>
1954///
1955/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1956///
1957/// \param __p
1958///    A pointer to a 32-bit memory location.
1959/// \param __a
1960///    A 128-bit vector of [4 x float] containing the value to be stored.
1961static __inline__ void __DEFAULT_FN_ATTRS
1962_mm_store_ss(float *__p, __m128 __a)
1963{
1964  struct __mm_store_ss_struct {
1965    float __u;
1966  } __attribute__((__packed__, __may_alias__));
1967  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1968}
1969
1970/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1971///    location.
1972///
1973/// \headerfile <x86intrin.h>
1974///
1975/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1976///
1977/// \param __p
1978///    A pointer to a 128-bit memory location. The address of the memory
1979///    location does not have to be aligned.
1980/// \param __a
1981///    A 128-bit vector of [4 x float] containing the values to be stored.
1982static __inline__ void __DEFAULT_FN_ATTRS
1983_mm_storeu_ps(float *__p, __m128 __a)
1984{
1985  struct __storeu_ps {
1986    __m128_u __v;
1987  } __attribute__((__packed__, __may_alias__));
1988  ((struct __storeu_ps*)__p)->__v = __a;
1989}
1990
1991/// Stores a 128-bit vector of [4 x float] into an aligned memory
1992///    location.
1993///
1994/// \headerfile <x86intrin.h>
1995///
1996/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1997///
1998/// \param __p
1999///    A pointer to a 128-bit memory location. The address of the memory
2000///    location has to be 16-byte aligned.
2001/// \param __a
2002///    A 128-bit vector of [4 x float] containing the values to be stored.
2003static __inline__ void __DEFAULT_FN_ATTRS
2004_mm_store_ps(float *__p, __m128 __a)
2005{
2006  *(__m128*)__p = __a;
2007}
2008
2009/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2010///    four contiguous elements in an aligned memory location.
2011///
2012/// \headerfile <x86intrin.h>
2013///
2014/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2015///    instruction.
2016///
2017/// \param __p
2018///    A pointer to a 128-bit memory location.
2019/// \param __a
2020///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2021///    of the four contiguous elements pointed by \a __p.
2022static __inline__ void __DEFAULT_FN_ATTRS
2023_mm_store1_ps(float *__p, __m128 __a)
2024{
2025  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026  _mm_store_ps(__p, __a);
2027}
2028
2029/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030///    four contiguous elements in an aligned memory location.
2031///
2032/// \headerfile <x86intrin.h>
2033///
2034/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2035///    instruction.
2036///
2037/// \param __p
2038///    A pointer to a 128-bit memory location.
2039/// \param __a
2040///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041///    of the four contiguous elements pointed by \a __p.
2042static __inline__ void __DEFAULT_FN_ATTRS
2043_mm_store_ps1(float *__p, __m128 __a)
2044{
2045  _mm_store1_ps(__p, __a);
2046}
2047
2048/// Stores float values from a 128-bit vector of [4 x float] to an
2049///    aligned memory location in reverse order.
2050///
2051/// \headerfile <x86intrin.h>
2052///
2053/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2054///    instruction.
2055///
2056/// \param __p
2057///    A pointer to a 128-bit memory location. The address of the memory
2058///    location has to be 128-bit aligned.
2059/// \param __a
2060///    A 128-bit vector of [4 x float] containing the values to be stored.
2061static __inline__ void __DEFAULT_FN_ATTRS
2062_mm_storer_ps(float *__p, __m128 __a)
2063{
2064  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065  _mm_store_ps(__p, __a);
2066}
2067
2068#define _MM_HINT_ET0 7
2069#define _MM_HINT_ET1 6
2070#define _MM_HINT_T0  3
2071#define _MM_HINT_T1  2
2072#define _MM_HINT_T2  1
2073#define _MM_HINT_NTA 0
2074
2075#ifndef _MSC_VER
2076/* FIXME: We have to #define this because "sel" must be a constant integer, and
2077   Sema doesn't do any form of constant propagation yet. */
2078
2079/// Loads one cache line of data from the specified address to a location
2080///    closer to the processor.
2081///
2082/// \headerfile <x86intrin.h>
2083///
2084/// \code
2085/// void _mm_prefetch(const void * a, const int sel);
2086/// \endcode
2087///
2088/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2089///
2090/// \param a
2091///    A pointer to a memory location containing a cache line of data.
2092/// \param sel
2093///    A predefined integer constant specifying the type of prefetch
2094///    operation: \n
2095///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096///    PREFETCHNTA instruction will be generated. \n
2097///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2098///    be generated. \n
2099///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2100///    be generated. \n
2101///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2102///    be generated.
2103#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2104                                                 ((sel) >> 2) & 1, (sel) & 0x3))
2105#endif
2106
2107/// Stores a 64-bit integer in the specified aligned memory location. To
2108///    minimize caching, the data is flagged as non-temporal (unlikely to be
2109///    used again soon).
2110///
2111/// \headerfile <x86intrin.h>
2112///
2113/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2114///
2115/// \param __p
2116///    A pointer to an aligned memory location used to store the register value.
2117/// \param __a
2118///    A 64-bit integer containing the value to be stored.
2119static __inline__ void __DEFAULT_FN_ATTRS_MMX
2120_mm_stream_pi(__m64 *__p, __m64 __a)
2121{
2122  __builtin_ia32_movntq(__p, __a);
2123}
2124
2125/// Moves packed float values from a 128-bit vector of [4 x float] to a
2126///    128-bit aligned memory location. To minimize caching, the data is flagged
2127///    as non-temporal (unlikely to be used again soon).
2128///
2129/// \headerfile <x86intrin.h>
2130///
2131/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2132///
2133/// \param __p
2134///    A pointer to a 128-bit aligned memory location that will receive the
2135///    single-precision floating-point values.
2136/// \param __a
2137///    A 128-bit vector of [4 x float] containing the values to be moved.
2138static __inline__ void __DEFAULT_FN_ATTRS
2139_mm_stream_ps(float *__p, __m128 __a)
2140{
2141  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2142}
2143
2144#if defined(__cplusplus)
2145extern "C" {
2146#endif
2147
2148/// Forces strong memory ordering (serialization) between store
2149///    instructions preceding this instruction and store instructions following
2150///    this instruction, ensuring the system completes all previous stores
2151///    before executing subsequent stores.
2152///
2153/// \headerfile <x86intrin.h>
2154///
2155/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2156///
2157void _mm_sfence(void);
2158
2159#if defined(__cplusplus)
2160} // extern "C"
2161#endif
2162
2163/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2164///    returns it, as specified by the immediate integer operand.
2165///
2166/// \headerfile <x86intrin.h>
2167///
2168/// \code
2169/// int _mm_extract_pi16(__m64 a, int n);
2170/// \endcode
2171///
2172/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2173///
2174/// \param a
2175///    A 64-bit vector of [4 x i16].
2176/// \param n
2177///    An immediate integer operand that determines which bits are extracted: \n
2178///    0: Bits [15:0] are copied to the destination. \n
2179///    1: Bits [31:16] are copied to the destination. \n
2180///    2: Bits [47:32] are copied to the destination. \n
2181///    3: Bits [63:48] are copied to the destination.
2182/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2183#define _mm_extract_pi16(a, n) \
2184  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
2185
2186/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2187///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2188///    specified by the immediate operand \a n.
2189///
2190/// \headerfile <x86intrin.h>
2191///
2192/// \code
2193/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2194/// \endcode
2195///
2196/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2197///
2198/// \param a
2199///    A 64-bit vector of [4 x i16].
2200/// \param d
2201///    An integer. The lower 16-bit value from this operand is written to the
2202///    destination at the offset specified by operand \a n.
2203/// \param n
2204///    An immediate integer operant that determines which the bits to be used
2205///    in the destination. \n
2206///    0: Bits [15:0] are copied to the destination. \n
2207///    1: Bits [31:16] are copied to the destination. \n
2208///    2: Bits [47:32] are copied to the destination. \n
2209///    3: Bits [63:48] are copied to the destination.  \n
2210///    The remaining bits in the destination are copied from the corresponding
2211///    bits in operand \a a.
2212/// \returns A 64-bit integer vector containing the copied packed data from the
2213///    operands.
2214#define _mm_insert_pi16(a, d, n) \
2215  (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
2216
2217/// Compares each of the corresponding packed 16-bit integer values of
2218///    the 64-bit integer vectors, and writes the greater value to the
2219///    corresponding bits in the destination.
2220///
2221/// \headerfile <x86intrin.h>
2222///
2223/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2224///
2225/// \param __a
2226///    A 64-bit integer vector containing one of the source operands.
2227/// \param __b
2228///    A 64-bit integer vector containing one of the source operands.
2229/// \returns A 64-bit integer vector containing the comparison results.
2230static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2231_mm_max_pi16(__m64 __a, __m64 __b)
2232{
2233  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2234}
2235
2236/// Compares each of the corresponding packed 8-bit unsigned integer
2237///    values of the 64-bit integer vectors, and writes the greater value to the
2238///    corresponding bits in the destination.
2239///
2240/// \headerfile <x86intrin.h>
2241///
2242/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2243///
2244/// \param __a
2245///    A 64-bit integer vector containing one of the source operands.
2246/// \param __b
2247///    A 64-bit integer vector containing one of the source operands.
2248/// \returns A 64-bit integer vector containing the comparison results.
2249static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2250_mm_max_pu8(__m64 __a, __m64 __b)
2251{
2252  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2253}
2254
2255/// Compares each of the corresponding packed 16-bit integer values of
2256///    the 64-bit integer vectors, and writes the lesser value to the
2257///    corresponding bits in the destination.
2258///
2259/// \headerfile <x86intrin.h>
2260///
2261/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2262///
2263/// \param __a
2264///    A 64-bit integer vector containing one of the source operands.
2265/// \param __b
2266///    A 64-bit integer vector containing one of the source operands.
2267/// \returns A 64-bit integer vector containing the comparison results.
2268static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2269_mm_min_pi16(__m64 __a, __m64 __b)
2270{
2271  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2272}
2273
2274/// Compares each of the corresponding packed 8-bit unsigned integer
2275///    values of the 64-bit integer vectors, and writes the lesser value to the
2276///    corresponding bits in the destination.
2277///
2278/// \headerfile <x86intrin.h>
2279///
2280/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2281///
2282/// \param __a
2283///    A 64-bit integer vector containing one of the source operands.
2284/// \param __b
2285///    A 64-bit integer vector containing one of the source operands.
2286/// \returns A 64-bit integer vector containing the comparison results.
2287static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2288_mm_min_pu8(__m64 __a, __m64 __b)
2289{
2290  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2291}
2292
2293/// Takes the most significant bit from each 8-bit element in a 64-bit
2294///    integer vector to create an 8-bit mask value. Zero-extends the value to
2295///    32-bit integer and writes it to the destination.
2296///
2297/// \headerfile <x86intrin.h>
2298///
2299/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2300///
2301/// \param __a
2302///    A 64-bit integer vector containing the values with bits to be extracted.
2303/// \returns The most significant bit from each 8-bit element in \a __a,
2304///    written to bits [7:0].
2305static __inline__ int __DEFAULT_FN_ATTRS_MMX
2306_mm_movemask_pi8(__m64 __a)
2307{
2308  return __builtin_ia32_pmovmskb((__v8qi)__a);
2309}
2310
2311/// Multiplies packed 16-bit unsigned integer values and writes the
2312///    high-order 16 bits of each 32-bit product to the corresponding bits in
2313///    the destination.
2314///
2315/// \headerfile <x86intrin.h>
2316///
2317/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2318///
2319/// \param __a
2320///    A 64-bit integer vector containing one of the source operands.
2321/// \param __b
2322///    A 64-bit integer vector containing one of the source operands.
2323/// \returns A 64-bit integer vector containing the products of both operands.
2324static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2325_mm_mulhi_pu16(__m64 __a, __m64 __b)
2326{
2327  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2328}
2329
2330/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2331///    destination, as specified by the immediate value operand.
2332///
2333/// \headerfile <x86intrin.h>
2334///
2335/// \code
2336/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2337/// \endcode
2338///
2339/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2340///
2341/// \param a
2342///    A 64-bit integer vector containing the values to be shuffled.
2343/// \param n
2344///    An immediate value containing an 8-bit value specifying which elements to
2345///    copy from \a a. The destinations within the 64-bit destination are
2346///    assigned values as follows: \n
2347///    Bits [1:0] are used to assign values to bits [15:0] in the
2348///    destination. \n
2349///    Bits [3:2] are used to assign values to bits [31:16] in the
2350///    destination. \n
2351///    Bits [5:4] are used to assign values to bits [47:32] in the
2352///    destination. \n
2353///    Bits [7:6] are used to assign values to bits [63:48] in the
2354///    destination. \n
2355///    Bit value assignments: \n
2356///    00: assigned from bits [15:0] of \a a. \n
2357///    01: assigned from bits [31:16] of \a a. \n
2358///    10: assigned from bits [47:32] of \a a. \n
2359///    11: assigned from bits [63:48] of \a a.
2360/// \returns A 64-bit integer vector containing the shuffled values.
2361#define _mm_shuffle_pi16(a, n) \
2362  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2363
2364/// Conditionally copies the values from each 8-bit element in the first
2365///    64-bit integer vector operand to the specified memory location, as
2366///    specified by the most significant bit in the corresponding element in the
2367///    second 64-bit integer vector operand.
2368///
2369///    To minimize caching, the data is flagged as non-temporal
2370///    (unlikely to be used again soon).
2371///
2372/// \headerfile <x86intrin.h>
2373///
2374/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2375///
2376/// \param __d
2377///    A 64-bit integer vector containing the values with elements to be copied.
2378/// \param __n
2379///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2380///    element determines whether the corresponding element in operand \a __d
2381///    is copied. If the most significant bit of a given element is 1, the
2382///    corresponding element in operand \a __d is copied.
2383/// \param __p
2384///    A pointer to a 64-bit memory location that will receive the conditionally
2385///    copied integer values. The address of the memory location does not have
2386///    to be aligned.
2387static __inline__ void __DEFAULT_FN_ATTRS_MMX
2388_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2389{
2390  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2391}
2392
2393/// Computes the rounded averages of the packed unsigned 8-bit integer
2394///    values and writes the averages to the corresponding bits in the
2395///    destination.
2396///
2397/// \headerfile <x86intrin.h>
2398///
2399/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2400///
2401/// \param __a
2402///    A 64-bit integer vector containing one of the source operands.
2403/// \param __b
2404///    A 64-bit integer vector containing one of the source operands.
2405/// \returns A 64-bit integer vector containing the averages of both operands.
2406static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2407_mm_avg_pu8(__m64 __a, __m64 __b)
2408{
2409  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2410}
2411
2412/// Computes the rounded averages of the packed unsigned 16-bit integer
2413///    values and writes the averages to the corresponding bits in the
2414///    destination.
2415///
2416/// \headerfile <x86intrin.h>
2417///
2418/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2419///
2420/// \param __a
2421///    A 64-bit integer vector containing one of the source operands.
2422/// \param __b
2423///    A 64-bit integer vector containing one of the source operands.
2424/// \returns A 64-bit integer vector containing the averages of both operands.
2425static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2426_mm_avg_pu16(__m64 __a, __m64 __b)
2427{
2428  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2429}
2430
2431/// Subtracts the corresponding 8-bit unsigned integer values of the two
2432///    64-bit vector operands and computes the absolute value for each of the
2433///    difference. Then sum of the 8 absolute differences is written to the
2434///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2435///
2436/// \headerfile <x86intrin.h>
2437///
2438/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2439///
2440/// \param __a
2441///    A 64-bit integer vector containing one of the source operands.
2442/// \param __b
2443///    A 64-bit integer vector containing one of the source operands.
2444/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445///    sets of absolute differences between both operands. The upper bits are
2446///    cleared.
2447static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2448_mm_sad_pu8(__m64 __a, __m64 __b)
2449{
2450  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2451}
2452
2453#if defined(__cplusplus)
2454extern "C" {
2455#endif
2456
2457/// Returns the contents of the MXCSR register as a 32-bit unsigned
2458///    integer value.
2459///
2460///    There are several groups of macros associated with this
2461///    intrinsic, including:
2462///    <ul>
2463///    <li>
2464///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2465///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467///      _MM_GET_EXCEPTION_STATE().
2468///    </li>
2469///    <li>
2470///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2471///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2473///    </li>
2474///    <li>
2475///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2476///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2477///      _MM_GET_ROUNDING_MODE().
2478///    </li>
2479///    <li>
2480///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2481///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2482///    </li>
2483///    <li>
2484///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2485///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486///      _MM_GET_DENORMALS_ZERO_MODE().
2487///    </li>
2488///    </ul>
2489///
2490///    For example, the following expression checks if an overflow exception has
2491///    occurred:
2492///    \code
2493///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2494///    \endcode
2495///
2496///    The following expression gets the current rounding mode:
2497///    \code
2498///      _MM_GET_ROUNDING_MODE()
2499///    \endcode
2500///
2501/// \headerfile <x86intrin.h>
2502///
2503/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2504///
2505/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2506///    register.
2507unsigned int _mm_getcsr(void);
2508
2509/// Sets the MXCSR register with the 32-bit unsigned integer value.
2510///
2511///    There are several groups of macros associated with this intrinsic,
2512///    including:
2513///    <ul>
2514///    <li>
2515///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2516///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2519///    </li>
2520///    <li>
2521///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2522///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524///      of these macros.
2525///    </li>
2526///    <li>
2527///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2528///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2530///    </li>
2531///    <li>
2532///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2533///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534///      one of these macros.
2535///    </li>
2536///    <li>
2537///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2538///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2540///    </li>
2541///    </ul>
2542///
2543///    For example, the following expression causes subsequent floating-point
2544///    operations to round up:
2545///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2546///
2547///    The following example sets the DAZ and FTZ flags:
2548///    \code
2549///    void setFlags() {
2550///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2552///    }
2553///    \endcode
2554///
2555/// \headerfile <x86intrin.h>
2556///
2557/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2558///
2559/// \param __i
2560///    A 32-bit unsigned integer value to be written to the MXCSR register.
2561void _mm_setcsr(unsigned int __i);
2562
2563#if defined(__cplusplus)
2564} // extern "C"
2565#endif
2566
2567/// Selects 4 float values from the 128-bit operands of [4 x float], as
2568///    specified by the immediate value operand.
2569///
2570/// \headerfile <x86intrin.h>
2571///
2572/// \code
2573/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2574/// \endcode
2575///
2576/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2577///
2578/// \param a
2579///    A 128-bit vector of [4 x float].
2580/// \param b
2581///    A 128-bit vector of [4 x float].
2582/// \param mask
2583///    An immediate value containing an 8-bit value specifying which elements to
2584///    copy from \a a and \a b. \n
2585///    Bits [3:0] specify the values copied from operand \a a. \n
2586///    Bits [7:4] specify the values copied from operand \a b. \n
2587///    The destinations within the 128-bit destination are assigned values as
2588///    follows: \n
2589///    Bits [1:0] are used to assign values to bits [31:0] in the
2590///    destination. \n
2591///    Bits [3:2] are used to assign values to bits [63:32] in the
2592///    destination. \n
2593///    Bits [5:4] are used to assign values to bits [95:64] in the
2594///    destination. \n
2595///    Bits [7:6] are used to assign values to bits [127:96] in the
2596///    destination. \n
2597///    Bit value assignments: \n
2598///    00: Bits [31:0] copied from the specified operand. \n
2599///    01: Bits [63:32] copied from the specified operand. \n
2600///    10: Bits [95:64] copied from the specified operand. \n
2601///    11: Bits [127:96] copied from the specified operand.
2602/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2603#define _mm_shuffle_ps(a, b, mask) \
2604  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2605                                (int)(mask))
2606
2607/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2609///
2610/// \headerfile <x86intrin.h>
2611///
2612/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2613///
2614/// \param __a
2615///    A 128-bit vector of [4 x float]. \n
2616///    Bits [95:64] are written to bits [31:0] of the destination. \n
2617///    Bits [127:96] are written to bits [95:64] of the destination.
2618/// \param __b
2619///    A 128-bit vector of [4 x float].
2620///    Bits [95:64] are written to bits [63:32] of the destination. \n
2621///    Bits [127:96] are written to bits [127:96] of the destination.
2622/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623static __inline__ __m128 __DEFAULT_FN_ATTRS
2624_mm_unpackhi_ps(__m128 __a, __m128 __b)
2625{
2626  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2627}
2628
2629/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2631///
2632/// \headerfile <x86intrin.h>
2633///
2634/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2635///
2636/// \param __a
2637///    A 128-bit vector of [4 x float]. \n
2638///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2639///    Bits [63:32] are written to bits [95:64] of the destination.
2640/// \param __b
2641///    A 128-bit vector of [4 x float]. \n
2642///    Bits [31:0] are written to bits [63:32] of the destination. \n
2643///    Bits [63:32] are written to bits [127:96] of the destination.
2644/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645static __inline__ __m128 __DEFAULT_FN_ATTRS
2646_mm_unpacklo_ps(__m128 __a, __m128 __b)
2647{
2648  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2649}
2650
2651/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2652///    32 bits are set to the lower 32 bits of the second parameter. The upper
2653///    96 bits are set to the upper 96 bits of the first parameter.
2654///
2655/// \headerfile <x86intrin.h>
2656///
2657/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2658///    instruction.
2659///
2660/// \param __a
2661///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662///    written to the upper 96 bits of the result.
2663/// \param __b
2664///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665///    written to the lower 32 bits of the result.
2666/// \returns A 128-bit floating-point vector of [4 x float].
2667static __inline__ __m128 __DEFAULT_FN_ATTRS
2668_mm_move_ss(__m128 __a, __m128 __b)
2669{
2670  __a[0] = __b[0];
2671  return __a;
2672}
2673
2674/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2675///    64 bits are set to the upper 64 bits of the second parameter. The upper
2676///    64 bits are set to the upper 64 bits of the first parameter.
2677///
2678/// \headerfile <x86intrin.h>
2679///
2680/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2681///
2682/// \param __a
2683///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684///    written to the upper 64 bits of the result.
2685/// \param __b
2686///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687///    written to the lower 64 bits of the result.
2688/// \returns A 128-bit floating-point vector of [4 x float].
2689static __inline__ __m128 __DEFAULT_FN_ATTRS
2690_mm_movehl_ps(__m128 __a, __m128 __b)
2691{
2692  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2693}
2694
2695/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2696///    64 bits are set to the lower 64 bits of the first parameter. The upper
2697///    64 bits are set to the lower 64 bits of the second parameter.
2698///
2699/// \headerfile <x86intrin.h>
2700///
2701/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2702///
2703/// \param __a
2704///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705///    written to the lower 64 bits of the result.
2706/// \param __b
2707///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708///    written to the upper 64 bits of the result.
2709/// \returns A 128-bit floating-point vector of [4 x float].
2710static __inline__ __m128 __DEFAULT_FN_ATTRS
2711_mm_movelh_ps(__m128 __a, __m128 __b)
2712{
2713  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2714}
2715
2716/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2717///    float].
2718///
2719/// \headerfile <x86intrin.h>
2720///
2721/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2722///
2723/// \param __a
2724///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725///    from the corresponding elements in this operand.
2726/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727///    values from the operand.
2728static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2729_mm_cvtpi16_ps(__m64 __a)
2730{
2731  __m64 __b, __c;
2732  __m128 __r;
2733
2734  __b = _mm_setzero_si64();
2735  __b = _mm_cmpgt_pi16(__b, __a);
2736  __c = _mm_unpackhi_pi16(__a, __b);
2737  __r = _mm_setzero_ps();
2738  __r = _mm_cvtpi32_ps(__r, __c);
2739  __r = _mm_movelh_ps(__r, __r);
2740  __c = _mm_unpacklo_pi16(__a, __b);
2741  __r = _mm_cvtpi32_ps(__r, __c);
2742
2743  return __r;
2744}
2745
2746/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2747///    128-bit vector of [4 x float].
2748///
2749/// \headerfile <x86intrin.h>
2750///
2751/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2752///
2753/// \param __a
2754///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755///    destination are copied from the corresponding elements in this operand.
2756/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757///    values from the operand.
2758static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2759_mm_cvtpu16_ps(__m64 __a)
2760{
2761  __m64 __b, __c;
2762  __m128 __r;
2763
2764  __b = _mm_setzero_si64();
2765  __c = _mm_unpackhi_pi16(__a, __b);
2766  __r = _mm_setzero_ps();
2767  __r = _mm_cvtpi32_ps(__r, __c);
2768  __r = _mm_movelh_ps(__r, __r);
2769  __c = _mm_unpacklo_pi16(__a, __b);
2770  __r = _mm_cvtpi32_ps(__r, __c);
2771
2772  return __r;
2773}
2774
2775/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2776///    into a 128-bit vector of [4 x float].
2777///
2778/// \headerfile <x86intrin.h>
2779///
2780/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2781///
2782/// \param __a
2783///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784///    from the corresponding lower 4 elements in this operand.
2785/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786///    values from the operand.
2787static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2788_mm_cvtpi8_ps(__m64 __a)
2789{
2790  __m64 __b;
2791
2792  __b = _mm_setzero_si64();
2793  __b = _mm_cmpgt_pi8(__b, __a);
2794  __b = _mm_unpacklo_pi8(__a, __b);
2795
2796  return _mm_cvtpi16_ps(__b);
2797}
2798
2799/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2800///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2801///
2802/// \headerfile <x86intrin.h>
2803///
2804/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2805///
2806/// \param __a
2807///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808///    destination are copied from the corresponding lower 4 elements in this
2809///    operand.
2810/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811///    values from the source operand.
2812static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2813_mm_cvtpu8_ps(__m64 __a)
2814{
2815  __m64 __b;
2816
2817  __b = _mm_setzero_si64();
2818  __b = _mm_unpacklo_pi8(__a, __b);
2819
2820  return _mm_cvtpi16_ps(__b);
2821}
2822
2823/// Converts the two 32-bit signed integer values from each 64-bit vector
2824///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2829///
2830/// \param __a
2831///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832///    copied from the elements in this operand.
2833/// \param __b
2834///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835///    copied from the elements in this operand.
2836/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837///    copied and converted values from the first operand. The upper 64 bits
2838///    contain the copied and converted values from the second operand.
2839static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2840_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2841{
2842  __m128 __c;
2843
2844  __c = _mm_setzero_ps();
2845  __c = _mm_cvtpi32_ps(__c, __b);
2846  __c = _mm_movelh_ps(__c, __c);
2847
2848  return _mm_cvtpi32_ps(__c, __a);
2849}
2850
2851/// Converts each single-precision floating-point element of a 128-bit
2852///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2853///    packs the results into a 64-bit integer vector of [4 x i16].
2854///
2855///    If the floating-point element is NaN or infinity, or if the
2856///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857///    it is converted to 0x8000. Otherwise if the floating-point element is
2858///    greater than 0x7FFF, it is converted to 0x7FFF.
2859///
2860/// \headerfile <x86intrin.h>
2861///
2862/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2863///
2864/// \param __a
2865///    A 128-bit floating-point vector of [4 x float].
2866/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2867///    values.
2868static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2869_mm_cvtps_pi16(__m128 __a)
2870{
2871  __m64 __b, __c;
2872
2873  __b = _mm_cvtps_pi32(__a);
2874  __a = _mm_movehl_ps(__a, __a);
2875  __c = _mm_cvtps_pi32(__a);
2876
2877  return _mm_packs_pi32(__b, __c);
2878}
2879
2880/// Converts each single-precision floating-point element of a 128-bit
2881///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2882///    packs the results into the lower 32 bits of a 64-bit integer vector of
2883///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2884///
2885///    If the floating-point element is NaN or infinity, or if the
2886///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887///    is converted to 0x80. Otherwise if the floating-point element is greater
2888///    than 0x7F, it is converted to 0x7F.
2889///
2890/// \headerfile <x86intrin.h>
2891///
2892/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2893///
2894/// \param __a
2895///    128-bit floating-point vector of [4 x float].
2896/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897///    converted values and the uppper 32 bits are set to zero.
2898static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2899_mm_cvtps_pi8(__m128 __a)
2900{
2901  __m64 __b, __c;
2902
2903  __b = _mm_cvtps_pi16(__a);
2904  __c = _mm_setzero_si64();
2905
2906  return _mm_packs_pi16(__b, __c);
2907}
2908
2909/// Extracts the sign bits from each single-precision floating-point
2910///    element of a 128-bit floating-point vector of [4 x float] and returns the
2911///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2912///    to zero.
2913///
2914/// \headerfile <x86intrin.h>
2915///
2916/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2917///
2918/// \param __a
2919///    A 128-bit floating-point vector of [4 x float].
2920/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921///    single-precision floating-point element of the parameter. Bits [31:4] are
2922///    set to zero.
2923static __inline__ int __DEFAULT_FN_ATTRS
2924_mm_movemask_ps(__m128 __a)
2925{
2926  return __builtin_ia32_movmskps((__v4sf)__a);
2927}
2928
2929
2930#define _MM_ALIGN16 __attribute__((aligned(16)))
2931
2932#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2933
2934#define _MM_EXCEPT_INVALID    (0x0001)
2935#define _MM_EXCEPT_DENORM     (0x0002)
2936#define _MM_EXCEPT_DIV_ZERO   (0x0004)
2937#define _MM_EXCEPT_OVERFLOW   (0x0008)
2938#define _MM_EXCEPT_UNDERFLOW  (0x0010)
2939#define _MM_EXCEPT_INEXACT    (0x0020)
2940#define _MM_EXCEPT_MASK       (0x003f)
2941
2942#define _MM_MASK_INVALID      (0x0080)
2943#define _MM_MASK_DENORM       (0x0100)
2944#define _MM_MASK_DIV_ZERO     (0x0200)
2945#define _MM_MASK_OVERFLOW     (0x0400)
2946#define _MM_MASK_UNDERFLOW    (0x0800)
2947#define _MM_MASK_INEXACT      (0x1000)
2948#define _MM_MASK_MASK         (0x1f80)
2949
2950#define _MM_ROUND_NEAREST     (0x0000)
2951#define _MM_ROUND_DOWN        (0x2000)
2952#define _MM_ROUND_UP          (0x4000)
2953#define _MM_ROUND_TOWARD_ZERO (0x6000)
2954#define _MM_ROUND_MASK        (0x6000)
2955
2956#define _MM_FLUSH_ZERO_MASK   (0x8000)
2957#define _MM_FLUSH_ZERO_ON     (0x8000)
2958#define _MM_FLUSH_ZERO_OFF    (0x0000)
2959
2960#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2964
2965#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2969
2970#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2971do { \
2972  __m128 tmp3, tmp2, tmp1, tmp0; \
2973  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2981} while (0)
2982
2983/* Aliases for compatibility. */
2984#define _m_pextrw _mm_extract_pi16
2985#define _m_pinsrw _mm_insert_pi16
2986#define _m_pmaxsw _mm_max_pi16
2987#define _m_pmaxub _mm_max_pu8
2988#define _m_pminsw _mm_min_pi16
2989#define _m_pminub _mm_min_pu8
2990#define _m_pmovmskb _mm_movemask_pi8
2991#define _m_pmulhuw _mm_mulhi_pu16
2992#define _m_pshufw _mm_shuffle_pi16
2993#define _m_maskmovq _mm_maskmove_si64
2994#define _m_pavgb _mm_avg_pu8
2995#define _m_pavgw _mm_avg_pu16
2996#define _m_psadbw _mm_sad_pu8
2997#define _m_ _mm_
2998#define _m_ _mm_
2999
3000#undef __DEFAULT_FN_ATTRS
3001#undef __DEFAULT_FN_ATTRS_MMX
3002
3003/* Ugly hack for backwards-compatibility (compatible with gcc) */
3004#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005#include <emmintrin.h>
3006#endif
3007
3008#endif /* __XMMINTRIN_H */
3009