xmmintrin.h revision 341825
159078Smdodd/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
259078Smdodd *
359078Smdodd * Permission is hereby granted, free of charge, to any person obtaining a copy
434480Sjulian * of this software and associated documentation files (the "Software"), to deal
534480Sjulian * in the Software without restriction, including without limitation the rights
634480Sjulian * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
734480Sjulian * copies of the Software, and to permit persons to whom the Software is
834480Sjulian * furnished to do so, subject to the following conditions:
959078Smdodd *
1034480Sjulian * The above copyright notice and this permission notice shall be included in
1134480Sjulian * all copies or substantial portions of the Software.
1234480Sjulian *
1334480Sjulian * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1434480Sjulian * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1534480Sjulian * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1634480Sjulian * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1759078Smdodd * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1859078Smdodd * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1934480Sjulian * THE SOFTWARE.
2034480Sjulian *
2134480Sjulian *===-----------------------------------------------------------------------===
2234480Sjulian */
2334480Sjulian
2434480Sjulian#ifndef __XMMINTRIN_H
2559078Smdodd#define __XMMINTRIN_H
2659078Smdodd
2734480Sjulian#include <mmintrin.h>
2834480Sjulian
2934480Sjuliantypedef int __v4si __attribute__((__vector_size__(16)));
3034480Sjuliantypedef float __v4sf __attribute__((__vector_size__(16)));
3134480Sjuliantypedef float __m128 __attribute__((__vector_size__(16)));
3245791Speter
3345791Speter/* Unsigned types */
3434480Sjuliantypedef unsigned int __v4su __attribute__((__vector_size__(16)));
3539234Sgibbs
3639234Sgibbs/* This header should only be included in a hosted environment as it depends on
3745791Speter * a standard library to provide allocation routines. */
3845791Speter#if __STDC_HOSTED__
3934480Sjulian#include <mm_malloc.h>
4059078Smdodd#endif
4159078Smdodd
4239234Sgibbs/* Define the default attributes for the functions in this file. */
4339234Sgibbs#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
4439234Sgibbs#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
4539234Sgibbs
4652042Smdodd/// Adds the 32-bit float values in the low-order bits of the operands.
4752042Smdodd///
4852042Smdodd/// \headerfile <x86intrin.h>
4952042Smdodd///
5059078Smdodd/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
5159078Smdodd///
5259078Smdodd/// \param __a
5359078Smdodd///    A 128-bit vector of [4 x float] containing one of the source operands.
5459078Smdodd///    The lower 32 bits of this operand are used in the calculation.
5559078Smdodd/// \param __b
5659078Smdodd///    A 128-bit vector of [4 x float] containing one of the source operands.
5759078Smdodd///    The lower 32 bits of this operand are used in the calculation.
5859078Smdodd/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
5959078Smdodd///    of the lower 32 bits of both operands. The upper 96 bits are copied from
6059078Smdodd///    the upper 96 bits of the first source operand.
6159078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS
6259078Smdodd_mm_add_ss(__m128 __a, __m128 __b)
6352042Smdodd{
6434480Sjulian  __a[0] += __b[0];
6534480Sjulian  return __a;
6659078Smdodd}
6759078Smdodd
6859078Smdodd/// Adds two 128-bit vectors of [4 x float], and returns the results of
6934480Sjulian///    the addition.
7059078Smdodd///
7139234Sgibbs/// \headerfile <x86intrin.h>
7259078Smdodd///
7334480Sjulian/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
7452042Smdodd///
7552042Smdodd/// \param __a
7652042Smdodd///    A 128-bit vector of [4 x float] containing one of the source operands.
7734480Sjulian/// \param __b
7845791Speter///    A 128-bit vector of [4 x float] containing one of the source operands.
7945791Speter/// \returns A 128-bit vector of [4 x float] containing the sums of both
8045791Speter///    operands.
8145791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS
8234480Sjulian_mm_add_ps(__m128 __a, __m128 __b)
8352042Smdodd{
8445791Speter  return (__m128)((__v4sf)__a + (__v4sf)__b);
8552042Smdodd}
8652042Smdodd
8752042Smdodd/// Subtracts the 32-bit float value in the low-order bits of the second
8852042Smdodd///    operand from the corresponding value in the first operand.
8945791Speter///
9034480Sjulian/// \headerfile <x86intrin.h>
9152042Smdodd///
9252042Smdodd/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
9352042Smdodd///
9445791Speter/// \param __a
9545791Speter///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
9634480Sjulian///    of this operand are used in the calculation.
9734480Sjulian/// \param __b
9845791Speter///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
9959078Smdodd///    bits of this operand are used in the calculation.
10034480Sjulian/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
10159078Smdodd///    difference of the lower 32 bits of both operands. The upper 96 bits are
10245791Speter///    copied from the upper 96 bits of the first source operand.
10345791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS
10439234Sgibbs_mm_sub_ss(__m128 __a, __m128 __b)
10545791Speter{
10659078Smdodd  __a[0] -= __b[0];
10759078Smdodd  return __a;
10834480Sjulian}
10945791Speter
11059078Smdodd/// Subtracts each of the values of the second operand from the first
11145791Speter///    operand, both of which are 128-bit vectors of [4 x float] and returns
11245791Speter///    the results of the subtraction.
11359078Smdodd///
11459078Smdodd/// \headerfile <x86intrin.h>
11534480Sjulian///
11634480Sjulian/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
11759078Smdodd///
11859078Smdodd/// \param __a
11959078Smdodd///    A 128-bit vector of [4 x float] containing the minuend.
12059078Smdodd/// \param __b
12159078Smdodd///    A 128-bit vector of [4 x float] containing the subtrahend.
12259078Smdodd/// \returns A 128-bit vector of [4 x float] containing the differences between
12359078Smdodd///    both operands.
12459078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS
12559078Smdodd_mm_sub_ps(__m128 __a, __m128 __b)
12645791Speter{
12759078Smdodd  return (__m128)((__v4sf)__a - (__v4sf)__b);
12859078Smdodd}
12945791Speter
13059078Smdodd/// Multiplies two 32-bit float values in the low-order bits of the
13134480Sjulian///    operands.
13239234Sgibbs///
13339234Sgibbs/// \headerfile <x86intrin.h>
13459078Smdodd///
13559078Smdodd/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
13659078Smdodd///
13759078Smdodd/// \param __a
13859078Smdodd///    A 128-bit vector of [4 x float] containing one of the source operands.
13959078Smdodd///    The lower 32 bits of this operand are used in the calculation.
14059078Smdodd/// \param __b
14159078Smdodd///    A 128-bit vector of [4 x float] containing one of the source operands.
142104710Speter///    The lower 32 bits of this operand are used in the calculation.
14359078Smdodd/// \returns A 128-bit vector of [4 x float] containing the product of the lower
14459078Smdodd///    32 bits of both operands. The upper 96 bits are copied from the upper 96
14559078Smdodd///    bits of the first source operand.
14639234Sgibbsstatic __inline__ __m128 __DEFAULT_FN_ATTRS
14759078Smdodd_mm_mul_ss(__m128 __a, __m128 __b)
14845791Speter{
14934480Sjulian  __a[0] *= __b[0];
15034480Sjulian  return __a;
15159078Smdodd}
15234480Sjulian
15339234Sgibbs/// Multiplies two 128-bit vectors of [4 x float] and returns the
15439234Sgibbs///    results of the multiplication.
15559078Smdodd///
15645791Speter/// \headerfile <x86intrin.h>
15734480Sjulian///
15834480Sjulian/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
15939234Sgibbs///
16039234Sgibbs/// \param __a
16145791Speter///    A 128-bit vector of [4 x float] containing one of the source operands.
16239234Sgibbs/// \param __b
16334480Sjulian///    A 128-bit vector of [4 x float] containing one of the source operands.
16473280Smarkm/// \returns A 128-bit vector of [4 x float] containing the products of both
16573280Smarkm///    operands.
16659078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS
16759078Smdodd_mm_mul_ps(__m128 __a, __m128 __b)
16859078Smdodd{
16959078Smdodd  return (__m128)((__v4sf)__a * (__v4sf)__b);
17045791Speter}
17159078Smdodd
17259078Smdodd/// Divides the value in the low-order 32 bits of the first operand by
17345791Speter///    the corresponding value in the second operand.
17445791Speter///
17545791Speter/// \headerfile <x86intrin.h>
17645791Speter///
17745791Speter/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
17859078Smdodd///
17959078Smdodd/// \param __a
18034480Sjulian///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
18134480Sjulian///    bits of this operand are used in the calculation.
18234480Sjulian/// \param __b
18334480Sjulian///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
18434480Sjulian///    of this operand are used in the calculation.
18534480Sjulian/// \returns A 128-bit vector of [4 x float] containing the quotients of the
18634480Sjulian///    lower 32 bits of both operands. The upper 96 bits are copied from the
18759078Smdodd///    upper 96 bits of the first source operand.
18859078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS
18959078Smdodd_mm_div_ss(__m128 __a, __m128 __b)
19059078Smdodd{
19159078Smdodd  __a[0] /= __b[0];
19259078Smdodd  return __a;
19359078Smdodd}
19459078Smdodd
19559078Smdodd/// Divides two 128-bit vectors of [4 x float].
19659078Smdodd///
19759078Smdodd/// \headerfile <x86intrin.h>
19859078Smdodd///
19959078Smdodd/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
20059078Smdodd///
20134480Sjulian/// \param __a
20234480Sjulian///    A 128-bit vector of [4 x float] containing the dividend.
20334480Sjulian/// \param __b
20434480Sjulian///    A 128-bit vector of [4 x float] containing the divisor.
20534480Sjulian/// \returns A 128-bit vector of [4 x float] containing the quotients of both
20634480Sjulian///    operands.
20734480Sjulianstatic __inline__ __m128 __DEFAULT_FN_ATTRS
20834480Sjulian_mm_div_ps(__m128 __a, __m128 __b)
20945791Speter{
21045791Speter  return (__m128)((__v4sf)__a / (__v4sf)__b);
21145791Speter}
21245791Speter
21345791Speter/// Calculates the square root of the value stored in the low-order bits
21445791Speter///    of a 128-bit vector of [4 x float].
21545791Speter///
21645791Speter/// \headerfile <x86intrin.h>
21745791Speter///
21845791Speter/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
21945791Speter///
22059078Smdodd/// \param __a
22145791Speter///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
22245791Speter///    used in the calculation.
22345791Speter/// \returns A 128-bit vector of [4 x float] containing the square root of the
22445791Speter///    value in the low-order bits of the operand.
22545791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS
226_mm_sqrt_ss(__m128 __a)
227{
228  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
229}
230
231/// Calculates the square roots of the values stored in a 128-bit vector
232///    of [4 x float].
233///
234/// \headerfile <x86intrin.h>
235///
236/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
237///
238/// \param __a
239///    A 128-bit vector of [4 x float].
240/// \returns A 128-bit vector of [4 x float] containing the square roots of the
241///    values in the operand.
242static __inline__ __m128 __DEFAULT_FN_ATTRS
243_mm_sqrt_ps(__m128 __a)
244{
245  return __builtin_ia32_sqrtps((__v4sf)__a);
246}
247
248/// Calculates the approximate reciprocal of the value stored in the
249///    low-order bits of a 128-bit vector of [4 x float].
250///
251/// \headerfile <x86intrin.h>
252///
253/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
254///
255/// \param __a
256///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
257///    used in the calculation.
258/// \returns A 128-bit vector of [4 x float] containing the approximate
259///    reciprocal of the value in the low-order bits of the operand.
260static __inline__ __m128 __DEFAULT_FN_ATTRS
261_mm_rcp_ss(__m128 __a)
262{
263  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
264}
265
266/// Calculates the approximate reciprocals of the values stored in a
267///    128-bit vector of [4 x float].
268///
269/// \headerfile <x86intrin.h>
270///
271/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
272///
273/// \param __a
274///    A 128-bit vector of [4 x float].
275/// \returns A 128-bit vector of [4 x float] containing the approximate
276///    reciprocals of the values in the operand.
277static __inline__ __m128 __DEFAULT_FN_ATTRS
278_mm_rcp_ps(__m128 __a)
279{
280  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
281}
282
283/// Calculates the approximate reciprocal of the square root of the value
284///    stored in the low-order bits of a 128-bit vector of [4 x float].
285///
286/// \headerfile <x86intrin.h>
287///
288/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
289///
290/// \param __a
291///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
292///    used in the calculation.
293/// \returns A 128-bit vector of [4 x float] containing the approximate
294///    reciprocal of the square root of the value in the low-order bits of the
295///    operand.
296static __inline__ __m128 __DEFAULT_FN_ATTRS
297_mm_rsqrt_ss(__m128 __a)
298{
299  return __builtin_ia32_rsqrtss((__v4sf)__a);
300}
301
302/// Calculates the approximate reciprocals of the square roots of the
303///    values stored in a 128-bit vector of [4 x float].
304///
305/// \headerfile <x86intrin.h>
306///
307/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
308///
309/// \param __a
310///    A 128-bit vector of [4 x float].
311/// \returns A 128-bit vector of [4 x float] containing the approximate
312///    reciprocals of the square roots of the values in the operand.
313static __inline__ __m128 __DEFAULT_FN_ATTRS
314_mm_rsqrt_ps(__m128 __a)
315{
316  return __builtin_ia32_rsqrtps((__v4sf)__a);
317}
318
319/// Compares two 32-bit float values in the low-order bits of both
320///    operands and returns the lesser value in the low-order bits of the
321///    vector of [4 x float].
322///
323/// \headerfile <x86intrin.h>
324///
325/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
326///
327/// \param __a
328///    A 128-bit vector of [4 x float] containing one of the operands. The lower
329///    32 bits of this operand are used in the comparison.
330/// \param __b
331///    A 128-bit vector of [4 x float] containing one of the operands. The lower
332///    32 bits of this operand are used in the comparison.
333/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
334///    minimum value between both operands. The upper 96 bits are copied from
335///    the upper 96 bits of the first source operand.
336static __inline__ __m128 __DEFAULT_FN_ATTRS
337_mm_min_ss(__m128 __a, __m128 __b)
338{
339  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
340}
341
342/// Compares two 128-bit vectors of [4 x float] and returns the lesser
343///    of each pair of values.
344///
345/// \headerfile <x86intrin.h>
346///
347/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
348///
349/// \param __a
350///    A 128-bit vector of [4 x float] containing one of the operands.
351/// \param __b
352///    A 128-bit vector of [4 x float] containing one of the operands.
353/// \returns A 128-bit vector of [4 x float] containing the minimum values
354///    between both operands.
355static __inline__ __m128 __DEFAULT_FN_ATTRS
356_mm_min_ps(__m128 __a, __m128 __b)
357{
358  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
359}
360
361/// Compares two 32-bit float values in the low-order bits of both
362///    operands and returns the greater value in the low-order bits of a 128-bit
363///    vector of [4 x float].
364///
365/// \headerfile <x86intrin.h>
366///
367/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
368///
369/// \param __a
370///    A 128-bit vector of [4 x float] containing one of the operands. The lower
371///    32 bits of this operand are used in the comparison.
372/// \param __b
373///    A 128-bit vector of [4 x float] containing one of the operands. The lower
374///    32 bits of this operand are used in the comparison.
375/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
376///    maximum value between both operands. The upper 96 bits are copied from
377///    the upper 96 bits of the first source operand.
378static __inline__ __m128 __DEFAULT_FN_ATTRS
379_mm_max_ss(__m128 __a, __m128 __b)
380{
381  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
382}
383
384/// Compares two 128-bit vectors of [4 x float] and returns the greater
385///    of each pair of values.
386///
387/// \headerfile <x86intrin.h>
388///
389/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
390///
391/// \param __a
392///    A 128-bit vector of [4 x float] containing one of the operands.
393/// \param __b
394///    A 128-bit vector of [4 x float] containing one of the operands.
395/// \returns A 128-bit vector of [4 x float] containing the maximum values
396///    between both operands.
397static __inline__ __m128 __DEFAULT_FN_ATTRS
398_mm_max_ps(__m128 __a, __m128 __b)
399{
400  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
401}
402
403/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
404///
405/// \headerfile <x86intrin.h>
406///
407/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
408///
409/// \param __a
410///    A 128-bit vector containing one of the source operands.
411/// \param __b
412///    A 128-bit vector containing one of the source operands.
413/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
414///    values between both operands.
415static __inline__ __m128 __DEFAULT_FN_ATTRS
416_mm_and_ps(__m128 __a, __m128 __b)
417{
418  return (__m128)((__v4su)__a & (__v4su)__b);
419}
420
421/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
422///    the one's complement of the values contained in the first source
423///    operand.
424///
425/// \headerfile <x86intrin.h>
426///
427/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
428///
429/// \param __a
430///    A 128-bit vector of [4 x float] containing the first source operand. The
431///    one's complement of this value is used in the bitwise AND.
432/// \param __b
433///    A 128-bit vector of [4 x float] containing the second source operand.
434/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
435///    one's complement of the first operand and the values in the second
436///    operand.
437static __inline__ __m128 __DEFAULT_FN_ATTRS
438_mm_andnot_ps(__m128 __a, __m128 __b)
439{
440  return (__m128)(~(__v4su)__a & (__v4su)__b);
441}
442
443/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
444///
445/// \headerfile <x86intrin.h>
446///
447/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
448///
449/// \param __a
450///    A 128-bit vector of [4 x float] containing one of the source operands.
451/// \param __b
452///    A 128-bit vector of [4 x float] containing one of the source operands.
453/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
454///    values between both operands.
455static __inline__ __m128 __DEFAULT_FN_ATTRS
456_mm_or_ps(__m128 __a, __m128 __b)
457{
458  return (__m128)((__v4su)__a | (__v4su)__b);
459}
460
461/// Performs a bitwise exclusive OR of two 128-bit vectors of
462///    [4 x float].
463///
464/// \headerfile <x86intrin.h>
465///
466/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
467///
468/// \param __a
469///    A 128-bit vector of [4 x float] containing one of the source operands.
470/// \param __b
471///    A 128-bit vector of [4 x float] containing one of the source operands.
472/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
473///    of the values between both operands.
474static __inline__ __m128 __DEFAULT_FN_ATTRS
475_mm_xor_ps(__m128 __a, __m128 __b)
476{
477  return (__m128)((__v4su)__a ^ (__v4su)__b);
478}
479
480/// Compares two 32-bit float values in the low-order bits of both
481///    operands for equality and returns the result of the comparison in the
482///    low-order bits of a vector [4 x float].
483///
484/// \headerfile <x86intrin.h>
485///
486/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
487///
488/// \param __a
489///    A 128-bit vector of [4 x float] containing one of the operands. The lower
490///    32 bits of this operand are used in the comparison.
491/// \param __b
492///    A 128-bit vector of [4 x float] containing one of the operands. The lower
493///    32 bits of this operand are used in the comparison.
494/// \returns A 128-bit vector of [4 x float] containing the comparison results
495///    in the low-order bits.
496static __inline__ __m128 __DEFAULT_FN_ATTRS
497_mm_cmpeq_ss(__m128 __a, __m128 __b)
498{
499  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
500}
501
502/// Compares each of the corresponding 32-bit float values of the
503///    128-bit vectors of [4 x float] for equality.
504///
505/// \headerfile <x86intrin.h>
506///
507/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
508///
509/// \param __a
510///    A 128-bit vector of [4 x float].
511/// \param __b
512///    A 128-bit vector of [4 x float].
513/// \returns A 128-bit vector of [4 x float] containing the comparison results.
514static __inline__ __m128 __DEFAULT_FN_ATTRS
515_mm_cmpeq_ps(__m128 __a, __m128 __b)
516{
517  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
518}
519
520/// Compares two 32-bit float values in the low-order bits of both
521///    operands to determine if the value in the first operand is less than the
522///    corresponding value in the second operand and returns the result of the
523///    comparison in the low-order bits of a vector of [4 x float].
524///
525/// \headerfile <x86intrin.h>
526///
527/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
528///
529/// \param __a
530///    A 128-bit vector of [4 x float] containing one of the operands. The lower
531///    32 bits of this operand are used in the comparison.
532/// \param __b
533///    A 128-bit vector of [4 x float] containing one of the operands. The lower
534///    32 bits of this operand are used in the comparison.
535/// \returns A 128-bit vector of [4 x float] containing the comparison results
536///    in the low-order bits.
537static __inline__ __m128 __DEFAULT_FN_ATTRS
538_mm_cmplt_ss(__m128 __a, __m128 __b)
539{
540  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
541}
542
543/// Compares each of the corresponding 32-bit float values of the
544///    128-bit vectors of [4 x float] to determine if the values in the first
545///    operand are less than those in the second operand.
546///
547/// \headerfile <x86intrin.h>
548///
549/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
550///
551/// \param __a
552///    A 128-bit vector of [4 x float].
553/// \param __b
554///    A 128-bit vector of [4 x float].
555/// \returns A 128-bit vector of [4 x float] containing the comparison results.
556static __inline__ __m128 __DEFAULT_FN_ATTRS
557_mm_cmplt_ps(__m128 __a, __m128 __b)
558{
559  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
560}
561
562/// Compares two 32-bit float values in the low-order bits of both
563///    operands to determine if the value in the first operand is less than or
564///    equal to the corresponding value in the second operand and returns the
565///    result of the comparison in the low-order bits of a vector of
566///    [4 x float].
567///
568/// \headerfile <x86intrin.h>
569///
570/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
571///
572/// \param __a
573///    A 128-bit vector of [4 x float] containing one of the operands. The lower
574///    32 bits of this operand are used in the comparison.
575/// \param __b
576///    A 128-bit vector of [4 x float] containing one of the operands. The lower
577///    32 bits of this operand are used in the comparison.
578/// \returns A 128-bit vector of [4 x float] containing the comparison results
579///    in the low-order bits.
580static __inline__ __m128 __DEFAULT_FN_ATTRS
581_mm_cmple_ss(__m128 __a, __m128 __b)
582{
583  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
584}
585
586/// Compares each of the corresponding 32-bit float values of the
587///    128-bit vectors of [4 x float] to determine if the values in the first
588///    operand are less than or equal to those in the second operand.
589///
590/// \headerfile <x86intrin.h>
591///
592/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
593///
594/// \param __a
595///    A 128-bit vector of [4 x float].
596/// \param __b
597///    A 128-bit vector of [4 x float].
598/// \returns A 128-bit vector of [4 x float] containing the comparison results.
599static __inline__ __m128 __DEFAULT_FN_ATTRS
600_mm_cmple_ps(__m128 __a, __m128 __b)
601{
602  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
603}
604
605/// Compares two 32-bit float values in the low-order bits of both
606///    operands to determine if the value in the first operand is greater than
607///    the corresponding value in the second operand and returns the result of
608///    the comparison in the low-order bits of a vector of [4 x float].
609///
610/// \headerfile <x86intrin.h>
611///
612/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
613///
614/// \param __a
615///    A 128-bit vector of [4 x float] containing one of the operands. The lower
616///    32 bits of this operand are used in the comparison.
617/// \param __b
618///    A 128-bit vector of [4 x float] containing one of the operands. The lower
619///    32 bits of this operand are used in the comparison.
620/// \returns A 128-bit vector of [4 x float] containing the comparison results
621///    in the low-order bits.
622static __inline__ __m128 __DEFAULT_FN_ATTRS
623_mm_cmpgt_ss(__m128 __a, __m128 __b)
624{
625  return (__m128)__builtin_shufflevector((__v4sf)__a,
626                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
627                                         4, 1, 2, 3);
628}
629
630/// Compares each of the corresponding 32-bit float values of the
631///    128-bit vectors of [4 x float] to determine if the values in the first
632///    operand are greater than those in the second operand.
633///
634/// \headerfile <x86intrin.h>
635///
636/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
637///
638/// \param __a
639///    A 128-bit vector of [4 x float].
640/// \param __b
641///    A 128-bit vector of [4 x float].
642/// \returns A 128-bit vector of [4 x float] containing the comparison results.
643static __inline__ __m128 __DEFAULT_FN_ATTRS
644_mm_cmpgt_ps(__m128 __a, __m128 __b)
645{
646  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
647}
648
649/// Compares two 32-bit float values in the low-order bits of both
650///    operands to determine if the value in the first operand is greater than
651///    or equal to the corresponding value in the second operand and returns
652///    the result of the comparison in the low-order bits of a vector of
653///    [4 x float].
654///
655/// \headerfile <x86intrin.h>
656///
657/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
658///
659/// \param __a
660///    A 128-bit vector of [4 x float] containing one of the operands. The lower
661///    32 bits of this operand are used in the comparison.
662/// \param __b
663///    A 128-bit vector of [4 x float] containing one of the operands. The lower
664///    32 bits of this operand are used in the comparison.
665/// \returns A 128-bit vector of [4 x float] containing the comparison results
666///    in the low-order bits.
667static __inline__ __m128 __DEFAULT_FN_ATTRS
668_mm_cmpge_ss(__m128 __a, __m128 __b)
669{
670  return (__m128)__builtin_shufflevector((__v4sf)__a,
671                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
672                                         4, 1, 2, 3);
673}
674
675/// Compares each of the corresponding 32-bit float values of the
676///    128-bit vectors of [4 x float] to determine if the values in the first
677///    operand are greater than or equal to those in the second operand.
678///
679/// \headerfile <x86intrin.h>
680///
681/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
682///
683/// \param __a
684///    A 128-bit vector of [4 x float].
685/// \param __b
686///    A 128-bit vector of [4 x float].
687/// \returns A 128-bit vector of [4 x float] containing the comparison results.
688static __inline__ __m128 __DEFAULT_FN_ATTRS
689_mm_cmpge_ps(__m128 __a, __m128 __b)
690{
691  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
692}
693
694/// Compares two 32-bit float values in the low-order bits of both
695///    operands for inequality and returns the result of the comparison in the
696///    low-order bits of a vector of [4 x float].
697///
698/// \headerfile <x86intrin.h>
699///
700/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
701///   instructions.
702///
703/// \param __a
704///    A 128-bit vector of [4 x float] containing one of the operands. The lower
705///    32 bits of this operand are used in the comparison.
706/// \param __b
707///    A 128-bit vector of [4 x float] containing one of the operands. The lower
708///    32 bits of this operand are used in the comparison.
709/// \returns A 128-bit vector of [4 x float] containing the comparison results
710///    in the low-order bits.
711static __inline__ __m128 __DEFAULT_FN_ATTRS
712_mm_cmpneq_ss(__m128 __a, __m128 __b)
713{
714  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
715}
716
717/// Compares each of the corresponding 32-bit float values of the
718///    128-bit vectors of [4 x float] for inequality.
719///
720/// \headerfile <x86intrin.h>
721///
722/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
723///   instructions.
724///
725/// \param __a
726///    A 128-bit vector of [4 x float].
727/// \param __b
728///    A 128-bit vector of [4 x float].
729/// \returns A 128-bit vector of [4 x float] containing the comparison results.
730static __inline__ __m128 __DEFAULT_FN_ATTRS
731_mm_cmpneq_ps(__m128 __a, __m128 __b)
732{
733  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
734}
735
736/// Compares two 32-bit float values in the low-order bits of both
737///    operands to determine if the value in the first operand is not less than
738///    the corresponding value in the second operand and returns the result of
739///    the comparison in the low-order bits of a vector of [4 x float].
740///
741/// \headerfile <x86intrin.h>
742///
743/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
744///   instructions.
745///
746/// \param __a
747///    A 128-bit vector of [4 x float] containing one of the operands. The lower
748///    32 bits of this operand are used in the comparison.
749/// \param __b
750///    A 128-bit vector of [4 x float] containing one of the operands. The lower
751///    32 bits of this operand are used in the comparison.
752/// \returns A 128-bit vector of [4 x float] containing the comparison results
753///    in the low-order bits.
754static __inline__ __m128 __DEFAULT_FN_ATTRS
755_mm_cmpnlt_ss(__m128 __a, __m128 __b)
756{
757  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
758}
759
760/// Compares each of the corresponding 32-bit float values of the
761///    128-bit vectors of [4 x float] to determine if the values in the first
762///    operand are not less than those in the second operand.
763///
764/// \headerfile <x86intrin.h>
765///
766/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
767///   instructions.
768///
769/// \param __a
770///    A 128-bit vector of [4 x float].
771/// \param __b
772///    A 128-bit vector of [4 x float].
773/// \returns A 128-bit vector of [4 x float] containing the comparison results.
774static __inline__ __m128 __DEFAULT_FN_ATTRS
775_mm_cmpnlt_ps(__m128 __a, __m128 __b)
776{
777  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
778}
779
780/// Compares two 32-bit float values in the low-order bits of both
781///    operands to determine if the value in the first operand is not less than
782///    or equal to the corresponding value in the second operand and returns
783///    the result of the comparison in the low-order bits of a vector of
784///    [4 x float].
785///
786/// \headerfile <x86intrin.h>
787///
788/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
789///   instructions.
790///
791/// \param __a
792///    A 128-bit vector of [4 x float] containing one of the operands. The lower
793///    32 bits of this operand are used in the comparison.
794/// \param __b
795///    A 128-bit vector of [4 x float] containing one of the operands. The lower
796///    32 bits of this operand are used in the comparison.
797/// \returns A 128-bit vector of [4 x float] containing the comparison results
798///    in the low-order bits.
799static __inline__ __m128 __DEFAULT_FN_ATTRS
800_mm_cmpnle_ss(__m128 __a, __m128 __b)
801{
802  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
803}
804
805/// Compares each of the corresponding 32-bit float values of the
806///    128-bit vectors of [4 x float] to determine if the values in the first
807///    operand are not less than or equal to those in the second operand.
808///
809/// \headerfile <x86intrin.h>
810///
811/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
812///   instructions.
813///
814/// \param __a
815///    A 128-bit vector of [4 x float].
816/// \param __b
817///    A 128-bit vector of [4 x float].
818/// \returns A 128-bit vector of [4 x float] containing the comparison results.
819static __inline__ __m128 __DEFAULT_FN_ATTRS
820_mm_cmpnle_ps(__m128 __a, __m128 __b)
821{
822  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
823}
824
825/// Compares two 32-bit float values in the low-order bits of both
826///    operands to determine if the value in the first operand is not greater
827///    than the corresponding value in the second operand and returns the
828///    result of the comparison in the low-order bits of a vector of
829///    [4 x float].
830///
831/// \headerfile <x86intrin.h>
832///
833/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
834///   instructions.
835///
836/// \param __a
837///    A 128-bit vector of [4 x float] containing one of the operands. The lower
838///    32 bits of this operand are used in the comparison.
839/// \param __b
840///    A 128-bit vector of [4 x float] containing one of the operands. The lower
841///    32 bits of this operand are used in the comparison.
842/// \returns A 128-bit vector of [4 x float] containing the comparison results
843///    in the low-order bits.
844static __inline__ __m128 __DEFAULT_FN_ATTRS
845_mm_cmpngt_ss(__m128 __a, __m128 __b)
846{
847  return (__m128)__builtin_shufflevector((__v4sf)__a,
848                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
849                                         4, 1, 2, 3);
850}
851
852/// Compares each of the corresponding 32-bit float values of the
853///    128-bit vectors of [4 x float] to determine if the values in the first
854///    operand are not greater than those in the second operand.
855///
856/// \headerfile <x86intrin.h>
857///
858/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
859///   instructions.
860///
861/// \param __a
862///    A 128-bit vector of [4 x float].
863/// \param __b
864///    A 128-bit vector of [4 x float].
865/// \returns A 128-bit vector of [4 x float] containing the comparison results.
866static __inline__ __m128 __DEFAULT_FN_ATTRS
867_mm_cmpngt_ps(__m128 __a, __m128 __b)
868{
869  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
870}
871
872/// Compares two 32-bit float values in the low-order bits of both
873///    operands to determine if the value in the first operand is not greater
874///    than or equal to the corresponding value in the second operand and
875///    returns the result of the comparison in the low-order bits of a vector
876///    of [4 x float].
877///
878/// \headerfile <x86intrin.h>
879///
880/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
881///   instructions.
882///
883/// \param __a
884///    A 128-bit vector of [4 x float] containing one of the operands. The lower
885///    32 bits of this operand are used in the comparison.
886/// \param __b
887///    A 128-bit vector of [4 x float] containing one of the operands. The lower
888///    32 bits of this operand are used in the comparison.
889/// \returns A 128-bit vector of [4 x float] containing the comparison results
890///    in the low-order bits.
891static __inline__ __m128 __DEFAULT_FN_ATTRS
892_mm_cmpnge_ss(__m128 __a, __m128 __b)
893{
894  return (__m128)__builtin_shufflevector((__v4sf)__a,
895                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
896                                         4, 1, 2, 3);
897}
898
899/// Compares each of the corresponding 32-bit float values of the
900///    128-bit vectors of [4 x float] to determine if the values in the first
901///    operand are not greater than or equal to those in the second operand.
902///
903/// \headerfile <x86intrin.h>
904///
905/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
906///   instructions.
907///
908/// \param __a
909///    A 128-bit vector of [4 x float].
910/// \param __b
911///    A 128-bit vector of [4 x float].
912/// \returns A 128-bit vector of [4 x float] containing the comparison results.
913static __inline__ __m128 __DEFAULT_FN_ATTRS
914_mm_cmpnge_ps(__m128 __a, __m128 __b)
915{
916  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
917}
918
919/// Compares two 32-bit float values in the low-order bits of both
920///    operands to determine if the value in the first operand is ordered with
921///    respect to the corresponding value in the second operand and returns the
922///    result of the comparison in the low-order bits of a vector of
923///    [4 x float].
924///
925/// \headerfile <x86intrin.h>
926///
927/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
928///   instructions.
929///
930/// \param __a
931///    A 128-bit vector of [4 x float] containing one of the operands. The lower
932///    32 bits of this operand are used in the comparison.
933/// \param __b
934///    A 128-bit vector of [4 x float] containing one of the operands. The lower
935///    32 bits of this operand are used in the comparison.
936/// \returns A 128-bit vector of [4 x float] containing the comparison results
937///    in the low-order bits.
938static __inline__ __m128 __DEFAULT_FN_ATTRS
939_mm_cmpord_ss(__m128 __a, __m128 __b)
940{
941  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
942}
943
944/// Compares each of the corresponding 32-bit float values of the
945///    128-bit vectors of [4 x float] to determine if the values in the first
946///    operand are ordered with respect to those in the second operand.
947///
948/// \headerfile <x86intrin.h>
949///
950/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
951///   instructions.
952///
953/// \param __a
954///    A 128-bit vector of [4 x float].
955/// \param __b
956///    A 128-bit vector of [4 x float].
957/// \returns A 128-bit vector of [4 x float] containing the comparison results.
958static __inline__ __m128 __DEFAULT_FN_ATTRS
959_mm_cmpord_ps(__m128 __a, __m128 __b)
960{
961  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
962}
963
964/// Compares two 32-bit float values in the low-order bits of both
965///    operands to determine if the value in the first operand is unordered
966///    with respect to the corresponding value in the second operand and
967///    returns the result of the comparison in the low-order bits of a vector
968///    of [4 x float].
969///
970/// \headerfile <x86intrin.h>
971///
972/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
973///   instructions.
974///
975/// \param __a
976///    A 128-bit vector of [4 x float] containing one of the operands. The lower
977///    32 bits of this operand are used in the comparison.
978/// \param __b
979///    A 128-bit vector of [4 x float] containing one of the operands. The lower
980///    32 bits of this operand are used in the comparison.
981/// \returns A 128-bit vector of [4 x float] containing the comparison results
982///    in the low-order bits.
983static __inline__ __m128 __DEFAULT_FN_ATTRS
984_mm_cmpunord_ss(__m128 __a, __m128 __b)
985{
986  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
987}
988
989/// Compares each of the corresponding 32-bit float values of the
990///    128-bit vectors of [4 x float] to determine if the values in the first
991///    operand are unordered with respect to those in the second operand.
992///
993/// \headerfile <x86intrin.h>
994///
995/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
996///   instructions.
997///
998/// \param __a
999///    A 128-bit vector of [4 x float].
1000/// \param __b
1001///    A 128-bit vector of [4 x float].
1002/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1003static __inline__ __m128 __DEFAULT_FN_ATTRS
1004_mm_cmpunord_ps(__m128 __a, __m128 __b)
1005{
1006  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1007}
1008
1009/// Compares two 32-bit float values in the low-order bits of both
1010///    operands for equality and returns the result of the comparison.
1011///
1012///    If either of the two lower 32-bit values is NaN, 0 is returned.
1013///
1014/// \headerfile <x86intrin.h>
1015///
1016/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1017///   instructions.
1018///
1019/// \param __a
1020///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1021///    used in the comparison.
1022/// \param __b
1023///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1024///    used in the comparison.
1025/// \returns An integer containing the comparison results. If either of the
1026///    two lower 32-bit values is NaN, 0 is returned.
1027static __inline__ int __DEFAULT_FN_ATTRS
1028_mm_comieq_ss(__m128 __a, __m128 __b)
1029{
1030  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1031}
1032
1033/// Compares two 32-bit float values in the low-order bits of both
1034///    operands to determine if the first operand is less than the second
1035///    operand and returns the result of the comparison.
1036///
1037///    If either of the two lower 32-bit values is NaN, 0 is returned.
1038///
1039/// \headerfile <x86intrin.h>
1040///
1041/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1042///   instructions.
1043///
1044/// \param __a
1045///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1046///    used in the comparison.
1047/// \param __b
1048///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1049///    used in the comparison.
1050/// \returns An integer containing the comparison results. If either of the two
1051///     lower 32-bit values is NaN, 0 is returned.
1052static __inline__ int __DEFAULT_FN_ATTRS
1053_mm_comilt_ss(__m128 __a, __m128 __b)
1054{
1055  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1056}
1057
1058/// Compares two 32-bit float values in the low-order bits of both
1059///    operands to determine if the first operand is less than or equal to the
1060///    second operand and returns the result of the comparison.
1061///
1062///    If either of the two lower 32-bit values is NaN, 0 is returned.
1063///
1064/// \headerfile <x86intrin.h>
1065///
1066/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1067///
1068/// \param __a
1069///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1070///    used in the comparison.
1071/// \param __b
1072///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1073///    used in the comparison.
1074/// \returns An integer containing the comparison results. If either of the two
1075///     lower 32-bit values is NaN, 0 is returned.
1076static __inline__ int __DEFAULT_FN_ATTRS
1077_mm_comile_ss(__m128 __a, __m128 __b)
1078{
1079  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1080}
1081
1082/// Compares two 32-bit float values in the low-order bits of both
1083///    operands to determine if the first operand is greater than the second
1084///    operand and returns the result of the comparison.
1085///
1086///    If either of the two lower 32-bit values is NaN, 0 is returned.
1087///
1088/// \headerfile <x86intrin.h>
1089///
1090/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1091///
1092/// \param __a
1093///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1094///    used in the comparison.
1095/// \param __b
1096///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1097///    used in the comparison.
1098/// \returns An integer containing the comparison results. If either of the
1099///     two lower 32-bit values is NaN, 0 is returned.
1100static __inline__ int __DEFAULT_FN_ATTRS
1101_mm_comigt_ss(__m128 __a, __m128 __b)
1102{
1103  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1104}
1105
1106/// Compares two 32-bit float values in the low-order bits of both
1107///    operands to determine if the first operand is greater than or equal to
1108///    the second operand and returns the result of the comparison.
1109///
1110///    If either of the two lower 32-bit values is NaN, 0 is returned.
1111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1115///
1116/// \param __a
1117///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1118///    used in the comparison.
1119/// \param __b
1120///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1121///    used in the comparison.
1122/// \returns An integer containing the comparison results. If either of the two
1123///    lower 32-bit values is NaN, 0 is returned.
1124static __inline__ int __DEFAULT_FN_ATTRS
1125_mm_comige_ss(__m128 __a, __m128 __b)
1126{
1127  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1128}
1129
1130/// Compares two 32-bit float values in the low-order bits of both
1131///    operands to determine if the first operand is not equal to the second
1132///    operand and returns the result of the comparison.
1133///
1134///    If either of the two lower 32-bit values is NaN, 1 is returned.
1135///
1136/// \headerfile <x86intrin.h>
1137///
1138/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1139///
1140/// \param __a
1141///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142///    used in the comparison.
1143/// \param __b
1144///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1145///    used in the comparison.
1146/// \returns An integer containing the comparison results. If either of the
1147///     two lower 32-bit values is NaN, 1 is returned.
1148static __inline__ int __DEFAULT_FN_ATTRS
1149_mm_comineq_ss(__m128 __a, __m128 __b)
1150{
1151  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1152}
1153
1154/// Performs an unordered comparison of two 32-bit float values using
1155///    the low-order bits of both operands to determine equality and returns
1156///    the result of the comparison.
1157///
1158///    If either of the two lower 32-bit values is NaN, 0 is returned.
1159///
1160/// \headerfile <x86intrin.h>
1161///
1162/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1163///
1164/// \param __a
1165///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166///    used in the comparison.
1167/// \param __b
1168///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169///    used in the comparison.
1170/// \returns An integer containing the comparison results. If either of the two
1171///     lower 32-bit values is NaN, 0 is returned.
1172static __inline__ int __DEFAULT_FN_ATTRS
1173_mm_ucomieq_ss(__m128 __a, __m128 __b)
1174{
1175  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1176}
1177
1178/// Performs an unordered comparison of two 32-bit float values using
1179///    the low-order bits of both operands to determine if the first operand is
1180///    less than the second operand and returns the result of the comparison.
1181///
1182///    If either of the two lower 32-bit values is NaN, 0 is returned.
1183///
1184/// \headerfile <x86intrin.h>
1185///
1186/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1187///
1188/// \param __a
1189///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190///    used in the comparison.
1191/// \param __b
1192///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1193///    used in the comparison.
1194/// \returns An integer containing the comparison results. If either of the two
1195///    lower 32-bit values is NaN, 0 is returned.
1196static __inline__ int __DEFAULT_FN_ATTRS
1197_mm_ucomilt_ss(__m128 __a, __m128 __b)
1198{
1199  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1200}
1201
1202/// Performs an unordered comparison of two 32-bit float values using
1203///    the low-order bits of both operands to determine if the first operand is
1204///    less than or equal to the second operand and returns the result of the
1205///    comparison.
1206///
1207///    If either of the two lower 32-bit values is NaN, 0 is returned.
1208///
1209/// \headerfile <x86intrin.h>
1210///
1211/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1212///
1213/// \param __a
1214///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1215///    used in the comparison.
1216/// \param __b
1217///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1218///    used in the comparison.
1219/// \returns An integer containing the comparison results. If either of the two
1220///     lower 32-bit values is NaN, 0 is returned.
1221static __inline__ int __DEFAULT_FN_ATTRS
1222_mm_ucomile_ss(__m128 __a, __m128 __b)
1223{
1224  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1225}
1226
1227/// Performs an unordered comparison of two 32-bit float values using
1228///    the low-order bits of both operands to determine if the first operand is
1229///    greater than the second operand and returns the result of the
1230///    comparison.
1231///
1232///    If either of the two lower 32-bit values is NaN, 0 is returned.
1233///
1234/// \headerfile <x86intrin.h>
1235///
1236/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1237///
1238/// \param __a
1239///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240///    used in the comparison.
1241/// \param __b
1242///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243///    used in the comparison.
1244/// \returns An integer containing the comparison results. If either of the two
1245///     lower 32-bit values is NaN, 0 is returned.
1246static __inline__ int __DEFAULT_FN_ATTRS
1247_mm_ucomigt_ss(__m128 __a, __m128 __b)
1248{
1249  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1250}
1251
1252/// Performs an unordered comparison of two 32-bit float values using
1253///    the low-order bits of both operands to determine if the first operand is
1254///    greater than or equal to the second operand and returns the result of
1255///    the comparison.
1256///
1257///    If either of the two lower 32-bit values is NaN, 0 is returned.
1258///
1259/// \headerfile <x86intrin.h>
1260///
1261/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1262///
1263/// \param __a
1264///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1265///    used in the comparison.
1266/// \param __b
1267///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1268///    used in the comparison.
1269/// \returns An integer containing the comparison results. If either of the two
1270///     lower 32-bit values is NaN, 0 is returned.
1271static __inline__ int __DEFAULT_FN_ATTRS
1272_mm_ucomige_ss(__m128 __a, __m128 __b)
1273{
1274  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1275}
1276
1277/// Performs an unordered comparison of two 32-bit float values using
1278///    the low-order bits of both operands to determine inequality and returns
1279///    the result of the comparison.
1280///
1281///    If either of the two lower 32-bit values is NaN, 1 is returned.
1282///
1283/// \headerfile <x86intrin.h>
1284///
1285/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1286///
1287/// \param __a
1288///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1289///    used in the comparison.
1290/// \param __b
1291///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1292///    used in the comparison.
1293/// \returns An integer containing the comparison results. If either of the two
1294///    lower 32-bit values is NaN, 1 is returned.
1295static __inline__ int __DEFAULT_FN_ATTRS
1296_mm_ucomineq_ss(__m128 __a, __m128 __b)
1297{
1298  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1299}
1300
1301/// Converts a float value contained in the lower 32 bits of a vector of
1302///    [4 x float] into a 32-bit integer.
1303///
1304/// \headerfile <x86intrin.h>
1305///
1306/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1307///   instructions.
1308///
1309/// \param __a
1310///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1311///    used in the conversion.
1312/// \returns A 32-bit integer containing the converted value.
1313static __inline__ int __DEFAULT_FN_ATTRS
1314_mm_cvtss_si32(__m128 __a)
1315{
1316  return __builtin_ia32_cvtss2si((__v4sf)__a);
1317}
1318
1319/// Converts a float value contained in the lower 32 bits of a vector of
1320///    [4 x float] into a 32-bit integer.
1321///
1322/// \headerfile <x86intrin.h>
1323///
1324/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1325///   instructions.
1326///
1327/// \param __a
1328///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1329///    used in the conversion.
1330/// \returns A 32-bit integer containing the converted value.
1331static __inline__ int __DEFAULT_FN_ATTRS
1332_mm_cvt_ss2si(__m128 __a)
1333{
1334  return _mm_cvtss_si32(__a);
1335}
1336
1337#ifdef __x86_64__
1338
1339/// Converts a float value contained in the lower 32 bits of a vector of
1340///    [4 x float] into a 64-bit integer.
1341///
1342/// \headerfile <x86intrin.h>
1343///
1344/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1345///   instructions.
1346///
1347/// \param __a
1348///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1349///    used in the conversion.
1350/// \returns A 64-bit integer containing the converted value.
1351static __inline__ long long __DEFAULT_FN_ATTRS
1352_mm_cvtss_si64(__m128 __a)
1353{
1354  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1355}
1356
1357#endif
1358
1359/// Converts two low-order float values in a 128-bit vector of
1360///    [4 x float] into a 64-bit vector of [2 x i32].
1361///
1362/// \headerfile <x86intrin.h>
1363///
1364/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1365///
1366/// \param __a
1367///    A 128-bit vector of [4 x float].
1368/// \returns A 64-bit integer vector containing the converted values.
1369static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1370_mm_cvtps_pi32(__m128 __a)
1371{
1372  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1373}
1374
1375/// Converts two low-order float values in a 128-bit vector of
1376///    [4 x float] into a 64-bit vector of [2 x i32].
1377///
1378/// \headerfile <x86intrin.h>
1379///
1380/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1381///
1382/// \param __a
1383///    A 128-bit vector of [4 x float].
1384/// \returns A 64-bit integer vector containing the converted values.
1385static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1386_mm_cvt_ps2pi(__m128 __a)
1387{
1388  return _mm_cvtps_pi32(__a);
1389}
1390
1391/// Converts a float value contained in the lower 32 bits of a vector of
1392///    [4 x float] into a 32-bit integer, truncating the result when it is
1393///    inexact.
1394///
1395/// \headerfile <x86intrin.h>
1396///
1397/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1398///   instructions.
1399///
1400/// \param __a
1401///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1402///    used in the conversion.
1403/// \returns A 32-bit integer containing the converted value.
1404static __inline__ int __DEFAULT_FN_ATTRS
1405_mm_cvttss_si32(__m128 __a)
1406{
1407  return __builtin_ia32_cvttss2si((__v4sf)__a);
1408}
1409
1410/// Converts a float value contained in the lower 32 bits of a vector of
1411///    [4 x float] into a 32-bit integer, truncating the result when it is
1412///    inexact.
1413///
1414/// \headerfile <x86intrin.h>
1415///
1416/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1417///   instructions.
1418///
1419/// \param __a
1420///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1421///    used in the conversion.
1422/// \returns A 32-bit integer containing the converted value.
1423static __inline__ int __DEFAULT_FN_ATTRS
1424_mm_cvtt_ss2si(__m128 __a)
1425{
1426  return _mm_cvttss_si32(__a);
1427}
1428
1429#ifdef __x86_64__
1430/// Converts a float value contained in the lower 32 bits of a vector of
1431///    [4 x float] into a 64-bit integer, truncating the result when it is
1432///    inexact.
1433///
1434/// \headerfile <x86intrin.h>
1435///
1436/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1437///   instructions.
1438///
1439/// \param __a
1440///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1441///    used in the conversion.
1442/// \returns A 64-bit integer containing the converted value.
1443static __inline__ long long __DEFAULT_FN_ATTRS
1444_mm_cvttss_si64(__m128 __a)
1445{
1446  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1447}
1448#endif
1449
1450/// Converts two low-order float values in a 128-bit vector of
1451///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1452///    when it is inexact.
1453///
1454/// \headerfile <x86intrin.h>
1455///
1456/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1457///   instructions.
1458///
1459/// \param __a
1460///    A 128-bit vector of [4 x float].
1461/// \returns A 64-bit integer vector containing the converted values.
1462static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1463_mm_cvttps_pi32(__m128 __a)
1464{
1465  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1466}
1467
1468/// Converts two low-order float values in a 128-bit vector of [4 x
1469///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1470///    is inexact.
1471///
1472/// \headerfile <x86intrin.h>
1473///
1474/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1475///
1476/// \param __a
1477///    A 128-bit vector of [4 x float].
1478/// \returns A 64-bit integer vector containing the converted values.
1479static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1480_mm_cvtt_ps2pi(__m128 __a)
1481{
1482  return _mm_cvttps_pi32(__a);
1483}
1484
1485/// Converts a 32-bit signed integer value into a floating point value
1486///    and writes it to the lower 32 bits of the destination. The remaining
1487///    higher order elements of the destination vector are copied from the
1488///    corresponding elements in the first operand.
1489///
1490/// \headerfile <x86intrin.h>
1491///
1492/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1493///
1494/// \param __a
1495///    A 128-bit vector of [4 x float].
1496/// \param __b
1497///    A 32-bit signed integer operand containing the value to be converted.
1498/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1499///    converted value of the second operand. The upper 96 bits are copied from
1500///    the upper 96 bits of the first operand.
1501static __inline__ __m128 __DEFAULT_FN_ATTRS
1502_mm_cvtsi32_ss(__m128 __a, int __b)
1503{
1504  __a[0] = __b;
1505  return __a;
1506}
1507
1508/// Converts a 32-bit signed integer value into a floating point value
1509///    and writes it to the lower 32 bits of the destination. The remaining
1510///    higher order elements of the destination are copied from the
1511///    corresponding elements in the first operand.
1512///
1513/// \headerfile <x86intrin.h>
1514///
1515/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1516///
1517/// \param __a
1518///    A 128-bit vector of [4 x float].
1519/// \param __b
1520///    A 32-bit signed integer operand containing the value to be converted.
1521/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1522///    converted value of the second operand. The upper 96 bits are copied from
1523///    the upper 96 bits of the first operand.
1524static __inline__ __m128 __DEFAULT_FN_ATTRS
1525_mm_cvt_si2ss(__m128 __a, int __b)
1526{
1527  return _mm_cvtsi32_ss(__a, __b);
1528}
1529
1530#ifdef __x86_64__
1531
1532/// Converts a 64-bit signed integer value into a floating point value
1533///    and writes it to the lower 32 bits of the destination. The remaining
1534///    higher order elements of the destination are copied from the
1535///    corresponding elements in the first operand.
1536///
1537/// \headerfile <x86intrin.h>
1538///
1539/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1540///
1541/// \param __a
1542///    A 128-bit vector of [4 x float].
1543/// \param __b
1544///    A 64-bit signed integer operand containing the value to be converted.
1545/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1546///    converted value of the second operand. The upper 96 bits are copied from
1547///    the upper 96 bits of the first operand.
1548static __inline__ __m128 __DEFAULT_FN_ATTRS
1549_mm_cvtsi64_ss(__m128 __a, long long __b)
1550{
1551  __a[0] = __b;
1552  return __a;
1553}
1554
1555#endif
1556
1557/// Converts two elements of a 64-bit vector of [2 x i32] into two
1558///    floating point values and writes them to the lower 64-bits of the
1559///    destination. The remaining higher order elements of the destination are
1560///    copied from the corresponding elements in the first operand.
1561///
1562/// \headerfile <x86intrin.h>
1563///
1564/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1565///
1566/// \param __a
1567///    A 128-bit vector of [4 x float].
1568/// \param __b
1569///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1570///    and written to the corresponding low-order elements in the destination.
1571/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1572///    converted value of the second operand. The upper 64 bits are copied from
1573///    the upper 64 bits of the first operand.
1574static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1575_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1576{
1577  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1578}
1579
1580/// Converts two elements of a 64-bit vector of [2 x i32] into two
1581///    floating point values and writes them to the lower 64-bits of the
1582///    destination. The remaining higher order elements of the destination are
1583///    copied from the corresponding elements in the first operand.
1584///
1585/// \headerfile <x86intrin.h>
1586///
1587/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1588///
1589/// \param __a
1590///    A 128-bit vector of [4 x float].
1591/// \param __b
1592///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1593///    and written to the corresponding low-order elements in the destination.
1594/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1595///    converted value from the second operand. The upper 64 bits are copied
1596///    from the upper 64 bits of the first operand.
1597static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1598_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1599{
1600  return _mm_cvtpi32_ps(__a, __b);
1601}
1602
1603/// Extracts a float value contained in the lower 32 bits of a vector of
1604///    [4 x float].
1605///
1606/// \headerfile <x86intrin.h>
1607///
1608/// This intrinsic has no corresponding instruction.
1609///
1610/// \param __a
1611///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1612///    used in the extraction.
1613/// \returns A 32-bit float containing the extracted value.
1614static __inline__ float __DEFAULT_FN_ATTRS
1615_mm_cvtss_f32(__m128 __a)
1616{
1617  return __a[0];
1618}
1619
1620/// Loads two packed float values from the address \a __p into the
1621///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1622///     are copied from the low-order bits of the first operand.
1623///
1624/// \headerfile <x86intrin.h>
1625///
1626/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1627///
1628/// \param __a
1629///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1630///    of the destination.
1631/// \param __p
1632///    A pointer to two packed float values. Bits [63:0] are written to bits
1633///    [127:64] of the destination.
1634/// \returns A 128-bit vector of [4 x float] containing the moved values.
1635static __inline__ __m128 __DEFAULT_FN_ATTRS
1636_mm_loadh_pi(__m128 __a, const __m64 *__p)
1637{
1638  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1639  struct __mm_loadh_pi_struct {
1640    __mm_loadh_pi_v2f32 __u;
1641  } __attribute__((__packed__, __may_alias__));
1642  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1643  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1644  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1645}
1646
1647/// Loads two packed float values from the address \a __p into the
1648///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1649///    are copied from the high-order bits of the first operand.
1650///
1651/// \headerfile <x86intrin.h>
1652///
1653/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1654///
1655/// \param __a
1656///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1657///    [127:64] of the destination.
1658/// \param __p
1659///    A pointer to two packed float values. Bits [63:0] are written to bits
1660///    [63:0] of the destination.
1661/// \returns A 128-bit vector of [4 x float] containing the moved values.
1662static __inline__ __m128 __DEFAULT_FN_ATTRS
1663_mm_loadl_pi(__m128 __a, const __m64 *__p)
1664{
1665  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1666  struct __mm_loadl_pi_struct {
1667    __mm_loadl_pi_v2f32 __u;
1668  } __attribute__((__packed__, __may_alias__));
1669  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1670  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1671  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1672}
1673
1674/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1675///    32 bits of the vector are initialized with the single-precision
1676///    floating-point value loaded from a specified memory location. The upper
1677///    96 bits are set to zero.
1678///
1679/// \headerfile <x86intrin.h>
1680///
1681/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1682///
1683/// \param __p
1684///    A pointer to a 32-bit memory location containing a single-precision
1685///    floating-point value.
1686/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1687///    lower 32 bits contain the value loaded from the memory location. The
1688///    upper 96 bits are set to zero.
1689static __inline__ __m128 __DEFAULT_FN_ATTRS
1690_mm_load_ss(const float *__p)
1691{
1692  struct __mm_load_ss_struct {
1693    float __u;
1694  } __attribute__((__packed__, __may_alias__));
1695  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1696  return __extension__ (__m128){ __u, 0, 0, 0 };
1697}
1698
1699/// Loads a 32-bit float value and duplicates it to all four vector
1700///    elements of a 128-bit vector of [4 x float].
1701///
1702/// \headerfile <x86intrin.h>
1703///
1704/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1705///    instruction.
1706///
1707/// \param __p
1708///    A pointer to a float value to be loaded and duplicated.
1709/// \returns A 128-bit vector of [4 x float] containing the loaded and
1710///    duplicated values.
1711static __inline__ __m128 __DEFAULT_FN_ATTRS
1712_mm_load1_ps(const float *__p)
1713{
1714  struct __mm_load1_ps_struct {
1715    float __u;
1716  } __attribute__((__packed__, __may_alias__));
1717  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1718  return __extension__ (__m128){ __u, __u, __u, __u };
1719}
1720
1721#define        _mm_load_ps1(p) _mm_load1_ps(p)
1722
1723/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1724///    memory location.
1725///
1726/// \headerfile <x86intrin.h>
1727///
1728/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1729///
1730/// \param __p
1731///    A pointer to a 128-bit memory location. The address of the memory
1732///    location has to be 128-bit aligned.
1733/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1734static __inline__ __m128 __DEFAULT_FN_ATTRS
1735_mm_load_ps(const float *__p)
1736{
1737  return *(__m128*)__p;
1738}
1739
1740/// Loads a 128-bit floating-point vector of [4 x float] from an
1741///    unaligned memory location.
1742///
1743/// \headerfile <x86intrin.h>
1744///
1745/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1746///
1747/// \param __p
1748///    A pointer to a 128-bit memory location. The address of the memory
1749///    location does not have to be aligned.
1750/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1751static __inline__ __m128 __DEFAULT_FN_ATTRS
1752_mm_loadu_ps(const float *__p)
1753{
1754  struct __loadu_ps {
1755    __m128 __v;
1756  } __attribute__((__packed__, __may_alias__));
1757  return ((struct __loadu_ps*)__p)->__v;
1758}
1759
1760/// Loads four packed float values, in reverse order, from an aligned
1761///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1762///
1763/// \headerfile <x86intrin.h>
1764///
1765/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1766///    instruction.
1767///
1768/// \param __p
1769///    A pointer to a 128-bit memory location. The address of the memory
1770///    location has to be 128-bit aligned.
1771/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1772///    in reverse order.
1773static __inline__ __m128 __DEFAULT_FN_ATTRS
1774_mm_loadr_ps(const float *__p)
1775{
1776  __m128 __a = _mm_load_ps(__p);
1777  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1778}
1779
1780/// Create a 128-bit vector of [4 x float] with undefined values.
1781///
1782/// \headerfile <x86intrin.h>
1783///
1784/// This intrinsic has no corresponding instruction.
1785///
1786/// \returns A 128-bit vector of [4 x float] containing undefined values.
1787static __inline__ __m128 __DEFAULT_FN_ATTRS
1788_mm_undefined_ps(void)
1789{
1790  return (__m128)__builtin_ia32_undef128();
1791}
1792
1793/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1794///    32 bits of the vector are initialized with the specified single-precision
1795///    floating-point value. The upper 96 bits are set to zero.
1796///
1797/// \headerfile <x86intrin.h>
1798///
1799/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1800///
1801/// \param __w
1802///    A single-precision floating-point value used to initialize the lower 32
1803///    bits of the result.
1804/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1805///    lower 32 bits contain the value provided in the source operand. The
1806///    upper 96 bits are set to zero.
1807static __inline__ __m128 __DEFAULT_FN_ATTRS
1808_mm_set_ss(float __w)
1809{
1810  return __extension__ (__m128){ __w, 0, 0, 0 };
1811}
1812
1813/// Constructs a 128-bit floating-point vector of [4 x float], with each
1814///    of the four single-precision floating-point vector elements set to the
1815///    specified single-precision floating-point value.
1816///
1817/// \headerfile <x86intrin.h>
1818///
1819/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1820///
1821/// \param __w
1822///    A single-precision floating-point value used to initialize each vector
1823///    element of the result.
1824/// \returns An initialized 128-bit floating-point vector of [4 x float].
1825static __inline__ __m128 __DEFAULT_FN_ATTRS
1826_mm_set1_ps(float __w)
1827{
1828  return __extension__ (__m128){ __w, __w, __w, __w };
1829}
1830
1831/* Microsoft specific. */
1832/// Constructs a 128-bit floating-point vector of [4 x float], with each
1833///    of the four single-precision floating-point vector elements set to the
1834///    specified single-precision floating-point value.
1835///
1836/// \headerfile <x86intrin.h>
1837///
1838/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1839///
1840/// \param __w
1841///    A single-precision floating-point value used to initialize each vector
1842///    element of the result.
1843/// \returns An initialized 128-bit floating-point vector of [4 x float].
1844static __inline__ __m128 __DEFAULT_FN_ATTRS
1845_mm_set_ps1(float __w)
1846{
1847    return _mm_set1_ps(__w);
1848}
1849
1850/// Constructs a 128-bit floating-point vector of [4 x float]
1851///    initialized with the specified single-precision floating-point values.
1852///
1853/// \headerfile <x86intrin.h>
1854///
1855/// This intrinsic is a utility function and does not correspond to a specific
1856///    instruction.
1857///
1858/// \param __z
1859///    A single-precision floating-point value used to initialize bits [127:96]
1860///    of the result.
1861/// \param __y
1862///    A single-precision floating-point value used to initialize bits [95:64]
1863///    of the result.
1864/// \param __x
1865///    A single-precision floating-point value used to initialize bits [63:32]
1866///    of the result.
1867/// \param __w
1868///    A single-precision floating-point value used to initialize bits [31:0]
1869///    of the result.
1870/// \returns An initialized 128-bit floating-point vector of [4 x float].
1871static __inline__ __m128 __DEFAULT_FN_ATTRS
1872_mm_set_ps(float __z, float __y, float __x, float __w)
1873{
1874  return __extension__ (__m128){ __w, __x, __y, __z };
1875}
1876
1877/// Constructs a 128-bit floating-point vector of [4 x float],
1878///    initialized in reverse order with the specified 32-bit single-precision
1879///    float-point values.
1880///
1881/// \headerfile <x86intrin.h>
1882///
1883/// This intrinsic is a utility function and does not correspond to a specific
1884///    instruction.
1885///
1886/// \param __z
1887///    A single-precision floating-point value used to initialize bits [31:0]
1888///    of the result.
1889/// \param __y
1890///    A single-precision floating-point value used to initialize bits [63:32]
1891///    of the result.
1892/// \param __x
1893///    A single-precision floating-point value used to initialize bits [95:64]
1894///    of the result.
1895/// \param __w
1896///    A single-precision floating-point value used to initialize bits [127:96]
1897///    of the result.
1898/// \returns An initialized 128-bit floating-point vector of [4 x float].
1899static __inline__ __m128 __DEFAULT_FN_ATTRS
1900_mm_setr_ps(float __z, float __y, float __x, float __w)
1901{
1902  return __extension__ (__m128){ __z, __y, __x, __w };
1903}
1904
1905/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1906///    to zero.
1907///
1908/// \headerfile <x86intrin.h>
1909///
1910/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1911///
1912/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1913///    all elements set to zero.
1914static __inline__ __m128 __DEFAULT_FN_ATTRS
1915_mm_setzero_ps(void)
1916{
1917  return __extension__ (__m128){ 0, 0, 0, 0 };
1918}
1919
1920/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1921///    memory location.
1922///
1923/// \headerfile <x86intrin.h>
1924///
1925/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1926///
1927/// \param __p
1928///    A pointer to a 64-bit memory location.
1929/// \param __a
1930///    A 128-bit vector of [4 x float] containing the values to be stored.
1931static __inline__ void __DEFAULT_FN_ATTRS
1932_mm_storeh_pi(__m64 *__p, __m128 __a)
1933{
1934  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1935}
1936
1937/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1938///     memory location.
1939///
1940/// \headerfile <x86intrin.h>
1941///
1942/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1943///
1944/// \param __p
1945///    A pointer to a memory location that will receive the float values.
1946/// \param __a
1947///    A 128-bit vector of [4 x float] containing the values to be stored.
1948static __inline__ void __DEFAULT_FN_ATTRS
1949_mm_storel_pi(__m64 *__p, __m128 __a)
1950{
1951  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1952}
1953
1954/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1955///     memory location.
1956///
1957/// \headerfile <x86intrin.h>
1958///
1959/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1960///
1961/// \param __p
1962///    A pointer to a 32-bit memory location.
1963/// \param __a
1964///    A 128-bit vector of [4 x float] containing the value to be stored.
1965static __inline__ void __DEFAULT_FN_ATTRS
1966_mm_store_ss(float *__p, __m128 __a)
1967{
1968  struct __mm_store_ss_struct {
1969    float __u;
1970  } __attribute__((__packed__, __may_alias__));
1971  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1972}
1973
1974/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1975///    location.
1976///
1977/// \headerfile <x86intrin.h>
1978///
1979/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1980///
1981/// \param __p
1982///    A pointer to a 128-bit memory location. The address of the memory
1983///    location does not have to be aligned.
1984/// \param __a
1985///    A 128-bit vector of [4 x float] containing the values to be stored.
1986static __inline__ void __DEFAULT_FN_ATTRS
1987_mm_storeu_ps(float *__p, __m128 __a)
1988{
1989  struct __storeu_ps {
1990    __m128 __v;
1991  } __attribute__((__packed__, __may_alias__));
1992  ((struct __storeu_ps*)__p)->__v = __a;
1993}
1994
1995/// Stores a 128-bit vector of [4 x float] into an aligned memory
1996///    location.
1997///
1998/// \headerfile <x86intrin.h>
1999///
2000/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2001///
2002/// \param __p
2003///    A pointer to a 128-bit memory location. The address of the memory
2004///    location has to be 16-byte aligned.
2005/// \param __a
2006///    A 128-bit vector of [4 x float] containing the values to be stored.
2007static __inline__ void __DEFAULT_FN_ATTRS
2008_mm_store_ps(float *__p, __m128 __a)
2009{
2010  *(__m128*)__p = __a;
2011}
2012
2013/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2014///    four contiguous elements in an aligned memory location.
2015///
2016/// \headerfile <x86intrin.h>
2017///
2018/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2019///    instruction.
2020///
2021/// \param __p
2022///    A pointer to a 128-bit memory location.
2023/// \param __a
2024///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2025///    of the four contiguous elements pointed by \a __p.
2026static __inline__ void __DEFAULT_FN_ATTRS
2027_mm_store1_ps(float *__p, __m128 __a)
2028{
2029  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2030  _mm_store_ps(__p, __a);
2031}
2032
2033/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2034///    four contiguous elements in an aligned memory location.
2035///
2036/// \headerfile <x86intrin.h>
2037///
2038/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2039///    instruction.
2040///
2041/// \param __p
2042///    A pointer to a 128-bit memory location.
2043/// \param __a
2044///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2045///    of the four contiguous elements pointed by \a __p.
2046static __inline__ void __DEFAULT_FN_ATTRS
2047_mm_store_ps1(float *__p, __m128 __a)
2048{
2049  _mm_store1_ps(__p, __a);
2050}
2051
2052/// Stores float values from a 128-bit vector of [4 x float] to an
2053///    aligned memory location in reverse order.
2054///
2055/// \headerfile <x86intrin.h>
2056///
2057/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2058///    instruction.
2059///
2060/// \param __p
2061///    A pointer to a 128-bit memory location. The address of the memory
2062///    location has to be 128-bit aligned.
2063/// \param __a
2064///    A 128-bit vector of [4 x float] containing the values to be stored.
2065static __inline__ void __DEFAULT_FN_ATTRS
2066_mm_storer_ps(float *__p, __m128 __a)
2067{
2068  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2069  _mm_store_ps(__p, __a);
2070}
2071
2072#define _MM_HINT_ET0 7
2073#define _MM_HINT_ET1 6
2074#define _MM_HINT_T0  3
2075#define _MM_HINT_T1  2
2076#define _MM_HINT_T2  1
2077#define _MM_HINT_NTA 0
2078
2079#ifndef _MSC_VER
2080/* FIXME: We have to #define this because "sel" must be a constant integer, and
2081   Sema doesn't do any form of constant propagation yet. */
2082
2083/// Loads one cache line of data from the specified address to a location
2084///    closer to the processor.
2085///
2086/// \headerfile <x86intrin.h>
2087///
2088/// \code
2089/// void _mm_prefetch(const void * a, const int sel);
2090/// \endcode
2091///
2092/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2093///
2094/// \param a
2095///    A pointer to a memory location containing a cache line of data.
2096/// \param sel
2097///    A predefined integer constant specifying the type of prefetch
2098///    operation: \n
2099///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2100///    PREFETCHNTA instruction will be generated. \n
2101///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2102///    be generated. \n
2103///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2104///    be generated. \n
2105///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2106///    be generated.
2107#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
2108                                                 ((sel) >> 2) & 1, (sel) & 0x3))
2109#endif
2110
2111/// Stores a 64-bit integer in the specified aligned memory location. To
2112///    minimize caching, the data is flagged as non-temporal (unlikely to be
2113///    used again soon).
2114///
2115/// \headerfile <x86intrin.h>
2116///
2117/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2118///
2119/// \param __p
2120///    A pointer to an aligned memory location used to store the register value.
2121/// \param __a
2122///    A 64-bit integer containing the value to be stored.
2123static __inline__ void __DEFAULT_FN_ATTRS_MMX
2124_mm_stream_pi(__m64 *__p, __m64 __a)
2125{
2126  __builtin_ia32_movntq(__p, __a);
2127}
2128
2129/// Moves packed float values from a 128-bit vector of [4 x float] to a
2130///    128-bit aligned memory location. To minimize caching, the data is flagged
2131///    as non-temporal (unlikely to be used again soon).
2132///
2133/// \headerfile <x86intrin.h>
2134///
2135/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2136///
2137/// \param __p
2138///    A pointer to a 128-bit aligned memory location that will receive the
2139///    single-precision floating-point values.
2140/// \param __a
2141///    A 128-bit vector of [4 x float] containing the values to be moved.
2142static __inline__ void __DEFAULT_FN_ATTRS
2143_mm_stream_ps(float *__p, __m128 __a)
2144{
2145  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2146}
2147
2148#if defined(__cplusplus)
2149extern "C" {
2150#endif
2151
2152/// Forces strong memory ordering (serialization) between store
2153///    instructions preceding this instruction and store instructions following
2154///    this instruction, ensuring the system completes all previous stores
2155///    before executing subsequent stores.
2156///
2157/// \headerfile <x86intrin.h>
2158///
2159/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2160///
2161void _mm_sfence(void);
2162
2163#if defined(__cplusplus)
2164} // extern "C"
2165#endif
2166
2167/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2168///    returns it, as specified by the immediate integer operand.
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// \code
2173/// int _mm_extract_pi16(__m64 a, int n);
2174/// \endcode
2175///
2176/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2177///
2178/// \param a
2179///    A 64-bit vector of [4 x i16].
2180/// \param n
2181///    An immediate integer operand that determines which bits are extracted: \n
2182///    0: Bits [15:0] are copied to the destination. \n
2183///    1: Bits [31:16] are copied to the destination. \n
2184///    2: Bits [47:32] are copied to the destination. \n
2185///    3: Bits [63:48] are copied to the destination.
2186/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2187#define _mm_extract_pi16(a, n) \
2188  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
2189
2190/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2191///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2192///    specified by the immediate operand \a n.
2193///
2194/// \headerfile <x86intrin.h>
2195///
2196/// \code
2197/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2198/// \endcode
2199///
2200/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2201///
2202/// \param a
2203///    A 64-bit vector of [4 x i16].
2204/// \param d
2205///    An integer. The lower 16-bit value from this operand is written to the
2206///    destination at the offset specified by operand \a n.
2207/// \param n
2208///    An immediate integer operant that determines which the bits to be used
2209///    in the destination. \n
2210///    0: Bits [15:0] are copied to the destination. \n
2211///    1: Bits [31:16] are copied to the destination. \n
2212///    2: Bits [47:32] are copied to the destination. \n
2213///    3: Bits [63:48] are copied to the destination.  \n
2214///    The remaining bits in the destination are copied from the corresponding
2215///    bits in operand \a a.
2216/// \returns A 64-bit integer vector containing the copied packed data from the
2217///    operands.
2218#define _mm_insert_pi16(a, d, n) \
2219  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
2220
2221/// Compares each of the corresponding packed 16-bit integer values of
2222///    the 64-bit integer vectors, and writes the greater value to the
2223///    corresponding bits in the destination.
2224///
2225/// \headerfile <x86intrin.h>
2226///
2227/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2228///
2229/// \param __a
2230///    A 64-bit integer vector containing one of the source operands.
2231/// \param __b
2232///    A 64-bit integer vector containing one of the source operands.
2233/// \returns A 64-bit integer vector containing the comparison results.
2234static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2235_mm_max_pi16(__m64 __a, __m64 __b)
2236{
2237  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2238}
2239
2240/// Compares each of the corresponding packed 8-bit unsigned integer
2241///    values of the 64-bit integer vectors, and writes the greater value to the
2242///    corresponding bits in the destination.
2243///
2244/// \headerfile <x86intrin.h>
2245///
2246/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2247///
2248/// \param __a
2249///    A 64-bit integer vector containing one of the source operands.
2250/// \param __b
2251///    A 64-bit integer vector containing one of the source operands.
2252/// \returns A 64-bit integer vector containing the comparison results.
2253static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2254_mm_max_pu8(__m64 __a, __m64 __b)
2255{
2256  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2257}
2258
2259/// Compares each of the corresponding packed 16-bit integer values of
2260///    the 64-bit integer vectors, and writes the lesser value to the
2261///    corresponding bits in the destination.
2262///
2263/// \headerfile <x86intrin.h>
2264///
2265/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2266///
2267/// \param __a
2268///    A 64-bit integer vector containing one of the source operands.
2269/// \param __b
2270///    A 64-bit integer vector containing one of the source operands.
2271/// \returns A 64-bit integer vector containing the comparison results.
2272static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2273_mm_min_pi16(__m64 __a, __m64 __b)
2274{
2275  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2276}
2277
2278/// Compares each of the corresponding packed 8-bit unsigned integer
2279///    values of the 64-bit integer vectors, and writes the lesser value to the
2280///    corresponding bits in the destination.
2281///
2282/// \headerfile <x86intrin.h>
2283///
2284/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2285///
2286/// \param __a
2287///    A 64-bit integer vector containing one of the source operands.
2288/// \param __b
2289///    A 64-bit integer vector containing one of the source operands.
2290/// \returns A 64-bit integer vector containing the comparison results.
2291static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2292_mm_min_pu8(__m64 __a, __m64 __b)
2293{
2294  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2295}
2296
2297/// Takes the most significant bit from each 8-bit element in a 64-bit
2298///    integer vector to create an 8-bit mask value. Zero-extends the value to
2299///    32-bit integer and writes it to the destination.
2300///
2301/// \headerfile <x86intrin.h>
2302///
2303/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2304///
2305/// \param __a
2306///    A 64-bit integer vector containing the values with bits to be extracted.
2307/// \returns The most significant bit from each 8-bit element in \a __a,
2308///    written to bits [7:0].
2309static __inline__ int __DEFAULT_FN_ATTRS_MMX
2310_mm_movemask_pi8(__m64 __a)
2311{
2312  return __builtin_ia32_pmovmskb((__v8qi)__a);
2313}
2314
2315/// Multiplies packed 16-bit unsigned integer values and writes the
2316///    high-order 16 bits of each 32-bit product to the corresponding bits in
2317///    the destination.
2318///
2319/// \headerfile <x86intrin.h>
2320///
2321/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2322///
2323/// \param __a
2324///    A 64-bit integer vector containing one of the source operands.
2325/// \param __b
2326///    A 64-bit integer vector containing one of the source operands.
2327/// \returns A 64-bit integer vector containing the products of both operands.
2328static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2329_mm_mulhi_pu16(__m64 __a, __m64 __b)
2330{
2331  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2332}
2333
2334/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2335///    destination, as specified by the immediate value operand.
2336///
2337/// \headerfile <x86intrin.h>
2338///
2339/// \code
2340/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2341/// \endcode
2342///
2343/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2344///
2345/// \param a
2346///    A 64-bit integer vector containing the values to be shuffled.
2347/// \param n
2348///    An immediate value containing an 8-bit value specifying which elements to
2349///    copy from \a a. The destinations within the 64-bit destination are
2350///    assigned values as follows: \n
2351///    Bits [1:0] are used to assign values to bits [15:0] in the
2352///    destination. \n
2353///    Bits [3:2] are used to assign values to bits [31:16] in the
2354///    destination. \n
2355///    Bits [5:4] are used to assign values to bits [47:32] in the
2356///    destination. \n
2357///    Bits [7:6] are used to assign values to bits [63:48] in the
2358///    destination. \n
2359///    Bit value assignments: \n
2360///    00: assigned from bits [15:0] of \a a. \n
2361///    01: assigned from bits [31:16] of \a a. \n
2362///    10: assigned from bits [47:32] of \a a. \n
2363///    11: assigned from bits [63:48] of \a a.
2364/// \returns A 64-bit integer vector containing the shuffled values.
2365#define _mm_shuffle_pi16(a, n) \
2366  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2367
2368/// Conditionally copies the values from each 8-bit element in the first
2369///    64-bit integer vector operand to the specified memory location, as
2370///    specified by the most significant bit in the corresponding element in the
2371///    second 64-bit integer vector operand.
2372///
2373///    To minimize caching, the data is flagged as non-temporal
2374///    (unlikely to be used again soon).
2375///
2376/// \headerfile <x86intrin.h>
2377///
2378/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2379///
2380/// \param __d
2381///    A 64-bit integer vector containing the values with elements to be copied.
2382/// \param __n
2383///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2384///    element determines whether the corresponding element in operand \a __d
2385///    is copied. If the most significant bit of a given element is 1, the
2386///    corresponding element in operand \a __d is copied.
2387/// \param __p
2388///    A pointer to a 64-bit memory location that will receive the conditionally
2389///    copied integer values. The address of the memory location does not have
2390///    to be aligned.
2391static __inline__ void __DEFAULT_FN_ATTRS_MMX
2392_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2393{
2394  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2395}
2396
2397/// Computes the rounded averages of the packed unsigned 8-bit integer
2398///    values and writes the averages to the corresponding bits in the
2399///    destination.
2400///
2401/// \headerfile <x86intrin.h>
2402///
2403/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2404///
2405/// \param __a
2406///    A 64-bit integer vector containing one of the source operands.
2407/// \param __b
2408///    A 64-bit integer vector containing one of the source operands.
2409/// \returns A 64-bit integer vector containing the averages of both operands.
2410static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2411_mm_avg_pu8(__m64 __a, __m64 __b)
2412{
2413  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2414}
2415
2416/// Computes the rounded averages of the packed unsigned 16-bit integer
2417///    values and writes the averages to the corresponding bits in the
2418///    destination.
2419///
2420/// \headerfile <x86intrin.h>
2421///
2422/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2423///
2424/// \param __a
2425///    A 64-bit integer vector containing one of the source operands.
2426/// \param __b
2427///    A 64-bit integer vector containing one of the source operands.
2428/// \returns A 64-bit integer vector containing the averages of both operands.
2429static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2430_mm_avg_pu16(__m64 __a, __m64 __b)
2431{
2432  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2433}
2434
2435/// Subtracts the corresponding 8-bit unsigned integer values of the two
2436///    64-bit vector operands and computes the absolute value for each of the
2437///    difference. Then sum of the 8 absolute differences is written to the
2438///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2439///
2440/// \headerfile <x86intrin.h>
2441///
2442/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2443///
2444/// \param __a
2445///    A 64-bit integer vector containing one of the source operands.
2446/// \param __b
2447///    A 64-bit integer vector containing one of the source operands.
2448/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2449///    sets of absolute differences between both operands. The upper bits are
2450///    cleared.
2451static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2452_mm_sad_pu8(__m64 __a, __m64 __b)
2453{
2454  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2455}
2456
2457#if defined(__cplusplus)
2458extern "C" {
2459#endif
2460
2461/// Returns the contents of the MXCSR register as a 32-bit unsigned
2462///    integer value.
2463///
2464///    There are several groups of macros associated with this
2465///    intrinsic, including:
2466///    <ul>
2467///    <li>
2468///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2469///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2470///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2471///      _MM_GET_EXCEPTION_STATE().
2472///    </li>
2473///    <li>
2474///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2475///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2476///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2477///    </li>
2478///    <li>
2479///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2480///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2481///      _MM_GET_ROUNDING_MODE().
2482///    </li>
2483///    <li>
2484///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2485///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2486///    </li>
2487///    <li>
2488///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2489///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2490///      _MM_GET_DENORMALS_ZERO_MODE().
2491///    </li>
2492///    </ul>
2493///
2494///    For example, the following expression checks if an overflow exception has
2495///    occurred:
2496///    \code
2497///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2498///    \endcode
2499///
2500///    The following expression gets the current rounding mode:
2501///    \code
2502///      _MM_GET_ROUNDING_MODE()
2503///    \endcode
2504///
2505/// \headerfile <x86intrin.h>
2506///
2507/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2508///
2509/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2510///    register.
2511unsigned int _mm_getcsr(void);
2512
2513/// Sets the MXCSR register with the 32-bit unsigned integer value.
2514///
2515///    There are several groups of macros associated with this intrinsic,
2516///    including:
2517///    <ul>
2518///    <li>
2519///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2520///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2521///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2522///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2523///    </li>
2524///    <li>
2525///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2526///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2527///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2528///      of these macros.
2529///    </li>
2530///    <li>
2531///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2532///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2533///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2534///    </li>
2535///    <li>
2536///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2537///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2538///      one of these macros.
2539///    </li>
2540///    <li>
2541///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2542///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2543///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2544///    </li>
2545///    </ul>
2546///
2547///    For example, the following expression causes subsequent floating-point
2548///    operations to round up:
2549///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2550///
2551///    The following example sets the DAZ and FTZ flags:
2552///    \code
2553///    void setFlags() {
2554///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2555///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2556///    }
2557///    \endcode
2558///
2559/// \headerfile <x86intrin.h>
2560///
2561/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2562///
2563/// \param __i
2564///    A 32-bit unsigned integer value to be written to the MXCSR register.
2565void _mm_setcsr(unsigned int __i);
2566
2567#if defined(__cplusplus)
2568} // extern "C"
2569#endif
2570
2571/// Selects 4 float values from the 128-bit operands of [4 x float], as
2572///    specified by the immediate value operand.
2573///
2574/// \headerfile <x86intrin.h>
2575///
2576/// \code
2577/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2578/// \endcode
2579///
2580/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2581///
2582/// \param a
2583///    A 128-bit vector of [4 x float].
2584/// \param b
2585///    A 128-bit vector of [4 x float].
2586/// \param mask
2587///    An immediate value containing an 8-bit value specifying which elements to
2588///    copy from \a a and \a b. \n
2589///    Bits [3:0] specify the values copied from operand \a a. \n
2590///    Bits [7:4] specify the values copied from operand \a b. \n
2591///    The destinations within the 128-bit destination are assigned values as
2592///    follows: \n
2593///    Bits [1:0] are used to assign values to bits [31:0] in the
2594///    destination. \n
2595///    Bits [3:2] are used to assign values to bits [63:32] in the
2596///    destination. \n
2597///    Bits [5:4] are used to assign values to bits [95:64] in the
2598///    destination. \n
2599///    Bits [7:6] are used to assign values to bits [127:96] in the
2600///    destination. \n
2601///    Bit value assignments: \n
2602///    00: Bits [31:0] copied from the specified operand. \n
2603///    01: Bits [63:32] copied from the specified operand. \n
2604///    10: Bits [95:64] copied from the specified operand. \n
2605///    11: Bits [127:96] copied from the specified operand.
2606/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2607#define _mm_shuffle_ps(a, b, mask) \
2608  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2609                                (int)(mask))
2610
2611/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2612///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2613///
2614/// \headerfile <x86intrin.h>
2615///
2616/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2617///
2618/// \param __a
2619///    A 128-bit vector of [4 x float]. \n
2620///    Bits [95:64] are written to bits [31:0] of the destination. \n
2621///    Bits [127:96] are written to bits [95:64] of the destination.
2622/// \param __b
2623///    A 128-bit vector of [4 x float].
2624///    Bits [95:64] are written to bits [63:32] of the destination. \n
2625///    Bits [127:96] are written to bits [127:96] of the destination.
2626/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2627static __inline__ __m128 __DEFAULT_FN_ATTRS
2628_mm_unpackhi_ps(__m128 __a, __m128 __b)
2629{
2630  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2631}
2632
2633/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2634///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2635///
2636/// \headerfile <x86intrin.h>
2637///
2638/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2639///
2640/// \param __a
2641///    A 128-bit vector of [4 x float]. \n
2642///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2643///    Bits [63:32] are written to bits [95:64] of the destination.
2644/// \param __b
2645///    A 128-bit vector of [4 x float]. \n
2646///    Bits [31:0] are written to bits [63:32] of the destination. \n
2647///    Bits [63:32] are written to bits [127:96] of the destination.
2648/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2649static __inline__ __m128 __DEFAULT_FN_ATTRS
2650_mm_unpacklo_ps(__m128 __a, __m128 __b)
2651{
2652  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2653}
2654
2655/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2656///    32 bits are set to the lower 32 bits of the second parameter. The upper
2657///    96 bits are set to the upper 96 bits of the first parameter.
2658///
2659/// \headerfile <x86intrin.h>
2660///
2661/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2662///    instruction.
2663///
2664/// \param __a
2665///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2666///    written to the upper 96 bits of the result.
2667/// \param __b
2668///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2669///    written to the lower 32 bits of the result.
2670/// \returns A 128-bit floating-point vector of [4 x float].
2671static __inline__ __m128 __DEFAULT_FN_ATTRS
2672_mm_move_ss(__m128 __a, __m128 __b)
2673{
2674  __a[0] = __b[0];
2675  return __a;
2676}
2677
2678/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2679///    64 bits are set to the upper 64 bits of the second parameter. The upper
2680///    64 bits are set to the upper 64 bits of the first parameter.
2681///
2682/// \headerfile <x86intrin.h>
2683///
2684/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2685///
2686/// \param __a
2687///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2688///    written to the upper 64 bits of the result.
2689/// \param __b
2690///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2691///    written to the lower 64 bits of the result.
2692/// \returns A 128-bit floating-point vector of [4 x float].
2693static __inline__ __m128 __DEFAULT_FN_ATTRS
2694_mm_movehl_ps(__m128 __a, __m128 __b)
2695{
2696  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2697}
2698
2699/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2700///    64 bits are set to the lower 64 bits of the first parameter. The upper
2701///    64 bits are set to the lower 64 bits of the second parameter.
2702///
2703/// \headerfile <x86intrin.h>
2704///
2705/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2706///
2707/// \param __a
2708///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2709///    written to the lower 64 bits of the result.
2710/// \param __b
2711///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2712///    written to the upper 64 bits of the result.
2713/// \returns A 128-bit floating-point vector of [4 x float].
2714static __inline__ __m128 __DEFAULT_FN_ATTRS
2715_mm_movelh_ps(__m128 __a, __m128 __b)
2716{
2717  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2718}
2719
2720/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2721///    float].
2722///
2723/// \headerfile <x86intrin.h>
2724///
2725/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2726///
2727/// \param __a
2728///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2729///    from the corresponding elements in this operand.
2730/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2731///    values from the operand.
2732static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2733_mm_cvtpi16_ps(__m64 __a)
2734{
2735  __m64 __b, __c;
2736  __m128 __r;
2737
2738  __b = _mm_setzero_si64();
2739  __b = _mm_cmpgt_pi16(__b, __a);
2740  __c = _mm_unpackhi_pi16(__a, __b);
2741  __r = _mm_setzero_ps();
2742  __r = _mm_cvtpi32_ps(__r, __c);
2743  __r = _mm_movelh_ps(__r, __r);
2744  __c = _mm_unpacklo_pi16(__a, __b);
2745  __r = _mm_cvtpi32_ps(__r, __c);
2746
2747  return __r;
2748}
2749
2750/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2751///    128-bit vector of [4 x float].
2752///
2753/// \headerfile <x86intrin.h>
2754///
2755/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2756///
2757/// \param __a
2758///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2759///    destination are copied from the corresponding elements in this operand.
2760/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2761///    values from the operand.
2762static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2763_mm_cvtpu16_ps(__m64 __a)
2764{
2765  __m64 __b, __c;
2766  __m128 __r;
2767
2768  __b = _mm_setzero_si64();
2769  __c = _mm_unpackhi_pi16(__a, __b);
2770  __r = _mm_setzero_ps();
2771  __r = _mm_cvtpi32_ps(__r, __c);
2772  __r = _mm_movelh_ps(__r, __r);
2773  __c = _mm_unpacklo_pi16(__a, __b);
2774  __r = _mm_cvtpi32_ps(__r, __c);
2775
2776  return __r;
2777}
2778
2779/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2780///    into a 128-bit vector of [4 x float].
2781///
2782/// \headerfile <x86intrin.h>
2783///
2784/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2785///
2786/// \param __a
2787///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2788///    from the corresponding lower 4 elements in this operand.
2789/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2790///    values from the operand.
2791static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2792_mm_cvtpi8_ps(__m64 __a)
2793{
2794  __m64 __b;
2795
2796  __b = _mm_setzero_si64();
2797  __b = _mm_cmpgt_pi8(__b, __a);
2798  __b = _mm_unpacklo_pi8(__a, __b);
2799
2800  return _mm_cvtpi16_ps(__b);
2801}
2802
2803/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2804///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2805///
2806/// \headerfile <x86intrin.h>
2807///
2808/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2809///
2810/// \param __a
2811///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2812///    destination are copied from the corresponding lower 4 elements in this
2813///    operand.
2814/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2815///    values from the source operand.
2816static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2817_mm_cvtpu8_ps(__m64 __a)
2818{
2819  __m64 __b;
2820
2821  __b = _mm_setzero_si64();
2822  __b = _mm_unpacklo_pi8(__a, __b);
2823
2824  return _mm_cvtpi16_ps(__b);
2825}
2826
2827/// Converts the two 32-bit signed integer values from each 64-bit vector
2828///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2829///
2830/// \headerfile <x86intrin.h>
2831///
2832/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2833///
2834/// \param __a
2835///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2836///    copied from the elements in this operand.
2837/// \param __b
2838///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2839///    copied from the elements in this operand.
2840/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2841///    copied and converted values from the first operand. The upper 64 bits
2842///    contain the copied and converted values from the second operand.
2843static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2844_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2845{
2846  __m128 __c;
2847
2848  __c = _mm_setzero_ps();
2849  __c = _mm_cvtpi32_ps(__c, __b);
2850  __c = _mm_movelh_ps(__c, __c);
2851
2852  return _mm_cvtpi32_ps(__c, __a);
2853}
2854
2855/// Converts each single-precision floating-point element of a 128-bit
2856///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2857///    packs the results into a 64-bit integer vector of [4 x i16].
2858///
2859///    If the floating-point element is NaN or infinity, or if the
2860///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2861///    it is converted to 0x8000. Otherwise if the floating-point element is
2862///    greater than 0x7FFF, it is converted to 0x7FFF.
2863///
2864/// \headerfile <x86intrin.h>
2865///
2866/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2867///
2868/// \param __a
2869///    A 128-bit floating-point vector of [4 x float].
2870/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2871///    values.
2872static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2873_mm_cvtps_pi16(__m128 __a)
2874{
2875  __m64 __b, __c;
2876
2877  __b = _mm_cvtps_pi32(__a);
2878  __a = _mm_movehl_ps(__a, __a);
2879  __c = _mm_cvtps_pi32(__a);
2880
2881  return _mm_packs_pi32(__b, __c);
2882}
2883
2884/// Converts each single-precision floating-point element of a 128-bit
2885///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2886///    packs the results into the lower 32 bits of a 64-bit integer vector of
2887///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2888///
2889///    If the floating-point element is NaN or infinity, or if the
2890///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2891///    is converted to 0x80. Otherwise if the floating-point element is greater
2892///    than 0x7F, it is converted to 0x7F.
2893///
2894/// \headerfile <x86intrin.h>
2895///
2896/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2897///
2898/// \param __a
2899///    128-bit floating-point vector of [4 x float].
2900/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2901///    converted values and the uppper 32 bits are set to zero.
2902static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2903_mm_cvtps_pi8(__m128 __a)
2904{
2905  __m64 __b, __c;
2906
2907  __b = _mm_cvtps_pi16(__a);
2908  __c = _mm_setzero_si64();
2909
2910  return _mm_packs_pi16(__b, __c);
2911}
2912
2913/// Extracts the sign bits from each single-precision floating-point
2914///    element of a 128-bit floating-point vector of [4 x float] and returns the
2915///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2916///    to zero.
2917///
2918/// \headerfile <x86intrin.h>
2919///
2920/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2921///
2922/// \param __a
2923///    A 128-bit floating-point vector of [4 x float].
2924/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2925///    single-precision floating-point element of the parameter. Bits [31:4] are
2926///    set to zero.
2927static __inline__ int __DEFAULT_FN_ATTRS
2928_mm_movemask_ps(__m128 __a)
2929{
2930  return __builtin_ia32_movmskps((__v4sf)__a);
2931}
2932
2933
2934#define _MM_ALIGN16 __attribute__((aligned(16)))
2935
2936#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2937
2938#define _MM_EXCEPT_INVALID    (0x0001)
2939#define _MM_EXCEPT_DENORM     (0x0002)
2940#define _MM_EXCEPT_DIV_ZERO   (0x0004)
2941#define _MM_EXCEPT_OVERFLOW   (0x0008)
2942#define _MM_EXCEPT_UNDERFLOW  (0x0010)
2943#define _MM_EXCEPT_INEXACT    (0x0020)
2944#define _MM_EXCEPT_MASK       (0x003f)
2945
2946#define _MM_MASK_INVALID      (0x0080)
2947#define _MM_MASK_DENORM       (0x0100)
2948#define _MM_MASK_DIV_ZERO     (0x0200)
2949#define _MM_MASK_OVERFLOW     (0x0400)
2950#define _MM_MASK_UNDERFLOW    (0x0800)
2951#define _MM_MASK_INEXACT      (0x1000)
2952#define _MM_MASK_MASK         (0x1f80)
2953
2954#define _MM_ROUND_NEAREST     (0x0000)
2955#define _MM_ROUND_DOWN        (0x2000)
2956#define _MM_ROUND_UP          (0x4000)
2957#define _MM_ROUND_TOWARD_ZERO (0x6000)
2958#define _MM_ROUND_MASK        (0x6000)
2959
2960#define _MM_FLUSH_ZERO_MASK   (0x8000)
2961#define _MM_FLUSH_ZERO_ON     (0x8000)
2962#define _MM_FLUSH_ZERO_OFF    (0x0000)
2963
2964#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2965#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2966#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2967#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2968
2969#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2970#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2971#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2972#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2973
2974#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2975do { \
2976  __m128 tmp3, tmp2, tmp1, tmp0; \
2977  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2978  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2979  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2980  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2981  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2982  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2983  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2984  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2985} while (0)
2986
2987/* Aliases for compatibility. */
2988#define _m_pextrw _mm_extract_pi16
2989#define _m_pinsrw _mm_insert_pi16
2990#define _m_pmaxsw _mm_max_pi16
2991#define _m_pmaxub _mm_max_pu8
2992#define _m_pminsw _mm_min_pi16
2993#define _m_pminub _mm_min_pu8
2994#define _m_pmovmskb _mm_movemask_pi8
2995#define _m_pmulhuw _mm_mulhi_pu16
2996#define _m_pshufw _mm_shuffle_pi16
2997#define _m_maskmovq _mm_maskmove_si64
2998#define _m_pavgb _mm_avg_pu8
2999#define _m_pavgw _mm_avg_pu16
3000#define _m_psadbw _mm_sad_pu8
3001#define _m_ _mm_
3002#define _m_ _mm_
3003
3004#undef __DEFAULT_FN_ATTRS
3005#undef __DEFAULT_FN_ATTRS_MMX
3006
3007/* Ugly hack for backwards-compatibility (compatible with gcc) */
3008#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3009#include <emmintrin.h>
3010#endif
3011
3012#endif /* __XMMINTRIN_H */
3013