xmmintrin.h revision 309124
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23296417Sdim
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#include <mmintrin.h>
28193326Sed
29205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
30193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
31193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
32193326Sed
33309124Sdim/* Unsigned types */
34309124Sdimtypedef unsigned int __v4su __attribute__((__vector_size__(16)));
35309124Sdim
36276479Sdim/* This header should only be included in a hosted environment as it depends on
37276479Sdim * a standard library to provide allocation routines. */
38218893Sdim#if __STDC_HOSTED__
39193326Sed#include <mm_malloc.h>
40218893Sdim#endif
41193326Sed
42288943Sdim/* Define the default attributes for the functions in this file. */
43296417Sdim#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
44288943Sdim
45309124Sdim/// \brief Adds the 32-bit float values in the low-order bits of the operands.
46309124Sdim///
47309124Sdim/// \headerfile <x86intrin.h>
48309124Sdim///
49309124Sdim/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
50309124Sdim///
51309124Sdim/// \param __a
52309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
53309124Sdim///    The lower 32 bits of this operand are used in the calculation.
54309124Sdim/// \param __b
55309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
56309124Sdim///    The lower 32 bits of this operand are used in the calculation.
57309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
58309124Sdim///    of the lower 32 bits of both operands. The upper 96 bits are copied from
59309124Sdim///    the upper 96 bits of the first source operand.
60288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
61249423Sdim_mm_add_ss(__m128 __a, __m128 __b)
62193326Sed{
63249423Sdim  __a[0] += __b[0];
64249423Sdim  return __a;
65193326Sed}
66193326Sed
67309124Sdim/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
68309124Sdim///    the addition.
69309124Sdim///
70309124Sdim/// \headerfile <x86intrin.h>
71309124Sdim///
72309124Sdim/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
73309124Sdim///
74309124Sdim/// \param __a
75309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
76309124Sdim/// \param __b
77309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
78309124Sdim/// \returns A 128-bit vector of [4 x float] containing the sums of both
79309124Sdim///    operands.
80288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
81249423Sdim_mm_add_ps(__m128 __a, __m128 __b)
82193326Sed{
83309124Sdim  return (__m128)((__v4sf)__a + (__v4sf)__b);
84193326Sed}
85193326Sed
86309124Sdim/// \brief Subtracts the 32-bit float value in the low-order bits of the second
87309124Sdim///    operand from the corresponding value in the first operand.
88309124Sdim///
89309124Sdim/// \headerfile <x86intrin.h>
90309124Sdim///
91309124Sdim/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
92309124Sdim///
93309124Sdim/// \param __a
94309124Sdim///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
95309124Sdim///    of this operand are used in the calculation.
96309124Sdim/// \param __b
97309124Sdim///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
98309124Sdim///    bits of this operand are used in the calculation.
99309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
100309124Sdim///    difference of the lower 32 bits of both operands. The upper 96 bits are
101309124Sdim///    copied from the upper 96 bits of the first source operand.
102288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
103249423Sdim_mm_sub_ss(__m128 __a, __m128 __b)
104193326Sed{
105249423Sdim  __a[0] -= __b[0];
106249423Sdim  return __a;
107193326Sed}
108193326Sed
109309124Sdim/// \brief Subtracts each of the values of the second operand from the first
110309124Sdim///    operand, both of which are 128-bit vectors of [4 x float] and returns
111309124Sdim///    the results of the subtraction.
112309124Sdim///
113309124Sdim/// \headerfile <x86intrin.h>
114309124Sdim///
115309124Sdim/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
116309124Sdim///
117309124Sdim/// \param __a
118309124Sdim///    A 128-bit vector of [4 x float] containing the minuend.
119309124Sdim/// \param __b
120309124Sdim///    A 128-bit vector of [4 x float] containing the subtrahend.
121309124Sdim/// \returns A 128-bit vector of [4 x float] containing the differences between
122309124Sdim///    both operands.
123288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
124249423Sdim_mm_sub_ps(__m128 __a, __m128 __b)
125193326Sed{
126309124Sdim  return (__m128)((__v4sf)__a - (__v4sf)__b);
127193326Sed}
128193326Sed
129309124Sdim/// \brief Multiplies two 32-bit float values in the low-order bits of the
130309124Sdim///    operands.
131309124Sdim///
132309124Sdim/// \headerfile <x86intrin.h>
133309124Sdim///
134309124Sdim/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
135309124Sdim///
136309124Sdim/// \param __a
137309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
138309124Sdim///    The lower 32 bits of this operand are used in the calculation.
139309124Sdim/// \param __b
140309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
141309124Sdim///    The lower 32 bits of this operand are used in the calculation.
142309124Sdim/// \returns A 128-bit vector of [4 x float] containing the product of the lower
143309124Sdim///    32 bits of both operands. The upper 96 bits are copied from the upper 96
144309124Sdim///    bits of the first source operand.
145288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
146249423Sdim_mm_mul_ss(__m128 __a, __m128 __b)
147193326Sed{
148249423Sdim  __a[0] *= __b[0];
149249423Sdim  return __a;
150193326Sed}
151193326Sed
152309124Sdim/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
153309124Sdim///    results of the multiplication.
154309124Sdim///
155309124Sdim/// \headerfile <x86intrin.h>
156309124Sdim///
157309124Sdim/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
158309124Sdim///
159309124Sdim/// \param __a
160309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
161309124Sdim/// \param __b
162309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
163309124Sdim/// \returns A 128-bit vector of [4 x float] containing the products of both
164309124Sdim///    operands.
165288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
166249423Sdim_mm_mul_ps(__m128 __a, __m128 __b)
167193326Sed{
168309124Sdim  return (__m128)((__v4sf)__a * (__v4sf)__b);
169193326Sed}
170193326Sed
171309124Sdim/// \brief Divides the value in the low-order 32 bits of the first operand by
172309124Sdim///    the corresponding value in the second operand.
173309124Sdim///
174309124Sdim/// \headerfile <x86intrin.h>
175309124Sdim///
176309124Sdim/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
177309124Sdim///
178309124Sdim/// \param __a
179309124Sdim///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
180309124Sdim///    bits of this operand are used in the calculation.
181309124Sdim/// \param __b
182309124Sdim///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
183309124Sdim///    of this operand are used in the calculation.
184309124Sdim/// \returns A 128-bit vector of [4 x float] containing the quotients of the
185309124Sdim///    lower 32 bits of both operands. The upper 96 bits are copied from the
186309124Sdim///    upper 96 bits of the first source operand.
187288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
188249423Sdim_mm_div_ss(__m128 __a, __m128 __b)
189193326Sed{
190249423Sdim  __a[0] /= __b[0];
191249423Sdim  return __a;
192193326Sed}
193193326Sed
194309124Sdim/// \brief Divides two 128-bit vectors of [4 x float].
195309124Sdim///
196309124Sdim/// \headerfile <x86intrin.h>
197309124Sdim///
198309124Sdim/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
199309124Sdim///
200309124Sdim/// \param __a
201309124Sdim///    A 128-bit vector of [4 x float] containing the dividend.
202309124Sdim/// \param __b
203309124Sdim///    A 128-bit vector of [4 x float] containing the divisor.
204309124Sdim/// \returns A 128-bit vector of [4 x float] containing the quotients of both
205309124Sdim///    operands.
206288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
207249423Sdim_mm_div_ps(__m128 __a, __m128 __b)
208193326Sed{
209309124Sdim  return (__m128)((__v4sf)__a / (__v4sf)__b);
210193326Sed}
211193326Sed
212309124Sdim/// \brief Calculates the square root of the value stored in the low-order bits
213309124Sdim///    of a 128-bit vector of [4 x float].
214309124Sdim///
215309124Sdim/// \headerfile <x86intrin.h>
216309124Sdim///
217309124Sdim/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
218309124Sdim///
219309124Sdim/// \param __a
220309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
221309124Sdim///    used in the calculation.
222309124Sdim/// \returns A 128-bit vector of [4 x float] containing the square root of the
223309124Sdim///    value in the low-order bits of the operand.
224288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
225249423Sdim_mm_sqrt_ss(__m128 __a)
226193326Sed{
227309124Sdim  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
228249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
229193326Sed}
230193326Sed
231309124Sdim/// \brief Calculates the square roots of the values stored in a 128-bit vector
232309124Sdim///    of [4 x float].
233309124Sdim///
234309124Sdim/// \headerfile <x86intrin.h>
235309124Sdim///
236309124Sdim/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
237309124Sdim///
238309124Sdim/// \param __a
239309124Sdim///    A 128-bit vector of [4 x float].
240309124Sdim/// \returns A 128-bit vector of [4 x float] containing the square roots of the
241309124Sdim///    values in the operand.
242288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
243249423Sdim_mm_sqrt_ps(__m128 __a)
244193326Sed{
245309124Sdim  return __builtin_ia32_sqrtps((__v4sf)__a);
246193326Sed}
247193326Sed
248309124Sdim/// \brief Calculates the approximate reciprocal of the value stored in the
249309124Sdim///    low-order bits of a 128-bit vector of [4 x float].
250309124Sdim///
251309124Sdim/// \headerfile <x86intrin.h>
252309124Sdim///
253309124Sdim/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
254309124Sdim///
255309124Sdim/// \param __a
256309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
257309124Sdim///    used in the calculation.
258309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate
259309124Sdim///    reciprocal of the value in the low-order bits of the operand.
260288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
261249423Sdim_mm_rcp_ss(__m128 __a)
262193326Sed{
263309124Sdim  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
264249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
265193326Sed}
266193326Sed
267309124Sdim/// \brief Calculates the approximate reciprocals of the values stored in a
268309124Sdim///    128-bit vector of [4 x float].
269309124Sdim///
270309124Sdim/// \headerfile <x86intrin.h>
271309124Sdim///
272309124Sdim/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
273309124Sdim///
274309124Sdim/// \param __a
275309124Sdim///    A 128-bit vector of [4 x float].
276309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate
277309124Sdim///    reciprocals of the values in the operand.
278288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
279249423Sdim_mm_rcp_ps(__m128 __a)
280193326Sed{
281309124Sdim  return __builtin_ia32_rcpps((__v4sf)__a);
282193326Sed}
283193326Sed
284309124Sdim/// \brief Calculates the approximate reciprocal of the square root of the value
285309124Sdim///    stored in the low-order bits of a 128-bit vector of [4 x float].
286309124Sdim///
287309124Sdim/// \headerfile <x86intrin.h>
288309124Sdim///
289309124Sdim/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
290309124Sdim///
291309124Sdim/// \param __a
292309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
293309124Sdim///    used in the calculation.
294309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate
295309124Sdim///    reciprocal of the square root of the value in the low-order bits of the
296309124Sdim///    operand.
297288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
298249423Sdim_mm_rsqrt_ss(__m128 __a)
299193326Sed{
300309124Sdim  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
301249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
302193326Sed}
303193326Sed
304309124Sdim/// \brief Calculates the approximate reciprocals of the square roots of the
305309124Sdim///    values stored in a 128-bit vector of [4 x float].
306309124Sdim///
307309124Sdim/// \headerfile <x86intrin.h>
308309124Sdim///
309309124Sdim/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
310309124Sdim///
311309124Sdim/// \param __a
312309124Sdim///    A 128-bit vector of [4 x float].
313309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate
314309124Sdim///    reciprocals of the square roots of the values in the operand.
315288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
316249423Sdim_mm_rsqrt_ps(__m128 __a)
317193326Sed{
318309124Sdim  return __builtin_ia32_rsqrtps((__v4sf)__a);
319193326Sed}
320193326Sed
321309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
322309124Sdim///    operands and returns the lesser value in the low-order bits of the
323309124Sdim///    vector of [4 x float].
324309124Sdim///
325309124Sdim/// \headerfile <x86intrin.h>
326309124Sdim///
327309124Sdim/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
328309124Sdim///
329309124Sdim/// \param __a
330309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
331309124Sdim///    32 bits of this operand are used in the comparison.
332309124Sdim/// \param __b
333309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
334309124Sdim///    32 bits of this operand are used in the comparison.
335309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
336309124Sdim///    minimum value between both operands. The upper 96 bits are copied from
337309124Sdim///    the upper 96 bits of the first source operand.
338288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
339249423Sdim_mm_min_ss(__m128 __a, __m128 __b)
340193326Sed{
341309124Sdim  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
342193326Sed}
343193326Sed
344309124Sdim/// \brief Compares two 128-bit vectors of [4 x float] and returns the
345309124Sdim///    lesser of each pair of values.
346309124Sdim///
347309124Sdim/// \headerfile <x86intrin.h>
348309124Sdim///
349309124Sdim/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
350309124Sdim///
351309124Sdim/// \param __a
352309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands.
353309124Sdim/// \param __b
354309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands.
355309124Sdim/// \returns A 128-bit vector of [4 x float] containing the minimum values
356309124Sdim///    between both operands.
357288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
358249423Sdim_mm_min_ps(__m128 __a, __m128 __b)
359193326Sed{
360309124Sdim  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
361193326Sed}
362193326Sed
363309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
364309124Sdim///    operands and returns the greater value in the low-order bits of
365309124Sdim///    a vector [4 x float].
366309124Sdim///
367309124Sdim/// \headerfile <x86intrin.h>
368309124Sdim///
369309124Sdim/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
370309124Sdim///
371309124Sdim/// \param __a
372309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
373309124Sdim///    32 bits of this operand are used in the comparison.
374309124Sdim/// \param __b
375309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
376309124Sdim///    32 bits of this operand are used in the comparison.
377309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378309124Sdim///    maximum value between both operands. The upper 96 bits are copied from
379309124Sdim///    the upper 96 bits of the first source operand.
380288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
381249423Sdim_mm_max_ss(__m128 __a, __m128 __b)
382193326Sed{
383309124Sdim  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384193326Sed}
385193326Sed
386309124Sdim/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
387309124Sdim///    of each pair of values.
388309124Sdim///
389309124Sdim/// \headerfile <x86intrin.h>
390309124Sdim///
391309124Sdim/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
392309124Sdim///
393309124Sdim/// \param __a
394309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands.
395309124Sdim/// \param __b
396309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands.
397309124Sdim/// \returns A 128-bit vector of [4 x float] containing the maximum values
398309124Sdim///    between both operands.
399288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
400249423Sdim_mm_max_ps(__m128 __a, __m128 __b)
401193326Sed{
402309124Sdim  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
403193326Sed}
404193326Sed
405309124Sdim/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
406309124Sdim///
407309124Sdim/// \headerfile <x86intrin.h>
408309124Sdim///
409309124Sdim/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
410309124Sdim///
411309124Sdim/// \param __a
412309124Sdim///    A 128-bit vector containing one of the source operands.
413309124Sdim/// \param __b
414309124Sdim///    A 128-bit vector containing one of the source operands.
415309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
416309124Sdim///    values between both operands.
417288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
418249423Sdim_mm_and_ps(__m128 __a, __m128 __b)
419193326Sed{
420309124Sdim  return (__m128)((__v4su)__a & (__v4su)__b);
421193326Sed}
422193326Sed
423309124Sdim/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
424309124Sdim///    the one's complement of the values contained in the first source
425309124Sdim///    operand.
426309124Sdim///
427309124Sdim/// \headerfile <x86intrin.h>
428309124Sdim///
429309124Sdim/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
430309124Sdim///
431309124Sdim/// \param __a
432309124Sdim///    A 128-bit vector of [4 x float] containing the first source operand. The
433309124Sdim///    one's complement of this value is used in the bitwise AND.
434309124Sdim/// \param __b
435309124Sdim///    A 128-bit vector of [4 x float] containing the second source operand.
436309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
437309124Sdim///    one's complement of the first operand and the values in the second
438309124Sdim///    operand.
439288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
440249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b)
441193326Sed{
442309124Sdim  return (__m128)(~(__v4su)__a & (__v4su)__b);
443193326Sed}
444193326Sed
445309124Sdim/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
446309124Sdim///
447309124Sdim/// \headerfile <x86intrin.h>
448309124Sdim///
449309124Sdim/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
450309124Sdim///
451309124Sdim/// \param __a
452309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
453309124Sdim/// \param __b
454309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
455309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
456309124Sdim///    values between both operands.
457288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
458249423Sdim_mm_or_ps(__m128 __a, __m128 __b)
459193326Sed{
460309124Sdim  return (__m128)((__v4su)__a | (__v4su)__b);
461193326Sed}
462193326Sed
463309124Sdim/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
464309124Sdim///    [4 x float].
465309124Sdim///
466309124Sdim/// \headerfile <x86intrin.h>
467309124Sdim///
468309124Sdim/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
469309124Sdim///
470309124Sdim/// \param __a
471309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
472309124Sdim/// \param __b
473309124Sdim///    A 128-bit vector of [4 x float] containing one of the source operands.
474309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
475309124Sdim///    of the values between both operands.
476288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
477249423Sdim_mm_xor_ps(__m128 __a, __m128 __b)
478193326Sed{
479309124Sdim  return (__m128)((__v4su)__a ^ (__v4su)__b);
480193326Sed}
481193326Sed
482309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
483309124Sdim///    operands for equality and returns the result of the comparison in the
484309124Sdim///    low-order bits of a vector [4 x float].
485309124Sdim///
486309124Sdim/// \headerfile <x86intrin.h>
487309124Sdim///
488309124Sdim/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
489309124Sdim///
490309124Sdim/// \param __a
491309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
492309124Sdim///    32 bits of this operand are used in the comparison.
493309124Sdim/// \param __b
494309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
495309124Sdim///    32 bits of this operand are used in the comparison.
496309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
497309124Sdim///    in the low-order bits.
498288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
499249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b)
500193326Sed{
501309124Sdim  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
502193326Sed}
503193326Sed
504309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
505309124Sdim///    128-bit vectors of [4 x float] for equality.
506309124Sdim///
507309124Sdim/// \headerfile <x86intrin.h>
508309124Sdim///
509309124Sdim/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
510309124Sdim///
511309124Sdim/// \param __a
512309124Sdim///    A 128-bit vector of [4 x float].
513309124Sdim/// \param __b
514309124Sdim///    A 128-bit vector of [4 x float].
515309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
516288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
517249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b)
518193326Sed{
519309124Sdim  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
520193326Sed}
521193326Sed
522309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
523309124Sdim///    operands to determine if the value in the first operand is less than the
524309124Sdim///    corresponding value in the second operand and returns the result of the
525309124Sdim///    comparison in the low-order bits of a vector of [4 x float].
526309124Sdim///
527309124Sdim/// \headerfile <x86intrin.h>
528309124Sdim///
529309124Sdim/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
530309124Sdim///
531309124Sdim/// \param __a
532309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
533309124Sdim///    32 bits of this operand are used in the comparison.
534309124Sdim/// \param __b
535309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
536309124Sdim///    32 bits of this operand are used in the comparison.
537309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
538309124Sdim///    in the low-order bits.
539288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
540249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b)
541193326Sed{
542309124Sdim  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
543193326Sed}
544193326Sed
545309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
546309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
547309124Sdim///    operand are less than those in the second operand.
548309124Sdim///
549309124Sdim/// \headerfile <x86intrin.h>
550309124Sdim///
551309124Sdim/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
552309124Sdim///
553309124Sdim/// \param __a
554309124Sdim///    A 128-bit vector of [4 x float].
555309124Sdim/// \param __b
556309124Sdim///    A 128-bit vector of [4 x float].
557309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
558288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
559249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b)
560193326Sed{
561309124Sdim  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
562193326Sed}
563193326Sed
564309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
565309124Sdim///    operands to determine if the value in the first operand is less than or
566309124Sdim///    equal to the corresponding value in the second operand and returns the
567309124Sdim///    result of the comparison in the low-order bits of a vector of
568309124Sdim///    [4 x float].
569309124Sdim///
570309124Sdim/// \headerfile <x86intrin.h>
571309124Sdim///
572309124Sdim/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
573309124Sdim///
574309124Sdim/// \param __a
575309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
576309124Sdim///    32 bits of this operand are used in the comparison.
577309124Sdim/// \param __b
578309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
579309124Sdim///    32 bits of this operand are used in the comparison.
580309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
581309124Sdim///    in the low-order bits.
582288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
583249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b)
584193326Sed{
585309124Sdim  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
586193326Sed}
587193326Sed
588309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
589309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
590309124Sdim///    operand are less than or equal to those in the second operand.
591309124Sdim///
592309124Sdim/// \headerfile <x86intrin.h>
593309124Sdim///
594309124Sdim/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
595309124Sdim///
596309124Sdim/// \param __a
597309124Sdim///    A 128-bit vector of [4 x float].
598309124Sdim/// \param __b
599309124Sdim///    A 128-bit vector of [4 x float].
600309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
601288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
602249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b)
603193326Sed{
604309124Sdim  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
605193326Sed}
606193326Sed
607309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
608309124Sdim///    operands to determine if the value in the first operand is greater than
609309124Sdim///    the corresponding value in the second operand and returns the result of
610309124Sdim///    the comparison in the low-order bits of a vector of [4 x float].
611309124Sdim///
612309124Sdim/// \headerfile <x86intrin.h>
613309124Sdim///
614309124Sdim/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
615309124Sdim///
616309124Sdim/// \param __a
617309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
618309124Sdim///    32 bits of this operand are used in the comparison.
619309124Sdim/// \param __b
620309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
621309124Sdim///    32 bits of this operand are used in the comparison.
622309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
623309124Sdim///    in the low-order bits.
624288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
625249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b)
626193326Sed{
627309124Sdim  return (__m128)__builtin_shufflevector((__v4sf)__a,
628309124Sdim                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
629261991Sdim                                         4, 1, 2, 3);
630193326Sed}
631193326Sed
632309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
633309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
634309124Sdim///    operand are greater than those in the second operand.
635309124Sdim///
636309124Sdim/// \headerfile <x86intrin.h>
637309124Sdim///
638309124Sdim/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
639309124Sdim///
640309124Sdim/// \param __a
641309124Sdim///    A 128-bit vector of [4 x float].
642309124Sdim/// \param __b
643309124Sdim///    A 128-bit vector of [4 x float].
644309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
645288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
646249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b)
647193326Sed{
648309124Sdim  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
649193326Sed}
650193326Sed
651309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
652309124Sdim///    operands to determine if the value in the first operand is greater than
653309124Sdim///    or equal to the corresponding value in the second operand and returns
654309124Sdim///    the result of the comparison in the low-order bits of a vector of
655309124Sdim///    [4 x float].
656309124Sdim///
657309124Sdim/// \headerfile <x86intrin.h>
658309124Sdim///
659309124Sdim/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
660309124Sdim///
661309124Sdim/// \param __a
662309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
663309124Sdim///    32 bits of this operand are used in the comparison.
664309124Sdim/// \param __b
665309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
666309124Sdim///    32 bits of this operand are used in the comparison.
667309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
668309124Sdim///    in the low-order bits.
669288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
670249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b)
671193326Sed{
672309124Sdim  return (__m128)__builtin_shufflevector((__v4sf)__a,
673309124Sdim                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
674261991Sdim                                         4, 1, 2, 3);
675193326Sed}
676193326Sed
677309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
678309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
679309124Sdim///    operand are greater than or equal to those in the second operand.
680309124Sdim///
681309124Sdim/// \headerfile <x86intrin.h>
682309124Sdim///
683309124Sdim/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
684309124Sdim///
685309124Sdim/// \param __a
686309124Sdim///    A 128-bit vector of [4 x float].
687309124Sdim/// \param __b
688309124Sdim///    A 128-bit vector of [4 x float].
689309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
690288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
691249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b)
692193326Sed{
693309124Sdim  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
694193326Sed}
695193326Sed
696309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
697309124Sdim///    operands for inequality and returns the result of the comparison in the
698309124Sdim///    low-order bits of a vector of [4 x float].
699309124Sdim///
700309124Sdim/// \headerfile <x86intrin.h>
701309124Sdim///
702309124Sdim/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
703309124Sdim///
704309124Sdim/// \param __a
705309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
706309124Sdim///    32 bits of this operand are used in the comparison.
707309124Sdim/// \param __b
708309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
709309124Sdim///    32 bits of this operand are used in the comparison.
710309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
711309124Sdim///    in the low-order bits.
712288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
713249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b)
714193326Sed{
715309124Sdim  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
716193326Sed}
717193326Sed
718309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
719309124Sdim///    128-bit vectors of [4 x float] for inequality.
720309124Sdim///
721309124Sdim/// \headerfile <x86intrin.h>
722309124Sdim///
723309124Sdim/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
724309124Sdim///
725309124Sdim/// \param __a
726309124Sdim///    A 128-bit vector of [4 x float].
727309124Sdim/// \param __b
728309124Sdim///    A 128-bit vector of [4 x float].
729309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
730288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
731249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b)
732193326Sed{
733309124Sdim  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
734193326Sed}
735193326Sed
736309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
737309124Sdim///    operands to determine if the value in the first operand is not less than
738309124Sdim///    the corresponding value in the second operand and returns the result of
739309124Sdim///    the comparison in the low-order bits of a vector of [4 x float].
740309124Sdim///
741309124Sdim/// \headerfile <x86intrin.h>
742309124Sdim///
743309124Sdim/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
744309124Sdim///
745309124Sdim/// \param __a
746309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
747309124Sdim///    32 bits of this operand are used in the comparison.
748309124Sdim/// \param __b
749309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
750309124Sdim///    32 bits of this operand are used in the comparison.
751309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
752309124Sdim///    in the low-order bits.
753288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
754249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b)
755193326Sed{
756309124Sdim  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
757193326Sed}
758193326Sed
759309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
760309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
761309124Sdim///    operand are not less than those in the second operand.
762309124Sdim///
763309124Sdim/// \headerfile <x86intrin.h>
764309124Sdim///
765309124Sdim/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
766309124Sdim///
767309124Sdim/// \param __a
768309124Sdim///    A 128-bit vector of [4 x float].
769309124Sdim/// \param __b
770309124Sdim///    A 128-bit vector of [4 x float].
771309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
772288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
773249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b)
774193326Sed{
775309124Sdim  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
776193326Sed}
777193326Sed
778309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
779309124Sdim///    operands to determine if the value in the first operand is not less than
780309124Sdim///    or equal to the corresponding value in the second operand and returns
781309124Sdim///    the result of the comparison in the low-order bits of a vector of
782309124Sdim///    [4 x float].
783309124Sdim///
784309124Sdim/// \headerfile <x86intrin.h>
785309124Sdim///
786309124Sdim/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
787309124Sdim///
788309124Sdim/// \param __a
789309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
790309124Sdim///    32 bits of this operand are used in the comparison.
791309124Sdim/// \param __b
792309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
793309124Sdim///    32 bits of this operand are used in the comparison.
794309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
795309124Sdim///    in the low-order bits.
796288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
797249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b)
798193326Sed{
799309124Sdim  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
800193326Sed}
801193326Sed
802309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
803309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
804309124Sdim///    operand are not less than or equal to those in the second operand.
805309124Sdim///
806309124Sdim/// \headerfile <x86intrin.h>
807309124Sdim///
808309124Sdim/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
809309124Sdim///
810309124Sdim/// \param __a
811309124Sdim///    A 128-bit vector of [4 x float].
812309124Sdim/// \param __b
813309124Sdim///    A 128-bit vector of [4 x float].
814309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
815288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
816249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b)
817193326Sed{
818309124Sdim  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
819193326Sed}
820193326Sed
821309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
822309124Sdim///    operands to determine if the value in the first operand is not greater
823309124Sdim///    than the corresponding value in the second operand and returns the
824309124Sdim///    result of the comparison in the low-order bits of a vector of
825309124Sdim///    [4 x float].
826309124Sdim///
827309124Sdim/// \headerfile <x86intrin.h>
828309124Sdim///
829309124Sdim/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
830309124Sdim///
831309124Sdim/// \param __a
832309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
833309124Sdim///    32 bits of this operand are used in the comparison.
834309124Sdim/// \param __b
835309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
836309124Sdim///    32 bits of this operand are used in the comparison.
837309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
838309124Sdim///    in the low-order bits.
839288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
840249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b)
841193326Sed{
842309124Sdim  return (__m128)__builtin_shufflevector((__v4sf)__a,
843309124Sdim                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
844261991Sdim                                         4, 1, 2, 3);
845193326Sed}
846193326Sed
847309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
848309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
849309124Sdim///    operand are not greater than those in the second operand.
850309124Sdim///
851309124Sdim/// \headerfile <x86intrin.h>
852309124Sdim///
853309124Sdim/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
854309124Sdim///
855309124Sdim/// \param __a
856309124Sdim///    A 128-bit vector of [4 x float].
857309124Sdim/// \param __b
858309124Sdim///    A 128-bit vector of [4 x float].
859309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
860288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
861249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b)
862193326Sed{
863309124Sdim  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
864193326Sed}
865193326Sed
866309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
867309124Sdim///    operands to determine if the value in the first operand is not greater
868309124Sdim///    than or equal to the corresponding value in the second operand and
869309124Sdim///    returns the result of the comparison in the low-order bits of a vector
870309124Sdim///    of [4 x float].
871309124Sdim///
872309124Sdim/// \headerfile <x86intrin.h>
873309124Sdim///
874309124Sdim/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
875309124Sdim///
876309124Sdim/// \param __a
877309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
878309124Sdim///    32 bits of this operand are used in the comparison.
879309124Sdim/// \param __b
880309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
881309124Sdim///    32 bits of this operand are used in the comparison.
882309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
883309124Sdim///    in the low-order bits.
884288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
885249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b)
886193326Sed{
887309124Sdim  return (__m128)__builtin_shufflevector((__v4sf)__a,
888309124Sdim                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
889261991Sdim                                         4, 1, 2, 3);
890193326Sed}
891193326Sed
892309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
893309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
894309124Sdim///    operand are not greater than or equal to those in the second operand.
895309124Sdim///
896309124Sdim/// \headerfile <x86intrin.h>
897309124Sdim///
898309124Sdim/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
899309124Sdim///
900309124Sdim/// \param __a
901309124Sdim///    A 128-bit vector of [4 x float].
902309124Sdim/// \param __b
903309124Sdim///    A 128-bit vector of [4 x float].
904309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
905288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
906249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b)
907193326Sed{
908309124Sdim  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
909193326Sed}
910193326Sed
911309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
912309124Sdim///    operands to determine if the value in the first operand is ordered with
913309124Sdim///    respect to the corresponding value in the second operand and returns the
914309124Sdim///    result of the comparison in the low-order bits of a vector of
915309124Sdim///    [4 x float].
916309124Sdim///
917309124Sdim/// \headerfile <x86intrin.h>
918309124Sdim///
919309124Sdim/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
920309124Sdim///
921309124Sdim/// \param __a
922309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
923309124Sdim///    32 bits of this operand are used in the comparison.
924309124Sdim/// \param __b
925309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
926309124Sdim///    32 bits of this operand are used in the comparison.
927309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
928309124Sdim///    in the low-order bits.
929288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
930249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b)
931193326Sed{
932309124Sdim  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
933193326Sed}
934193326Sed
935309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
936309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
937309124Sdim///    operand are ordered with respect to those in the second operand.
938309124Sdim///
939309124Sdim/// \headerfile <x86intrin.h>
940309124Sdim///
941309124Sdim/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
942309124Sdim///
943309124Sdim/// \param __a
944309124Sdim///    A 128-bit vector of [4 x float].
945309124Sdim/// \param __b
946309124Sdim///    A 128-bit vector of [4 x float].
947309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
948288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
949249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b)
950193326Sed{
951309124Sdim  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
952193326Sed}
953193326Sed
954309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
955309124Sdim///    operands to determine if the value in the first operand is unordered
956309124Sdim///    with respect to the corresponding value in the second operand and
957309124Sdim///    returns the result of the comparison in the low-order bits of a vector
958309124Sdim///    of [4 x float].
959309124Sdim///
960309124Sdim/// \headerfile <x86intrin.h>
961309124Sdim///
962309124Sdim/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
963309124Sdim///
964309124Sdim/// \param __a
965309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
966309124Sdim///    32 bits of this operand are used in the comparison.
967309124Sdim/// \param __b
968309124Sdim///    A 128-bit vector of [4 x float] containing one of the operands. The lower
969309124Sdim///    32 bits of this operand are used in the comparison.
970309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results
971309124Sdim///    in the low-order bits.
972288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
973249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b)
974193326Sed{
975309124Sdim  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
976193326Sed}
977193326Sed
978309124Sdim/// \brief Compares each of the corresponding 32-bit float values of the
979309124Sdim///    128-bit vectors of [4 x float] to determine if the values in the first
980309124Sdim///    operand are unordered with respect to those in the second operand.
981309124Sdim///
982309124Sdim/// \headerfile <x86intrin.h>
983309124Sdim///
984309124Sdim/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
985309124Sdim///
986309124Sdim/// \param __a
987309124Sdim///    A 128-bit vector of [4 x float].
988309124Sdim/// \param __b
989309124Sdim///    A 128-bit vector of [4 x float].
990309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results.
991288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
992249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b)
993193326Sed{
994309124Sdim  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995193326Sed}
996193326Sed
997309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
998309124Sdim///    operands for equality and returns the result of the comparison.
999309124Sdim///
1000309124Sdim/// \headerfile <x86intrin.h>
1001309124Sdim///
1002309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1003309124Sdim///
1004309124Sdim/// \param __a
1005309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1006309124Sdim///    used in the comparison.
1007309124Sdim/// \param __b
1008309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009309124Sdim///    used in the comparison.
1010309124Sdim/// \returns An integer containing the comparison results.
1011288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1012249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b)
1013193326Sed{
1014309124Sdim  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1015193326Sed}
1016193326Sed
1017309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
1018309124Sdim///    operands to determine if the first operand is less than the second
1019309124Sdim///    operand and returns the result of the comparison.
1020309124Sdim///
1021309124Sdim/// \headerfile <x86intrin.h>
1022309124Sdim///
1023309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1024309124Sdim///
1025309124Sdim/// \param __a
1026309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1027309124Sdim///    used in the comparison.
1028309124Sdim/// \param __b
1029309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1030309124Sdim///    used in the comparison.
1031309124Sdim/// \returns An integer containing the comparison results.
1032288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1033249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b)
1034193326Sed{
1035309124Sdim  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1036193326Sed}
1037193326Sed
1038309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
1039309124Sdim///    operands to determine if the first operand is less than or equal to the
1040309124Sdim///    second operand and returns the result of the comparison.
1041309124Sdim///
1042309124Sdim/// \headerfile <x86intrin.h>
1043309124Sdim///
1044309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1045309124Sdim///
1046309124Sdim/// \param __a
1047309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1048309124Sdim///    used in the comparison.
1049309124Sdim/// \param __b
1050309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1051309124Sdim///    used in the comparison.
1052309124Sdim/// \returns An integer containing the comparison results.
1053288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1054249423Sdim_mm_comile_ss(__m128 __a, __m128 __b)
1055193326Sed{
1056309124Sdim  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1057193326Sed}
1058193326Sed
1059309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
1060309124Sdim///    operands to determine if the first operand is greater than the second
1061309124Sdim///    operand and returns the result of the comparison.
1062309124Sdim///
1063309124Sdim/// \headerfile <x86intrin.h>
1064309124Sdim///
1065309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1066309124Sdim///
1067309124Sdim/// \param __a
1068309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069309124Sdim///    used in the comparison.
1070309124Sdim/// \param __b
1071309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1072309124Sdim///    used in the comparison.
1073309124Sdim/// \returns An integer containing the comparison results.
1074288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1075249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b)
1076193326Sed{
1077309124Sdim  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1078193326Sed}
1079193326Sed
1080309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
1081309124Sdim///    operands to determine if the first operand is greater than or equal to
1082309124Sdim///    the second operand and returns the result of the comparison.
1083309124Sdim///
1084309124Sdim/// \headerfile <x86intrin.h>
1085309124Sdim///
1086309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1087309124Sdim///
1088309124Sdim/// \param __a
1089309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090309124Sdim///    used in the comparison.
1091309124Sdim/// \param __b
1092309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093309124Sdim///    used in the comparison.
1094309124Sdim/// \returns An integer containing the comparison results.
1095288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1096249423Sdim_mm_comige_ss(__m128 __a, __m128 __b)
1097193326Sed{
1098309124Sdim  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1099193326Sed}
1100193326Sed
1101309124Sdim/// \brief Compares two 32-bit float values in the low-order bits of both
1102309124Sdim///    operands to determine if the first operand is not equal to the second
1103309124Sdim///    operand and returns the result of the comparison.
1104309124Sdim///
1105309124Sdim/// \headerfile <x86intrin.h>
1106309124Sdim///
1107309124Sdim/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1108309124Sdim///
1109309124Sdim/// \param __a
1110309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1111309124Sdim///    used in the comparison.
1112309124Sdim/// \param __b
1113309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114309124Sdim///    used in the comparison.
1115309124Sdim/// \returns An integer containing the comparison results.
1116288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1117249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b)
1118193326Sed{
1119309124Sdim  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1120193326Sed}
1121193326Sed
1122309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1123309124Sdim///    the low-order bits of both operands to determine equality and returns
1124309124Sdim///    the result of the comparison.
1125309124Sdim///
1126309124Sdim/// \headerfile <x86intrin.h>
1127309124Sdim///
1128309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1129309124Sdim///
1130309124Sdim/// \param __a
1131309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1132309124Sdim///    used in the comparison.
1133309124Sdim/// \param __b
1134309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135309124Sdim///    used in the comparison.
1136309124Sdim/// \returns An integer containing the comparison results.
1137288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1138249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b)
1139193326Sed{
1140309124Sdim  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1141193326Sed}
1142193326Sed
1143309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1144309124Sdim///    the low-order bits of both operands to determine if the first operand is
1145309124Sdim///    less than the second operand and returns the result of the comparison.
1146309124Sdim///
1147309124Sdim/// \headerfile <x86intrin.h>
1148309124Sdim///
1149309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1150309124Sdim///
1151309124Sdim/// \param __a
1152309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1153309124Sdim///    used in the comparison.
1154309124Sdim/// \param __b
1155309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1156309124Sdim///    used in the comparison.
1157309124Sdim/// \returns An integer containing the comparison results.
1158288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1159249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b)
1160193326Sed{
1161309124Sdim  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1162193326Sed}
1163193326Sed
1164309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1165309124Sdim///    the low-order bits of both operands to determine if the first operand
1166309124Sdim///    is less than or equal to the second operand and returns the result of
1167309124Sdim///    the comparison.
1168309124Sdim///
1169309124Sdim/// \headerfile <x86intrin.h>
1170309124Sdim///
1171309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1172309124Sdim///
1173309124Sdim/// \param __a
1174309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175309124Sdim///    used in the comparison.
1176309124Sdim/// \param __b
1177309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178309124Sdim///    used in the comparison.
1179309124Sdim/// \returns An integer containing the comparison results.
1180288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1181249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b)
1182193326Sed{
1183309124Sdim  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1184193326Sed}
1185193326Sed
1186309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1187309124Sdim///    the low-order bits of both operands to determine if the first operand
1188309124Sdim///    is greater than the second operand and returns the result of the
1189309124Sdim///    comparison.
1190309124Sdim///
1191309124Sdim/// \headerfile <x86intrin.h>
1192309124Sdim///
1193309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1194309124Sdim///
1195309124Sdim/// \param __a
1196309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1197309124Sdim///    used in the comparison.
1198309124Sdim/// \param __b
1199309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1200309124Sdim///    used in the comparison.
1201309124Sdim/// \returns An integer containing the comparison results.
1202288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1203249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b)
1204193326Sed{
1205309124Sdim  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1206193326Sed}
1207193326Sed
1208309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1209309124Sdim///    the low-order bits of both operands to determine if the first operand is
1210309124Sdim///    greater than or equal to the second operand and returns the result of
1211309124Sdim///    the comparison.
1212309124Sdim///
1213309124Sdim/// \headerfile <x86intrin.h>
1214309124Sdim///
1215309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1216309124Sdim///
1217309124Sdim/// \param __a
1218309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1219309124Sdim///    used in the comparison.
1220309124Sdim/// \param __b
1221309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1222309124Sdim///    used in the comparison.
1223309124Sdim/// \returns An integer containing the comparison results.
1224288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1225249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b)
1226193326Sed{
1227309124Sdim  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1228193326Sed}
1229193326Sed
1230309124Sdim/// \brief Performs an unordered comparison of two 32-bit float values using
1231309124Sdim///    the low-order bits of both operands to determine inequality and returns
1232309124Sdim///    the result of the comparison.
1233309124Sdim///
1234309124Sdim/// \headerfile <x86intrin.h>
1235309124Sdim///
1236309124Sdim/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1237309124Sdim///
1238309124Sdim/// \param __a
1239309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240309124Sdim///    used in the comparison.
1241309124Sdim/// \param __b
1242309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243309124Sdim///    used in the comparison.
1244309124Sdim/// \returns An integer containing the comparison results.
1245288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1246249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b)
1247193326Sed{
1248309124Sdim  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1249193326Sed}
1250193326Sed
1251309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1252309124Sdim///    [4 x float] into a 32-bit integer.
1253309124Sdim///
1254309124Sdim/// \headerfile <x86intrin.h>
1255309124Sdim///
1256309124Sdim/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1257309124Sdim///
1258309124Sdim/// \param __a
1259309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260309124Sdim///    used in the conversion.
1261309124Sdim/// \returns A 32-bit integer containing the converted value.
1262288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1263249423Sdim_mm_cvtss_si32(__m128 __a)
1264193326Sed{
1265309124Sdim  return __builtin_ia32_cvtss2si((__v4sf)__a);
1266193326Sed}
1267193326Sed
1268309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1269309124Sdim///    [4 x float] into a 32-bit integer.
1270309124Sdim///
1271309124Sdim/// \headerfile <x86intrin.h>
1272309124Sdim///
1273309124Sdim/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1274309124Sdim///
1275309124Sdim/// \param __a
1276309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277309124Sdim///    used in the conversion.
1278309124Sdim/// \returns A 32-bit integer containing the converted value.
1279288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1280249423Sdim_mm_cvt_ss2si(__m128 __a)
1281204643Srdivacky{
1282249423Sdim  return _mm_cvtss_si32(__a);
1283204643Srdivacky}
1284204643Srdivacky
1285193576Sed#ifdef __x86_64__
1286193576Sed
1287309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1288309124Sdim///    [4 x float] into a 64-bit integer.
1289309124Sdim///
1290309124Sdim/// \headerfile <x86intrin.h>
1291309124Sdim///
1292309124Sdim/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1293309124Sdim///
1294309124Sdim/// \param __a
1295309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1296309124Sdim///    used in the conversion.
1297309124Sdim/// \returns A 64-bit integer containing the converted value.
1298288943Sdimstatic __inline__ long long __DEFAULT_FN_ATTRS
1299249423Sdim_mm_cvtss_si64(__m128 __a)
1300193326Sed{
1301309124Sdim  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1302193326Sed}
1303193326Sed
1304193576Sed#endif
1305193576Sed
1306309124Sdim/// \brief Converts two low-order float values in a 128-bit vector of
1307309124Sdim///    [4 x float] into a 64-bit vector of [2 x i32].
1308309124Sdim///
1309309124Sdim/// \headerfile <x86intrin.h>
1310309124Sdim///
1311309124Sdim/// This intrinsic corresponds to the \c CVTPS2PI instruction.
1312309124Sdim///
1313309124Sdim/// \param __a
1314309124Sdim///    A 128-bit vector of [4 x float].
1315309124Sdim/// \returns A 64-bit integer vector containing the converted values.
1316288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
1317249423Sdim_mm_cvtps_pi32(__m128 __a)
1318193326Sed{
1319309124Sdim  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1320193326Sed}
1321193326Sed
1322309124Sdim/// \brief Converts two low-order float values in a 128-bit vector of
1323309124Sdim///    [4 x float] into a 64-bit vector of [2 x i32].
1324309124Sdim///
1325309124Sdim/// \headerfile <x86intrin.h>
1326309124Sdim///
1327309124Sdim/// This intrinsic corresponds to the \c CVTPS2PI instruction.
1328309124Sdim///
1329309124Sdim/// \param __a
1330309124Sdim///    A 128-bit vector of [4 x float].
1331309124Sdim/// \returns A 64-bit integer vector containing the converted values.
1332288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
1333249423Sdim_mm_cvt_ps2pi(__m128 __a)
1334212904Sdim{
1335249423Sdim  return _mm_cvtps_pi32(__a);
1336212904Sdim}
1337212904Sdim
1338309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1339309124Sdim///    [4 x float] into a 32-bit integer, truncating the result when it is
1340309124Sdim///    inexact.
1341309124Sdim///
1342309124Sdim/// \headerfile <x86intrin.h>
1343309124Sdim///
1344309124Sdim/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1345309124Sdim///
1346309124Sdim/// \param __a
1347309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1348309124Sdim///    used in the conversion.
1349309124Sdim/// \returns A 32-bit integer containing the converted value.
1350288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1351249423Sdim_mm_cvttss_si32(__m128 __a)
1352193326Sed{
1353309124Sdim  return __builtin_ia32_cvttss2si((__v4sf)__a);
1354193326Sed}
1355193326Sed
1356309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1357309124Sdim///    [4 x float] into a 32-bit integer, truncating the result when it is
1358309124Sdim///    inexact.
1359309124Sdim///
1360309124Sdim/// \headerfile <x86intrin.h>
1361309124Sdim///
1362309124Sdim/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1363309124Sdim///
1364309124Sdim/// \param __a
1365309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1366309124Sdim///    used in the conversion.
1367309124Sdim/// \returns A 32-bit integer containing the converted value.
1368288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
1369249423Sdim_mm_cvtt_ss2si(__m128 __a)
1370204643Srdivacky{
1371249423Sdim  return _mm_cvttss_si32(__a);
1372204643Srdivacky}
1373204643Srdivacky
1374309124Sdim/// \brief Converts a float value contained in the lower 32 bits of a vector of
1375309124Sdim///    [4 x float] into a 64-bit integer, truncating the result when it is
1376309124Sdim///    inexact.
1377309124Sdim///
1378309124Sdim/// \headerfile <x86intrin.h>
1379309124Sdim///
1380309124Sdim/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1381309124Sdim///
1382309124Sdim/// \param __a
1383309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1384309124Sdim///    used in the conversion.
1385309124Sdim/// \returns A 64-bit integer containing the converted value.
1386288943Sdimstatic __inline__ long long __DEFAULT_FN_ATTRS
1387249423Sdim_mm_cvttss_si64(__m128 __a)
1388193326Sed{
1389309124Sdim  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1390193326Sed}
1391193326Sed
1392309124Sdim/// \brief Converts two low-order float values in a 128-bit vector of
1393309124Sdim///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1394309124Sdim///    when it is inexact.
1395309124Sdim///
1396309124Sdim/// \headerfile <x86intrin.h>
1397309124Sdim///
1398309124Sdim/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
1399309124Sdim///
1400309124Sdim/// \param __a
1401309124Sdim///    A 128-bit vector of [4 x float].
1402309124Sdim/// \returns A 64-bit integer vector containing the converted values.
1403288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
1404249423Sdim_mm_cvttps_pi32(__m128 __a)
1405193326Sed{
1406309124Sdim  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1407193326Sed}
1408193326Sed
1409309124Sdim/// \brief Converts two low-order float values in a 128-bit vector of [4 x
1410309124Sdim///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1411309124Sdim///    is inexact.
1412309124Sdim///
1413309124Sdim/// \headerfile <x86intrin.h>
1414309124Sdim///
1415309124Sdim/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
1416309124Sdim///
1417309124Sdim/// \param __a
1418309124Sdim///    A 128-bit vector of [4 x float].
1419309124Sdim/// \returns A 64-bit integer vector containing the converted values.
1420288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
1421249423Sdim_mm_cvtt_ps2pi(__m128 __a)
1422212904Sdim{
1423249423Sdim  return _mm_cvttps_pi32(__a);
1424212904Sdim}
1425212904Sdim
1426309124Sdim/// \brief Converts a 32-bit signed integer value into a floating point value
1427309124Sdim///    and writes it to the lower 32 bits of the destination. The remaining
1428309124Sdim///    higher order elements of the destination vector are copied from the
1429309124Sdim///    corresponding elements in the first operand.
1430309124Sdim///
1431309124Sdim/// \headerfile <x86intrin.h>
1432309124Sdim///
1433309124Sdim/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1434309124Sdim///
1435309124Sdim/// \param __a
1436309124Sdim///    A 128-bit vector of [4 x float].
1437309124Sdim/// \param __b
1438309124Sdim///    A 32-bit signed integer operand containing the value to be converted.
1439309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1440309124Sdim///    converted value of the second operand. The upper 96 bits are copied from
1441309124Sdim///    the upper 96 bits of the first operand.
1442288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1443249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b)
1444193326Sed{
1445249423Sdim  __a[0] = __b;
1446249423Sdim  return __a;
1447193326Sed}
1448193326Sed
1449309124Sdim/// \brief Converts a 32-bit signed integer value into a floating point value
1450309124Sdim///    and writes it to the lower 32 bits of the destination. The remaining
1451309124Sdim///    higher order elements of the destination are copied from the
1452309124Sdim///    corresponding elements in the first operand.
1453309124Sdim///
1454309124Sdim/// \headerfile <x86intrin.h>
1455309124Sdim///
1456309124Sdim/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1457309124Sdim///
1458309124Sdim/// \param __a
1459309124Sdim///    A 128-bit vector of [4 x float].
1460309124Sdim/// \param __b
1461309124Sdim///    A 32-bit signed integer operand containing the value to be converted.
1462309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1463309124Sdim///    converted value of the second operand. The upper 96 bits are copied from
1464309124Sdim///    the upper 96 bits of the first operand.
1465288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1466249423Sdim_mm_cvt_si2ss(__m128 __a, int __b)
1467212904Sdim{
1468249423Sdim  return _mm_cvtsi32_ss(__a, __b);
1469212904Sdim}
1470212904Sdim
1471193326Sed#ifdef __x86_64__
1472193326Sed
1473309124Sdim/// \brief Converts a 64-bit signed integer value into a floating point value
1474309124Sdim///    and writes it to the lower 32 bits of the destination. The remaining
1475309124Sdim///    higher order elements of the destination are copied from the
1476309124Sdim///    corresponding elements in the first operand.
1477309124Sdim///
1478309124Sdim/// \headerfile <x86intrin.h>
1479309124Sdim///
1480309124Sdim/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1481309124Sdim///
1482309124Sdim/// \param __a
1483309124Sdim///    A 128-bit vector of [4 x float].
1484309124Sdim/// \param __b
1485309124Sdim///    A 64-bit signed integer operand containing the value to be converted.
1486309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487309124Sdim///    converted value of the second operand. The upper 96 bits are copied from
1488309124Sdim///    the upper 96 bits of the first operand.
1489288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1490249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b)
1491193326Sed{
1492249423Sdim  __a[0] = __b;
1493249423Sdim  return __a;
1494193326Sed}
1495193326Sed
1496193326Sed#endif
1497193326Sed
1498309124Sdim/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1499309124Sdim///    floating point values and writes them to the lower 64-bits of the
1500309124Sdim///    destination. The remaining higher order elements of the destination are
1501309124Sdim///    copied from the corresponding elements in the first operand.
1502309124Sdim///
1503309124Sdim/// \headerfile <x86intrin.h>
1504309124Sdim///
1505309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS instruction.
1506309124Sdim///
1507309124Sdim/// \param __a
1508309124Sdim///    A 128-bit vector of [4 x float].
1509309124Sdim/// \param __b
1510309124Sdim///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1511309124Sdim///    and written to the corresponding low-order elements in the destination.
1512309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1513309124Sdim///    converted value of the second operand. The upper 64 bits are copied from
1514309124Sdim///    the upper 64 bits of the first operand.
1515288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1516249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1517193326Sed{
1518309124Sdim  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1519193326Sed}
1520193326Sed
1521309124Sdim/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1522309124Sdim///    floating point values and writes them to the lower 64-bits of the
1523309124Sdim///    destination. The remaining higher order elements of the destination are
1524309124Sdim///    copied from the corresponding elements in the first operand.
1525309124Sdim///
1526309124Sdim/// \headerfile <x86intrin.h>
1527309124Sdim///
1528309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS instruction.
1529309124Sdim///
1530309124Sdim/// \param __a
1531309124Sdim///    A 128-bit vector of [4 x float].
1532309124Sdim/// \param __b
1533309124Sdim///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1534309124Sdim///    and written to the corresponding low-order elements in the destination.
1535309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1536309124Sdim///    converted value from the second operand. The upper 64 bits are copied
1537309124Sdim///    from the upper 64 bits of the first operand.
1538288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1539249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1540212904Sdim{
1541249423Sdim  return _mm_cvtpi32_ps(__a, __b);
1542212904Sdim}
1543212904Sdim
1544309124Sdim/// \brief Extracts a float value contained in the lower 32 bits of a vector of
1545309124Sdim///    [4 x float].
1546309124Sdim///
1547309124Sdim/// \headerfile <x86intrin.h>
1548309124Sdim///
1549309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1550309124Sdim///
1551309124Sdim/// \param __a
1552309124Sdim///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553309124Sdim///    used in the extraction.
1554309124Sdim/// \returns A 32-bit float containing the extracted value.
1555288943Sdimstatic __inline__ float __DEFAULT_FN_ATTRS
1556249423Sdim_mm_cvtss_f32(__m128 __a)
1557193326Sed{
1558249423Sdim  return __a[0];
1559193326Sed}
1560193326Sed
1561309124Sdim/// \brief Loads two packed float values from the address __p into the
1562309124Sdim///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1563309124Sdim///     are copied from the low-order bits of the first operand.
1564309124Sdim///
1565309124Sdim/// \headerfile <x86intrin.h>
1566309124Sdim///
1567309124Sdim/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1568309124Sdim///
1569309124Sdim/// \param __a
1570309124Sdim///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1571309124Sdim///    of the destination.
1572309124Sdim/// \param __p
1573309124Sdim///    A pointer to two packed float values. Bits [63:0] are written to bits
1574309124Sdim///    [127:64] of the destination.
1575309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values.
1576288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1577249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p)
1578193326Sed{
1579226633Sdim  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1580226633Sdim  struct __mm_loadh_pi_struct {
1581249423Sdim    __mm_loadh_pi_v2f32 __u;
1582226633Sdim  } __attribute__((__packed__, __may_alias__));
1583249423Sdim  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1584249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1585249423Sdim  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1586193326Sed}
1587193326Sed
1588309124Sdim/// \brief Loads two packed float values from the address __p into the low-order
1589309124Sdim///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
1590309124Sdim///    from the high-order bits of the first operand.
1591309124Sdim///
1592309124Sdim/// \headerfile <x86intrin.h>
1593309124Sdim///
1594309124Sdim/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1595309124Sdim///
1596309124Sdim/// \param __a
1597309124Sdim///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1598309124Sdim///    [127:64] of the destination.
1599309124Sdim/// \param __p
1600309124Sdim///    A pointer to two packed float values. Bits [63:0] are written to bits
1601309124Sdim///    [63:0] of the destination.
1602309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values.
1603288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1604249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p)
1605193326Sed{
1606226633Sdim  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1607226633Sdim  struct __mm_loadl_pi_struct {
1608249423Sdim    __mm_loadl_pi_v2f32 __u;
1609226633Sdim  } __attribute__((__packed__, __may_alias__));
1610249423Sdim  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1611249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1612249423Sdim  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1613193326Sed}
1614193326Sed
1615309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1616309124Sdim///    32 bits of the vector are initialized with the single-precision
1617309124Sdim///    floating-point value loaded from a specified memory location. The upper
1618309124Sdim///    96 bits are set to zero.
1619309124Sdim///
1620309124Sdim/// \headerfile <x86intrin.h>
1621309124Sdim///
1622309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1623309124Sdim///
1624309124Sdim/// \param __p
1625309124Sdim///    A pointer to a 32-bit memory location containing a single-precision
1626309124Sdim///    floating-point value.
1627309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1628309124Sdim///    lower 32 bits contain the value loaded from the memory location. The
1629309124Sdim///    upper 96 bits are set to zero.
1630288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1631249423Sdim_mm_load_ss(const float *__p)
1632193326Sed{
1633226633Sdim  struct __mm_load_ss_struct {
1634249423Sdim    float __u;
1635226633Sdim  } __attribute__((__packed__, __may_alias__));
1636249423Sdim  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1637249423Sdim  return (__m128){ __u, 0, 0, 0 };
1638193326Sed}
1639193326Sed
1640309124Sdim/// \brief Loads a 32-bit float value and duplicates it to all four vector
1641309124Sdim///    elements of a 128-bit vector of [4 x float].
1642309124Sdim///
1643309124Sdim/// \headerfile <x86intrin.h>
1644309124Sdim///
1645309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
1646309124Sdim///    instruction.
1647309124Sdim///
1648309124Sdim/// \param __p
1649309124Sdim///    A pointer to a float value to be loaded and duplicated.
1650309124Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded
1651309124Sdim///    and duplicated values.
1652288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1653249423Sdim_mm_load1_ps(const float *__p)
1654193326Sed{
1655226633Sdim  struct __mm_load1_ps_struct {
1656249423Sdim    float __u;
1657226633Sdim  } __attribute__((__packed__, __may_alias__));
1658249423Sdim  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1659249423Sdim  return (__m128){ __u, __u, __u, __u };
1660193326Sed}
1661193326Sed
1662193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
1663193326Sed
1664309124Sdim/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1665309124Sdim///    memory location.
1666309124Sdim///
1667309124Sdim/// \headerfile <x86intrin.h>
1668309124Sdim///
1669309124Sdim/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1670309124Sdim///
1671309124Sdim/// \param __p
1672309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
1673309124Sdim///    location has to be 128-bit aligned.
1674309124Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
1675288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1676249423Sdim_mm_load_ps(const float *__p)
1677193326Sed{
1678249423Sdim  return *(__m128*)__p;
1679193326Sed}
1680193326Sed
1681309124Sdim/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1682309124Sdim///    unaligned memory location.
1683309124Sdim///
1684309124Sdim/// \headerfile <x86intrin.h>
1685309124Sdim///
1686309124Sdim/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1687309124Sdim///
1688309124Sdim/// \param __p
1689309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
1690309124Sdim///    location does not have to be aligned.
1691309124Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1692288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1693249423Sdim_mm_loadu_ps(const float *__p)
1694193326Sed{
1695223017Sdim  struct __loadu_ps {
1696249423Sdim    __m128 __v;
1697226633Sdim  } __attribute__((__packed__, __may_alias__));
1698249423Sdim  return ((struct __loadu_ps*)__p)->__v;
1699193326Sed}
1700193326Sed
1701309124Sdim/// \brief Loads four packed float values, in reverse order, from an aligned
1702309124Sdim///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1703309124Sdim///
1704309124Sdim/// \headerfile <x86intrin.h>
1705309124Sdim///
1706309124Sdim/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
1707309124Sdim///    instruction.
1708309124Sdim///
1709309124Sdim/// \param __p
1710309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
1711309124Sdim///    location has to be 128-bit aligned.
1712309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1713309124Sdim///    in reverse order.
1714288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1715249423Sdim_mm_loadr_ps(const float *__p)
1716193326Sed{
1717249423Sdim  __m128 __a = _mm_load_ps(__p);
1718309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1719193326Sed}
1720193326Sed
1721309124Sdim/// \brief Create a 128-bit vector of [4 x float] with undefined values.
1722309124Sdim///
1723309124Sdim/// \headerfile <x86intrin.h>
1724309124Sdim///
1725309124Sdim/// This intrinsic has no corresponding instruction.
1726309124Sdim///
1727309124Sdim/// \returns A 128-bit vector of [4 x float] containing undefined values.
1728309124Sdim
1729288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1730309124Sdim_mm_undefined_ps(void)
1731296417Sdim{
1732296417Sdim  return (__m128)__builtin_ia32_undef128();
1733296417Sdim}
1734296417Sdim
1735309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1736309124Sdim///    32 bits of the vector are initialized with the specified single-precision
1737309124Sdim///    floating-point value. The upper 96 bits are set to zero.
1738309124Sdim///
1739309124Sdim/// \headerfile <x86intrin.h>
1740309124Sdim///
1741309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1742309124Sdim///
1743309124Sdim/// \param __w
1744309124Sdim///    A single-precision floating-point value used to initialize the lower 32
1745309124Sdim///    bits of the result.
1746309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1747309124Sdim///    lower 32 bits contain the value provided in the source operand. The
1748309124Sdim///    upper 96 bits are set to zero.
1749296417Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1750249423Sdim_mm_set_ss(float __w)
1751193326Sed{
1752249423Sdim  return (__m128){ __w, 0, 0, 0 };
1753193326Sed}
1754193326Sed
1755309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1756309124Sdim///    of the four single-precision floating-point vector elements set to the
1757309124Sdim///    specified single-precision floating-point value.
1758309124Sdim///
1759309124Sdim/// \headerfile <x86intrin.h>
1760309124Sdim///
1761309124Sdim/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1762309124Sdim///
1763309124Sdim/// \param __w
1764309124Sdim///    A single-precision floating-point value used to initialize each vector
1765309124Sdim///    element of the result.
1766309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float].
1767288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1768249423Sdim_mm_set1_ps(float __w)
1769193326Sed{
1770249423Sdim  return (__m128){ __w, __w, __w, __w };
1771193326Sed}
1772193326Sed
1773276479Sdim/* Microsoft specific. */
1774309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1775309124Sdim///    of the four single-precision floating-point vector elements set to the
1776309124Sdim///    specified single-precision floating-point value.
1777309124Sdim///
1778309124Sdim/// \headerfile <x86intrin.h>
1779309124Sdim///
1780309124Sdim/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1781309124Sdim///
1782309124Sdim/// \param __w
1783309124Sdim///    A single-precision floating-point value used to initialize each vector
1784309124Sdim///    element of the result.
1785309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float].
1786288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1787249423Sdim_mm_set_ps1(float __w)
1788193326Sed{
1789249423Sdim    return _mm_set1_ps(__w);
1790193326Sed}
1791193326Sed
1792309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]
1793309124Sdim///    initialized with the specified single-precision floating-point values.
1794309124Sdim///
1795309124Sdim/// \headerfile <x86intrin.h>
1796309124Sdim///
1797309124Sdim/// This intrinsic is a utility function and does not correspond to a specific
1798309124Sdim///    instruction.
1799309124Sdim///
1800309124Sdim/// \param __z
1801309124Sdim///    A single-precision floating-point value used to initialize bits [127:96]
1802309124Sdim///    of the result.
1803309124Sdim/// \param __y
1804309124Sdim///    A single-precision floating-point value used to initialize bits [95:64]
1805309124Sdim///    of the result.
1806309124Sdim/// \param __x
1807309124Sdim///    A single-precision floating-point value used to initialize bits [63:32]
1808309124Sdim///    of the result.
1809309124Sdim/// \param __w
1810309124Sdim///    A single-precision floating-point value used to initialize bits [31:0]
1811309124Sdim///    of the result.
1812309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float].
1813288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1814249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w)
1815193326Sed{
1816249423Sdim  return (__m128){ __w, __x, __y, __z };
1817193326Sed}
1818193326Sed
1819309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float],
1820309124Sdim///    initialized in reverse order with the specified 32-bit single-precision
1821309124Sdim///    float-point values.
1822309124Sdim///
1823309124Sdim/// \headerfile <x86intrin.h>
1824309124Sdim///
1825309124Sdim/// This intrinsic is a utility function and does not correspond to a specific
1826309124Sdim///    instruction.
1827309124Sdim///
1828309124Sdim/// \param __z
1829309124Sdim///    A single-precision floating-point value used to initialize bits [31:0]
1830309124Sdim///    of the result.
1831309124Sdim/// \param __y
1832309124Sdim///    A single-precision floating-point value used to initialize bits [63:32]
1833309124Sdim///    of the result.
1834309124Sdim/// \param __x
1835309124Sdim///    A single-precision floating-point value used to initialize bits [95:64]
1836309124Sdim///    of the result.
1837309124Sdim/// \param __w
1838309124Sdim///    A single-precision floating-point value used to initialize bits [127:96]
1839309124Sdim///    of the result.
1840309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float].
1841288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1842249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w)
1843193326Sed{
1844249423Sdim  return (__m128){ __z, __y, __x, __w };
1845193326Sed}
1846193326Sed
1847309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1848309124Sdim///    to zero.
1849309124Sdim///
1850309124Sdim/// \headerfile <x86intrin.h>
1851309124Sdim///
1852309124Sdim/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
1853309124Sdim///
1854309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1855309124Sdim///    all elements set to zero.
1856288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
1857193326Sed_mm_setzero_ps(void)
1858193326Sed{
1859193326Sed  return (__m128){ 0, 0, 0, 0 };
1860193326Sed}
1861193326Sed
1862309124Sdim/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1863309124Sdim///    memory location.
1864309124Sdim///
1865309124Sdim/// \headerfile <x86intrin.h>
1866309124Sdim///
1867309124Sdim/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
1868309124Sdim///
1869309124Sdim/// \param __p
1870309124Sdim///    A pointer to a 64-bit memory location.
1871309124Sdim/// \param __a
1872309124Sdim///    A 128-bit vector of [4 x float] containing the values to be stored.
1873288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1874249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a)
1875193326Sed{
1876309124Sdim  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1877193326Sed}
1878193326Sed
1879309124Sdim/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1880309124Sdim///     memory location.
1881309124Sdim///
1882309124Sdim/// \headerfile <x86intrin.h>
1883309124Sdim///
1884309124Sdim/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
1885309124Sdim///
1886309124Sdim/// \param __p
1887309124Sdim///    A pointer to a memory location that will receive the float values.
1888309124Sdim/// \param __a
1889309124Sdim///    A 128-bit vector of [4 x float] containing the values to be stored.
1890288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1891249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a)
1892193326Sed{
1893309124Sdim  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1894193326Sed}
1895193326Sed
1896309124Sdim/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1897309124Sdim///     memory location.
1898309124Sdim///
1899309124Sdim/// \headerfile <x86intrin.h>
1900309124Sdim///
1901309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1902309124Sdim///
1903309124Sdim/// \param __p
1904309124Sdim///    A pointer to a 32-bit memory location.
1905309124Sdim/// \param __a
1906309124Sdim///    A 128-bit vector of [4 x float] containing the value to be stored.
1907288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1908249423Sdim_mm_store_ss(float *__p, __m128 __a)
1909193326Sed{
1910226633Sdim  struct __mm_store_ss_struct {
1911249423Sdim    float __u;
1912226633Sdim  } __attribute__((__packed__, __may_alias__));
1913249423Sdim  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1914193326Sed}
1915193326Sed
1916309124Sdim/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1917309124Sdim///    unaligned memory location.
1918309124Sdim///
1919309124Sdim/// \headerfile <x86intrin.h>
1920309124Sdim///
1921309124Sdim/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1922309124Sdim///
1923309124Sdim/// \param __p
1924309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
1925309124Sdim///    location does not have to be aligned.
1926309124Sdim/// \param __a
1927309124Sdim///    A 128-bit vector of [4 x float] containing the values to be stored.
1928288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1929249423Sdim_mm_storeu_ps(float *__p, __m128 __a)
1930193326Sed{
1931309124Sdim  struct __storeu_ps {
1932309124Sdim    __m128 __v;
1933309124Sdim  } __attribute__((__packed__, __may_alias__));
1934309124Sdim  ((struct __storeu_ps*)__p)->__v = __a;
1935193326Sed}
1936193326Sed
1937309124Sdim/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1938309124Sdim///    four contiguous elements in an aligned memory location.
1939309124Sdim///
1940309124Sdim/// \headerfile <x86intrin.h>
1941309124Sdim///
1942309124Sdim/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1943309124Sdim///    instruction.
1944309124Sdim///
1945309124Sdim/// \param __p
1946309124Sdim///    A pointer to a 128-bit memory location.
1947309124Sdim/// \param __a
1948309124Sdim///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1949309124Sdim///    of the four contiguous elements pointed by __p.
1950288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1951309124Sdim_mm_store_ps(float *__p, __m128 __a)
1952193326Sed{
1953309124Sdim  *(__m128*)__p = __a;
1954193326Sed}
1955193326Sed
1956309124Sdim/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1957309124Sdim///    four contiguous elements in an aligned memory location.
1958309124Sdim///
1959309124Sdim/// \headerfile <x86intrin.h>
1960309124Sdim///
1961309124Sdim/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1962309124Sdim///    instruction.
1963309124Sdim///
1964309124Sdim/// \param __p
1965309124Sdim///    A pointer to a 128-bit memory location.
1966309124Sdim/// \param __a
1967309124Sdim///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1968309124Sdim///    of the four contiguous elements pointed by __p.
1969288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1970309124Sdim_mm_store1_ps(float *__p, __m128 __a)
1971212904Sdim{
1972309124Sdim  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1973309124Sdim  _mm_store_ps(__p, __a);
1974212904Sdim}
1975212904Sdim
1976309124Sdim/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1977309124Sdim///    aligned memory location.
1978309124Sdim///
1979309124Sdim/// \headerfile <x86intrin.h>
1980309124Sdim///
1981309124Sdim/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1982309124Sdim///
1983309124Sdim/// \param __p
1984309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
1985309124Sdim///    location has to be 128-bit aligned.
1986309124Sdim/// \param __a
1987309124Sdim///    A 128-bit vector of [4 x float] containing the values to be stored.
1988288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
1989309124Sdim_mm_store_ps1(float *__p, __m128 __a)
1990193326Sed{
1991309124Sdim  return _mm_store1_ps(__p, __a);
1992193326Sed}
1993193326Sed
1994309124Sdim/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1995309124Sdim///    aligned memory location in reverse order.
1996309124Sdim///
1997309124Sdim/// \headerfile <x86intrin.h>
1998309124Sdim///
1999309124Sdim/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
2000309124Sdim///    instruction.
2001309124Sdim///
2002309124Sdim/// \param __p
2003309124Sdim///    A pointer to a 128-bit memory location. The address of the memory
2004309124Sdim///    location has to be 128-bit aligned.
2005309124Sdim/// \param __a
2006309124Sdim///    A 128-bit vector of [4 x float] containing the values to be stored.
2007288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2008249423Sdim_mm_storer_ps(float *__p, __m128 __a)
2009193326Sed{
2010309124Sdim  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2011249423Sdim  _mm_store_ps(__p, __a);
2012193326Sed}
2013193326Sed
2014212904Sdim#define _MM_HINT_T0 3
2015193326Sed#define _MM_HINT_T1 2
2016212904Sdim#define _MM_HINT_T2 1
2017193326Sed#define _MM_HINT_NTA 0
2018193326Sed
2019276479Sdim#ifndef _MSC_VER
2020210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
2021193326Sed   Sema doesn't do any form of constant propagation yet. */
2022193326Sed
2023309124Sdim/// \brief Loads one cache line of data from the specified address to a location
2024309124Sdim///    closer to the processor.
2025309124Sdim///
2026309124Sdim/// \headerfile <x86intrin.h>
2027309124Sdim///
2028309124Sdim/// \code
2029309124Sdim/// void _mm_prefetch(const void * a, const int sel);
2030309124Sdim/// \endcode
2031309124Sdim///
2032309124Sdim/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
2033309124Sdim///
2034309124Sdim/// \param a
2035309124Sdim///    A pointer to a memory location containing a cache line of data.
2036309124Sdim/// \param sel
2037309124Sdim///    A predefined integer constant specifying the type of prefetch operation:
2038309124Sdim///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
2039309124Sdim///    The PREFETCHNTA instruction will be generated.
2040309124Sdim///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2041309124Sdim///    be generated.
2042309124Sdim///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2043309124Sdim///    be generated.
2044309124Sdim///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2045309124Sdim///    be generated.
2046234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
2047276479Sdim#endif
2048193326Sed
2049309124Sdim/// \brief Stores a 64-bit integer in the specified aligned memory location. To
2050309124Sdim///    minimize caching, the data is flagged as non-temporal (unlikely to be
2051309124Sdim///    used again soon).
2052309124Sdim///
2053309124Sdim/// \headerfile <x86intrin.h>
2054309124Sdim///
2055309124Sdim/// This intrinsic corresponds to the \c MOVNTQ instruction.
2056309124Sdim///
2057309124Sdim/// \param __p
2058309124Sdim///    A pointer to an aligned memory location used to store the register value.
2059309124Sdim/// \param __a
2060309124Sdim///    A 64-bit integer containing the value to be stored.
2061288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2062249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a)
2063193326Sed{
2064249423Sdim  __builtin_ia32_movntq(__p, __a);
2065193326Sed}
2066193326Sed
2067309124Sdim/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2068309124Sdim///    128-bit aligned memory location. To minimize caching, the data is flagged
2069309124Sdim///    as non-temporal (unlikely to be used again soon).
2070309124Sdim///
2071309124Sdim/// \headerfile <x86intrin.h>
2072309124Sdim///
2073309124Sdim/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
2074309124Sdim///
2075309124Sdim/// \param __p
2076309124Sdim///    A pointer to a 128-bit aligned memory location that will receive the
2077309124Sdim///    integer values.
2078309124Sdim/// \param __a
2079309124Sdim///    A 128-bit vector of [4 x float] containing the values to be moved.
2080288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2081249423Sdim_mm_stream_ps(float *__p, __m128 __a)
2082193326Sed{
2083309124Sdim  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2084193326Sed}
2085193326Sed
2086309124Sdim/// \brief Forces strong memory ordering (serialization) between store
2087309124Sdim///    instructions preceding this instruction and store instructions following
2088309124Sdim///    this instruction, ensuring the system completes all previous stores
2089309124Sdim///    before executing subsequent stores.
2090309124Sdim///
2091309124Sdim/// \headerfile <x86intrin.h>
2092309124Sdim///
2093309124Sdim/// This intrinsic corresponds to the \c SFENCE instruction.
2094309124Sdim///
2095288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2096193326Sed_mm_sfence(void)
2097193326Sed{
2098193326Sed  __builtin_ia32_sfence();
2099193326Sed}
2100193326Sed
2101309124Sdim/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2102309124Sdim///    returns it, as specified by the immediate integer operand.
2103309124Sdim///
2104309124Sdim/// \headerfile <x86intrin.h>
2105309124Sdim///
2106309124Sdim/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
2107309124Sdim///
2108309124Sdim/// \param __a
2109309124Sdim///    A 64-bit vector of [4 x i16].
2110309124Sdim/// \param __n
2111309124Sdim///    An immediate integer operand that determines which bits are extracted:
2112309124Sdim///    0: Bits [15:0] are copied to the destination.
2113309124Sdim///    1: Bits [31:16] are copied to the destination.
2114309124Sdim///    2: Bits [47:32] are copied to the destination.
2115309124Sdim///    3: Bits [63:48] are copied to the destination.
2116309124Sdim/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2117309124Sdim#define _mm_extract_pi16(a, n) __extension__ ({ \
2118309124Sdim  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
2119193326Sed
2120309124Sdim/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2121309124Sdim///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2122309124Sdim///    specified by the immediate operand __n.
2123309124Sdim///
2124309124Sdim/// \headerfile <x86intrin.h>
2125309124Sdim///
2126309124Sdim/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
2127309124Sdim///
2128309124Sdim/// \param __a
2129309124Sdim///    A 64-bit vector of [4 x i16].
2130309124Sdim/// \param __d
2131309124Sdim///    An integer. The lower 16-bit value from this operand is written to the
2132309124Sdim///    destination at the offset specified by operand __n.
2133309124Sdim/// \param __n
2134309124Sdim///    An immediate integer operant that determines which the bits to be used
2135309124Sdim///    in the destination.
2136309124Sdim///    0: Bits [15:0] are copied to the destination.
2137309124Sdim///    1: Bits [31:16] are copied to the destination.
2138309124Sdim///    2: Bits [47:32] are copied to the destination.
2139309124Sdim///    3: Bits [63:48] are copied to the destination.
2140309124Sdim///    The remaining bits in the destination are copied from the corresponding
2141309124Sdim///    bits in operand __a.
2142309124Sdim/// \returns A 64-bit integer vector containing the copied packed data from the
2143309124Sdim///    operands.
2144309124Sdim#define _mm_insert_pi16(a, d, n) __extension__ ({ \
2145309124Sdim  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
2146193326Sed
2147309124Sdim/// \brief Compares each of the corresponding packed 16-bit integer values of
2148309124Sdim///    the 64-bit integer vectors, and writes the greater value to the
2149309124Sdim///    corresponding bits in the destination.
2150309124Sdim///
2151309124Sdim/// \headerfile <x86intrin.h>
2152309124Sdim///
2153309124Sdim/// This intrinsic corresponds to the \c PMAXSW instruction.
2154309124Sdim///
2155309124Sdim/// \param __a
2156309124Sdim///    A 64-bit integer vector containing one of the source operands.
2157309124Sdim/// \param __b
2158309124Sdim///    A 64-bit integer vector containing one of the source operands.
2159309124Sdim/// \returns A 64-bit integer vector containing the comparison results.
2160288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2161249423Sdim_mm_max_pi16(__m64 __a, __m64 __b)
2162193326Sed{
2163249423Sdim  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2164193326Sed}
2165193326Sed
2166309124Sdim/// \brief Compares each of the corresponding packed 8-bit unsigned integer
2167309124Sdim///    values of the 64-bit integer vectors, and writes the greater value to the
2168309124Sdim///    corresponding bits in the destination.
2169309124Sdim///
2170309124Sdim/// \headerfile <x86intrin.h>
2171309124Sdim///
2172309124Sdim/// This intrinsic corresponds to the \c PMAXUB instruction.
2173309124Sdim///
2174309124Sdim/// \param __a
2175309124Sdim///    A 64-bit integer vector containing one of the source operands.
2176309124Sdim/// \param __b
2177309124Sdim///    A 64-bit integer vector containing one of the source operands.
2178309124Sdim/// \returns A 64-bit integer vector containing the comparison results.
2179288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2180249423Sdim_mm_max_pu8(__m64 __a, __m64 __b)
2181193326Sed{
2182249423Sdim  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2183193326Sed}
2184193326Sed
2185309124Sdim/// \brief Compares each of the corresponding packed 16-bit integer values of
2186309124Sdim///    the 64-bit integer vectors, and writes the lesser value to the
2187309124Sdim///    corresponding bits in the destination.
2188309124Sdim///
2189309124Sdim/// \headerfile <x86intrin.h>
2190309124Sdim///
2191309124Sdim/// This intrinsic corresponds to the \c PMINSW instruction.
2192309124Sdim///
2193309124Sdim/// \param __a
2194309124Sdim///    A 64-bit integer vector containing one of the source operands.
2195309124Sdim/// \param __b
2196309124Sdim///    A 64-bit integer vector containing one of the source operands.
2197309124Sdim/// \returns A 64-bit integer vector containing the comparison results.
2198288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2199249423Sdim_mm_min_pi16(__m64 __a, __m64 __b)
2200193326Sed{
2201249423Sdim  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2202193326Sed}
2203193326Sed
2204309124Sdim/// \brief Compares each of the corresponding packed 8-bit unsigned integer
2205309124Sdim///    values of the 64-bit integer vectors, and writes the lesser value to the
2206309124Sdim///    corresponding bits in the destination.
2207309124Sdim///
2208309124Sdim/// \headerfile <x86intrin.h>
2209309124Sdim///
2210309124Sdim/// This intrinsic corresponds to the \c PMINUB instruction.
2211309124Sdim///
2212309124Sdim/// \param __a
2213309124Sdim///    A 64-bit integer vector containing one of the source operands.
2214309124Sdim/// \param __b
2215309124Sdim///    A 64-bit integer vector containing one of the source operands.
2216309124Sdim/// \returns A 64-bit integer vector containing the comparison results.
2217288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2218249423Sdim_mm_min_pu8(__m64 __a, __m64 __b)
2219193326Sed{
2220249423Sdim  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2221193326Sed}
2222193326Sed
2223309124Sdim/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2224309124Sdim///    integer vector to create a 16-bit mask value. Zero-extends the value to
2225309124Sdim///    32-bit integer and writes it to the destination.
2226309124Sdim///
2227309124Sdim/// \headerfile <x86intrin.h>
2228309124Sdim///
2229309124Sdim/// This intrinsic corresponds to the \c PMOVMSKB instruction.
2230309124Sdim///
2231309124Sdim/// \param __a
2232309124Sdim///    A 64-bit integer vector containing the values with bits to be extracted.
2233309124Sdim/// \returns The most significant bit from each 8-bit element in the operand,
2234309124Sdim///    written to bits [15:0].
2235288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
2236249423Sdim_mm_movemask_pi8(__m64 __a)
2237193326Sed{
2238249423Sdim  return __builtin_ia32_pmovmskb((__v8qi)__a);
2239193326Sed}
2240193326Sed
2241309124Sdim/// \brief Multiplies packed 16-bit unsigned integer values and writes the
2242309124Sdim///    high-order 16 bits of each 32-bit product to the corresponding bits in
2243309124Sdim///    the destination.
2244309124Sdim///
2245309124Sdim/// \headerfile <x86intrin.h>
2246309124Sdim///
2247309124Sdim/// This intrinsic corresponds to the \c PMULHUW instruction.
2248309124Sdim///
2249309124Sdim/// \param __a
2250309124Sdim///    A 64-bit integer vector containing one of the source operands.
2251309124Sdim/// \param __b
2252309124Sdim///    A 64-bit integer vector containing one of the source operands.
2253309124Sdim/// \returns A 64-bit integer vector containing the products of both operands.
2254288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2255249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b)
2256193326Sed{
2257249423Sdim  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2258193326Sed}
2259193326Sed
2260309124Sdim/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2261309124Sdim///    destination, as specified by the immediate value operand.
2262309124Sdim///
2263309124Sdim/// \headerfile <x86intrin.h>
2264309124Sdim///
2265309124Sdim/// This intrinsic corresponds to the \c PSHUFW instruction.
2266309124Sdim///
2267309124Sdim/// \code
2268309124Sdim/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2269309124Sdim/// \endcode
2270309124Sdim///
2271309124Sdim/// \param a
2272309124Sdim///    A 64-bit integer vector containing the values to be shuffled.
2273309124Sdim/// \param n
2274309124Sdim///    An immediate value containing an 8-bit value specifying which elements to
2275309124Sdim///    copy from a. The destinations within the 64-bit destination are assigned
2276309124Sdim///    values as follows:
2277309124Sdim///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
2278309124Sdim///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
2279309124Sdim///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
2280309124Sdim///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
2281309124Sdim///    Bit value assignments:
2282309124Sdim///    00: assigned from bits [15:0] of a.
2283309124Sdim///    01: assigned from bits [31:16] of a.
2284309124Sdim///    10: assigned from bits [47:32] of a.
2285309124Sdim///    11: assigned from bits [63:48] of a.
2286309124Sdim/// \returns A 64-bit integer vector containing the shuffled values.
2287234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \
2288296417Sdim  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
2289193326Sed
2290309124Sdim/// \brief Conditionally copies the values from each 8-bit element in the first
2291309124Sdim///    64-bit integer vector operand to the specified memory location, as
2292309124Sdim///    specified by the most significant bit in the corresponding element in the
2293309124Sdim///    second 64-bit integer vector operand. To minimize caching, the data is
2294309124Sdim///    flagged as non-temporal (unlikely to be used again soon).
2295309124Sdim///
2296309124Sdim/// \headerfile <x86intrin.h>
2297309124Sdim///
2298309124Sdim/// This intrinsic corresponds to the \c MASKMOVQ instruction.
2299309124Sdim///
2300309124Sdim/// \param __d
2301309124Sdim///    A 64-bit integer vector containing the values with elements to be copied.
2302309124Sdim/// \param __n
2303309124Sdim///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2304309124Sdim///    element determines whether the corresponding element in operand __d is
2305309124Sdim///    copied. If the most significant bit of a given element is 1, the
2306309124Sdim///    corresponding element in operand __d is copied.
2307309124Sdim/// \param __p
2308309124Sdim///    A pointer to a 64-bit memory location that will receive the conditionally
2309309124Sdim///    copied integer values. The address of the memory location does not have
2310309124Sdim///    to be aligned.
2311288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2312249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2313193326Sed{
2314249423Sdim  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2315193326Sed}
2316193326Sed
2317309124Sdim/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2318309124Sdim///    values and writes the averages to the corresponding bits in the
2319309124Sdim///    destination.
2320309124Sdim///
2321309124Sdim/// \headerfile <x86intrin.h>
2322309124Sdim///
2323309124Sdim/// This intrinsic corresponds to the \c PAVGB instruction.
2324309124Sdim///
2325309124Sdim/// \param __a
2326309124Sdim///    A 64-bit integer vector containing one of the source operands.
2327309124Sdim/// \param __b
2328309124Sdim///    A 64-bit integer vector containing one of the source operands.
2329309124Sdim/// \returns A 64-bit integer vector containing the averages of both operands.
2330288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2331249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b)
2332193326Sed{
2333249423Sdim  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2334193326Sed}
2335193326Sed
2336309124Sdim/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2337309124Sdim///    values and writes the averages to the corresponding bits in the
2338309124Sdim///    destination.
2339309124Sdim///
2340309124Sdim/// \headerfile <x86intrin.h>
2341309124Sdim///
2342309124Sdim/// This intrinsic corresponds to the \c PAVGW instruction.
2343309124Sdim///
2344309124Sdim/// \param __a
2345309124Sdim///    A 64-bit integer vector containing one of the source operands.
2346309124Sdim/// \param __b
2347309124Sdim///    A 64-bit integer vector containing one of the source operands.
2348309124Sdim/// \returns A 64-bit integer vector containing the averages of both operands.
2349288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2350249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b)
2351193326Sed{
2352249423Sdim  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2353193326Sed}
2354193326Sed
2355309124Sdim/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2356309124Sdim///    64-bit vector operands and computes the absolute value for each of the
2357309124Sdim///    difference. Then sum of the 8 absolute differences is written to the
2358309124Sdim///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2359309124Sdim///
2360309124Sdim/// \headerfile <x86intrin.h>
2361309124Sdim///
2362309124Sdim/// This intrinsic corresponds to the \c PSADBW instruction.
2363309124Sdim///
2364309124Sdim/// \param __a
2365309124Sdim///    A 64-bit integer vector containing one of the source operands.
2366309124Sdim/// \param __b
2367309124Sdim///    A 64-bit integer vector containing one of the source operands.
2368309124Sdim/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2369309124Sdim///    sets of absolute differences between both operands. The upper bits are
2370309124Sdim///    cleared.
2371288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2372249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b)
2373193326Sed{
2374249423Sdim  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2375193326Sed}
2376193326Sed
2377309124Sdim/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2378309124Sdim///    integer value. There are several groups of macros associated with this
2379309124Sdim///    intrinsic, including:
2380309124Sdim///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2381309124Sdim///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2382309124Sdim///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2383309124Sdim///      _MM_GET_EXCEPTION_STATE().
2384309124Sdim///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2385309124Sdim///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2386309124Sdim///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2387309124Sdim///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2388309124Sdim///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2389309124Sdim///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2390309124Sdim///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2391309124Sdim///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2392309124Sdim///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2393309124Sdim///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2394309124Sdim///      _MM_GET_DENORMALS_ZERO_MODE().
2395309124Sdim///
2396309124Sdim///    For example, the expression below checks if an overflow exception has
2397309124Sdim///    occurred:
2398309124Sdim///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2399309124Sdim///
2400309124Sdim///    The following example gets the current rounding mode:
2401309124Sdim///      _MM_GET_ROUNDING_MODE()
2402309124Sdim///
2403309124Sdim/// \headerfile <x86intrin.h>
2404309124Sdim///
2405309124Sdim/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
2406309124Sdim///
2407309124Sdim/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2408309124Sdim///    register.
2409288943Sdimstatic __inline__ unsigned int __DEFAULT_FN_ATTRS
2410193326Sed_mm_getcsr(void)
2411193326Sed{
2412193326Sed  return __builtin_ia32_stmxcsr();
2413193326Sed}
2414193326Sed
2415309124Sdim/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
2416309124Sdim///    are several groups of macros associated with this intrinsic, including:
2417309124Sdim///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2418309124Sdim///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2419309124Sdim///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2420309124Sdim///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2421309124Sdim///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2422309124Sdim///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2423309124Sdim///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2424309124Sdim///      of these macros.
2425309124Sdim///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2426309124Sdim///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2427309124Sdim///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2428309124Sdim///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2429309124Sdim///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2430309124Sdim///      one of these macros.
2431309124Sdim///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2432309124Sdim///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2433309124Sdim///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2434309124Sdim///
2435309124Sdim///    For example, the following expression causes subsequent floating-point
2436309124Sdim///    operations to round up:
2437309124Sdim///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2438309124Sdim///
2439309124Sdim///    The following example sets the DAZ and FTZ flags:
2440309124Sdim///      void setFlags() {
2441309124Sdim///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2442309124Sdim///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2443309124Sdim///      }
2444309124Sdim///
2445309124Sdim/// \headerfile <x86intrin.h>
2446309124Sdim///
2447309124Sdim/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
2448309124Sdim///
2449309124Sdim/// \param __i
2450309124Sdim///    A 32-bit unsigned integer value to be written to the MXCSR register.
2451288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS
2452249423Sdim_mm_setcsr(unsigned int __i)
2453193326Sed{
2454249423Sdim  __builtin_ia32_ldmxcsr(__i);
2455193326Sed}
2456193326Sed
2457309124Sdim/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2458309124Sdim///    specified by the immediate value operand.
2459309124Sdim///
2460309124Sdim/// \headerfile <x86intrin.h>
2461309124Sdim///
2462309124Sdim/// \code
2463309124Sdim/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2464309124Sdim/// \endcode
2465309124Sdim///
2466309124Sdim/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
2467309124Sdim///
2468309124Sdim/// \param a
2469309124Sdim///    A 128-bit vector of [4 x float].
2470309124Sdim/// \param b
2471309124Sdim///    A 128-bit vector of [4 x float].
2472309124Sdim/// \param mask
2473309124Sdim///    An immediate value containing an 8-bit value specifying which elements to
2474309124Sdim///    copy from a and b.
2475309124Sdim///    Bits [3:0] specify the values copied from operand a.
2476309124Sdim///    Bits [7:4] specify the values copied from operand b. The destinations
2477309124Sdim///    within the 128-bit destination are assigned values as follows:
2478309124Sdim///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
2479309124Sdim///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
2480309124Sdim///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
2481309124Sdim///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
2482309124Sdim///    Bit value assignments:
2483309124Sdim///    00: Bits [31:0] copied from the specified operand.
2484309124Sdim///    01: Bits [63:32] copied from the specified operand.
2485309124Sdim///    10: Bits [95:64] copied from the specified operand.
2486309124Sdim///    11: Bits [127:96] copied from the specified operand.
2487309124Sdim/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2488234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
2489296417Sdim  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2490309124Sdim                                  0 + (((mask) >> 0) & 0x3), \
2491309124Sdim                                  0 + (((mask) >> 2) & 0x3), \
2492309124Sdim                                  4 + (((mask) >> 4) & 0x3), \
2493309124Sdim                                  4 + (((mask) >> 6) & 0x3)); })
2494193326Sed
2495309124Sdim/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2496309124Sdim///    [4 x float] and interleaves them into a 128-bit vector of [4 x
2497309124Sdim///    float].
2498309124Sdim///
2499309124Sdim/// \headerfile <x86intrin.h>
2500309124Sdim///
2501309124Sdim/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
2502309124Sdim///
2503309124Sdim/// \param __a
2504309124Sdim///    A 128-bit vector of [4 x float].
2505309124Sdim///    Bits [95:64] are written to bits [31:0] of the destination.
2506309124Sdim///    Bits [127:96] are written to bits [95:64] of the destination.
2507309124Sdim/// \param __b
2508309124Sdim///    A 128-bit vector of [4 x float].
2509309124Sdim///    Bits [95:64] are written to bits [63:32] of the destination.
2510309124Sdim///    Bits [127:96] are written to bits [127:96] of the destination.
2511309124Sdim/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2512288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2513249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b)
2514193326Sed{
2515309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2516193326Sed}
2517193326Sed
2518309124Sdim/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2519309124Sdim///    [4 x float] and interleaves them into a 128-bit vector of [4 x
2520309124Sdim///    float].
2521309124Sdim///
2522309124Sdim/// \headerfile <x86intrin.h>
2523309124Sdim///
2524309124Sdim/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
2525309124Sdim///
2526309124Sdim/// \param __a
2527309124Sdim///    A 128-bit vector of [4 x float].
2528309124Sdim///    Bits [31:0] are written to bits [31:0] of the destination.
2529309124Sdim///    Bits [63:32] are written to bits [95:64] of the destination.
2530309124Sdim/// \param __b
2531309124Sdim///    A 128-bit vector of [4 x float].
2532309124Sdim///    Bits [31:0] are written to bits [63:32] of the destination.
2533309124Sdim///    Bits [63:32] are written to bits [127:96] of the destination.
2534309124Sdim/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2535288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2536249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b)
2537193326Sed{
2538309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2539193326Sed}
2540193326Sed
2541309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2542309124Sdim///    32 bits are set to the lower 32 bits of the second parameter. The upper
2543309124Sdim///    96 bits are set to the upper 96 bits of the first parameter.
2544309124Sdim///
2545309124Sdim/// \headerfile <x86intrin.h>
2546309124Sdim///
2547309124Sdim/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
2548309124Sdim///
2549309124Sdim/// \param __a
2550309124Sdim///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2551309124Sdim///    written to the upper 96 bits of the result.
2552309124Sdim/// \param __b
2553309124Sdim///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2554309124Sdim///    written to the lower 32 bits of the result.
2555309124Sdim/// \returns A 128-bit floating-point vector of [4 x float].
2556288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2557249423Sdim_mm_move_ss(__m128 __a, __m128 __b)
2558193326Sed{
2559309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
2560193326Sed}
2561193326Sed
2562309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2563309124Sdim///    64 bits are set to the upper 64 bits of the second parameter. The upper
2564309124Sdim///    64 bits are set to the upper 64 bits of the first parameter.
2565309124Sdim///
2566309124Sdim/// \headerfile <x86intrin.h>
2567309124Sdim///
2568309124Sdim/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
2569309124Sdim///
2570309124Sdim/// \param __a
2571309124Sdim///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2572309124Sdim///    written to the upper 64 bits of the result.
2573309124Sdim/// \param __b
2574309124Sdim///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2575309124Sdim///    written to the lower 64 bits of the result.
2576309124Sdim/// \returns A 128-bit floating-point vector of [4 x float].
2577288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2578249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b)
2579193326Sed{
2580309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2581193326Sed}
2582193326Sed
2583309124Sdim/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2584309124Sdim///    64 bits are set to the lower 64 bits of the first parameter. The upper
2585309124Sdim///    64 bits are set to the lower 64 bits of the second parameter.
2586309124Sdim///
2587309124Sdim/// \headerfile <x86intrin.h>
2588309124Sdim///
2589309124Sdim/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
2590309124Sdim///
2591309124Sdim/// \param __a
2592309124Sdim///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2593309124Sdim///    written to the lower 64 bits of the result.
2594309124Sdim/// \param __b
2595309124Sdim///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2596309124Sdim///    written to the upper 64 bits of the result.
2597309124Sdim/// \returns A 128-bit floating-point vector of [4 x float].
2598288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2599249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b)
2600193326Sed{
2601309124Sdim  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2602193326Sed}
2603193326Sed
2604309124Sdim/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2605309124Sdim///    float].
2606309124Sdim///
2607309124Sdim/// \headerfile <x86intrin.h>
2608309124Sdim///
2609309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2610309124Sdim///
2611309124Sdim/// \param __a
2612309124Sdim///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2613309124Sdim///    from the corresponding elements in this operand.
2614309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2615309124Sdim///    values from the operand.
2616288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2617249423Sdim_mm_cvtpi16_ps(__m64 __a)
2618193326Sed{
2619249423Sdim  __m64 __b, __c;
2620249423Sdim  __m128 __r;
2621193326Sed
2622249423Sdim  __b = _mm_setzero_si64();
2623249423Sdim  __b = _mm_cmpgt_pi16(__b, __a);
2624249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
2625249423Sdim  __r = _mm_setzero_ps();
2626249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
2627249423Sdim  __r = _mm_movelh_ps(__r, __r);
2628249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
2629249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
2630193326Sed
2631249423Sdim  return __r;
2632193326Sed}
2633193326Sed
2634309124Sdim/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2635309124Sdim///    128-bit vector of [4 x float].
2636309124Sdim///
2637309124Sdim/// \headerfile <x86intrin.h>
2638309124Sdim///
2639309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2640309124Sdim///
2641309124Sdim/// \param __a
2642309124Sdim///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2643309124Sdim///    destination are copied from the corresponding elements in this operand.
2644309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2645309124Sdim///    values from the operand.
2646288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2647249423Sdim_mm_cvtpu16_ps(__m64 __a)
2648193326Sed{
2649249423Sdim  __m64 __b, __c;
2650249423Sdim  __m128 __r;
2651193326Sed
2652249423Sdim  __b = _mm_setzero_si64();
2653249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
2654249423Sdim  __r = _mm_setzero_ps();
2655249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
2656249423Sdim  __r = _mm_movelh_ps(__r, __r);
2657249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
2658249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
2659193326Sed
2660249423Sdim  return __r;
2661193326Sed}
2662193326Sed
2663309124Sdim/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2664309124Sdim///    into a 128-bit vector of [4 x float].
2665309124Sdim///
2666309124Sdim/// \headerfile <x86intrin.h>
2667309124Sdim///
2668309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2669309124Sdim///
2670309124Sdim/// \param __a
2671309124Sdim///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2672309124Sdim///    from the corresponding lower 4 elements in this operand.
2673309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2674309124Sdim///    values from the operand.
2675288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2676249423Sdim_mm_cvtpi8_ps(__m64 __a)
2677193326Sed{
2678249423Sdim  __m64 __b;
2679296417Sdim
2680249423Sdim  __b = _mm_setzero_si64();
2681249423Sdim  __b = _mm_cmpgt_pi8(__b, __a);
2682249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
2683193326Sed
2684249423Sdim  return _mm_cvtpi16_ps(__b);
2685193326Sed}
2686193326Sed
2687309124Sdim/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2688309124Sdim///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2689309124Sdim///
2690309124Sdim/// \headerfile <x86intrin.h>
2691309124Sdim///
2692309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2693309124Sdim///
2694309124Sdim/// \param __a
2695309124Sdim///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2696309124Sdim///    destination are copied from the corresponding lower 4 elements in this
2697309124Sdim///    operand.
2698309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2699309124Sdim///    values from the source operand.
2700288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2701249423Sdim_mm_cvtpu8_ps(__m64 __a)
2702193326Sed{
2703249423Sdim  __m64 __b;
2704296417Sdim
2705249423Sdim  __b = _mm_setzero_si64();
2706249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
2707193326Sed
2708249423Sdim  return _mm_cvtpi16_ps(__b);
2709193326Sed}
2710193326Sed
2711309124Sdim/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2712309124Sdim///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2713309124Sdim///
2714309124Sdim/// \headerfile <x86intrin.h>
2715309124Sdim///
2716309124Sdim/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2717309124Sdim///
2718309124Sdim/// \param __a
2719309124Sdim///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2720309124Sdim///    copied from the elements in this operand.
2721309124Sdim/// \param __b
2722309124Sdim///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2723309124Sdim///    copied from the elements in this operand.
2724309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2725309124Sdim///    copied and converted values from the first operand. The upper 64 bits
2726309124Sdim///    contain the copied and converted values from the second operand.
2727288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS
2728249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2729193326Sed{
2730249423Sdim  __m128 __c;
2731296417Sdim
2732249423Sdim  __c = _mm_setzero_ps();
2733249423Sdim  __c = _mm_cvtpi32_ps(__c, __b);
2734249423Sdim  __c = _mm_movelh_ps(__c, __c);
2735193326Sed
2736249423Sdim  return _mm_cvtpi32_ps(__c, __a);
2737193326Sed}
2738193326Sed
2739309124Sdim/// \brief Converts each single-precision floating-point element of a 128-bit
2740309124Sdim///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2741309124Sdim///    packs the results into a 64-bit integer vector of [4 x i16]. If the
2742309124Sdim///    floating-point element is NaN or infinity, or if the floating-point
2743309124Sdim///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
2744309124Sdim///    to 0x8000. Otherwise if the floating-point element is greater
2745309124Sdim///    than 0x7FFF, it is converted to 0x7FFF.
2746309124Sdim///
2747309124Sdim/// \headerfile <x86intrin.h>
2748309124Sdim///
2749309124Sdim/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2750309124Sdim///
2751309124Sdim/// \param __a
2752309124Sdim///    A 128-bit floating-point vector of [4 x float].
2753309124Sdim/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2754309124Sdim///    values.
2755288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2756249423Sdim_mm_cvtps_pi16(__m128 __a)
2757193326Sed{
2758249423Sdim  __m64 __b, __c;
2759296417Sdim
2760249423Sdim  __b = _mm_cvtps_pi32(__a);
2761249423Sdim  __a = _mm_movehl_ps(__a, __a);
2762249423Sdim  __c = _mm_cvtps_pi32(__a);
2763296417Sdim
2764266674Sdim  return _mm_packs_pi32(__b, __c);
2765193326Sed}
2766193326Sed
2767309124Sdim/// \brief Converts each single-precision floating-point element of a 128-bit
2768309124Sdim///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2769309124Sdim///    packs the results into the lower 32 bits of a 64-bit integer vector of
2770309124Sdim///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
2771309124Sdim///    floating-point element is NaN or infinity, or if the floating-point
2772309124Sdim///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
2773309124Sdim///    to 0x80. Otherwise if the floating-point element is greater
2774309124Sdim///    than 0x7F, it is converted to 0x7F.
2775309124Sdim///
2776309124Sdim/// \headerfile <x86intrin.h>
2777309124Sdim///
2778309124Sdim/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2779309124Sdim///
2780309124Sdim/// \param __a
2781309124Sdim///    128-bit floating-point vector of [4 x float].
2782309124Sdim/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2783309124Sdim///    converted values and the uppper 32 bits are set to zero.
2784288943Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS
2785249423Sdim_mm_cvtps_pi8(__m128 __a)
2786193326Sed{
2787249423Sdim  __m64 __b, __c;
2788296417Sdim
2789249423Sdim  __b = _mm_cvtps_pi16(__a);
2790249423Sdim  __c = _mm_setzero_si64();
2791296417Sdim
2792249423Sdim  return _mm_packs_pi16(__b, __c);
2793193326Sed}
2794193326Sed
2795309124Sdim/// \brief Extracts the sign bits from each single-precision floating-point
2796309124Sdim///    element of a 128-bit floating-point vector of [4 x float] and returns the
2797309124Sdim///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2798309124Sdim///    to zero.
2799309124Sdim///
2800309124Sdim/// \headerfile <x86intrin.h>
2801309124Sdim///
2802309124Sdim/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
2803309124Sdim///
2804309124Sdim/// \param __a
2805309124Sdim///    A 128-bit floating-point vector of [4 x float].
2806309124Sdim/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2807309124Sdim///    single-precision floating-point element of the parameter. Bits [31:4] are
2808309124Sdim///    set to zero.
2809288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS
2810249423Sdim_mm_movemask_ps(__m128 __a)
2811193326Sed{
2812309124Sdim  return __builtin_ia32_movmskps((__v4sf)__a);
2813193326Sed}
2814193326Sed
2815296417Sdim
2816309124Sdim#define _MM_ALIGN16 __attribute__((aligned(16)))
2817296417Sdim
2818193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2819193326Sed
2820193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
2821193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
2822193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
2823193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
2824193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
2825193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
2826193326Sed#define _MM_EXCEPT_MASK       (0x003f)
2827193326Sed
2828193326Sed#define _MM_MASK_INVALID      (0x0080)
2829193326Sed#define _MM_MASK_DENORM       (0x0100)
2830193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
2831193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
2832193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
2833193326Sed#define _MM_MASK_INEXACT      (0x1000)
2834193326Sed#define _MM_MASK_MASK         (0x1f80)
2835193326Sed
2836193326Sed#define _MM_ROUND_NEAREST     (0x0000)
2837193326Sed#define _MM_ROUND_DOWN        (0x2000)
2838193326Sed#define _MM_ROUND_UP          (0x4000)
2839193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
2840193326Sed#define _MM_ROUND_MASK        (0x6000)
2841193326Sed
2842193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
2843193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
2844234353Sdim#define _MM_FLUSH_ZERO_OFF    (0x0000)
2845193326Sed
2846193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2847193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2848193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2849193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2850193326Sed
2851193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2852193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2853193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2854193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2855193326Sed
2856193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2857193326Seddo { \
2858193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
2859193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2860193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2861193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2862193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2863193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2864193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2865193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2866203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2867193326Sed} while (0)
2868193326Sed
2869212904Sdim/* Aliases for compatibility. */
2870212904Sdim#define _m_pextrw _mm_extract_pi16
2871212904Sdim#define _m_pinsrw _mm_insert_pi16
2872212904Sdim#define _m_pmaxsw _mm_max_pi16
2873212904Sdim#define _m_pmaxub _mm_max_pu8
2874212904Sdim#define _m_pminsw _mm_min_pi16
2875212904Sdim#define _m_pminub _mm_min_pu8
2876212904Sdim#define _m_pmovmskb _mm_movemask_pi8
2877212904Sdim#define _m_pmulhuw _mm_mulhi_pu16
2878212904Sdim#define _m_pshufw _mm_shuffle_pi16
2879212904Sdim#define _m_maskmovq _mm_maskmove_si64
2880212904Sdim#define _m_pavgb _mm_avg_pu8
2881212904Sdim#define _m_pavgw _mm_avg_pu16
2882212904Sdim#define _m_psadbw _mm_sad_pu8
2883212904Sdim#define _m_ _mm_
2884212904Sdim#define _m_ _mm_
2885212904Sdim
2886288943Sdim#undef __DEFAULT_FN_ATTRS
2887288943Sdim
2888194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
2889309124Sdim#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2890193326Sed#include <emmintrin.h>
2891194179Sed#endif
2892193326Sed
2893193326Sed#endif /* __XMMINTRIN_H */
2894