smmintrin.h revision 327952
1168404Spjd/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2168404Spjd *
3168404Spjd * Permission is hereby granted, free of charge, to any person obtaining a copy
4168404Spjd * of this software and associated documentation files (the "Software"), to deal
5168404Spjd * in the Software without restriction, including without limitation the rights
6168404Spjd * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7168404Spjd * copies of the Software, and to permit persons to whom the Software is
8168404Spjd * furnished to do so, subject to the following conditions:
9168404Spjd *
10168404Spjd * The above copyright notice and this permission notice shall be included in
11168404Spjd * all copies or substantial portions of the Software.
12168404Spjd *
13168404Spjd * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14168404Spjd * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15168404Spjd * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16168404Spjd * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17168404Spjd * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18168404Spjd * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19168404Spjd * THE SOFTWARE.
20168404Spjd *
21168404Spjd *===-----------------------------------------------------------------------===
22168404Spjd */
23219089Spjd
24227497Smm#ifndef _SMMINTRIN_H
25268720Sdelphij#define _SMMINTRIN_H
26236145Smm
27236155Smm#include <tmmintrin.h>
28254758Sdelphij
29168404Spjd/* Define the default attributes for the functions in this file. */
30168404Spjd#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
31168404Spjd
32168404Spjd/* SSE4 Rounding macros. */
33168404Spjd#define _MM_FROUND_TO_NEAREST_INT    0x00
34168404Spjd#define _MM_FROUND_TO_NEG_INF        0x01
35168404Spjd#define _MM_FROUND_TO_POS_INF        0x02
36168404Spjd#define _MM_FROUND_TO_ZERO           0x03
37168404Spjd#define _MM_FROUND_CUR_DIRECTION     0x04
38168404Spjd
39168404Spjd#define _MM_FROUND_RAISE_EXC         0x00
40168404Spjd#define _MM_FROUND_NO_EXC            0x08
41168404Spjd
42168404Spjd#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
43168404Spjd#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
44168404Spjd#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
45168404Spjd#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
46168404Spjd#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
47185029Spjd#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
48185029Spjd
49168404Spjd/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
50236155Smm///    integer and returns the rounded values in a 128-bit vector of
51168404Spjd///    [4 x float].
52168404Spjd///
53168404Spjd/// \headerfile <x86intrin.h>
54168404Spjd///
55168404Spjd/// \code
56168404Spjd/// __m128 _mm_ceil_ps(__m128 X);
57185029Spjd/// \endcode
58236884Smm///
59168404Spjd/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
60219089Spjd///
61219089Spjd/// \param X
62168404Spjd///    A 128-bit vector of [4 x float] values to be rounded up.
63168404Spjd/// \returns A 128-bit vector of [4 x float] containing the rounded values.
64168404Spjd#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
65168404Spjd
66168404Spjd/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
67224171Sgibbs///    integer and returns the rounded values in a 128-bit vector of
68168404Spjd///    [2 x double].
69168404Spjd///
70168404Spjd/// \headerfile <x86intrin.h>
71168404Spjd///
72168404Spjd/// \code
73168404Spjd/// __m128d _mm_ceil_pd(__m128d X);
74168404Spjd/// \endcode
75168404Spjd///
76236155Smm/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
77168404Spjd///
78228103Smm/// \param X
79228103Smm///    A 128-bit vector of [2 x double] values to be rounded up.
80168404Spjd/// \returns A 128-bit vector of [2 x double] containing the rounded values.
81168404Spjd#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
82168404Spjd
83219089Spjd/// \brief Copies three upper elements of the first 128-bit vector operand to
84168404Spjd///    the corresponding three upper elements of the 128-bit result vector of
85168404Spjd///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
86168404Spjd///    operand to an integer and copies it to the lowest element of the 128-bit
87168404Spjd///    result vector of [4 x float].
88168404Spjd///
89168404Spjd/// \headerfile <x86intrin.h>
90168404Spjd///
91168404Spjd/// \code
92168404Spjd/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
93168404Spjd/// \endcode
94168404Spjd///
95168404Spjd/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
96168404Spjd///
97168404Spjd/// \param X
98168404Spjd///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
99168404Spjd///    copied to the corresponding bits of the result.
100168404Spjd/// \param Y
101185029Spjd///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
102185029Spjd///    rounded up to the nearest integer and copied to the corresponding bits
103168404Spjd///    of the result.
104168404Spjd/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
105168404Spjd///    values.
106168404Spjd#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
107168404Spjd
108168404Spjd/// \brief Copies the upper element of the first 128-bit vector operand to the
109168404Spjd///    corresponding upper element of the 128-bit result vector of [2 x double].
110168404Spjd///    Rounds up the lower element of the second 128-bit vector operand to an
111168404Spjd///    integer and copies it to the lower element of the 128-bit result vector
112168404Spjd///    of [2 x double].
113168404Spjd///
114185029Spjd/// \headerfile <x86intrin.h>
115168404Spjd///
116168404Spjd/// \code
117168404Spjd/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
118168404Spjd/// \endcode
119168404Spjd///
120168404Spjd/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
121168404Spjd///
122168404Spjd/// \param X
123168404Spjd///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
124168404Spjd///    copied to the corresponding bits of the result.
125168404Spjd/// \param Y
126168404Spjd///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
127224171Sgibbs///    rounded up to the nearest integer and copied to the corresponding bits
128168404Spjd///    of the result.
129168404Spjd/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
130168404Spjd///    values.
131168404Spjd#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
132168404Spjd
133168404Spjd/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
134168404Spjd///    an integer and returns the rounded values in a 128-bit vector of
135168404Spjd///    [4 x float].
136168404Spjd///
137219089Spjd/// \headerfile <x86intrin.h>
138228103Smm///
139236155Smm/// \code
140236155Smm/// __m128 _mm_floor_ps(__m128 X);
141168404Spjd/// \endcode
142168404Spjd///
143168404Spjd/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
144168404Spjd///
145168404Spjd/// \param X
146168404Spjd///    A 128-bit vector of [4 x float] values to be rounded down.
147168404Spjd/// \returns A 128-bit vector of [4 x float] containing the rounded values.
148168404Spjd#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
149168404Spjd
150168404Spjd/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
151168404Spjd///    integer and returns the rounded values in a 128-bit vector of
152168404Spjd///    [2 x double].
153168404Spjd///
154168404Spjd/// \headerfile <x86intrin.h>
155168404Spjd///
156168404Spjd/// \code
157168404Spjd/// __m128d _mm_floor_pd(__m128d X);
158168404Spjd/// \endcode
159168404Spjd///
160168404Spjd/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
161168404Spjd///
162168404Spjd/// \param X
163168404Spjd///    A 128-bit vector of [2 x double].
164168404Spjd/// \returns A 128-bit vector of [2 x double] containing the rounded values.
165168404Spjd#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
166224171Sgibbs
167224171Sgibbs/// \brief Copies three upper elements of the first 128-bit vector operand to
168168404Spjd///    the corresponding three upper elements of the 128-bit result vector of
169168404Spjd///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
170168404Spjd///    operand to an integer and copies it to the lowest element of the 128-bit
171168404Spjd///    result vector of [4 x float].
172168404Spjd///
173168404Spjd/// \headerfile <x86intrin.h>
174168404Spjd///
175236155Smm/// \code
176168404Spjd/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
177168404Spjd/// \endcode
178168404Spjd///
179168404Spjd/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
180219089Spjd///
181168404Spjd/// \param X
182168404Spjd///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
183168404Spjd///    copied to the corresponding bits of the result.
184168404Spjd/// \param Y
185168404Spjd///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
186168404Spjd///    rounded down to the nearest integer and copied to the corresponding bits
187228103Smm///    of the result.
188168404Spjd/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
189168404Spjd///    values.
190168404Spjd#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
191168404Spjd
192168404Spjd/// \brief Copies the upper element of the first 128-bit vector operand to the
193168404Spjd///    corresponding upper element of the 128-bit result vector of [2 x double].
194168404Spjd///    Rounds down the lower element of the second 128-bit vector operand to an
195168404Spjd///    integer and copies it to the lower element of the 128-bit result vector
196248571Smm///    of [2 x double].
197185029Spjd///
198248571Smm/// \headerfile <x86intrin.h>
199219089Spjd///
200219089Spjd/// \code
201168404Spjd/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
202168404Spjd/// \endcode
203168404Spjd///
204168404Spjd/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
205168404Spjd///
206168404Spjd/// \param X
207168404Spjd///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
208185029Spjd///    copied to the corresponding bits of the result.
209168404Spjd/// \param Y
210219089Spjd///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
211168404Spjd///    rounded down to the nearest integer and copied to the corresponding bits
212236884Smm///    of the result.
213185029Spjd/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
214185029Spjd///    values.
215168404Spjd#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
216168404Spjd
217168404Spjd/// \brief Rounds each element of the 128-bit vector of [4 x float] to an
218168404Spjd///    integer value according to the rounding control specified by the second
219168404Spjd///    argument and returns the rounded values in a 128-bit vector of
220168404Spjd///    [4 x float].
221168404Spjd///
222185029Spjd/// \headerfile <x86intrin.h>
223168404Spjd///
224168404Spjd/// \code
225219089Spjd/// __m128 _mm_round_ps(__m128 X, const int M);
226185029Spjd/// \endcode
227219089Spjd///
228219089Spjd/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
229185029Spjd///
230219089Spjd/// \param X
231219089Spjd///    A 128-bit vector of [4 x float].
232219089Spjd/// \param M
233168404Spjd///    An integer value that specifies the rounding operation. \n
234219089Spjd///    Bits [7:4] are reserved. \n
235168404Spjd///    Bit [3] is a precision exception value: \n
236224171Sgibbs///      0: A normal PE exception is used \n
237224171Sgibbs///      1: The PE field is not updated \n
238168404Spjd///    Bit [2] is the rounding control source: \n
239263889Sdelphij///      0: Use bits [1:0] of \a M \n
240219089Spjd///      1: Use the current MXCSR setting \n
241168404Spjd///    Bits [1:0] contain the rounding control definition: \n
242168404Spjd///      00: Nearest \n
243168404Spjd///      01: Downward (toward negative infinity) \n
244228020Smm///      10: Upward (toward positive infinity) \n
245168404Spjd///      11: Truncated
246168404Spjd/// \returns A 128-bit vector of [4 x float] containing the rounded values.
247185029Spjd#define _mm_round_ps(X, M) __extension__ ({ \
248168404Spjd  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
249185029Spjd
250236155Smm/// \brief Copies three upper elements of the first 128-bit vector operand to
251260138Sdelphij///    the corresponding three upper elements of the 128-bit result vector of
252168404Spjd///    [4 x float]. Rounds the lowest element of the second 128-bit vector
253168404Spjd///    operand to an integer value according to the rounding control specified
254168404Spjd///    by the third argument and copies it to the lowest element of the 128-bit
255219089Spjd///    result vector of [4 x float].
256219089Spjd///
257168404Spjd/// \headerfile <x86intrin.h>
258228020Smm///
259185029Spjd/// \code
260168404Spjd/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
261263889Sdelphij/// \endcode
262263889Sdelphij///
263168404Spjd/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
264168404Spjd///
265219089Spjd/// \param X
266219089Spjd///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
267219089Spjd///    copied to the corresponding bits of the result.
268219089Spjd/// \param Y
269228103Smm///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
270228103Smm///    rounded to the nearest integer using the specified rounding control and
271168404Spjd///    copied to the corresponding bits of the result.
272168404Spjd/// \param M
273168404Spjd///    An integer value that specifies the rounding operation. \n
274168404Spjd///    Bits [7:4] are reserved. \n
275168404Spjd///    Bit [3] is a precision exception value: \n
276168404Spjd///      0: A normal PE exception is used \n
277168404Spjd///      1: The PE field is not updated \n
278168404Spjd///    Bit [2] is the rounding control source: \n
279168404Spjd///      0: Use bits [1:0] of \a M \n
280168404Spjd///      1: Use the current MXCSR setting \n
281185029Spjd///    Bits [1:0] contain the rounding control definition: \n
282185029Spjd///      00: Nearest \n
283168404Spjd///      01: Downward (toward negative infinity) \n
284168404Spjd///      10: Upward (toward positive infinity) \n
285168404Spjd///      11: Truncated
286219089Spjd/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
287168404Spjd///    values.
288185029Spjd#define _mm_round_ss(X, Y, M) __extension__ ({ \
289185029Spjd  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
290185029Spjd                                 (__v4sf)(__m128)(Y), (M)); })
291219089Spjd
292185029Spjd/// \brief Rounds each element of the 128-bit vector of [2 x double] to an
293168404Spjd///    integer value according to the rounding control specified by the second
294168404Spjd///    argument and returns the rounded values in a 128-bit vector of
295168404Spjd///    [2 x double].
296168404Spjd///
297168404Spjd/// \headerfile <x86intrin.h>
298185029Spjd///
299168404Spjd/// \code
300168404Spjd/// __m128d _mm_round_pd(__m128d X, const int M);
301168404Spjd/// \endcode
302168404Spjd///
303168404Spjd/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
304168404Spjd///
305168404Spjd/// \param X
306168404Spjd///    A 128-bit vector of [2 x double].
307168404Spjd/// \param M
308168404Spjd///    An integer value that specifies the rounding operation. \n
309168404Spjd///    Bits [7:4] are reserved. \n
310168404Spjd///    Bit [3] is a precision exception value: \n
311168404Spjd///      0: A normal PE exception is used \n
312168404Spjd///      1: The PE field is not updated \n
313168404Spjd///    Bit [2] is the rounding control source: \n
314168404Spjd///      0: Use bits [1:0] of \a M \n
315168404Spjd///      1: Use the current MXCSR setting \n
316168404Spjd///    Bits [1:0] contain the rounding control definition: \n
317168404Spjd///      00: Nearest \n
318168404Spjd///      01: Downward (toward negative infinity) \n
319168404Spjd///      10: Upward (toward positive infinity) \n
320168404Spjd///      11: Truncated
321168404Spjd/// \returns A 128-bit vector of [2 x double] containing the rounded values.
322168404Spjd#define _mm_round_pd(X, M) __extension__ ({ \
323168404Spjd  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
324168404Spjd
325168404Spjd/// \brief Copies the upper element of the first 128-bit vector operand to the
326168404Spjd///    corresponding upper element of the 128-bit result vector of [2 x double].
327168404Spjd///    Rounds the lower element of the second 128-bit vector operand to an
328168404Spjd///    integer value according to the rounding control specified by the third
329168404Spjd///    argument and copies it to the lower element of the 128-bit result vector
330168404Spjd///    of [2 x double].
331168404Spjd///
332185029Spjd/// \headerfile <x86intrin.h>
333185029Spjd///
334168404Spjd/// \code
335168404Spjd/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
336168404Spjd/// \endcode
337168404Spjd///
338219089Spjd/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
339185029Spjd///
340168404Spjd/// \param X
341168404Spjd///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
342185029Spjd///    copied to the corresponding bits of the result.
343185029Spjd/// \param Y
344236884Smm///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
345236884Smm///    rounded to the nearest integer using the specified rounding control and
346236884Smm///    copied to the corresponding bits of the result.
347236884Smm/// \param M
348236884Smm///    An integer value that specifies the rounding operation. \n
349243014Smm///    Bits [7:4] are reserved. \n
350168404Spjd///    Bit [3] is a precision exception value: \n
351168404Spjd///      0: A normal PE exception is used \n
352168404Spjd///      1: The PE field is not updated \n
353168404Spjd///    Bit [2] is the rounding control source: \n
354168404Spjd///      0: Use bits [1:0] of \a M \n
355168404Spjd///      1: Use the current MXCSR setting \n
356168404Spjd///    Bits [1:0] contain the rounding control definition: \n
357168404Spjd///      00: Nearest \n
358168404Spjd///      01: Downward (toward negative infinity) \n
359168404Spjd///      10: Upward (toward positive infinity) \n
360168404Spjd///      11: Truncated
361168404Spjd/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
362168404Spjd///    values.
363168404Spjd#define _mm_round_sd(X, Y, M) __extension__ ({ \
364185029Spjd  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
365185029Spjd                                  (__v2df)(__m128d)(Y), (M)); })
366168404Spjd
367168404Spjd/* SSE4 Packed Blending Intrinsics.  */
368168404Spjd/// \brief Returns a 128-bit vector of [2 x double] where the values are
369168404Spjd///    selected from either the first or second operand as specified by the
370168404Spjd///    third operand, the control mask.
371168404Spjd///
372168404Spjd/// \headerfile <x86intrin.h>
373168404Spjd///
374168404Spjd/// \code
375168404Spjd/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
376168404Spjd/// \endcode
377168404Spjd///
378168404Spjd/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
379185029Spjd///
380185029Spjd/// \param V1
381185029Spjd///    A 128-bit vector of [2 x double].
382185029Spjd/// \param V2
383185029Spjd///    A 128-bit vector of [2 x double].
384185029Spjd/// \param M
385185029Spjd///    An immediate integer operand, with mask bits [1:0] specifying how the
386219089Spjd///    values are to be copied. The position of the mask bit corresponds to the
387185029Spjd///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
388185029Spjd///    element in operand \a V1 is copied to the same position in the result.
389168404Spjd///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
390168404Spjd///    is copied to the same position in the result.
391168404Spjd/// \returns A 128-bit vector of [2 x double] containing the copied values.
392168404Spjd#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
393238926Smm  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
394238926Smm                                   (__v2df)(__m128d)(V2), \
395238926Smm                                   (((M) & 0x01) ? 2 : 0), \
396238926Smm                                   (((M) & 0x02) ? 3 : 1)); })
397238926Smm
398238926Smm/// \brief Returns a 128-bit vector of [4 x float] where the values are selected
399238926Smm///    from either the first or second operand as specified by the third
400238926Smm///    operand, the control mask.
401238926Smm///
402238926Smm/// \headerfile <x86intrin.h>
403238926Smm///
404238926Smm/// \code
405168404Spjd/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
406185029Spjd/// \endcode
407185029Spjd///
408185029Spjd/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
409185029Spjd///
410185029Spjd/// \param V1
411185029Spjd///    A 128-bit vector of [4 x float].
412185029Spjd/// \param V2
413185029Spjd///    A 128-bit vector of [4 x float].
414185029Spjd/// \param M
415185029Spjd///    An immediate integer operand, with mask bits [3:0] specifying how the
416185029Spjd///    values are to be copied. The position of the mask bit corresponds to the
417185029Spjd///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
418185029Spjd///    element in operand \a V1 is copied to the same position in the result.
419185029Spjd///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
420185029Spjd///    is copied to the same position in the result.
421185029Spjd/// \returns A 128-bit vector of [4 x float] containing the copied values.
422185029Spjd#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
423185029Spjd  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
424185029Spjd                                  (((M) & 0x01) ? 4 : 0), \
425185029Spjd                                  (((M) & 0x02) ? 5 : 1), \
426185029Spjd                                  (((M) & 0x04) ? 6 : 2), \
427185029Spjd                                  (((M) & 0x08) ? 7 : 3)); })
428238926Smm
429238926Smm/// \brief Returns a 128-bit vector of [2 x double] where the values are
430236884Smm///    selected from either the first or second operand as specified by the
431236884Smm///    third operand, the control mask.
432185029Spjd///
433185029Spjd/// \headerfile <x86intrin.h>
434185029Spjd///
435185029Spjd/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
436238926Smm///
437238926Smm/// \param __V1
438238926Smm///    A 128-bit vector of [2 x double].
439238926Smm/// \param __V2
440238926Smm///    A 128-bit vector of [2 x double].
441238926Smm/// \param __M
442238926Smm///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
443238926Smm///    values are to be copied. The position of the mask bit corresponds to the
444238926Smm///    most significant bit of a copied value. When a mask bit is 0, the
445238926Smm///    corresponding 64-bit element in operand \a __V1 is copied to the same
446238926Smm///    position in the result. When a mask bit is 1, the corresponding 64-bit
447238926Smm///    element in operand \a __V2 is copied to the same position in the result.
448238926Smm/// \returns A 128-bit vector of [2 x double] containing the copied values.
449238926Smmstatic __inline__ __m128d __DEFAULT_FN_ATTRS
450238926Smm_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
451238926Smm{
452236884Smm  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
453236884Smm                                            (__v2df)__M);
454236884Smm}
455236884Smm
456185029Spjd/// \brief Returns a 128-bit vector of [4 x float] where the values are
457209962Smm///    selected from either the first or second operand as specified by the
458209962Smm///    third operand, the control mask.
459209962Smm///
460209962Smm/// \headerfile <x86intrin.h>
461185029Spjd///
462185029Spjd/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
463185029Spjd///
464185029Spjd/// \param __V1
465185029Spjd///    A 128-bit vector of [4 x float].
466185029Spjd/// \param __V2
467185029Spjd///    A 128-bit vector of [4 x float].
468185029Spjd/// \param __M
469185029Spjd///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
470185029Spjd///    how the values are to be copied. The position of the mask bit corresponds
471185029Spjd///    to the most significant bit of a copied value. When a mask bit is 0, the
472185029Spjd///    corresponding 32-bit element in operand \a __V1 is copied to the same
473185029Spjd///    position in the result. When a mask bit is 1, the corresponding 32-bit
474185029Spjd///    element in operand \a __V2 is copied to the same position in the result.
475185029Spjd/// \returns A 128-bit vector of [4 x float] containing the copied values.
476185029Spjdstatic __inline__ __m128 __DEFAULT_FN_ATTRS
477185029Spjd_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
478185029Spjd{
479185029Spjd  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
480185029Spjd                                           (__v4sf)__M);
481168404Spjd}
482168404Spjd
483168404Spjd/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
484168404Spjd///    from either of the first or second operand as specified by the third
485168404Spjd///    operand, the control mask.
486168404Spjd///
487168404Spjd/// \headerfile <x86intrin.h>
488168404Spjd///
489168404Spjd/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
490168404Spjd///
491168404Spjd/// \param __V1
492168404Spjd///    A 128-bit vector of [16 x i8].
493168404Spjd/// \param __V2
494168404Spjd///    A 128-bit vector of [16 x i8].
495168404Spjd/// \param __M
496168404Spjd///    A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
497168404Spjd///    how the values are to be copied. The position of the mask bit corresponds
498168404Spjd///    to the most significant bit of a copied value. When a mask bit is 0, the
499168404Spjd///    corresponding 8-bit element in operand \a __V1 is copied to the same
500168404Spjd///    position in the result. When a mask bit is 1, the corresponding 8-bit
501168404Spjd///    element in operand \a __V2 is copied to the same position in the result.
502168404Spjd/// \returns A 128-bit vector of [16 x i8] containing the copied values.
503168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
504168404Spjd_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
505168404Spjd{
506168404Spjd  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
507168404Spjd                                               (__v16qi)__M);
508168404Spjd}
509168404Spjd
510168404Spjd/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
511168404Spjd///    from either of the first or second operand as specified by the third
512168404Spjd///    operand, the control mask.
513168404Spjd///
514168404Spjd/// \headerfile <x86intrin.h>
515168404Spjd///
516168404Spjd/// \code
517168404Spjd/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
518168404Spjd/// \endcode
519168404Spjd///
520168404Spjd/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
521168404Spjd///
522168404Spjd/// \param V1
523168404Spjd///    A 128-bit vector of [8 x i16].
524168404Spjd/// \param V2
525168404Spjd///    A 128-bit vector of [8 x i16].
526168404Spjd/// \param M
527168404Spjd///    An immediate integer operand, with mask bits [7:0] specifying how the
528168404Spjd///    values are to be copied. The position of the mask bit corresponds to the
529168404Spjd///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
530168404Spjd///    element in operand \a V1 is copied to the same position in the result.
531168404Spjd///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
532168404Spjd///    is copied to the same position in the result.
533168404Spjd/// \returns A 128-bit vector of [8 x i16] containing the copied values.
534168404Spjd#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
535168404Spjd  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
536168404Spjd                                   (__v8hi)(__m128i)(V2), \
537168404Spjd                                   (((M) & 0x01) ?  8 : 0), \
538168404Spjd                                   (((M) & 0x02) ?  9 : 1), \
539168404Spjd                                   (((M) & 0x04) ? 10 : 2), \
540168404Spjd                                   (((M) & 0x08) ? 11 : 3), \
541168404Spjd                                   (((M) & 0x10) ? 12 : 4), \
542168404Spjd                                   (((M) & 0x20) ? 13 : 5), \
543168404Spjd                                   (((M) & 0x40) ? 14 : 6), \
544168404Spjd                                   (((M) & 0x80) ? 15 : 7)); })
545168404Spjd
546168404Spjd/* SSE4 Dword Multiply Instructions.  */
547168404Spjd/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
548185029Spjd///    and returns the lower 32 bits of the each product in a 128-bit vector of
549185029Spjd///    [4 x i32].
550168404Spjd///
551168404Spjd/// \headerfile <x86intrin.h>
552168404Spjd///
553168404Spjd/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
554168404Spjd///
555168404Spjd/// \param __V1
556168404Spjd///    A 128-bit integer vector.
557168404Spjd/// \param __V2
558168404Spjd///    A 128-bit integer vector.
559168404Spjd/// \returns A 128-bit integer vector containing the products of both operands.
560168404Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
561168404Spjd_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
562168404Spjd{
563168404Spjd  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
564185029Spjd}
565185029Spjd
566185029Spjd/// \brief Multiplies corresponding even-indexed elements of two 128-bit
567168404Spjd///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
568185029Spjd///    containing the products.
569185029Spjd///
570185029Spjd/// \headerfile <x86intrin.h>
571185029Spjd///
572185029Spjd/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
573185029Spjd///
574185029Spjd/// \param __V1
575185029Spjd///    A 128-bit vector of [4 x i32].
576168404Spjd/// \param __V2
577168404Spjd///    A 128-bit vector of [4 x i32].
578168404Spjd/// \returns A 128-bit vector of [2 x i64] containing the products of both
579168404Spjd///    operands.
580168404Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
581168404Spjd_mm_mul_epi32 (__m128i __V1, __m128i __V2)
582168404Spjd{
583168404Spjd  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
584168404Spjd}
585168404Spjd
586168404Spjd/* SSE4 Floating Point Dot Product Instructions.  */
587168404Spjd/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
588219089Spjd///    and returns it in the elements of the 128-bit result vector of
589168404Spjd///    [4 x float].
590219089Spjd///
591219089Spjd///    The immediate integer operand controls which input elements
592168404Spjd///    will contribute to the dot product, and where the final results are
593168404Spjd///    returned.
594168404Spjd///
595168404Spjd/// \headerfile <x86intrin.h>
596168404Spjd///
597185029Spjd/// \code
598168404Spjd/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
599168404Spjd/// \endcode
600168404Spjd///
601168404Spjd/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
602168404Spjd///
603168404Spjd/// \param X
604168404Spjd///    A 128-bit vector of [4 x float].
605168404Spjd/// \param Y
606168404Spjd///    A 128-bit vector of [4 x float].
607168404Spjd/// \param M
608168404Spjd///    An immediate integer operand. Mask bits [7:4] determine which elements
609168404Spjd///    of the input vectors are used, with bit [4] corresponding to the lowest
610168404Spjd///    element and bit [7] corresponding to the highest element of each [4 x
611168404Spjd///    float] vector. If a bit is set, the corresponding elements from the two
612168404Spjd///    input vectors are used as an input for dot product; otherwise that input
613168404Spjd///    is treated as zero. Bits [3:0] determine which elements of the result
614168404Spjd///    will receive a copy of the final dot product, with bit [0] corresponding
615168404Spjd///    to the lowest element and bit [3] corresponding to the highest element of
616168404Spjd///    each [4 x float] subvector. If a bit is set, the dot product is returned
617168404Spjd///    in the corresponding element; otherwise that element is set to zero.
618185029Spjd/// \returns A 128-bit vector of [4 x float] containing the dot product.
619185029Spjd#define _mm_dp_ps(X, Y, M) __extension__ ({ \
620185029Spjd  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
621168404Spjd                               (__v4sf)(__m128)(Y), (M)); })
622168404Spjd
623168404Spjd/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
624168404Spjd///    and returns it in the elements of the 128-bit result vector of
625168404Spjd///    [2 x double].
626168404Spjd///
627224171Sgibbs///    The immediate integer operand controls which input
628224171Sgibbs///    elements will contribute to the dot product, and where the final results
629224171Sgibbs///    are returned.
630224171Sgibbs///
631224171Sgibbs/// \headerfile <x86intrin.h>
632224171Sgibbs///
633224171Sgibbs/// \code
634224171Sgibbs/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
635224171Sgibbs/// \endcode
636224171Sgibbs///
637224171Sgibbs/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
638224171Sgibbs///
639224171Sgibbs/// \param X
640224171Sgibbs///    A 128-bit vector of [2 x double].
641224171Sgibbs/// \param Y
642224171Sgibbs///    A 128-bit vector of [2 x double].
643224171Sgibbs/// \param M
644224171Sgibbs///    An immediate integer operand. Mask bits [5:4] determine which elements
645224171Sgibbs///    of the input vectors are used, with bit [4] corresponding to the lowest
646224171Sgibbs///    element and bit [5] corresponding to the highest element of each of [2 x
647224171Sgibbs///    double] vector. If a bit is set, the corresponding elements from the two
648224171Sgibbs///    input vectors are used as an input for dot product; otherwise that input
649224171Sgibbs///    is treated as zero. Bits [1:0] determine which elements of the result
650224171Sgibbs///    will receive a copy of the final dot product, with bit [0] corresponding
651224171Sgibbs///    to the lowest element and bit [1] corresponding to the highest element of
652224171Sgibbs///    each [2 x double] vector. If a bit is set, the dot product is returned in
653224171Sgibbs///    the corresponding element; otherwise that element is set to zero.
654224171Sgibbs#define _mm_dp_pd(X, Y, M) __extension__ ({\
655224171Sgibbs  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
656224171Sgibbs                                (__v2df)(__m128d)(Y), (M)); })
657224171Sgibbs
658224171Sgibbs/* SSE4 Streaming Load Hint Instruction.  */
659224171Sgibbs/// \brief Loads integer values from a 128-bit aligned memory location to a
660224171Sgibbs///    128-bit integer vector.
661224171Sgibbs///
662224171Sgibbs/// \headerfile <x86intrin.h>
663224171Sgibbs///
664224171Sgibbs/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
665224171Sgibbs///
666224171Sgibbs/// \param __V
667224171Sgibbs///    A pointer to a 128-bit aligned memory location that contains the integer
668224171Sgibbs///    values.
669224171Sgibbs/// \returns A 128-bit integer vector containing the data stored at the
670224171Sgibbs///    specified memory location.
671224171Sgibbsstatic __inline__  __m128i __DEFAULT_FN_ATTRS
672224171Sgibbs_mm_stream_load_si128 (__m128i const *__V)
673224171Sgibbs{
674224171Sgibbs  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
675224171Sgibbs}
676224171Sgibbs
677224171Sgibbs/* SSE4 Packed Integer Min/Max Instructions.  */
678224171Sgibbs/// \brief Compares the corresponding elements of two 128-bit vectors of
679224171Sgibbs///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
680224171Sgibbs///    of the two values.
681224171Sgibbs///
682224171Sgibbs/// \headerfile <x86intrin.h>
683224171Sgibbs///
684224171Sgibbs/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
685224171Sgibbs///
686224171Sgibbs/// \param __V1
687224171Sgibbs///    A 128-bit vector of [16 x i8].
688224171Sgibbs/// \param __V2
689224171Sgibbs///    A 128-bit vector of [16 x i8]
690224171Sgibbs/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
691224171Sgibbsstatic __inline__  __m128i __DEFAULT_FN_ATTRS
692224171Sgibbs_mm_min_epi8 (__m128i __V1, __m128i __V2)
693224171Sgibbs{
694224171Sgibbs  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
695224171Sgibbs}
696224171Sgibbs
697224171Sgibbs/// \brief Compares the corresponding elements of two 128-bit vectors of
698224171Sgibbs///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
699224171Sgibbs///    greater value of the two.
700224171Sgibbs///
701224171Sgibbs/// \headerfile <x86intrin.h>
702224171Sgibbs///
703224171Sgibbs/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
704224171Sgibbs///
705224171Sgibbs/// \param __V1
706224171Sgibbs///    A 128-bit vector of [16 x i8].
707224171Sgibbs/// \param __V2
708224171Sgibbs///    A 128-bit vector of [16 x i8].
709224171Sgibbs/// \returns A 128-bit vector of [16 x i8] containing the greater values.
710224171Sgibbsstatic __inline__  __m128i __DEFAULT_FN_ATTRS
711224171Sgibbs_mm_max_epi8 (__m128i __V1, __m128i __V2)
712224171Sgibbs{
713224171Sgibbs  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
714224171Sgibbs}
715224171Sgibbs
716224171Sgibbs/// \brief Compares the corresponding elements of two 128-bit vectors of
717224171Sgibbs///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
718224171Sgibbs///    value of the two.
719224171Sgibbs///
720224171Sgibbs/// \headerfile <x86intrin.h>
721224171Sgibbs///
722224171Sgibbs/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
723224171Sgibbs///
724224171Sgibbs/// \param __V1
725224171Sgibbs///    A 128-bit vector of [8 x u16].
726224171Sgibbs/// \param __V2
727224171Sgibbs///    A 128-bit vector of [8 x u16].
728224171Sgibbs/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
729224171Sgibbsstatic __inline__  __m128i __DEFAULT_FN_ATTRS
730224171Sgibbs_mm_min_epu16 (__m128i __V1, __m128i __V2)
731224171Sgibbs{
732224171Sgibbs  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
733224171Sgibbs}
734224171Sgibbs
735224171Sgibbs/// \brief Compares the corresponding elements of two 128-bit vectors of
736224171Sgibbs///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
737224171Sgibbs///    greater value of the two.
738224171Sgibbs///
739224171Sgibbs/// \headerfile <x86intrin.h>
740224171Sgibbs///
741224171Sgibbs/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
742224171Sgibbs///
743224171Sgibbs/// \param __V1
744224171Sgibbs///    A 128-bit vector of [8 x u16].
745224171Sgibbs/// \param __V2
746236884Smm///    A 128-bit vector of [8 x u16].
747185029Spjd/// \returns A 128-bit vector of [8 x u16] containing the greater values.
748185029Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
749168404Spjd_mm_max_epu16 (__m128i __V1, __m128i __V2)
750168404Spjd{
751168404Spjd  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
752168404Spjd}
753168404Spjd
754168404Spjd/// \brief Compares the corresponding elements of two 128-bit vectors of
755236884Smm///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
756185029Spjd///    value of the two.
757236884Smm///
758236884Smm/// \headerfile <x86intrin.h>
759185029Spjd///
760168404Spjd/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
761168404Spjd///
762168404Spjd/// \param __V1
763168404Spjd///    A 128-bit vector of [4 x i32].
764168404Spjd/// \param __V2
765168404Spjd///    A 128-bit vector of [4 x i32].
766168404Spjd/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
767168404Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
768168404Spjd_mm_min_epi32 (__m128i __V1, __m128i __V2)
769168404Spjd{
770168404Spjd  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
771236884Smm}
772168404Spjd
773185029Spjd/// \brief Compares the corresponding elements of two 128-bit vectors of
774168404Spjd///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
775185029Spjd///    greater value of the two.
776168404Spjd///
777168404Spjd/// \headerfile <x86intrin.h>
778185029Spjd///
779185029Spjd/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
780185029Spjd///
781168404Spjd/// \param __V1
782168404Spjd///    A 128-bit vector of [4 x i32].
783236884Smm/// \param __V2
784168404Spjd///    A 128-bit vector of [4 x i32].
785168404Spjd/// \returns A 128-bit vector of [4 x i32] containing the greater values.
786168404Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
787168404Spjd_mm_max_epi32 (__m128i __V1, __m128i __V2)
788168404Spjd{
789168404Spjd  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
790168404Spjd}
791236884Smm
792236884Smm/// \brief Compares the corresponding elements of two 128-bit vectors of
793236884Smm///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
794168404Spjd///    value of the two.
795168404Spjd///
796185029Spjd/// \headerfile <x86intrin.h>
797185029Spjd///
798185029Spjd/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
799185029Spjd///
800185029Spjd/// \param __V1
801185029Spjd///    A 128-bit vector of [4 x u32].
802185029Spjd/// \param __V2
803185029Spjd///    A 128-bit vector of [4 x u32].
804185029Spjd/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
805185029Spjdstatic __inline__  __m128i __DEFAULT_FN_ATTRS
806168404Spjd_mm_min_epu32 (__m128i __V1, __m128i __V2)
807168404Spjd{
808251634Sdelphij  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
809168404Spjd}
810168404Spjd
811185029Spjd/// \brief Compares the corresponding elements of two 128-bit vectors of
812185029Spjd///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
813185029Spjd///    greater value of the two.
814185029Spjd///
815185029Spjd/// \headerfile <x86intrin.h>
816185029Spjd///
817185029Spjd/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
818185029Spjd///
819185029Spjd/// \param __V1
820185029Spjd///    A 128-bit vector of [4 x u32].
821185029Spjd/// \param __V2
822236884Smm///    A 128-bit vector of [4 x u32].
823236884Smm/// \returns A 128-bit vector of [4 x u32] containing the greater values.
824236884Smmstatic __inline__  __m128i __DEFAULT_FN_ATTRS
825236884Smm_mm_max_epu32 (__m128i __V1, __m128i __V2)
826236884Smm{
827236884Smm  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
828236884Smm}
829236884Smm
830236884Smm/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
831236884Smm/// \brief Takes the first argument \a X and inserts an element from the second
832236884Smm///    argument \a Y as selected by the third argument \a N. That result then
833236884Smm///    has elements zeroed out also as selected by the third argument \a N. The
834236884Smm///    resulting 128-bit vector of [4 x float] is then returned.
835236884Smm///
836236884Smm/// \headerfile <x86intrin.h>
837185029Spjd///
838185029Spjd/// \code
839185029Spjd/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
840185029Spjd/// \endcode
841185029Spjd///
842185029Spjd/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
843185029Spjd///
844185029Spjd/// \param X
845185029Spjd///    A 128-bit vector source operand of [4 x float]. With the exception of
846185029Spjd///    those bits in the result copied from parameter \a Y and zeroed by bits
847251634Sdelphij///    [3:0] of \a N, all bits from this parameter are copied to the result.
848251634Sdelphij/// \param Y
849251634Sdelphij///    A 128-bit vector source operand of [4 x float]. One single-precision
850251634Sdelphij///    floating-point element from this source, as determined by the immediate
851251634Sdelphij///    parameter, is copied to the result.
852251634Sdelphij/// \param N
853251634Sdelphij///    Specifies which bits from operand \a Y will be copied, which bits in the
854251634Sdelphij///    result they will be be copied to, and which bits in the result will be
855251634Sdelphij///    cleared. The following assignments are made: \n
856251634Sdelphij///    Bits [7:6] specify the bits to copy from operand \a Y: \n
857185029Spjd///      00: Selects bits [31:0] from operand \a Y. \n
858251634Sdelphij///      01: Selects bits [63:32] from operand \a Y. \n
859185029Spjd///      10: Selects bits [95:64] from operand \a Y. \n
860168404Spjd///      11: Selects bits [127:96] from operand \a Y. \n
861168404Spjd///    Bits [5:4] specify the bits in the result to which the selected bits
862168404Spjd///    from operand \a Y are copied: \n
863185029Spjd///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
864168404Spjd///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
865168404Spjd///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
866168404Spjd///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
867185029Spjd///    Bits[3:0]: If any of these bits are set, the corresponding result
868168404Spjd///    element is cleared.
869168404Spjd/// \returns A 128-bit vector of [4 x float] containing the copied
870168404Spjd///    single-precision floating point elements from the operands.
871168404Spjd#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
872168404Spjd
873168404Spjd/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
874168404Spjd///    returns it, using the immediate value parameter \a N as a selector.
875168404Spjd///
876168404Spjd/// \headerfile <x86intrin.h>
877185029Spjd///
878168404Spjd/// \code
879168404Spjd/// int _mm_extract_ps(__m128 X, const int N);
880168404Spjd/// \endcode
881185029Spjd///
882168404Spjd/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
883168404Spjd/// instruction.
884168404Spjd///
885168404Spjd/// \param X
886168404Spjd///    A 128-bit vector of [4 x float].
887168404Spjd/// \param N
888168404Spjd///    An immediate value. Bits [1:0] determines which bits from the argument
889168404Spjd///    \a X are extracted and returned: \n
890168404Spjd///    00: Bits [31:0] of parameter \a X are returned. \n
891168404Spjd///    01: Bits [63:32] of parameter \a X are returned. \n
892168404Spjd///    10: Bits [95:64] of parameter \a X are returned. \n
893168404Spjd///    11: Bits [127:96] of parameter \a X are returned.
894168404Spjd/// \returns A 32-bit integer containing the extracted 32 bits of float data.
895185029Spjd#define _mm_extract_ps(X, N) (__extension__                      \
896168404Spjd                              ({ union { int __i; float __f; } __t;  \
897168404Spjd                                 __v4sf __a = (__v4sf)(__m128)(X);       \
898168404Spjd                                 __t.__f = __a[(N) & 3];                 \
899185029Spjd                                 __t.__i;}))
900185029Spjd
901168404Spjd/* Miscellaneous insert and extract macros.  */
902185029Spjd/* Extract a single-precision float from X at index N into D.  */
903168404Spjd#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
904168404Spjd                                                    (D) = __a[N]; }))
905185029Spjd
906168404Spjd/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
907168404Spjd   an index suitable for _mm_insert_ps.  */
908168404Spjd#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
909185029Spjd
910168404Spjd/* Extract a float from X at index N into the first index of the return.  */
911168404Spjd#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
912168404Spjd                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
913168404Spjd
914168404Spjd/* Insert int into packed integer array at index.  */
915185029Spjd/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
916168404Spjd///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
917168404Spjd///    of an integer parameter \a I into an offset specified by the immediate
918168404Spjd///    value parameter \a N.
919168404Spjd///
920168404Spjd/// \headerfile <x86intrin.h>
921244857Spjd///
922168404Spjd/// \code
923244857Spjd/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
924168404Spjd/// \endcode
925244857Spjd///
926168404Spjd/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
927185029Spjd///
928168404Spjd/// \param X
929168404Spjd///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
930168404Spjd///    result and then one of the sixteen elements in the result vector is
931168404Spjd///    replaced by the lower 8 bits of \a I.
932168404Spjd/// \param I
933185029Spjd///    An integer. The lower 8 bits of this operand are written to the result
934168404Spjd///    beginning at the offset specified by \a N.
935168404Spjd/// \param N
936168404Spjd///    An immediate value. Bits [3:0] specify the bit offset in the result at
937168404Spjd///    which the lower 8 bits of \a I are written. \n
938168404Spjd///    0000: Bits [7:0] of the result are used for insertion. \n
939168404Spjd///    0001: Bits [15:8] of the result are used for insertion. \n
940168404Spjd///    0010: Bits [23:16] of the result are used for insertion. \n
941168404Spjd///    0011: Bits [31:24] of the result are used for insertion. \n
942168404Spjd///    0100: Bits [39:32] of the result are used for insertion. \n
943168404Spjd///    0101: Bits [47:40] of the result are used for insertion. \n
944168404Spjd///    0110: Bits [55:48] of the result are used for insertion. \n
945168404Spjd///    0111: Bits [63:56] of the result are used for insertion. \n
946168404Spjd///    1000: Bits [71:64] of the result are used for insertion. \n
947168404Spjd///    1001: Bits [79:72] of the result are used for insertion. \n
948168404Spjd///    1010: Bits [87:80] of the result are used for insertion. \n
949168404Spjd///    1011: Bits [95:88] of the result are used for insertion. \n
950168404Spjd///    1100: Bits [103:96] of the result are used for insertion. \n
951168404Spjd///    1101: Bits [111:104] of the result are used for insertion. \n
952185029Spjd///    1110: Bits [119:112] of the result are used for insertion. \n
953185029Spjd///    1111: Bits [127:120] of the result are used for insertion.
954185029Spjd/// \returns A 128-bit integer vector containing the constructed values.
955185029Spjd#define _mm_insert_epi8(X, I, N) (__extension__                           \
956185029Spjd                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
957185029Spjd                                     __a[(N) & 15] = (I);                 \
958185029Spjd                                     (__m128i)__a;}))
959185029Spjd
960185029Spjd/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
961185029Spjd///    the 128-bit integer vector parameter, and then inserting the 32-bit
962185029Spjd///    integer parameter \a I at the offset specified by the immediate value
963185029Spjd///    parameter \a N.
964185029Spjd///
965185029Spjd/// \headerfile <x86intrin.h>
966168404Spjd///
967168404Spjd/// \code
968185029Spjd/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
969185029Spjd/// \endcode
970185029Spjd///
971185029Spjd/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
972185029Spjd///
973168404Spjd/// \param X
974168404Spjd///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
975168404Spjd///    result and then one of the four elements in the result vector is
976251634Sdelphij///    replaced by \a I.
977251634Sdelphij/// \param I
978251634Sdelphij///    A 32-bit integer that is written to the result beginning at the offset
979251634Sdelphij///    specified by \a N.
980251634Sdelphij/// \param N
981251634Sdelphij///    An immediate value. Bits [1:0] specify the bit offset in the result at
982251634Sdelphij///    which the integer \a I is written. \n
983251634Sdelphij///    00: Bits [31:0] of the result are used for insertion. \n
984251634Sdelphij///    01: Bits [63:32] of the result are used for insertion. \n
985251634Sdelphij///    10: Bits [95:64] of the result are used for insertion. \n
986251634Sdelphij///    11: Bits [127:96] of the result are used for insertion.
987251634Sdelphij/// \returns A 128-bit integer vector containing the constructed values.
988168404Spjd#define _mm_insert_epi32(X, I, N) (__extension__                         \
989168404Spjd                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
990168404Spjd                                      __a[(N) & 3] = (I);                \
991168404Spjd                                      (__m128i)__a;}))
992168404Spjd
993168404Spjd#ifdef __x86_64__
994168404Spjd/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
995168404Spjd///    the 128-bit integer vector parameter, and then inserting the 64-bit
996168404Spjd///    integer parameter \a I, using the immediate value parameter \a N as an
997185029Spjd///    insertion location selector.
998185029Spjd///
999185029Spjd/// \headerfile <x86intrin.h>
1000168404Spjd///
1001168404Spjd/// \code
1002168404Spjd/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
1003168404Spjd/// \endcode
1004168404Spjd///
1005168404Spjd/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
1006236884Smm///
1007259813Sdelphij/// \param X
1008236884Smm///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
1009236884Smm///    result and then one of the two elements in the result vector is replaced
1010236884Smm///    by \a I.
1011236884Smm/// \param I
1012236884Smm///    A 64-bit integer that is written to the result beginning at the offset
1013236884Smm///    specified by \a N.
1014236884Smm/// \param N
1015236884Smm///    An immediate value. Bit [0] specifies the bit offset in the result at
1016236884Smm///    which the integer \a I is written. \n
1017236884Smm///    0: Bits [63:0] of the result are used for insertion. \n
1018236884Smm///    1: Bits [127:64] of the result are used for insertion. \n
1019236884Smm/// \returns A 128-bit integer vector containing the constructed values.
1020236884Smm#define _mm_insert_epi64(X, I, N) (__extension__                         \
1021236884Smm                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
1022251634Sdelphij                                      __a[(N) & 1] = (I);                \
1023251634Sdelphij                                      (__m128i)__a;}))
1024251634Sdelphij#endif /* __x86_64__ */
1025236884Smm
1026236884Smm/* Extract int from packed integer array at index.  This returns the element
1027236884Smm * as a zero extended value, so it is unsigned.
1028251634Sdelphij */
1029251634Sdelphij/// \brief Extracts an 8-bit element from the 128-bit integer vector of
1030185029Spjd///    [16 x i8], using the immediate value parameter \a N as a selector.
1031185029Spjd///
1032168404Spjd/// \headerfile <x86intrin.h>
1033168404Spjd///
1034168404Spjd/// \code
1035168404Spjd/// int _mm_extract_epi8(__m128i X, const int N);
1036185029Spjd/// \endcode
1037168404Spjd///
1038168404Spjd/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1039168404Spjd///
1040168404Spjd/// \param X
1041168404Spjd///    A 128-bit integer vector.
1042168404Spjd/// \param N
1043168404Spjd///    An immediate value. Bits [3:0] specify which 8-bit vector element from
1044168404Spjd///    the argument \a X to extract and copy to the result. \n
1045185029Spjd///    0000: Bits [7:0] of parameter \a X are extracted. \n
1046168404Spjd///    0001: Bits [15:8] of the parameter \a X are extracted. \n
1047185029Spjd///    0010: Bits [23:16] of the parameter \a X are extracted. \n
1048185029Spjd///    0011: Bits [31:24] of the parameter \a X are extracted. \n
1049168404Spjd///    0100: Bits [39:32] of the parameter \a X are extracted. \n
1050185029Spjd///    0101: Bits [47:40] of the parameter \a X are extracted. \n
1051185029Spjd///    0110: Bits [55:48] of the parameter \a X are extracted. \n
1052185029Spjd///    0111: Bits [63:56] of the parameter \a X are extracted. \n
1053185029Spjd///    1000: Bits [71:64] of the parameter \a X are extracted. \n
1054185029Spjd///    1001: Bits [79:72] of the parameter \a X are extracted. \n
1055168404Spjd///    1010: Bits [87:80] of the parameter \a X are extracted. \n
1056168404Spjd///    1011: Bits [95:88] of the parameter \a X are extracted. \n
1057168404Spjd///    1100: Bits [103:96] of the parameter \a X are extracted. \n
1058168404Spjd///    1101: Bits [111:104] of the parameter \a X are extracted. \n
1059168404Spjd///    1110: Bits [119:112] of the parameter \a X are extracted. \n
1060168404Spjd///    1111: Bits [127:120] of the parameter \a X are extracted.
1061168404Spjd/// \returns  An unsigned integer, whose lower 8 bits are selected from the
1062168404Spjd///    128-bit integer vector parameter and the remaining bits are assigned
1063168404Spjd///    zeros.
1064168404Spjd#define _mm_extract_epi8(X, N) (__extension__                           \
1065168404Spjd                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
1066168404Spjd                                   (int)(unsigned char) __a[(N) & 15];}))
1067168404Spjd
1068168404Spjd/// \brief Extracts a 32-bit element from the 128-bit integer vector of
1069168404Spjd///    [4 x i32], using the immediate value parameter \a N as a selector.
1070168404Spjd///
1071168404Spjd/// \headerfile <x86intrin.h>
1072168404Spjd///
1073168404Spjd/// \code
1074168404Spjd/// int _mm_extract_epi32(__m128i X, const int N);
1075168404Spjd/// \endcode
1076168404Spjd///
1077168404Spjd/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1078168404Spjd///
1079168404Spjd/// \param X
1080168404Spjd///    A 128-bit integer vector.
1081168404Spjd/// \param N
1082168404Spjd///    An immediate value. Bits [1:0] specify which 32-bit vector element from
1083168404Spjd///    the argument \a X to extract and copy to the result. \n
1084168404Spjd///    00: Bits [31:0] of the parameter \a X are extracted. \n
1085168404Spjd///    01: Bits [63:32] of the parameter \a X are extracted. \n
1086168404Spjd///    10: Bits [95:64] of the parameter \a X are extracted. \n
1087168404Spjd///    11: Bits [127:96] of the parameter \a X are exracted.
1088168404Spjd/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
1089168404Spjd///    integer vector parameter and the remaining bits are assigned zeros.
1090168404Spjd#define _mm_extract_epi32(X, N) (__extension__                         \
1091168404Spjd                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
1092168404Spjd                                    (int)__a[(N) & 3];}))
1093168404Spjd
1094168404Spjd#ifdef __x86_64__
1095168404Spjd/// \brief Extracts a 64-bit element from the 128-bit integer vector of
1096168404Spjd///    [2 x i64], using the immediate value parameter \a N as a selector.
1097168404Spjd///
1098168404Spjd/// \headerfile <x86intrin.h>
1099168404Spjd///
1100168404Spjd/// \code
1101168404Spjd/// long long _mm_extract_epi64(__m128i X, const int N);
1102168404Spjd/// \endcode
1103168404Spjd///
1104168404Spjd/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1105168404Spjd///
1106168404Spjd/// \param X
1107168404Spjd///    A 128-bit integer vector.
1108168404Spjd/// \param N
1109168404Spjd///    An immediate value. Bit [0] specifies which 64-bit vector element from
1110168404Spjd///    the argument \a X to return. \n
1111168404Spjd///    0: Bits [63:0] are returned. \n
1112168404Spjd///    1: Bits [127:64] are returned. \n
1113168404Spjd/// \returns  A 64-bit integer.
1114168404Spjd#define _mm_extract_epi64(X, N) (__extension__                         \
1115168404Spjd                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
1116168404Spjd                                    (long long)__a[(N) & 1];}))
1117168404Spjd#endif /* __x86_64 */
1118248571Smm
1119248571Smm/* SSE4 128-bit Packed Integer Comparisons.  */
1120168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are all
1121248571Smm///    zeros.
1122248571Smm///
1123168404Spjd/// \headerfile <x86intrin.h>
1124168404Spjd///
1125168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1126168404Spjd///
1127168404Spjd/// \param __M
1128168404Spjd///    A 128-bit integer vector containing the bits to be tested.
1129168404Spjd/// \param __V
1130168404Spjd///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1131168404Spjd/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1132168404Spjdstatic __inline__ int __DEFAULT_FN_ATTRS
1133168404Spjd_mm_testz_si128(__m128i __M, __m128i __V)
1134168404Spjd{
1135168404Spjd  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1136168404Spjd}
1137168404Spjd
1138168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are all
1139168404Spjd///    ones.
1140168404Spjd///
1141207670Smm/// \headerfile <x86intrin.h>
1142168404Spjd///
1143168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1144168404Spjd///
1145168404Spjd/// \param __M
1146168404Spjd///    A 128-bit integer vector containing the bits to be tested.
1147168404Spjd/// \param __V
1148207670Smm///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1149168404Spjd/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1150168404Spjdstatic __inline__ int __DEFAULT_FN_ATTRS
1151168404Spjd_mm_testc_si128(__m128i __M, __m128i __V)
1152168404Spjd{
1153207670Smm  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1154207670Smm}
1155207670Smm
1156168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are
1157168404Spjd///    neither all zeros nor all ones.
1158168404Spjd///
1159168404Spjd/// \headerfile <x86intrin.h>
1160168404Spjd///
1161168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1162168404Spjd///
1163168404Spjd/// \param __M
1164168404Spjd///    A 128-bit integer vector containing the bits to be tested.
1165168404Spjd/// \param __V
1166168404Spjd///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1167168404Spjd/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1168168404Spjd///    FALSE otherwise.
1169168404Spjdstatic __inline__ int __DEFAULT_FN_ATTRS
1170168404Spjd_mm_testnzc_si128(__m128i __M, __m128i __V)
1171168404Spjd{
1172168404Spjd  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1173168404Spjd}
1174168404Spjd
1175168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are all
1176168404Spjd///    ones.
1177168404Spjd///
1178168404Spjd/// \headerfile <x86intrin.h>
1179168404Spjd///
1180168404Spjd/// \code
1181168404Spjd/// int _mm_test_all_ones(__m128i V);
1182168404Spjd/// \endcode
1183168404Spjd///
1184168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1185248571Smm///
1186248571Smm/// \param V
1187248571Smm///    A 128-bit integer vector containing the bits to be tested.
1188207670Smm/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1189248571Smm///    otherwise.
1190207670Smm#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
1191248571Smm
1192168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are
1193207670Smm///    neither all zeros nor all ones.
1194168404Spjd///
1195168404Spjd/// \headerfile <x86intrin.h>
1196168404Spjd///
1197168404Spjd/// \code
1198168404Spjd/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1199168404Spjd/// \endcode
1200168404Spjd///
1201168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1202168404Spjd///
1203168404Spjd/// \param M
1204168404Spjd///    A 128-bit integer vector containing the bits to be tested.
1205168404Spjd/// \param V
1206168404Spjd///    A 128-bit integer vector selecting which bits to test in operand \a M.
1207168404Spjd/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1208219089Spjd///    FALSE otherwise.
1209168404Spjd#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1210168404Spjd
1211168404Spjd/// \brief Tests whether the specified bits in a 128-bit integer vector are all
1212168404Spjd///    zeros.
1213168404Spjd///
1214168404Spjd/// \headerfile <x86intrin.h>
1215168404Spjd///
1216168404Spjd/// \code
1217168404Spjd/// int _mm_test_all_zeros(__m128i M, __m128i V);
1218168404Spjd/// \endcode
1219168404Spjd///
1220168404Spjd/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1221168404Spjd///
1222168404Spjd/// \param M
1223168404Spjd///    A 128-bit integer vector containing the bits to be tested.
1224168404Spjd/// \param V
1225168404Spjd///    A 128-bit integer vector selecting which bits to test in operand \a M.
1226185029Spjd/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1227185029Spjd#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
1228185029Spjd
1229185029Spjd/* SSE4 64-bit Packed Integer Comparisons.  */
1230185029Spjd/// \brief Compares each of the corresponding 64-bit values of the 128-bit
1231185029Spjd///    integer vectors for equality.
1232185029Spjd///
1233185029Spjd/// \headerfile <x86intrin.h>
1234168404Spjd///
1235168404Spjd/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1236168404Spjd///
1237168404Spjd/// \param __V1
1238168404Spjd///    A 128-bit integer vector.
1239168404Spjd/// \param __V2
1240168404Spjd///    A 128-bit integer vector.
1241168404Spjd/// \returns A 128-bit integer vector containing the comparison results.
1242168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1243168404Spjd_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
1244168404Spjd{
1245168404Spjd  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1246213197Smm}
1247213197Smm
1248213197Smm/* SSE4 Packed Integer Sign-Extension.  */
1249213197Smm/// \brief Sign-extends each of the lower eight 8-bit integer elements of a
1250168404Spjd///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1251213197Smm///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1252213197Smm///    are unused.
1253213197Smm///
1254213197Smm/// \headerfile <x86intrin.h>
1255213197Smm///
1256213197Smm/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1257213197Smm///
1258213197Smm/// \param __V
1259213197Smm///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
1260213197Smm///    extended to 16-bit values.
1261213197Smm/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1262213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1263213197Smm_mm_cvtepi8_epi16(__m128i __V)
1264213197Smm{
1265213197Smm  /* This function always performs a signed extension, but __v16qi is a char
1266213197Smm     which may be signed or unsigned, so use __v16qs. */
1267213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1268213197Smm}
1269213197Smm
1270213197Smm/// \brief Sign-extends each of the lower four 8-bit integer elements of a
1271213197Smm///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1272213197Smm///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1273213197Smm///    vector are unused.
1274213197Smm///
1275213197Smm/// \headerfile <x86intrin.h>
1276213197Smm///
1277213197Smm/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1278213197Smm///
1279213197Smm/// \param __V
1280213197Smm///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
1281213197Smm///    extended to 32-bit values.
1282213197Smm/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1283213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1284213197Smm_mm_cvtepi8_epi32(__m128i __V)
1285213197Smm{
1286213197Smm  /* This function always performs a signed extension, but __v16qi is a char
1287213197Smm     which may be signed or unsigned, so use __v16qs. */
1288213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1289213197Smm}
1290213197Smm
1291168404Spjd/// \brief Sign-extends each of the lower two 8-bit integer elements of a
1292213197Smm///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1293213197Smm///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1294213197Smm///    vector are unused.
1295213197Smm///
1296213197Smm/// \headerfile <x86intrin.h>
1297213197Smm///
1298213197Smm/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1299254591Sgibbs///
1300219089Spjd/// \param __V
1301213197Smm///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
1302219089Spjd///    extended to 64-bit values.
1303213197Smm/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1304213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1305254591Sgibbs_mm_cvtepi8_epi64(__m128i __V)
1306213197Smm{
1307224169Sgibbs  /* This function always performs a signed extension, but __v16qi is a char
1308213197Smm     which may be signed or unsigned, so use __v16qs. */
1309213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1310213197Smm}
1311213197Smm
1312213197Smm/// \brief Sign-extends each of the lower four 16-bit integer elements of a
1313219089Spjd///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1314254591Sgibbs///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1315219089Spjd///    vector are unused.
1316213197Smm///
1317213197Smm/// \headerfile <x86intrin.h>
1318213197Smm///
1319213197Smm/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1320213197Smm///
1321213197Smm/// \param __V
1322213197Smm///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
1323213197Smm///    extended to 32-bit values.
1324213197Smm/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1325213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1326213197Smm_mm_cvtepi16_epi32(__m128i __V)
1327213197Smm{
1328213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1329213197Smm}
1330213197Smm
1331213197Smm/// \brief Sign-extends each of the lower two 16-bit integer elements of a
1332213197Smm///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1333213197Smm///    a 128-bit vector of [2 x i64]. The upper six elements of the input
1334213197Smm///    vector are unused.
1335213197Smm///
1336213197Smm/// \headerfile <x86intrin.h>
1337213197Smm///
1338213197Smm/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1339224170Sgibbs///
1340224170Sgibbs/// \param __V
1341213197Smm///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
1342224170Sgibbs///    extended to 64-bit values.
1343224170Sgibbs/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1344213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1345213197Smm_mm_cvtepi16_epi64(__m128i __V)
1346213197Smm{
1347213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1348213197Smm}
1349213197Smm
1350213197Smm/// \brief Sign-extends each of the lower two 32-bit integer elements of a
1351213197Smm///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1352213197Smm///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1353213197Smm///    are unused.
1354213197Smm///
1355213197Smm/// \headerfile <x86intrin.h>
1356213197Smm///
1357213197Smm/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1358213197Smm///
1359213197Smm/// \param __V
1360213197Smm///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
1361213197Smm///    extended to 64-bit values.
1362213197Smm/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1363213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1364236884Smm_mm_cvtepi32_epi64(__m128i __V)
1365236884Smm{
1366236884Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1367236884Smm}
1368254591Sgibbs
1369254591Sgibbs/* SSE4 Packed Integer Zero-Extension.  */
1370254591Sgibbs/// \brief Zero-extends each of the lower eight 8-bit integer elements of a
1371254591Sgibbs///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1372213197Smm///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1373213197Smm///    are unused.
1374213197Smm///
1375213197Smm/// \headerfile <x86intrin.h>
1376213197Smm///
1377213197Smm/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1378213197Smm///
1379213197Smm/// \param __V
1380213197Smm///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
1381213197Smm///    extended to 16-bit values.
1382213197Smm/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1383213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1384213197Smm_mm_cvtepu8_epi16(__m128i __V)
1385213197Smm{
1386213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1387213197Smm}
1388213197Smm
1389213197Smm/// \brief Zero-extends each of the lower four 8-bit integer elements of a
1390213197Smm///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1391213197Smm///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1392213197Smm///    vector are unused.
1393213197Smm///
1394213197Smm/// \headerfile <x86intrin.h>
1395213197Smm///
1396213197Smm/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1397213197Smm///
1398213197Smm/// \param __V
1399213197Smm///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
1400213197Smm///    extended to 32-bit values.
1401213197Smm/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1402219089Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1403219089Spjd_mm_cvtepu8_epi32(__m128i __V)
1404219089Spjd{
1405219089Spjd  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1406219089Spjd}
1407219089Spjd
1408219089Spjd/// \brief Zero-extends each of the lower two 8-bit integer elements of a
1409219089Spjd///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1410213197Smm///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1411213197Smm///    vector are unused.
1412213197Smm///
1413213197Smm/// \headerfile <x86intrin.h>
1414254591Sgibbs///
1415254591Sgibbs/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1416254591Sgibbs///
1417254591Sgibbs/// \param __V
1418254591Sgibbs///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
1419254591Sgibbs///    extended to 64-bit values.
1420213197Smm/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1421213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1422219089Spjd_mm_cvtepu8_epi64(__m128i __V)
1423219089Spjd{
1424219089Spjd  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1425219089Spjd}
1426219089Spjd
1427219089Spjd/// \brief Zero-extends each of the lower four 16-bit integer elements of a
1428219089Spjd///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1429219089Spjd///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1430219089Spjd///    vector are unused.
1431219089Spjd///
1432213197Smm/// \headerfile <x86intrin.h>
1433213197Smm///
1434213197Smm/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1435219089Spjd///
1436213197Smm/// \param __V
1437219089Spjd///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
1438213197Smm///    extended to 32-bit values.
1439219089Spjd/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1440219089Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1441219089Spjd_mm_cvtepu16_epi32(__m128i __V)
1442219089Spjd{
1443213197Smm  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1444219089Spjd}
1445213197Smm
1446213197Smm/// \brief Zero-extends each of the lower two 16-bit integer elements of a
1447213197Smm///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1448213197Smm///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1449213197Smm///    are unused.
1450213197Smm///
1451213197Smm/// \headerfile <x86intrin.h>
1452213197Smm///
1453168404Spjd/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1454168404Spjd///
1455168404Spjd/// \param __V
1456168404Spjd///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
1457213197Smm///    extended to 64-bit values.
1458168404Spjd/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1459168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1460168404Spjd_mm_cvtepu16_epi64(__m128i __V)
1461168404Spjd{
1462168404Spjd  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1463168404Spjd}
1464168404Spjd
1465219089Spjd/// \brief Zero-extends each of the lower two 32-bit integer elements of a
1466219089Spjd///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1467168404Spjd///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1468168404Spjd///    are unused.
1469219089Spjd///
1470168404Spjd/// \headerfile <x86intrin.h>
1471168404Spjd///
1472168404Spjd/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1473185029Spjd///
1474168404Spjd/// \param __V
1475168404Spjd///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
1476185029Spjd///    extended to 64-bit values.
1477168404Spjd/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1478168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1479168404Spjd_mm_cvtepu32_epi64(__m128i __V)
1480168404Spjd{
1481168404Spjd  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1482168404Spjd}
1483168404Spjd
1484168404Spjd/* SSE4 Pack with Unsigned Saturation.  */
1485168404Spjd/// \brief Converts 32-bit signed integers from both 128-bit integer vector
1486168404Spjd///    operands into 16-bit unsigned integers, and returns the packed result.
1487168404Spjd///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1488168404Spjd///    0x0000 are saturated to 0x0000.
1489168404Spjd///
1490168404Spjd/// \headerfile <x86intrin.h>
1491168404Spjd///
1492168404Spjd/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1493168404Spjd///
1494168404Spjd/// \param __V1
1495236884Smm///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1496236884Smm///    signed integer and is converted to a 16-bit unsigned integer with
1497236884Smm///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1498236884Smm///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1499185029Spjd///    are written to the lower 64 bits of the result.
1500185029Spjd/// \param __V2
1501185029Spjd///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1502185029Spjd///    signed integer and is converted to a 16-bit unsigned integer with
1503168404Spjd///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1504168404Spjd///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1505168404Spjd///    are written to the higher 64 bits of the result.
1506168404Spjd/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1507168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1508168404Spjd_mm_packus_epi32(__m128i __V1, __m128i __V2)
1509168404Spjd{
1510168404Spjd  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1511168404Spjd}
1512168404Spjd
1513168404Spjd/* SSE4 Multiple Packed Sums of Absolute Difference.  */
1514168404Spjd/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
1515185029Spjd///    values of the differences to the corresponding bits in the destination.
1516185029Spjd///    Then sums of the absolute differences are returned according to the bit
1517185029Spjd///    fields in the immediate operand.
1518185029Spjd///
1519213197Smm/// \headerfile <x86intrin.h>
1520185029Spjd///
1521185029Spjd/// \code
1522219089Spjd/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1523213197Smm/// \endcode
1524168404Spjd///
1525168404Spjd/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1526168404Spjd///
1527185029Spjd/// \param X
1528185029Spjd///    A 128-bit vector of [16 x i8].
1529185029Spjd/// \param Y
1530185029Spjd///    A 128-bit vector of [16 x i8].
1531219089Spjd/// \param M
1532185029Spjd///    An 8-bit immediate operand specifying how the absolute differences are to
1533185029Spjd///    be calculated, according to the following algorithm:
1534185029Spjd///    \code
1535185029Spjd///    // M2 represents bit 2 of the immediate operand
1536185029Spjd///    // M10 represents bits [1:0] of the immediate operand
1537168404Spjd///    i = M2 * 4
1538185029Spjd///    j = M10 * 4
1539185029Spjd///    for (k = 0; k < 8; k = k + 1) {
1540185029Spjd///      d0 = abs(X[i + k + 0] - Y[j + 0])
1541219089Spjd///      d1 = abs(X[i + k + 1] - Y[j + 1])
1542185029Spjd///      d2 = abs(X[i + k + 2] - Y[j + 2])
1543185029Spjd///      d3 = abs(X[i + k + 3] - Y[j + 3])
1544185029Spjd///      r[k] = d0 + d1 + d2 + d3
1545168404Spjd///    }
1546168404Spjd///    \endcode
1547168404Spjd/// \returns A 128-bit integer vector containing the sums of the sets of
1548168404Spjd///    absolute differences between both operands.
1549213197Smm#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
1550213197Smm  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1551213197Smm                                      (__v16qi)(__m128i)(Y), (M)); })
1552213197Smm
1553213197Smm/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
1554213197Smm///    vector of [8 x u16] and returns it and along with its index.
1555213197Smm///
1556213197Smm/// \headerfile <x86intrin.h>
1557213197Smm///
1558213197Smm/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1559213197Smm/// instruction.
1560213197Smm///
1561213197Smm/// \param __V
1562213197Smm///    A 128-bit vector of [8 x u16].
1563213197Smm/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1564213197Smm///    in parameter \a __V, bits [18:16] contain the index of the minimum value
1565213197Smm///    and the remaining bits are set to 0.
1566213197Smmstatic __inline__ __m128i __DEFAULT_FN_ATTRS
1567213197Smm_mm_minpos_epu16(__m128i __V)
1568213197Smm{
1569213197Smm  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
1570213197Smm}
1571213197Smm
1572213197Smm/* Handle the sse4.2 definitions here. */
1573213197Smm
1574213197Smm/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1575213197Smm   so we'll do the same.  */
1576219089Spjd
1577213197Smm#undef __DEFAULT_FN_ATTRS
1578213197Smm#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1579213197Smm
1580213197Smm/* These specify the type of data that we're comparing.  */
1581213197Smm#define _SIDD_UBYTE_OPS                 0x00
1582213197Smm#define _SIDD_UWORD_OPS                 0x01
1583213197Smm#define _SIDD_SBYTE_OPS                 0x02
1584213197Smm#define _SIDD_SWORD_OPS                 0x03
1585219089Spjd
1586213197Smm/* These specify the type of comparison operation.  */
1587168404Spjd#define _SIDD_CMP_EQUAL_ANY             0x00
1588168404Spjd#define _SIDD_CMP_RANGES                0x04
1589168404Spjd#define _SIDD_CMP_EQUAL_EACH            0x08
1590168404Spjd#define _SIDD_CMP_EQUAL_ORDERED         0x0c
1591168404Spjd
1592168404Spjd/* These macros specify the polarity of the operation.  */
1593168404Spjd#define _SIDD_POSITIVE_POLARITY         0x00
1594168404Spjd#define _SIDD_NEGATIVE_POLARITY         0x10
1595168404Spjd#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
1596168404Spjd#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
1597168404Spjd
1598168404Spjd/* These macros are used in _mm_cmpXstri() to specify the return.  */
1599168404Spjd#define _SIDD_LEAST_SIGNIFICANT         0x00
1600168404Spjd#define _SIDD_MOST_SIGNIFICANT          0x40
1601168404Spjd
1602228103Smm/* These macros are used in _mm_cmpXstri() to specify the return.  */
1603168404Spjd#define _SIDD_BIT_MASK                  0x00
1604168404Spjd#define _SIDD_UNIT_MASK                 0x40
1605168404Spjd
1606168404Spjd/* SSE4.2 Packed Comparison Intrinsics.  */
1607168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
1608168404Spjd///    data with implicitly defined lengths that is contained in source operands
1609168404Spjd///    \a A and \a B. Returns a 128-bit integer vector representing the result
1610168404Spjd///    mask of the comparison.
1611168404Spjd///
1612168404Spjd/// \headerfile <x86intrin.h>
1613219089Spjd///
1614168404Spjd/// \code
1615185029Spjd/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1616168404Spjd/// \endcode
1617168404Spjd///
1618168404Spjd/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1619228103Smm/// instruction.
1620228103Smm///
1621228103Smm/// \param A
1622168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1623168404Spjd///    compared.
1624168404Spjd/// \param B
1625168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1626168404Spjd///    compared.
1627168404Spjd/// \param M
1628168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1629168404Spjd///    words, the type of comparison to perform, and the format of the return
1630228103Smm///    value. \n
1631228103Smm///    Bits [1:0]: Determine source data format. \n
1632168404Spjd///      00: 16 unsigned bytes \n
1633168404Spjd///      01: 8 unsigned words \n
1634168404Spjd///      10: 16 signed bytes \n
1635168404Spjd///      11: 8 signed words \n
1636228103Smm///    Bits [3:2]: Determine comparison type and aggregation method. \n
1637168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
1638168404Spjd///          the characters in \a A. \n
1639168404Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1640168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
1641228103Smm///          and less than or equal for odd-indexed elements in \a A. \n
1642228103Smm///      10: Match: Compare each pair of corresponding characters in \a A and
1643168404Spjd///          \a B for equality. \n
1644168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
1645168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1646228103Smm///                mask of the comparison results. \n
1647168404Spjd///      00: No effect. \n
1648168404Spjd///      01: Negate the bit mask. \n
1649168404Spjd///      10: No effect. \n
1650168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1651228103Smm///          to the size of \a A or \a B. \n
1652168404Spjd///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1653168404Spjd///             bytes. \n
1654168404Spjd///      0: The result is zero-extended to 16 bytes. \n
1655168404Spjd///      1: The result is expanded to 16 bytes (this expansion is performed by
1656238926Smm///         repeating each bit 8 or 16 times).
1657238926Smm/// \returns Returns a 128-bit integer vector representing the result mask of
1658168404Spjd///    the comparison.
1659168404Spjd#define _mm_cmpistrm(A, B, M) \
1660168404Spjd  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1661228103Smm                                       (__v16qi)(__m128i)(B), (int)(M))
1662168404Spjd
1663168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
1664168404Spjd///    data with implicitly defined lengths that is contained in source operands
1665238926Smm///    \a A and \a B. Returns an integer representing the result index of the
1666238926Smm///    comparison.
1667238926Smm///
1668238926Smm/// \headerfile <x86intrin.h>
1669238926Smm///
1670236884Smm/// \code
1671236884Smm/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1672236884Smm/// \endcode
1673236884Smm///
1674236884Smm/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1675236884Smm/// instruction.
1676236884Smm///
1677236884Smm/// \param A
1678236884Smm///    A 128-bit integer vector containing one of the source operands to be
1679236884Smm///    compared.
1680236884Smm/// \param B
1681236884Smm///    A 128-bit integer vector containing one of the source operands to be
1682236884Smm///    compared.
1683236884Smm/// \param M
1684168498Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1685228103Smm///    words, the type of comparison to perform, and the format of the return
1686168498Spjd///    value. \n
1687168498Spjd///    Bits [1:0]: Determine source data format. \n
1688185029Spjd///      00: 16 unsigned bytes \n
1689185029Spjd///      01: 8 unsigned words \n
1690185029Spjd///      10: 16 signed bytes \n
1691228103Smm///      11: 8 signed words \n
1692185029Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
1693185029Spjd///      00: Subset: Each character in \a B is compared for equality with all
1694185029Spjd///          the characters in \a A. \n
1695185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1696228103Smm///          basis is greater than or equal for even-indexed elements in \a A,
1697185029Spjd///          and less than or equal for odd-indexed elements in \a A. \n
1698185029Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
1699185029Spjd///          \a B for equality. \n
1700219089Spjd///      11: Substring: Search B for substring matches of \a A. \n
1701228103Smm///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1702219089Spjd///                mask of the comparison results. \n
1703219089Spjd///      00: No effect. \n
1704219089Spjd///      01: Negate the bit mask. \n
1705259131Sdelphij///      10: No effect. \n
1706259131Sdelphij///      11: Negate the bit mask only for bits with an index less than or equal
1707259131Sdelphij///          to the size of \a A or \a B. \n
1708259131Sdelphij///    Bit [6]: Determines whether the index of the lowest set bit or the
1709259131Sdelphij///             highest set bit is returned. \n
1710259131Sdelphij///      0: The index of the least significant set bit. \n
1711168404Spjd///      1: The index of the most significant set bit. \n
1712168404Spjd/// \returns Returns an integer representing the result index of the comparison.
1713168404Spjd#define _mm_cmpistri(A, B, M) \
1714168404Spjd  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1715168404Spjd                                   (__v16qi)(__m128i)(B), (int)(M))
1716168404Spjd
1717168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
1718168404Spjd///    data with explicitly defined lengths that is contained in source operands
1719168404Spjd///    \a A and \a B. Returns a 128-bit integer vector representing the result
1720168404Spjd///    mask of the comparison.
1721168404Spjd///
1722238926Smm/// \headerfile <x86intrin.h>
1723238926Smm///
1724228103Smm/// \code
1725168404Spjd/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1726168404Spjd/// \endcode
1727168404Spjd///
1728238926Smm/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1729228103Smm/// instruction.
1730168498Spjd///
1731168498Spjd/// \param A
1732238926Smm///    A 128-bit integer vector containing one of the source operands to be
1733228103Smm///    compared.
1734168404Spjd/// \param LA
1735168404Spjd///    An integer that specifies the length of the string in \a A.
1736238926Smm/// \param B
1737168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1738228103Smm///    compared.
1739168404Spjd/// \param LB
1740168404Spjd///    An integer that specifies the length of the string in \a B.
1741168404Spjd/// \param M
1742168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1743168404Spjd///    words, the type of comparison to perform, and the format of the return
1744228103Smm///    value. \n
1745168404Spjd///    Bits [1:0]: Determine source data format. \n
1746168404Spjd///      00: 16 unsigned bytes \n
1747168404Spjd///      01: 8 unsigned words \n
1748168404Spjd///      10: 16 signed bytes \n
1749236884Smm///      11: 8 signed words \n
1750236884Smm///    Bits [3:2]: Determine comparison type and aggregation method. \n
1751236884Smm///      00: Subset: Each character in \a B is compared for equality with all
1752236884Smm///          the characters in \a A. \n
1753236884Smm///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1754236884Smm///          basis is greater than or equal for even-indexed elements in \a A,
1755236884Smm///          and less than or equal for odd-indexed elements in \a A. \n
1756236884Smm///      10: Match: Compare each pair of corresponding characters in \a A and
1757236884Smm///          \a B for equality. \n
1758236884Smm///      11: Substring: Search \a B for substring matches of \a A. \n
1759236884Smm///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1760236884Smm///                mask of the comparison results. \n
1761236884Smm///      00: No effect. \n
1762236884Smm///      01: Negate the bit mask. \n
1763168404Spjd///      10: No effect. \n
1764168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1765168404Spjd///          to the size of \a A or \a B. \n
1766228103Smm///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1767168404Spjd///             bytes. \n
1768168404Spjd///      0: The result is zero-extended to 16 bytes. \n
1769168404Spjd///      1: The result is expanded to 16 bytes (this expansion is performed by
1770168404Spjd///         repeating each bit 8 or 16 times). \n
1771228103Smm/// \returns Returns a 128-bit integer vector representing the result mask of
1772168404Spjd///    the comparison.
1773168404Spjd#define _mm_cmpestrm(A, LA, B, LB, M) \
1774168404Spjd  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1775168404Spjd                                       (__v16qi)(__m128i)(B), (int)(LB), \
1776228103Smm                                       (int)(M))
1777228103Smm
1778228103Smm/// \brief Uses the immediate operand \a M to perform a comparison of string
1779228103Smm///    data with explicitly defined lengths that is contained in source operands
1780168404Spjd///    \a A and \a B. Returns an integer representing the result index of the
1781168404Spjd///    comparison.
1782168404Spjd///
1783168404Spjd/// \headerfile <x86intrin.h>
1784168404Spjd///
1785168404Spjd/// \code
1786168404Spjd/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1787168404Spjd/// \endcode
1788168404Spjd///
1789168404Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1790168404Spjd/// instruction.
1791168404Spjd///
1792185029Spjd/// \param A
1793168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1794168404Spjd///    compared.
1795168404Spjd/// \param LA
1796168404Spjd///    An integer that specifies the length of the string in \a A.
1797236146Smm/// \param B
1798168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1799168404Spjd///    compared.
1800228103Smm/// \param LB
1801168404Spjd///    An integer that specifies the length of the string in \a B.
1802168404Spjd/// \param M
1803168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1804168404Spjd///    words, the type of comparison to perform, and the format of the return
1805168404Spjd///    value. \n
1806213197Smm///    Bits [1:0]: Determine source data format. \n
1807213197Smm///      00: 16 unsigned bytes \n
1808213197Smm///      01: 8 unsigned words \n
1809185029Spjd///      10: 16 signed bytes \n
1810168404Spjd///      11: 8 signed words \n
1811168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
1812168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
1813168404Spjd///          the characters in \a A. \n
1814168404Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1815168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
1816168404Spjd///          and less than or equal for odd-indexed elements in \a A. \n
1817168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
1818168404Spjd///          \a B for equality. \n
1819185029Spjd///      11: Substring: Search B for substring matches of \a A. \n
1820185029Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1821168404Spjd///                mask of the comparison results. \n
1822168404Spjd///      00: No effect. \n
1823168404Spjd///      01: Negate the bit mask. \n
1824219089Spjd///      10: No effect. \n
1825168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1826168404Spjd///          to the size of \a A or \a B. \n
1827168404Spjd///    Bit [6]: Determines whether the index of the lowest set bit or the
1828168404Spjd///             highest set bit is returned. \n
1829168404Spjd///      0: The index of the least significant set bit. \n
1830168404Spjd///      1: The index of the most significant set bit. \n
1831168404Spjd/// \returns Returns an integer representing the result index of the comparison.
1832168404Spjd#define _mm_cmpestri(A, LA, B, LB, M) \
1833168404Spjd  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1834168404Spjd                                   (__v16qi)(__m128i)(B), (int)(LB), \
1835168404Spjd                                   (int)(M))
1836168404Spjd
1837168404Spjd/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
1838236884Smm/// \brief Uses the immediate operand \a M to perform a comparison of string
1839168404Spjd///    data with implicitly defined lengths that is contained in source operands
1840236884Smm///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1841168404Spjd///    string in \a B is the maximum, otherwise, returns 0.
1842219089Spjd///
1843219089Spjd/// \headerfile <x86intrin.h>
1844168498Spjd///
1845168498Spjd/// \code
1846168498Spjd/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1847168498Spjd/// \endcode
1848168498Spjd///
1849168498Spjd/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1850168498Spjd/// instruction.
1851168498Spjd///
1852168498Spjd/// \param A
1853168498Spjd///    A 128-bit integer vector containing one of the source operands to be
1854168498Spjd///    compared.
1855168498Spjd/// \param B
1856168498Spjd///    A 128-bit integer vector containing one of the source operands to be
1857168498Spjd///    compared.
1858168498Spjd/// \param M
1859168498Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1860168498Spjd///    words and the type of comparison to perform. \n
1861168498Spjd///    Bits [1:0]: Determine source data format. \n
1862168498Spjd///      00: 16 unsigned bytes \n
1863168498Spjd///      01: 8 unsigned words \n
1864168498Spjd///      10: 16 signed bytes \n
1865168498Spjd///      11: 8 signed words \n
1866168498Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
1867168498Spjd///      00: Subset: Each character in \a B is compared for equality with all
1868168498Spjd///          the characters in \a A. \n
1869168498Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1870168498Spjd///          basis is greater than or equal for even-indexed elements in \a A,
1871168498Spjd///          and less than or equal for odd-indexed elements in \a A. \n
1872168498Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
1873168498Spjd///          \a B for equality. \n
1874168498Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
1875168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1876168404Spjd///                mask of the comparison results. \n
1877219089Spjd///      00: No effect. \n
1878168404Spjd///      01: Negate the bit mask. \n
1879168404Spjd///      10: No effect. \n
1880168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1881168404Spjd///          to the size of \a A or \a B. \n
1882168404Spjd/// \returns Returns 1 if the bit mask is zero and the length of the string in
1883209962Smm///    \a B is the maximum; otherwise, returns 0.
1884209962Smm#define _mm_cmpistra(A, B, M) \
1885168404Spjd  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1886209962Smm                                    (__v16qi)(__m128i)(B), (int)(M))
1887219089Spjd
1888209962Smm/// \brief Uses the immediate operand \a M to perform a comparison of string
1889168404Spjd///    data with implicitly defined lengths that is contained in source operands
1890168404Spjd///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1891168404Spjd///    0.
1892168404Spjd///
1893168404Spjd/// \headerfile <x86intrin.h>
1894219089Spjd///
1895168404Spjd/// \code
1896168404Spjd/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1897168404Spjd/// \endcode
1898168404Spjd///
1899185029Spjd/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1900185029Spjd/// instruction.
1901185029Spjd///
1902219089Spjd/// \param A
1903168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1904185029Spjd///    compared.
1905185029Spjd/// \param B
1906185029Spjd///    A 128-bit integer vector containing one of the source operands to be
1907168404Spjd///    compared.
1908168404Spjd/// \param M
1909168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1910168404Spjd///    words and the type of comparison to perform. \n
1911168404Spjd///    Bits [1:0]: Determine source data format. \n
1912168404Spjd///      00: 16 unsigned bytes \n
1913168404Spjd///      01: 8 unsigned words \n
1914168404Spjd///      10: 16 signed bytes \n
1915168404Spjd///      11: 8 signed words \n
1916168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
1917219089Spjd///      00: Subset: Each character in \a B is compared for equality with all
1918185029Spjd///          the characters in \a A. \n
1919185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1920209962Smm///          basis is greater than or equal for even-indexed elements in \a A,
1921209962Smm///          and less than or equal for odd-indexed elements in \a A. \n
1922185029Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
1923219089Spjd///          \a B for equality. \n
1924219089Spjd///      11: Substring: Search B for substring matches of \a A. \n
1925219089Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1926219089Spjd///                mask of the comparison results. \n
1927219089Spjd///      00: No effect. \n
1928219089Spjd///      01: Negate the bit mask. \n
1929219089Spjd///      10: No effect. \n
1930219089Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1931219089Spjd///          to the size of \a A or \a B.
1932219089Spjd/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1933219089Spjd#define _mm_cmpistrc(A, B, M) \
1934168404Spjd  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1935168404Spjd                                    (__v16qi)(__m128i)(B), (int)(M))
1936185029Spjd
1937185029Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
1938168404Spjd///    data with implicitly defined lengths that is contained in source operands
1939168404Spjd///    \a A and \a B. Returns bit 0 of the resulting bit mask.
1940168404Spjd///
1941168404Spjd/// \headerfile <x86intrin.h>
1942168404Spjd///
1943168404Spjd/// \code
1944168404Spjd/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1945168404Spjd/// \endcode
1946168404Spjd///
1947219089Spjd/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1948185029Spjd/// instruction.
1949168404Spjd///
1950168404Spjd/// \param A
1951168404Spjd///    A 128-bit integer vector containing one of the source operands to be
1952168404Spjd///    compared.
1953168404Spjd/// \param B
1954185029Spjd///    A 128-bit integer vector containing one of the source operands to be
1955185029Spjd///    compared.
1956185029Spjd/// \param M
1957168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
1958219089Spjd///    words and the type of comparison to perform. \n
1959185029Spjd///    Bits [1:0]: Determine source data format. \n
1960168404Spjd///      00: 16 unsigned bytes \n
1961219089Spjd///      01: 8 unsigned words \n
1962219089Spjd///      10: 16 signed bytes \n
1963219089Spjd///      11: 8 signed words \n
1964219089Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
1965219089Spjd///      00: Subset: Each character in \a B is compared for equality with all
1966219089Spjd///          the characters in \a A. \n
1967185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1968219089Spjd///          basis is greater than or equal for even-indexed elements in \a A,
1969219089Spjd///          and less than or equal for odd-indexed elements in \a A. \n
1970168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
1971168404Spjd///          \a B for equality. \n
1972263385Sdelphij///      11: Substring: Search B for substring matches of \a A. \n
1973168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1974168404Spjd///                mask of the comparison results. \n
1975168404Spjd///      00: No effect. \n
1976168404Spjd///      01: Negate the bit mask. \n
1977185029Spjd///      10: No effect. \n
1978185029Spjd///      11: Negate the bit mask only for bits with an index less than or equal
1979185029Spjd///          to the size of \a A or \a B. \n
1980168404Spjd/// \returns Returns bit 0 of the resulting bit mask.
1981168404Spjd#define _mm_cmpistro(A, B, M) \
1982168404Spjd  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1983168404Spjd                                    (__v16qi)(__m128i)(B), (int)(M))
1984168404Spjd
1985168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
1986168404Spjd///    data with implicitly defined lengths that is contained in source operands
1987168404Spjd///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
1988168404Spjd///    the maximum, otherwise, returns 0.
1989168404Spjd///
1990168404Spjd/// \headerfile <x86intrin.h>
1991168404Spjd///
1992168404Spjd/// \code
1993168404Spjd/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1994168404Spjd/// \endcode
1995168404Spjd///
1996168404Spjd/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1997219089Spjd/// instruction.
1998168404Spjd///
1999185029Spjd/// \param A
2000219089Spjd///    A 128-bit integer vector containing one of the source operands to be
2001185029Spjd///    compared.
2002219089Spjd/// \param B
2003219089Spjd///    A 128-bit integer vector containing one of the source operands to be
2004219089Spjd///    compared.
2005219089Spjd/// \param M
2006219089Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2007219089Spjd///    words and the type of comparison to perform. \n
2008219089Spjd///    Bits [1:0]: Determine source data format. \n
2009219089Spjd///      00: 16 unsigned bytes \n
2010219089Spjd///      01: 8 unsigned words \n
2011168404Spjd///      10: 16 signed bytes \n
2012185029Spjd///      11: 8 signed words \n
2013185029Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2014185029Spjd///      00: Subset: Each character in \a B is compared for equality with all
2015185029Spjd///          the characters in \a A. \n
2016185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2017185029Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2018185029Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2019185029Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2020185029Spjd///          \a B for equality. \n
2021168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2022168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2023185029Spjd///                mask of the comparison results. \n
2024185029Spjd///      00: No effect. \n
2025185029Spjd///      01: Negate the bit mask. \n
2026185029Spjd///      10: No effect. \n
2027185029Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2028185029Spjd///          to the size of \a A or \a B. \n
2029185029Spjd/// \returns Returns 1 if the length of the string in \a A is less than the
2030185029Spjd///    maximum, otherwise, returns 0.
2031185029Spjd#define _mm_cmpistrs(A, B, M) \
2032185029Spjd  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2033168404Spjd                                    (__v16qi)(__m128i)(B), (int)(M))
2034219089Spjd
2035219089Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
2036268720Sdelphij///    data with implicitly defined lengths that is contained in source operands
2037219089Spjd///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
2038219089Spjd///    the maximum, otherwise, returns 0.
2039219089Spjd///
2040219089Spjd/// \headerfile <x86intrin.h>
2041219089Spjd///
2042219089Spjd/// \code
2043219089Spjd/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2044219089Spjd/// \endcode
2045219089Spjd///
2046219089Spjd/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2047219089Spjd/// instruction.
2048219089Spjd///
2049219089Spjd/// \param A
2050168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2051168404Spjd///    compared.
2052168404Spjd/// \param B
2053168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2054168404Spjd///    compared.
2055168404Spjd/// \param M
2056168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2057168404Spjd///    words and the type of comparison to perform. \n
2058168404Spjd///    Bits [1:0]: Determine source data format. \n
2059168404Spjd///      00: 16 unsigned bytes \n
2060168404Spjd///      01: 8 unsigned words \n
2061168404Spjd///      10: 16 signed bytes \n
2062168404Spjd///      11: 8 signed words \n
2063168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2064168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
2065185029Spjd///          the characters in \a A. \n
2066185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2067185029Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2068185029Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2069185029Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2070219089Spjd///          \a B for equality. \n
2071219089Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2072219089Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2073219089Spjd///                mask of the comparison results. \n
2074219089Spjd///      00: No effect. \n
2075219089Spjd///      01: Negate the bit mask. \n
2076219089Spjd///      10: No effect. \n
2077219089Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2078219089Spjd///          to the size of \a A or \a B.
2079219089Spjd/// \returns Returns 1 if the length of the string in \a B is less than the
2080219089Spjd///    maximum, otherwise, returns 0.
2081219089Spjd#define _mm_cmpistrz(A, B, M) \
2082219089Spjd  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2083219089Spjd                                    (__v16qi)(__m128i)(B), (int)(M))
2084219089Spjd
2085219089Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
2086219089Spjd///    data with explicitly defined lengths that is contained in source operands
2087219089Spjd///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2088168404Spjd///    string in \a B is the maximum, otherwise, returns 0.
2089168404Spjd///
2090235478Savg/// \headerfile <x86intrin.h>
2091168404Spjd///
2092168404Spjd/// \code
2093168404Spjd/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2094168404Spjd/// \endcode
2095168404Spjd///
2096168404Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2097168404Spjd/// instruction.
2098168404Spjd///
2099168404Spjd/// \param A
2100168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2101168404Spjd///    compared.
2102168404Spjd/// \param LA
2103168404Spjd///    An integer that specifies the length of the string in \a A.
2104168404Spjd/// \param B
2105168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2106168404Spjd///    compared.
2107168404Spjd/// \param LB
2108168404Spjd///    An integer that specifies the length of the string in \a B.
2109168404Spjd/// \param M
2110168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2111168404Spjd///    words and the type of comparison to perform. \n
2112168404Spjd///    Bits [1:0]: Determine source data format. \n
2113168404Spjd///      00: 16 unsigned bytes \n
2114168404Spjd///      01: 8 unsigned words \n
2115219089Spjd///      10: 16 signed bytes \n
2116168404Spjd///      11: 8 signed words \n
2117168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2118168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
2119168404Spjd///          the characters in \a A. \n
2120168404Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2121168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2122168404Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2123168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2124168404Spjd///          \a B for equality. \n
2125168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2126168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2127168404Spjd///                mask of the comparison results. \n
2128168404Spjd///      00: No effect. \n
2129168404Spjd///      01: Negate the bit mask. \n
2130168404Spjd///      10: No effect. \n
2131168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2132168404Spjd///          to the size of \a A or \a B.
2133168404Spjd/// \returns Returns 1 if the bit mask is zero and the length of the string in
2134168404Spjd///    \a B is the maximum, otherwise, returns 0.
2135168404Spjd#define _mm_cmpestra(A, LA, B, LB, M) \
2136168404Spjd  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2137168404Spjd                                    (__v16qi)(__m128i)(B), (int)(LB), \
2138254758Sdelphij                                    (int)(M))
2139168404Spjd
2140254758Sdelphij/// \brief Uses the immediate operand \a M to perform a comparison of string
2141254758Sdelphij///    data with explicitly defined lengths that is contained in source operands
2142168404Spjd///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2143168404Spjd///    returns 0.
2144185029Spjd///
2145219089Spjd/// \headerfile <x86intrin.h>
2146185029Spjd///
2147219089Spjd/// \code
2148185029Spjd/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2149185029Spjd/// \endcode
2150219089Spjd///
2151219089Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2152219089Spjd/// instruction.
2153219089Spjd///
2154219089Spjd/// \param A
2155219089Spjd///    A 128-bit integer vector containing one of the source operands to be
2156219089Spjd///    compared.
2157219089Spjd/// \param LA
2158219089Spjd///    An integer that specifies the length of the string in \a A.
2159219089Spjd/// \param B
2160219089Spjd///    A 128-bit integer vector containing one of the source operands to be
2161219089Spjd///    compared.
2162219089Spjd/// \param LB
2163219089Spjd///    An integer that specifies the length of the string in \a B.
2164219089Spjd/// \param M
2165219089Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2166219089Spjd///    words and the type of comparison to perform. \n
2167219089Spjd///    Bits [1:0]: Determine source data format. \n
2168219089Spjd///      00: 16 unsigned bytes \n
2169219089Spjd///      01: 8 unsigned words \n
2170219089Spjd///      10: 16 signed bytes \n
2171219089Spjd///      11: 8 signed words \n
2172219089Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2173219089Spjd///      00: Subset: Each character in \a B is compared for equality with all
2174219089Spjd///          the characters in \a A. \n
2175219089Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2176185029Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2177185029Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2178185029Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2179185029Spjd///          \a B for equality. \n
2180219089Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2181219089Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2182219089Spjd///                mask of the comparison results. \n
2183219089Spjd///      00: No effect. \n
2184185029Spjd///      01: Negate the bit mask. \n
2185219089Spjd///      10: No effect. \n
2186185029Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2187185029Spjd///          to the size of \a A or \a B. \n
2188185029Spjd/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2189185029Spjd#define _mm_cmpestrc(A, LA, B, LB, M) \
2190185029Spjd  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2191185029Spjd                                    (__v16qi)(__m128i)(B), (int)(LB), \
2192185029Spjd                                    (int)(M))
2193185029Spjd
2194185029Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
2195168404Spjd///    data with explicitly defined lengths that is contained in source operands
2196168404Spjd///    \a A and \a B. Returns bit 0 of the resulting bit mask.
2197168404Spjd///
2198168404Spjd/// \headerfile <x86intrin.h>
2199168404Spjd///
2200168404Spjd/// \code
2201168404Spjd/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2202168404Spjd/// \endcode
2203168404Spjd///
2204168404Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2205168404Spjd/// instruction.
2206168404Spjd///
2207168404Spjd/// \param A
2208168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2209219089Spjd///    compared.
2210219089Spjd/// \param LA
2211219089Spjd///    An integer that specifies the length of the string in \a A.
2212168404Spjd/// \param B
2213168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2214168404Spjd///    compared.
2215168404Spjd/// \param LB
2216168404Spjd///    An integer that specifies the length of the string in \a B.
2217168404Spjd/// \param M
2218219089Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2219168404Spjd///    words and the type of comparison to perform. \n
2220219089Spjd///    Bits [1:0]: Determine source data format. \n
2221219089Spjd///      00: 16 unsigned bytes \n
2222168404Spjd///      01: 8 unsigned words \n
2223219089Spjd///      10: 16 signed bytes \n
2224168404Spjd///      11: 8 signed words \n
2225168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2226168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
2227168404Spjd///          the characters in \a A. \n
2228168404Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2229168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2230168404Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2231168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2232168404Spjd///          \a B for equality. \n
2233168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2234168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2235168404Spjd///                mask of the comparison results. \n
2236168404Spjd///      00: No effect. \n
2237168404Spjd///      01: Negate the bit mask. \n
2238168404Spjd///      10: No effect. \n
2239168404Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2240168404Spjd///          to the size of \a A or \a B.
2241168404Spjd/// \returns Returns bit 0 of the resulting bit mask.
2242168404Spjd#define _mm_cmpestro(A, LA, B, LB, M) \
2243168404Spjd  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2244168404Spjd                                    (__v16qi)(__m128i)(B), (int)(LB), \
2245168404Spjd                                    (int)(M))
2246168404Spjd
2247168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
2248168404Spjd///    data with explicitly defined lengths that is contained in source operands
2249168404Spjd///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
2250168404Spjd///    the maximum, otherwise, returns 0.
2251168404Spjd///
2252168404Spjd/// \headerfile <x86intrin.h>
2253168404Spjd///
2254168404Spjd/// \code
2255168404Spjd/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2256168404Spjd/// \endcode
2257168404Spjd///
2258168404Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2259168404Spjd/// instruction.
2260168404Spjd///
2261168404Spjd/// \param A
2262168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2263168404Spjd///    compared.
2264168404Spjd/// \param LA
2265168404Spjd///    An integer that specifies the length of the string in \a A.
2266168404Spjd/// \param B
2267168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2268168404Spjd///    compared.
2269219089Spjd/// \param LB
2270168404Spjd///    An integer that specifies the length of the string in \a B.
2271168404Spjd/// \param M
2272168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2273168404Spjd///    words and the type of comparison to perform. \n
2274168404Spjd///    Bits [1:0]: Determine source data format. \n
2275168404Spjd///      00: 16 unsigned bytes \n
2276168404Spjd///      01: 8 unsigned words \n
2277168404Spjd///      10: 16 signed bytes \n
2278168404Spjd///      11: 8 signed words \n
2279168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2280168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
2281185029Spjd///          the characters in \a A. \n
2282185029Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2283168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2284219089Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2285168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2286168404Spjd///          \a B for equality. \n
2287168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2288168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement in the bit
2289168404Spjd///                mask of the comparison results. \n
2290168404Spjd///      00: No effect. \n
2291236155Smm///      01: Negate the bit mask. \n
2292236155Smm///      10: No effect. \n
2293236155Smm///      11: Negate the bit mask only for bits with an index less than or equal
2294168404Spjd///          to the size of \a A or \a B. \n
2295168404Spjd/// \returns Returns 1 if the length of the string in \a A is less than the
2296168404Spjd///    maximum, otherwise, returns 0.
2297168404Spjd#define _mm_cmpestrs(A, LA, B, LB, M) \
2298168404Spjd  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2299168404Spjd                                    (__v16qi)(__m128i)(B), (int)(LB), \
2300168404Spjd                                    (int)(M))
2301168404Spjd
2302168404Spjd/// \brief Uses the immediate operand \a M to perform a comparison of string
2303168404Spjd///    data with explicitly defined lengths that is contained in source operands
2304168404Spjd///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
2305168404Spjd///    the maximum, otherwise, returns 0.
2306168404Spjd///
2307168404Spjd/// \headerfile <x86intrin.h>
2308168404Spjd///
2309168404Spjd/// \code
2310168404Spjd/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2311168404Spjd/// \endcode
2312219089Spjd///
2313168404Spjd/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2314168404Spjd///
2315168404Spjd/// \param A
2316168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2317168404Spjd///    compared.
2318168404Spjd/// \param LA
2319168404Spjd///    An integer that specifies the length of the string in \a A.
2320185029Spjd/// \param B
2321168404Spjd///    A 128-bit integer vector containing one of the source operands to be
2322168404Spjd///    compared.
2323168404Spjd/// \param LB
2324168404Spjd///    An integer that specifies the length of the string in \a B.
2325168404Spjd/// \param M
2326168404Spjd///    An 8-bit immediate operand specifying whether the characters are bytes or
2327168404Spjd///    words and the type of comparison to perform. \n
2328168404Spjd///    Bits [1:0]: Determine source data format. \n
2329168404Spjd///      00: 16 unsigned bytes  \n
2330168404Spjd///      01: 8 unsigned words \n
2331168404Spjd///      10: 16 signed bytes \n
2332168404Spjd///      11: 8 signed words \n
2333168404Spjd///    Bits [3:2]: Determine comparison type and aggregation method. \n
2334168404Spjd///      00: Subset: Each character in \a B is compared for equality with all
2335168404Spjd///          the characters in \a A. \n
2336168404Spjd///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2337168404Spjd///          basis is greater than or equal for even-indexed elements in \a A,
2338168404Spjd///          and less than or equal for odd-indexed elements in \a A. \n
2339168404Spjd///      10: Match: Compare each pair of corresponding characters in \a A and
2340168404Spjd///          \a B for equality. \n
2341168404Spjd///      11: Substring: Search \a B for substring matches of \a A. \n
2342168404Spjd///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2343168404Spjd///                mask of the comparison results. \n
2344168404Spjd///      00: No effect. \n
2345168404Spjd///      01: Negate the bit mask. \n
2346168404Spjd///      10: No effect. \n
2347219089Spjd///      11: Negate the bit mask only for bits with an index less than or equal
2348219089Spjd///          to the size of \a A or \a B.
2349168404Spjd/// \returns Returns 1 if the length of the string in \a B is less than the
2350168404Spjd///    maximum, otherwise, returns 0.
2351168404Spjd#define _mm_cmpestrz(A, LA, B, LB, M) \
2352168404Spjd  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2353219089Spjd                                    (__v16qi)(__m128i)(B), (int)(LB), \
2354168404Spjd                                    (int)(M))
2355168404Spjd
2356168404Spjd/* SSE4.2 Compare Packed Data -- Greater Than.  */
2357168404Spjd/// \brief Compares each of the corresponding 64-bit values of the 128-bit
2358168404Spjd///    integer vectors to determine if the values in the first operand are
2359168404Spjd///    greater than those in the second operand.
2360168404Spjd///
2361168404Spjd/// \headerfile <x86intrin.h>
2362168404Spjd///
2363168404Spjd/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2364168404Spjd///
2365168404Spjd/// \param __V1
2366168404Spjd///    A 128-bit integer vector.
2367168404Spjd/// \param __V2
2368168404Spjd///    A 128-bit integer vector.
2369168404Spjd/// \returns A 128-bit integer vector containing the comparison results.
2370168404Spjdstatic __inline__ __m128i __DEFAULT_FN_ATTRS
2371168404Spjd_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
2372168404Spjd{
2373168404Spjd  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2374168404Spjd}
2375168404Spjd
2376168404Spjd/* SSE4.2 Accumulate CRC32.  */
2377168404Spjd/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2378168404Spjd///    unsigned char operand.
2379168404Spjd///
2380168404Spjd/// \headerfile <x86intrin.h>
2381168404Spjd///
2382168404Spjd/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
2383168404Spjd///
2384168404Spjd/// \param __C
2385168404Spjd///    An unsigned integer operand to add to the CRC-32C checksum of operand
2386168404Spjd///    \a  __D.
2387168404Spjd/// \param __D
2388168404Spjd///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
2389168404Spjd/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2390168404Spjd///    operand \a __D.
2391168404Spjdstatic __inline__ unsigned int __DEFAULT_FN_ATTRS
2392168404Spjd_mm_crc32_u8(unsigned int __C, unsigned char __D)
2393168404Spjd{
2394168404Spjd  return __builtin_ia32_crc32qi(__C, __D);
2395168404Spjd}
2396168404Spjd
2397168404Spjd/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2398168404Spjd///    unsigned short operand.
2399168404Spjd///
2400168404Spjd/// \headerfile <x86intrin.h>
2401168404Spjd///
2402168404Spjd/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
2403227497Smm///
2404219089Spjd/// \param __C
2405227497Smm///    An unsigned integer operand to add to the CRC-32C checksum of operand
2406227497Smm///    \a __D.
2407227497Smm/// \param __D
2408227497Smm///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
2409227497Smm/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2410227497Smm///    operand \a __D.
2411227497Smmstatic __inline__ unsigned int __DEFAULT_FN_ATTRS
2412219089Spjd_mm_crc32_u16(unsigned int __C, unsigned short __D)
2413219089Spjd{
2414219089Spjd  return __builtin_ia32_crc32hi(__C, __D);
2415168404Spjd}
2416168404Spjd
2417168404Spjd/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
2418168404Spjd///    the second unsigned integer operand.
2419185029Spjd///
2420185029Spjd/// \headerfile <x86intrin.h>
2421227497Smm///
2422227497Smm/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
2423227497Smm///
2424227497Smm/// \param __C
2425227497Smm///    An unsigned integer operand to add to the CRC-32C checksum of operand
2426227497Smm///    \a __D.
2427227497Smm/// \param __D
2428227497Smm///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
2429227497Smm/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2430227497Smm///    operand \a __D.
2431227497Smmstatic __inline__ unsigned int __DEFAULT_FN_ATTRS
2432227497Smm_mm_crc32_u32(unsigned int __C, unsigned int __D)
2433227497Smm{
2434227497Smm  return __builtin_ia32_crc32si(__C, __D);
2435227497Smm}
2436227497Smm
2437227497Smm#ifdef __x86_64__
2438227497Smm/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2439227497Smm///    unsigned 64-bit integer operand.
2440227497Smm///
2441227497Smm/// \headerfile <x86intrin.h>
2442227497Smm///
2443227497Smm/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
2444227497Smm///
2445227497Smm/// \param __C
2446185029Spjd///    An unsigned integer operand to add to the CRC-32C checksum of operand
2447185029Spjd///    \a __D.
2448185029Spjd/// \param __D
2449185029Spjd///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
2450185029Spjd/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2451185029Spjd///    operand \a __D.
2452185029Spjdstatic __inline__ unsigned long long __DEFAULT_FN_ATTRS
2453185029Spjd_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
2454185029Spjd{
2455185029Spjd  return __builtin_ia32_crc32di(__C, __D);
2456185029Spjd}
2457185029Spjd#endif /* __x86_64__ */
2458185029Spjd
2459185029Spjd#undef __DEFAULT_FN_ATTRS
2460219089Spjd
2461219089Spjd#ifdef __POPCNT__
2462185029Spjd#include <popcntintrin.h>
2463185029Spjd#endif
2464185029Spjd
2465185029Spjd#endif /* _SMMINTRIN_H */
2466185029Spjd