1/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __MMINTRIN_H
11#define __MMINTRIN_H
12
13typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
14
15typedef long long __v1di __attribute__((__vector_size__(8)));
16typedef int __v2si __attribute__((__vector_size__(8)));
17typedef short __v4hi __attribute__((__vector_size__(8)));
18typedef char __v8qi __attribute__((__vector_size__(8)));
19
20/* Define the default attributes for the functions in this file. */
21#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
22
23/// Clears the MMX state by setting the state of the x87 stack registers
24///    to empty.
25///
26/// \headerfile <x86intrin.h>
27///
28/// This intrinsic corresponds to the <c> EMMS </c> instruction.
29///
30static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
31_mm_empty(void)
32{
33    __builtin_ia32_emms();
34}
35
36/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
37///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
38///
39/// \headerfile <x86intrin.h>
40///
41/// This intrinsic corresponds to the <c> MOVD </c> instruction.
42///
43/// \param __i
44///    A 32-bit integer value.
45/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
46///    parameter. The upper 32 bits are set to 0.
47static __inline__ __m64 __DEFAULT_FN_ATTRS
48_mm_cvtsi32_si64(int __i)
49{
50    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
51}
52
53/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
54///    signed integer.
55///
56/// \headerfile <x86intrin.h>
57///
58/// This intrinsic corresponds to the <c> MOVD </c> instruction.
59///
60/// \param __m
61///    A 64-bit integer vector.
62/// \returns A 32-bit signed integer value containing the lower 32 bits of the
63///    parameter.
64static __inline__ int __DEFAULT_FN_ATTRS
65_mm_cvtsi64_si32(__m64 __m)
66{
67    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
68}
69
70/// Casts a 64-bit signed integer value into a 64-bit integer vector.
71///
72/// \headerfile <x86intrin.h>
73///
74/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
75///
76/// \param __i
77///    A 64-bit signed integer.
78/// \returns A 64-bit integer vector containing the same bitwise pattern as the
79///    parameter.
80static __inline__ __m64 __DEFAULT_FN_ATTRS
81_mm_cvtsi64_m64(long long __i)
82{
83    return (__m64)__i;
84}
85
86/// Casts a 64-bit integer vector into a 64-bit signed integer value.
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
91///
92/// \param __m
93///    A 64-bit integer vector.
94/// \returns A 64-bit signed integer containing the same bitwise pattern as the
95///    parameter.
96static __inline__ long long __DEFAULT_FN_ATTRS
97_mm_cvtm64_si64(__m64 __m)
98{
99    return (long long)__m;
100}
101
102/// Converts 16-bit signed integers from both 64-bit integer vector
103///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
104///    a 64-bit integer vector of [8 x i8] as the result. Positive values
105///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
106///    are saturated to 0x80.
107///
108/// \headerfile <x86intrin.h>
109///
110/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
111///
112/// \param __m1
113///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
114///    16-bit signed integer and is converted to an 8-bit signed integer with
115///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
116///    Negative values less than 0x80 are saturated to 0x80. The converted
117///    [4 x i8] values are written to the lower 32 bits of the result.
118/// \param __m2
119///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
120///    16-bit signed integer and is converted to an 8-bit signed integer with
121///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
122///    Negative values less than 0x80 are saturated to 0x80. The converted
123///    [4 x i8] values are written to the upper 32 bits of the result.
124/// \returns A 64-bit integer vector of [8 x i8] containing the converted
125///    values.
126static __inline__ __m64 __DEFAULT_FN_ATTRS
127_mm_packs_pi16(__m64 __m1, __m64 __m2)
128{
129    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
130}
131
132/// Converts 32-bit signed integers from both 64-bit integer vector
133///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
134///    a 64-bit integer vector of [4 x i16] as the result. Positive values
135///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
136///    0x8000 are saturated to 0x8000.
137///
138/// \headerfile <x86intrin.h>
139///
140/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
141///
142/// \param __m1
143///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
144///    32-bit signed integer and is converted to a 16-bit signed integer with
145///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
146///    Negative values less than 0x8000 are saturated to 0x8000. The converted
147///    [2 x i16] values are written to the lower 32 bits of the result.
148/// \param __m2
149///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
150///    32-bit signed integer and is converted to a 16-bit signed integer with
151///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
152///    Negative values less than 0x8000 are saturated to 0x8000. The converted
153///    [2 x i16] values are written to the upper 32 bits of the result.
154/// \returns A 64-bit integer vector of [4 x i16] containing the converted
155///    values.
156static __inline__ __m64 __DEFAULT_FN_ATTRS
157_mm_packs_pi32(__m64 __m1, __m64 __m2)
158{
159    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
160}
161
162/// Converts 16-bit signed integers from both 64-bit integer vector
163///    parameters of [4 x i16] into 8-bit unsigned integer values, and
164///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
165///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
166///    to 0.
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
171///
172/// \param __m1
173///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
174///    16-bit signed integer and is converted to an 8-bit unsigned integer with
175///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
176///    than 0 are saturated to 0. The converted [4 x i8] values are written to
177///    the lower 32 bits of the result.
178/// \param __m2
179///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
180///    16-bit signed integer and is converted to an 8-bit unsigned integer with
181///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
182///    than 0 are saturated to 0. The converted [4 x i8] values are written to
183///    the upper 32 bits of the result.
184/// \returns A 64-bit integer vector of [8 x i8] containing the converted
185///    values.
186static __inline__ __m64 __DEFAULT_FN_ATTRS
187_mm_packs_pu16(__m64 __m1, __m64 __m2)
188{
189    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
190}
191
192/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
193///    and interleaves them into a 64-bit integer vector of [8 x i8].
194///
195/// \headerfile <x86intrin.h>
196///
197/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
198///
199/// \param __m1
200///    A 64-bit integer vector of [8 x i8]. \n
201///    Bits [39:32] are written to bits [7:0] of the result. \n
202///    Bits [47:40] are written to bits [23:16] of the result. \n
203///    Bits [55:48] are written to bits [39:32] of the result. \n
204///    Bits [63:56] are written to bits [55:48] of the result.
205/// \param __m2
206///    A 64-bit integer vector of [8 x i8].
207///    Bits [39:32] are written to bits [15:8] of the result. \n
208///    Bits [47:40] are written to bits [31:24] of the result. \n
209///    Bits [55:48] are written to bits [47:40] of the result. \n
210///    Bits [63:56] are written to bits [63:56] of the result.
211/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
212///    values.
213static __inline__ __m64 __DEFAULT_FN_ATTRS
214_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
215{
216    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
217}
218
219/// Unpacks the upper 32 bits from two 64-bit integer vectors of
220///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
225///
226/// \param __m1
227///    A 64-bit integer vector of [4 x i16].
228///    Bits [47:32] are written to bits [15:0] of the result. \n
229///    Bits [63:48] are written to bits [47:32] of the result.
230/// \param __m2
231///    A 64-bit integer vector of [4 x i16].
232///    Bits [47:32] are written to bits [31:16] of the result. \n
233///    Bits [63:48] are written to bits [63:48] of the result.
234/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
235///    values.
236static __inline__ __m64 __DEFAULT_FN_ATTRS
237_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
238{
239    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
240}
241
242/// Unpacks the upper 32 bits from two 64-bit integer vectors of
243///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
244///
245/// \headerfile <x86intrin.h>
246///
247/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
248///
249/// \param __m1
250///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
251///    the lower 32 bits of the result.
252/// \param __m2
253///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
254///    the upper 32 bits of the result.
255/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
256///    values.
257static __inline__ __m64 __DEFAULT_FN_ATTRS
258_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
259{
260    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
261}
262
263/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
264///    and interleaves them into a 64-bit integer vector of [8 x i8].
265///
266/// \headerfile <x86intrin.h>
267///
268/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
269///
270/// \param __m1
271///    A 64-bit integer vector of [8 x i8].
272///    Bits [7:0] are written to bits [7:0] of the result. \n
273///    Bits [15:8] are written to bits [23:16] of the result. \n
274///    Bits [23:16] are written to bits [39:32] of the result. \n
275///    Bits [31:24] are written to bits [55:48] of the result.
276/// \param __m2
277///    A 64-bit integer vector of [8 x i8].
278///    Bits [7:0] are written to bits [15:8] of the result. \n
279///    Bits [15:8] are written to bits [31:24] of the result. \n
280///    Bits [23:16] are written to bits [47:40] of the result. \n
281///    Bits [31:24] are written to bits [63:56] of the result.
282/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
283///    values.
284static __inline__ __m64 __DEFAULT_FN_ATTRS
285_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
286{
287    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
288}
289
290/// Unpacks the lower 32 bits from two 64-bit integer vectors of
291///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
296///
297/// \param __m1
298///    A 64-bit integer vector of [4 x i16].
299///    Bits [15:0] are written to bits [15:0] of the result. \n
300///    Bits [31:16] are written to bits [47:32] of the result.
301/// \param __m2
302///    A 64-bit integer vector of [4 x i16].
303///    Bits [15:0] are written to bits [31:16] of the result. \n
304///    Bits [31:16] are written to bits [63:48] of the result.
305/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
306///    values.
307static __inline__ __m64 __DEFAULT_FN_ATTRS
308_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
309{
310    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
311}
312
313/// Unpacks the lower 32 bits from two 64-bit integer vectors of
314///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
315///
316/// \headerfile <x86intrin.h>
317///
318/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
319///
320/// \param __m1
321///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
322///    the lower 32 bits of the result.
323/// \param __m2
324///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
325///    the upper 32 bits of the result.
326/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
327///    values.
328static __inline__ __m64 __DEFAULT_FN_ATTRS
329_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
330{
331    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
332}
333
334/// Adds each 8-bit integer element of the first 64-bit integer vector
335///    of [8 x i8] to the corresponding 8-bit integer element of the second
336///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
337///    packed into a 64-bit integer vector of [8 x i8].
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> PADDB </c> instruction.
342///
343/// \param __m1
344///    A 64-bit integer vector of [8 x i8].
345/// \param __m2
346///    A 64-bit integer vector of [8 x i8].
347/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
348///    parameters.
349static __inline__ __m64 __DEFAULT_FN_ATTRS
350_mm_add_pi8(__m64 __m1, __m64 __m2)
351{
352    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
353}
354
355/// Adds each 16-bit integer element of the first 64-bit integer vector
356///    of [4 x i16] to the corresponding 16-bit integer element of the second
357///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
358///    packed into a 64-bit integer vector of [4 x i16].
359///
360/// \headerfile <x86intrin.h>
361///
362/// This intrinsic corresponds to the <c> PADDW </c> instruction.
363///
364/// \param __m1
365///    A 64-bit integer vector of [4 x i16].
366/// \param __m2
367///    A 64-bit integer vector of [4 x i16].
368/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
369///    parameters.
370static __inline__ __m64 __DEFAULT_FN_ATTRS
371_mm_add_pi16(__m64 __m1, __m64 __m2)
372{
373    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
374}
375
376/// Adds each 32-bit integer element of the first 64-bit integer vector
377///    of [2 x i32] to the corresponding 32-bit integer element of the second
378///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
379///    packed into a 64-bit integer vector of [2 x i32].
380///
381/// \headerfile <x86intrin.h>
382///
383/// This intrinsic corresponds to the <c> PADDD </c> instruction.
384///
385/// \param __m1
386///    A 64-bit integer vector of [2 x i32].
387/// \param __m2
388///    A 64-bit integer vector of [2 x i32].
389/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
390///    parameters.
391static __inline__ __m64 __DEFAULT_FN_ATTRS
392_mm_add_pi32(__m64 __m1, __m64 __m2)
393{
394    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
395}
396
397/// Adds each 8-bit signed integer element of the first 64-bit integer
398///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
399///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
400///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
401///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
402///
403/// \headerfile <x86intrin.h>
404///
405/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
406///
407/// \param __m1
408///    A 64-bit integer vector of [8 x i8].
409/// \param __m2
410///    A 64-bit integer vector of [8 x i8].
411/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
412///    of both parameters.
413static __inline__ __m64 __DEFAULT_FN_ATTRS
414_mm_adds_pi8(__m64 __m1, __m64 __m2)
415{
416    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
417}
418
419/// Adds each 16-bit signed integer element of the first 64-bit integer
420///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
421///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
422///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
423///    saturated to 0x8000. The results are packed into a 64-bit integer vector
424///    of [4 x i16].
425///
426/// \headerfile <x86intrin.h>
427///
428/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
429///
430/// \param __m1
431///    A 64-bit integer vector of [4 x i16].
432/// \param __m2
433///    A 64-bit integer vector of [4 x i16].
434/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
435///    of both parameters.
436static __inline__ __m64 __DEFAULT_FN_ATTRS
437_mm_adds_pi16(__m64 __m1, __m64 __m2)
438{
439    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
440}
441
442/// Adds each 8-bit unsigned integer element of the first 64-bit integer
443///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
444///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
445///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
446///    [8 x i8].
447///
448/// \headerfile <x86intrin.h>
449///
450/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
451///
452/// \param __m1
453///    A 64-bit integer vector of [8 x i8].
454/// \param __m2
455///    A 64-bit integer vector of [8 x i8].
456/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
457///    unsigned sums of both parameters.
458static __inline__ __m64 __DEFAULT_FN_ATTRS
459_mm_adds_pu8(__m64 __m1, __m64 __m2)
460{
461    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
462}
463
464/// Adds each 16-bit unsigned integer element of the first 64-bit integer
465///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
466///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
467///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
468///    integer vector of [4 x i16].
469///
470/// \headerfile <x86intrin.h>
471///
472/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
473///
474/// \param __m1
475///    A 64-bit integer vector of [4 x i16].
476/// \param __m2
477///    A 64-bit integer vector of [4 x i16].
478/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
479///    unsigned sums of both parameters.
480static __inline__ __m64 __DEFAULT_FN_ATTRS
481_mm_adds_pu16(__m64 __m1, __m64 __m2)
482{
483    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
484}
485
486/// Subtracts each 8-bit integer element of the second 64-bit integer
487///    vector of [8 x i8] from the corresponding 8-bit integer element of the
488///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
489///    are packed into a 64-bit integer vector of [8 x i8].
490///
491/// \headerfile <x86intrin.h>
492///
493/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
494///
495/// \param __m1
496///    A 64-bit integer vector of [8 x i8] containing the minuends.
497/// \param __m2
498///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
499/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
500///    both parameters.
501static __inline__ __m64 __DEFAULT_FN_ATTRS
502_mm_sub_pi8(__m64 __m1, __m64 __m2)
503{
504    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
505}
506
507/// Subtracts each 16-bit integer element of the second 64-bit integer
508///    vector of [4 x i16] from the corresponding 16-bit integer element of the
509///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
510///    results are packed into a 64-bit integer vector of [4 x i16].
511///
512/// \headerfile <x86intrin.h>
513///
514/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
515///
516/// \param __m1
517///    A 64-bit integer vector of [4 x i16] containing the minuends.
518/// \param __m2
519///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
520/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
521///    both parameters.
522static __inline__ __m64 __DEFAULT_FN_ATTRS
523_mm_sub_pi16(__m64 __m1, __m64 __m2)
524{
525    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
526}
527
528/// Subtracts each 32-bit integer element of the second 64-bit integer
529///    vector of [2 x i32] from the corresponding 32-bit integer element of the
530///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
531///    results are packed into a 64-bit integer vector of [2 x i32].
532///
533/// \headerfile <x86intrin.h>
534///
535/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
536///
537/// \param __m1
538///    A 64-bit integer vector of [2 x i32] containing the minuends.
539/// \param __m2
540///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
541/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
542///    both parameters.
543static __inline__ __m64 __DEFAULT_FN_ATTRS
544_mm_sub_pi32(__m64 __m1, __m64 __m2)
545{
546    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
547}
548
549/// Subtracts each 8-bit signed integer element of the second 64-bit
550///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
551///    element of the first 64-bit integer vector of [8 x i8]. Positive results
552///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
553///    are saturated to 0x80. The results are packed into a 64-bit integer
554///    vector of [8 x i8].
555///
556/// \headerfile <x86intrin.h>
557///
558/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
559///
560/// \param __m1
561///    A 64-bit integer vector of [8 x i8] containing the minuends.
562/// \param __m2
563///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
564/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
565///    differences of both parameters.
566static __inline__ __m64 __DEFAULT_FN_ATTRS
567_mm_subs_pi8(__m64 __m1, __m64 __m2)
568{
569    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
570}
571
572/// Subtracts each 16-bit signed integer element of the second 64-bit
573///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
574///    element of the first 64-bit integer vector of [4 x i16]. Positive results
575///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
576///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
577///    integer vector of [4 x i16].
578///
579/// \headerfile <x86intrin.h>
580///
581/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
582///
583/// \param __m1
584///    A 64-bit integer vector of [4 x i16] containing the minuends.
585/// \param __m2
586///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
587/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
588///    differences of both parameters.
589static __inline__ __m64 __DEFAULT_FN_ATTRS
590_mm_subs_pi16(__m64 __m1, __m64 __m2)
591{
592    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
593}
594
595/// Subtracts each 8-bit unsigned integer element of the second 64-bit
596///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
597///    element of the first 64-bit integer vector of [8 x i8].
598///
599///    If an element of the first vector is less than the corresponding element
600///    of the second vector, the result is saturated to 0. The results are
601///    packed into a 64-bit integer vector of [8 x i8].
602///
603/// \headerfile <x86intrin.h>
604///
605/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
606///
607/// \param __m1
608///    A 64-bit integer vector of [8 x i8] containing the minuends.
609/// \param __m2
610///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
611/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
612///    differences of both parameters.
613static __inline__ __m64 __DEFAULT_FN_ATTRS
614_mm_subs_pu8(__m64 __m1, __m64 __m2)
615{
616    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
617}
618
619/// Subtracts each 16-bit unsigned integer element of the second 64-bit
620///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
621///    integer element of the first 64-bit integer vector of [4 x i16].
622///
623///    If an element of the first vector is less than the corresponding element
624///    of the second vector, the result is saturated to 0. The results are
625///    packed into a 64-bit integer vector of [4 x i16].
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
630///
631/// \param __m1
632///    A 64-bit integer vector of [4 x i16] containing the minuends.
633/// \param __m2
634///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
635/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
636///    differences of both parameters.
637static __inline__ __m64 __DEFAULT_FN_ATTRS
638_mm_subs_pu16(__m64 __m1, __m64 __m2)
639{
640    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
641}
642
643/// Multiplies each 16-bit signed integer element of the first 64-bit
644///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
645///    element of the second 64-bit integer vector of [4 x i16] and get four
646///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
647///    The lower 32 bits of these two sums are packed into a 64-bit integer
648///    vector of [2 x i32].
649///
650///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
651///    of both parameters are multiplied, and the sum of both results is written
652///    to bits [31:0] of the result.
653///
654/// \headerfile <x86intrin.h>
655///
656/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
657///
658/// \param __m1
659///    A 64-bit integer vector of [4 x i16].
660/// \param __m2
661///    A 64-bit integer vector of [4 x i16].
662/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
663///    products of both parameters.
664static __inline__ __m64 __DEFAULT_FN_ATTRS
665_mm_madd_pi16(__m64 __m1, __m64 __m2)
666{
667    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
668}
669
670/// Multiplies each 16-bit signed integer element of the first 64-bit
671///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
672///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
673///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
674///
675/// \headerfile <x86intrin.h>
676///
677/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
678///
679/// \param __m1
680///    A 64-bit integer vector of [4 x i16].
681/// \param __m2
682///    A 64-bit integer vector of [4 x i16].
683/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
684///    of the products of both parameters.
685static __inline__ __m64 __DEFAULT_FN_ATTRS
686_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
687{
688    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
689}
690
691/// Multiplies each 16-bit signed integer element of the first 64-bit
692///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
693///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
694///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
695///
696/// \headerfile <x86intrin.h>
697///
698/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
699///
700/// \param __m1
701///    A 64-bit integer vector of [4 x i16].
702/// \param __m2
703///    A 64-bit integer vector of [4 x i16].
704/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
705///    of the products of both parameters.
706static __inline__ __m64 __DEFAULT_FN_ATTRS
707_mm_mullo_pi16(__m64 __m1, __m64 __m2)
708{
709    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
710}
711
712/// Left-shifts each 16-bit signed integer element of the first
713///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
714///    of bits specified by the second parameter, which is a 64-bit integer. The
715///    lower 16 bits of the results are packed into a 64-bit integer vector of
716///    [4 x i16].
717///
718/// \headerfile <x86intrin.h>
719///
720/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
721///
722/// \param __m
723///    A 64-bit integer vector of [4 x i16].
724/// \param __count
725///    A 64-bit integer vector interpreted as a single 64-bit integer.
726/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
727///    values. If \a __count is greater or equal to 16, the result is set to all
728///    0.
729static __inline__ __m64 __DEFAULT_FN_ATTRS
730_mm_sll_pi16(__m64 __m, __m64 __count)
731{
732    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
733}
734
735/// Left-shifts each 16-bit signed integer element of a 64-bit integer
736///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
737///    The lower 16 bits of the results are packed into a 64-bit integer vector
738///    of [4 x i16].
739///
740/// \headerfile <x86intrin.h>
741///
742/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
743///
744/// \param __m
745///    A 64-bit integer vector of [4 x i16].
746/// \param __count
747///    A 32-bit integer value.
748/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
749///    values. If \a __count is greater or equal to 16, the result is set to all
750///    0.
751static __inline__ __m64 __DEFAULT_FN_ATTRS
752_mm_slli_pi16(__m64 __m, int __count)
753{
754    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
755}
756
757/// Left-shifts each 32-bit signed integer element of the first
758///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
759///    of bits specified by the second parameter, which is a 64-bit integer. The
760///    lower 32 bits of the results are packed into a 64-bit integer vector of
761///    [2 x i32].
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
766///
767/// \param __m
768///    A 64-bit integer vector of [2 x i32].
769/// \param __count
770///    A 64-bit integer vector interpreted as a single 64-bit integer.
771/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
772///    values. If \a __count is greater or equal to 32, the result is set to all
773///    0.
774static __inline__ __m64 __DEFAULT_FN_ATTRS
775_mm_sll_pi32(__m64 __m, __m64 __count)
776{
777    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
778}
779
780/// Left-shifts each 32-bit signed integer element of a 64-bit integer
781///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
782///    The lower 32 bits of the results are packed into a 64-bit integer vector
783///    of [2 x i32].
784///
785/// \headerfile <x86intrin.h>
786///
787/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
788///
789/// \param __m
790///    A 64-bit integer vector of [2 x i32].
791/// \param __count
792///    A 32-bit integer value.
793/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
794///    values. If \a __count is greater or equal to 32, the result is set to all
795///    0.
796static __inline__ __m64 __DEFAULT_FN_ATTRS
797_mm_slli_pi32(__m64 __m, int __count)
798{
799    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
800}
801
802/// Left-shifts the first 64-bit integer parameter by the number of bits
803///    specified by the second 64-bit integer parameter. The lower 64 bits of
804///    result are returned.
805///
806/// \headerfile <x86intrin.h>
807///
808/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
809///
810/// \param __m
811///    A 64-bit integer vector interpreted as a single 64-bit integer.
812/// \param __count
813///    A 64-bit integer vector interpreted as a single 64-bit integer.
814/// \returns A 64-bit integer vector containing the left-shifted value. If
815///     \a __count is greater or equal to 64, the result is set to 0.
816static __inline__ __m64 __DEFAULT_FN_ATTRS
817_mm_sll_si64(__m64 __m, __m64 __count)
818{
819    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
820}
821
822/// Left-shifts the first parameter, which is a 64-bit integer, by the
823///    number of bits specified by the second parameter, which is a 32-bit
824///    integer. The lower 64 bits of result are returned.
825///
826/// \headerfile <x86intrin.h>
827///
828/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
829///
830/// \param __m
831///    A 64-bit integer vector interpreted as a single 64-bit integer.
832/// \param __count
833///    A 32-bit integer value.
834/// \returns A 64-bit integer vector containing the left-shifted value. If
835///     \a __count is greater or equal to 64, the result is set to 0.
836static __inline__ __m64 __DEFAULT_FN_ATTRS
837_mm_slli_si64(__m64 __m, int __count)
838{
839    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
840}
841
842/// Right-shifts each 16-bit integer element of the first parameter,
843///    which is a 64-bit integer vector of [4 x i16], by the number of bits
844///    specified by the second parameter, which is a 64-bit integer.
845///
846///    High-order bits are filled with the sign bit of the initial value of each
847///    16-bit element. The 16-bit results are packed into a 64-bit integer
848///    vector of [4 x i16].
849///
850/// \headerfile <x86intrin.h>
851///
852/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
853///
854/// \param __m
855///    A 64-bit integer vector of [4 x i16].
856/// \param __count
857///    A 64-bit integer vector interpreted as a single 64-bit integer.
858/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
859///    values.
860static __inline__ __m64 __DEFAULT_FN_ATTRS
861_mm_sra_pi16(__m64 __m, __m64 __count)
862{
863    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
864}
865
866/// Right-shifts each 16-bit integer element of a 64-bit integer vector
867///    of [4 x i16] by the number of bits specified by a 32-bit integer.
868///
869///    High-order bits are filled with the sign bit of the initial value of each
870///    16-bit element. The 16-bit results are packed into a 64-bit integer
871///    vector of [4 x i16].
872///
873/// \headerfile <x86intrin.h>
874///
875/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
876///
877/// \param __m
878///    A 64-bit integer vector of [4 x i16].
879/// \param __count
880///    A 32-bit integer value.
881/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
882///    values.
883static __inline__ __m64 __DEFAULT_FN_ATTRS
884_mm_srai_pi16(__m64 __m, int __count)
885{
886    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
887}
888
889/// Right-shifts each 32-bit integer element of the first parameter,
890///    which is a 64-bit integer vector of [2 x i32], by the number of bits
891///    specified by the second parameter, which is a 64-bit integer.
892///
893///    High-order bits are filled with the sign bit of the initial value of each
894///    32-bit element. The 32-bit results are packed into a 64-bit integer
895///    vector of [2 x i32].
896///
897/// \headerfile <x86intrin.h>
898///
899/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
900///
901/// \param __m
902///    A 64-bit integer vector of [2 x i32].
903/// \param __count
904///    A 64-bit integer vector interpreted as a single 64-bit integer.
905/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
906///    values.
907static __inline__ __m64 __DEFAULT_FN_ATTRS
908_mm_sra_pi32(__m64 __m, __m64 __count)
909{
910    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
911}
912
913/// Right-shifts each 32-bit integer element of a 64-bit integer vector
914///    of [2 x i32] by the number of bits specified by a 32-bit integer.
915///
916///    High-order bits are filled with the sign bit of the initial value of each
917///    32-bit element. The 32-bit results are packed into a 64-bit integer
918///    vector of [2 x i32].
919///
920/// \headerfile <x86intrin.h>
921///
922/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
923///
924/// \param __m
925///    A 64-bit integer vector of [2 x i32].
926/// \param __count
927///    A 32-bit integer value.
928/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
929///    values.
930static __inline__ __m64 __DEFAULT_FN_ATTRS
931_mm_srai_pi32(__m64 __m, int __count)
932{
933    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
934}
935
936/// Right-shifts each 16-bit integer element of the first parameter,
937///    which is a 64-bit integer vector of [4 x i16], by the number of bits
938///    specified by the second parameter, which is a 64-bit integer.
939///
940///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
941///    integer vector of [4 x i16].
942///
943/// \headerfile <x86intrin.h>
944///
945/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
946///
947/// \param __m
948///    A 64-bit integer vector of [4 x i16].
949/// \param __count
950///    A 64-bit integer vector interpreted as a single 64-bit integer.
951/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
952///    values.
953static __inline__ __m64 __DEFAULT_FN_ATTRS
954_mm_srl_pi16(__m64 __m, __m64 __count)
955{
956    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
957}
958
959/// Right-shifts each 16-bit integer element of a 64-bit integer vector
960///    of [4 x i16] by the number of bits specified by a 32-bit integer.
961///
962///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
963///    integer vector of [4 x i16].
964///
965/// \headerfile <x86intrin.h>
966///
967/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
968///
969/// \param __m
970///    A 64-bit integer vector of [4 x i16].
971/// \param __count
972///    A 32-bit integer value.
973/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
974///    values.
975static __inline__ __m64 __DEFAULT_FN_ATTRS
976_mm_srli_pi16(__m64 __m, int __count)
977{
978    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
979}
980
981/// Right-shifts each 32-bit integer element of the first parameter,
982///    which is a 64-bit integer vector of [2 x i32], by the number of bits
983///    specified by the second parameter, which is a 64-bit integer.
984///
985///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
986///    integer vector of [2 x i32].
987///
988/// \headerfile <x86intrin.h>
989///
990/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
991///
992/// \param __m
993///    A 64-bit integer vector of [2 x i32].
994/// \param __count
995///    A 64-bit integer vector interpreted as a single 64-bit integer.
996/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
997///    values.
998static __inline__ __m64 __DEFAULT_FN_ATTRS
999_mm_srl_pi32(__m64 __m, __m64 __count)
1000{
1001    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
1002}
1003
1004/// Right-shifts each 32-bit integer element of a 64-bit integer vector
1005///    of [2 x i32] by the number of bits specified by a 32-bit integer.
1006///
1007///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1008///    integer vector of [2 x i32].
1009///
1010/// \headerfile <x86intrin.h>
1011///
1012/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1013///
1014/// \param __m
1015///    A 64-bit integer vector of [2 x i32].
1016/// \param __count
1017///    A 32-bit integer value.
1018/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1019///    values.
1020static __inline__ __m64 __DEFAULT_FN_ATTRS
1021_mm_srli_pi32(__m64 __m, int __count)
1022{
1023    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
1024}
1025
1026/// Right-shifts the first 64-bit integer parameter by the number of bits
1027///    specified by the second 64-bit integer parameter.
1028///
1029///    High-order bits are cleared.
1030///
1031/// \headerfile <x86intrin.h>
1032///
1033/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1034///
1035/// \param __m
1036///    A 64-bit integer vector interpreted as a single 64-bit integer.
1037/// \param __count
1038///    A 64-bit integer vector interpreted as a single 64-bit integer.
1039/// \returns A 64-bit integer vector containing the right-shifted value.
1040static __inline__ __m64 __DEFAULT_FN_ATTRS
1041_mm_srl_si64(__m64 __m, __m64 __count)
1042{
1043    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
1044}
1045
1046/// Right-shifts the first parameter, which is a 64-bit integer, by the
1047///    number of bits specified by the second parameter, which is a 32-bit
1048///    integer.
1049///
1050///    High-order bits are cleared.
1051///
1052/// \headerfile <x86intrin.h>
1053///
1054/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1055///
1056/// \param __m
1057///    A 64-bit integer vector interpreted as a single 64-bit integer.
1058/// \param __count
1059///    A 32-bit integer value.
1060/// \returns A 64-bit integer vector containing the right-shifted value.
1061static __inline__ __m64 __DEFAULT_FN_ATTRS
1062_mm_srli_si64(__m64 __m, int __count)
1063{
1064    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
1065}
1066
1067/// Performs a bitwise AND of two 64-bit integer vectors.
1068///
1069/// \headerfile <x86intrin.h>
1070///
1071/// This intrinsic corresponds to the <c> PAND </c> instruction.
1072///
1073/// \param __m1
1074///    A 64-bit integer vector.
1075/// \param __m2
1076///    A 64-bit integer vector.
1077/// \returns A 64-bit integer vector containing the bitwise AND of both
1078///    parameters.
1079static __inline__ __m64 __DEFAULT_FN_ATTRS
1080_mm_and_si64(__m64 __m1, __m64 __m2)
1081{
1082    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
1083}
1084
1085/// Performs a bitwise NOT of the first 64-bit integer vector, and then
1086///    performs a bitwise AND of the intermediate result and the second 64-bit
1087///    integer vector.
1088///
1089/// \headerfile <x86intrin.h>
1090///
1091/// This intrinsic corresponds to the <c> PANDN </c> instruction.
1092///
1093/// \param __m1
1094///    A 64-bit integer vector. The one's complement of this parameter is used
1095///    in the bitwise AND.
1096/// \param __m2
1097///    A 64-bit integer vector.
1098/// \returns A 64-bit integer vector containing the bitwise AND of the second
1099///    parameter and the one's complement of the first parameter.
1100static __inline__ __m64 __DEFAULT_FN_ATTRS
1101_mm_andnot_si64(__m64 __m1, __m64 __m2)
1102{
1103    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
1104}
1105
1106/// Performs a bitwise OR of two 64-bit integer vectors.
1107///
1108/// \headerfile <x86intrin.h>
1109///
1110/// This intrinsic corresponds to the <c> POR </c> instruction.
1111///
1112/// \param __m1
1113///    A 64-bit integer vector.
1114/// \param __m2
1115///    A 64-bit integer vector.
1116/// \returns A 64-bit integer vector containing the bitwise OR of both
1117///    parameters.
1118static __inline__ __m64 __DEFAULT_FN_ATTRS
1119_mm_or_si64(__m64 __m1, __m64 __m2)
1120{
1121    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
1122}
1123
1124/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1125///
1126/// \headerfile <x86intrin.h>
1127///
1128/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1129///
1130/// \param __m1
1131///    A 64-bit integer vector.
1132/// \param __m2
1133///    A 64-bit integer vector.
1134/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1135///    parameters.
1136static __inline__ __m64 __DEFAULT_FN_ATTRS
1137_mm_xor_si64(__m64 __m1, __m64 __m2)
1138{
1139    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
1140}
1141
1142/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1143///    [8 x i8] to determine if the element of the first vector is equal to the
1144///    corresponding element of the second vector.
1145///
1146///    The comparison yields 0 for false, 0xFF for true.
1147///
1148/// \headerfile <x86intrin.h>
1149///
1150/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1151///
1152/// \param __m1
1153///    A 64-bit integer vector of [8 x i8].
1154/// \param __m2
1155///    A 64-bit integer vector of [8 x i8].
1156/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1157///    results.
1158static __inline__ __m64 __DEFAULT_FN_ATTRS
1159_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1160{
1161    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
1162}
1163
1164/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1165///    [4 x i16] to determine if the element of the first vector is equal to the
1166///    corresponding element of the second vector.
1167///
1168///    The comparison yields 0 for false, 0xFFFF for true.
1169///
1170/// \headerfile <x86intrin.h>
1171///
1172/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1173///
1174/// \param __m1
1175///    A 64-bit integer vector of [4 x i16].
1176/// \param __m2
1177///    A 64-bit integer vector of [4 x i16].
1178/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1179///    results.
1180static __inline__ __m64 __DEFAULT_FN_ATTRS
1181_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1182{
1183    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
1184}
1185
1186/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1187///    [2 x i32] to determine if the element of the first vector is equal to the
1188///    corresponding element of the second vector.
1189///
1190///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1191///
1192/// \headerfile <x86intrin.h>
1193///
1194/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1195///
1196/// \param __m1
1197///    A 64-bit integer vector of [2 x i32].
1198/// \param __m2
1199///    A 64-bit integer vector of [2 x i32].
1200/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1201///    results.
1202static __inline__ __m64 __DEFAULT_FN_ATTRS
1203_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1204{
1205    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
1206}
1207
1208/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1209///    [8 x i8] to determine if the element of the first vector is greater than
1210///    the corresponding element of the second vector.
1211///
1212///    The comparison yields 0 for false, 0xFF for true.
1213///
1214/// \headerfile <x86intrin.h>
1215///
1216/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1217///
1218/// \param __m1
1219///    A 64-bit integer vector of [8 x i8].
1220/// \param __m2
1221///    A 64-bit integer vector of [8 x i8].
1222/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1223///    results.
1224static __inline__ __m64 __DEFAULT_FN_ATTRS
1225_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1226{
1227    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
1228}
1229
1230/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1231///    [4 x i16] to determine if the element of the first vector is greater than
1232///    the corresponding element of the second vector.
1233///
1234///    The comparison yields 0 for false, 0xFFFF for true.
1235///
1236/// \headerfile <x86intrin.h>
1237///
1238/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1239///
1240/// \param __m1
1241///    A 64-bit integer vector of [4 x i16].
1242/// \param __m2
1243///    A 64-bit integer vector of [4 x i16].
1244/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1245///    results.
1246static __inline__ __m64 __DEFAULT_FN_ATTRS
1247_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1248{
1249    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
1250}
1251
1252/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1253///    [2 x i32] to determine if the element of the first vector is greater than
1254///    the corresponding element of the second vector.
1255///
1256///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1257///
1258/// \headerfile <x86intrin.h>
1259///
1260/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1261///
1262/// \param __m1
1263///    A 64-bit integer vector of [2 x i32].
1264/// \param __m2
1265///    A 64-bit integer vector of [2 x i32].
1266/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1267///    results.
1268static __inline__ __m64 __DEFAULT_FN_ATTRS
1269_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1270{
1271    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
1272}
1273
1274/// Constructs a 64-bit integer vector initialized to zero.
1275///
1276/// \headerfile <x86intrin.h>
1277///
1278/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1279///
1280/// \returns An initialized 64-bit integer vector with all elements set to zero.
1281static __inline__ __m64 __DEFAULT_FN_ATTRS
1282_mm_setzero_si64(void)
1283{
1284    return __extension__ (__m64){ 0LL };
1285}
1286
1287/// Constructs a 64-bit integer vector initialized with the specified
1288///    32-bit integer values.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// This intrinsic is a utility function and does not correspond to a specific
1293///    instruction.
1294///
1295/// \param __i1
1296///    A 32-bit integer value used to initialize the upper 32 bits of the
1297///    result.
1298/// \param __i0
1299///    A 32-bit integer value used to initialize the lower 32 bits of the
1300///    result.
1301/// \returns An initialized 64-bit integer vector.
1302static __inline__ __m64 __DEFAULT_FN_ATTRS
1303_mm_set_pi32(int __i1, int __i0)
1304{
1305    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
1306}
1307
1308/// Constructs a 64-bit integer vector initialized with the specified
1309///    16-bit integer values.
1310///
1311/// \headerfile <x86intrin.h>
1312///
1313/// This intrinsic is a utility function and does not correspond to a specific
1314///    instruction.
1315///
1316/// \param __s3
1317///    A 16-bit integer value used to initialize bits [63:48] of the result.
1318/// \param __s2
1319///    A 16-bit integer value used to initialize bits [47:32] of the result.
1320/// \param __s1
1321///    A 16-bit integer value used to initialize bits [31:16] of the result.
1322/// \param __s0
1323///    A 16-bit integer value used to initialize bits [15:0] of the result.
1324/// \returns An initialized 64-bit integer vector.
1325static __inline__ __m64 __DEFAULT_FN_ATTRS
1326_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1327{
1328    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
1329}
1330
1331/// Constructs a 64-bit integer vector initialized with the specified
1332///    8-bit integer values.
1333///
1334/// \headerfile <x86intrin.h>
1335///
1336/// This intrinsic is a utility function and does not correspond to a specific
1337///    instruction.
1338///
1339/// \param __b7
1340///    An 8-bit integer value used to initialize bits [63:56] of the result.
1341/// \param __b6
1342///    An 8-bit integer value used to initialize bits [55:48] of the result.
1343/// \param __b5
1344///    An 8-bit integer value used to initialize bits [47:40] of the result.
1345/// \param __b4
1346///    An 8-bit integer value used to initialize bits [39:32] of the result.
1347/// \param __b3
1348///    An 8-bit integer value used to initialize bits [31:24] of the result.
1349/// \param __b2
1350///    An 8-bit integer value used to initialize bits [23:16] of the result.
1351/// \param __b1
1352///    An 8-bit integer value used to initialize bits [15:8] of the result.
1353/// \param __b0
1354///    An 8-bit integer value used to initialize bits [7:0] of the result.
1355/// \returns An initialized 64-bit integer vector.
1356static __inline__ __m64 __DEFAULT_FN_ATTRS
1357_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1358            char __b1, char __b0)
1359{
1360    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
1361                                               __b4, __b5, __b6, __b7);
1362}
1363
1364/// Constructs a 64-bit integer vector of [2 x i32], with each of the
1365///    32-bit integer vector elements set to the specified 32-bit integer
1366///    value.
1367///
1368/// \headerfile <x86intrin.h>
1369///
1370/// This intrinsic is a utility function and does not correspond to a specific
1371///    instruction.
1372///
1373/// \param __i
1374///    A 32-bit integer value used to initialize each vector element of the
1375///    result.
1376/// \returns An initialized 64-bit integer vector of [2 x i32].
1377static __inline__ __m64 __DEFAULT_FN_ATTRS
1378_mm_set1_pi32(int __i)
1379{
1380    return _mm_set_pi32(__i, __i);
1381}
1382
1383/// Constructs a 64-bit integer vector of [4 x i16], with each of the
1384///    16-bit integer vector elements set to the specified 16-bit integer
1385///    value.
1386///
1387/// \headerfile <x86intrin.h>
1388///
1389/// This intrinsic is a utility function and does not correspond to a specific
1390///    instruction.
1391///
1392/// \param __w
1393///    A 16-bit integer value used to initialize each vector element of the
1394///    result.
1395/// \returns An initialized 64-bit integer vector of [4 x i16].
1396static __inline__ __m64 __DEFAULT_FN_ATTRS
1397_mm_set1_pi16(short __w)
1398{
1399    return _mm_set_pi16(__w, __w, __w, __w);
1400}
1401
1402/// Constructs a 64-bit integer vector of [8 x i8], with each of the
1403///    8-bit integer vector elements set to the specified 8-bit integer value.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic is a utility function and does not correspond to a specific
1408///    instruction.
1409///
1410/// \param __b
1411///    An 8-bit integer value used to initialize each vector element of the
1412///    result.
1413/// \returns An initialized 64-bit integer vector of [8 x i8].
1414static __inline__ __m64 __DEFAULT_FN_ATTRS
1415_mm_set1_pi8(char __b)
1416{
1417    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1418}
1419
1420/// Constructs a 64-bit integer vector, initialized in reverse order with
1421///    the specified 32-bit integer values.
1422///
1423/// \headerfile <x86intrin.h>
1424///
1425/// This intrinsic is a utility function and does not correspond to a specific
1426///    instruction.
1427///
1428/// \param __i0
1429///    A 32-bit integer value used to initialize the lower 32 bits of the
1430///    result.
1431/// \param __i1
1432///    A 32-bit integer value used to initialize the upper 32 bits of the
1433///    result.
1434/// \returns An initialized 64-bit integer vector.
1435static __inline__ __m64 __DEFAULT_FN_ATTRS
1436_mm_setr_pi32(int __i0, int __i1)
1437{
1438    return _mm_set_pi32(__i1, __i0);
1439}
1440
1441/// Constructs a 64-bit integer vector, initialized in reverse order with
1442///    the specified 16-bit integer values.
1443///
1444/// \headerfile <x86intrin.h>
1445///
1446/// This intrinsic is a utility function and does not correspond to a specific
1447///    instruction.
1448///
1449/// \param __w0
1450///    A 16-bit integer value used to initialize bits [15:0] of the result.
1451/// \param __w1
1452///    A 16-bit integer value used to initialize bits [31:16] of the result.
1453/// \param __w2
1454///    A 16-bit integer value used to initialize bits [47:32] of the result.
1455/// \param __w3
1456///    A 16-bit integer value used to initialize bits [63:48] of the result.
1457/// \returns An initialized 64-bit integer vector.
1458static __inline__ __m64 __DEFAULT_FN_ATTRS
1459_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1460{
1461    return _mm_set_pi16(__w3, __w2, __w1, __w0);
1462}
1463
1464/// Constructs a 64-bit integer vector, initialized in reverse order with
1465///    the specified 8-bit integer values.
1466///
1467/// \headerfile <x86intrin.h>
1468///
1469/// This intrinsic is a utility function and does not correspond to a specific
1470///    instruction.
1471///
1472/// \param __b0
1473///    An 8-bit integer value used to initialize bits [7:0] of the result.
1474/// \param __b1
1475///    An 8-bit integer value used to initialize bits [15:8] of the result.
1476/// \param __b2
1477///    An 8-bit integer value used to initialize bits [23:16] of the result.
1478/// \param __b3
1479///    An 8-bit integer value used to initialize bits [31:24] of the result.
1480/// \param __b4
1481///    An 8-bit integer value used to initialize bits [39:32] of the result.
1482/// \param __b5
1483///    An 8-bit integer value used to initialize bits [47:40] of the result.
1484/// \param __b6
1485///    An 8-bit integer value used to initialize bits [55:48] of the result.
1486/// \param __b7
1487///    An 8-bit integer value used to initialize bits [63:56] of the result.
1488/// \returns An initialized 64-bit integer vector.
1489static __inline__ __m64 __DEFAULT_FN_ATTRS
1490_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1491             char __b6, char __b7)
1492{
1493    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1494}
1495
1496#undef __DEFAULT_FN_ATTRS
1497
1498/* Aliases for compatibility. */
1499#define _m_empty _mm_empty
1500#define _m_from_int _mm_cvtsi32_si64
1501#define _m_from_int64 _mm_cvtsi64_m64
1502#define _m_to_int _mm_cvtsi64_si32
1503#define _m_to_int64 _mm_cvtm64_si64
1504#define _m_packsswb _mm_packs_pi16
1505#define _m_packssdw _mm_packs_pi32
1506#define _m_packuswb _mm_packs_pu16
1507#define _m_punpckhbw _mm_unpackhi_pi8
1508#define _m_punpckhwd _mm_unpackhi_pi16
1509#define _m_punpckhdq _mm_unpackhi_pi32
1510#define _m_punpcklbw _mm_unpacklo_pi8
1511#define _m_punpcklwd _mm_unpacklo_pi16
1512#define _m_punpckldq _mm_unpacklo_pi32
1513#define _m_paddb _mm_add_pi8
1514#define _m_paddw _mm_add_pi16
1515#define _m_paddd _mm_add_pi32
1516#define _m_paddsb _mm_adds_pi8
1517#define _m_paddsw _mm_adds_pi16
1518#define _m_paddusb _mm_adds_pu8
1519#define _m_paddusw _mm_adds_pu16
1520#define _m_psubb _mm_sub_pi8
1521#define _m_psubw _mm_sub_pi16
1522#define _m_psubd _mm_sub_pi32
1523#define _m_psubsb _mm_subs_pi8
1524#define _m_psubsw _mm_subs_pi16
1525#define _m_psubusb _mm_subs_pu8
1526#define _m_psubusw _mm_subs_pu16
1527#define _m_pmaddwd _mm_madd_pi16
1528#define _m_pmulhw _mm_mulhi_pi16
1529#define _m_pmullw _mm_mullo_pi16
1530#define _m_psllw _mm_sll_pi16
1531#define _m_psllwi _mm_slli_pi16
1532#define _m_pslld _mm_sll_pi32
1533#define _m_pslldi _mm_slli_pi32
1534#define _m_psllq _mm_sll_si64
1535#define _m_psllqi _mm_slli_si64
1536#define _m_psraw _mm_sra_pi16
1537#define _m_psrawi _mm_srai_pi16
1538#define _m_psrad _mm_sra_pi32
1539#define _m_psradi _mm_srai_pi32
1540#define _m_psrlw _mm_srl_pi16
1541#define _m_psrlwi _mm_srli_pi16
1542#define _m_psrld _mm_srl_pi32
1543#define _m_psrldi _mm_srli_pi32
1544#define _m_psrlq _mm_srl_si64
1545#define _m_psrlqi _mm_srli_si64
1546#define _m_pand _mm_and_si64
1547#define _m_pandn _mm_andnot_si64
1548#define _m_por _mm_or_si64
1549#define _m_pxor _mm_xor_si64
1550#define _m_pcmpeqb _mm_cmpeq_pi8
1551#define _m_pcmpeqw _mm_cmpeq_pi16
1552#define _m_pcmpeqd _mm_cmpeq_pi32
1553#define _m_pcmpgtb _mm_cmpgt_pi8
1554#define _m_pcmpgtw _mm_cmpgt_pi16
1555#define _m_pcmpgtd _mm_cmpgt_pi32
1556
1557#endif /* __MMINTRIN_H */
1558
1559