1/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __TMMINTRIN_H
11#define __TMMINTRIN_H
12
13#include <pmmintrin.h>
14
15/* Define the default attributes for the functions in this file. */
16#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
17#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
18
19/// Computes the absolute value of each of the packed 8-bit signed
20///    integers in the source operand and stores the 8-bit unsigned integer
21///    results in the destination.
22///
23/// \headerfile <x86intrin.h>
24///
25/// This intrinsic corresponds to the \c PABSB instruction.
26///
27/// \param __a
28///    A 64-bit vector of [8 x i8].
29/// \returns A 64-bit integer vector containing the absolute values of the
30///    elements in the operand.
31static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
32_mm_abs_pi8(__m64 __a)
33{
34    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
35}
36
37/// Computes the absolute value of each of the packed 8-bit signed
38///    integers in the source operand and stores the 8-bit unsigned integer
39///    results in the destination.
40///
41/// \headerfile <x86intrin.h>
42///
43/// This intrinsic corresponds to the \c VPABSB instruction.
44///
45/// \param __a
46///    A 128-bit vector of [16 x i8].
47/// \returns A 128-bit integer vector containing the absolute values of the
48///    elements in the operand.
49static __inline__ __m128i __DEFAULT_FN_ATTRS
50_mm_abs_epi8(__m128i __a)
51{
52    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
53}
54
55/// Computes the absolute value of each of the packed 16-bit signed
56///    integers in the source operand and stores the 16-bit unsigned integer
57///    results in the destination.
58///
59/// \headerfile <x86intrin.h>
60///
61/// This intrinsic corresponds to the \c PABSW instruction.
62///
63/// \param __a
64///    A 64-bit vector of [4 x i16].
65/// \returns A 64-bit integer vector containing the absolute values of the
66///    elements in the operand.
67static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
68_mm_abs_pi16(__m64 __a)
69{
70    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
71}
72
73/// Computes the absolute value of each of the packed 16-bit signed
74///    integers in the source operand and stores the 16-bit unsigned integer
75///    results in the destination.
76///
77/// \headerfile <x86intrin.h>
78///
79/// This intrinsic corresponds to the \c VPABSW instruction.
80///
81/// \param __a
82///    A 128-bit vector of [8 x i16].
83/// \returns A 128-bit integer vector containing the absolute values of the
84///    elements in the operand.
85static __inline__ __m128i __DEFAULT_FN_ATTRS
86_mm_abs_epi16(__m128i __a)
87{
88    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
89}
90
91/// Computes the absolute value of each of the packed 32-bit signed
92///    integers in the source operand and stores the 32-bit unsigned integer
93///    results in the destination.
94///
95/// \headerfile <x86intrin.h>
96///
97/// This intrinsic corresponds to the \c PABSD instruction.
98///
99/// \param __a
100///    A 64-bit vector of [2 x i32].
101/// \returns A 64-bit integer vector containing the absolute values of the
102///    elements in the operand.
103static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
104_mm_abs_pi32(__m64 __a)
105{
106    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
107}
108
109/// Computes the absolute value of each of the packed 32-bit signed
110///    integers in the source operand and stores the 32-bit unsigned integer
111///    results in the destination.
112///
113/// \headerfile <x86intrin.h>
114///
115/// This intrinsic corresponds to the \c VPABSD instruction.
116///
117/// \param __a
118///    A 128-bit vector of [4 x i32].
119/// \returns A 128-bit integer vector containing the absolute values of the
120///    elements in the operand.
121static __inline__ __m128i __DEFAULT_FN_ATTRS
122_mm_abs_epi32(__m128i __a)
123{
124    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
125}
126
127/// Concatenates the two 128-bit integer vector operands, and
128///    right-shifts the result by the number of bytes specified in the immediate
129///    operand.
130///
131/// \headerfile <x86intrin.h>
132///
133/// \code
134/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
135/// \endcode
136///
137/// This intrinsic corresponds to the \c PALIGNR instruction.
138///
139/// \param a
140///    A 128-bit vector of [16 x i8] containing one of the source operands.
141/// \param b
142///    A 128-bit vector of [16 x i8] containing one of the source operands.
143/// \param n
144///    An immediate operand specifying how many bytes to right-shift the result.
145/// \returns A 128-bit integer vector containing the concatenated right-shifted
146///    value.
147#define _mm_alignr_epi8(a, b, n) \
148  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
149                                     (__v16qi)(__m128i)(b), (n))
150
151/// Concatenates the two 64-bit integer vector operands, and right-shifts
152///    the result by the number of bytes specified in the immediate operand.
153///
154/// \headerfile <x86intrin.h>
155///
156/// \code
157/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
158/// \endcode
159///
160/// This intrinsic corresponds to the \c PALIGNR instruction.
161///
162/// \param a
163///    A 64-bit vector of [8 x i8] containing one of the source operands.
164/// \param b
165///    A 64-bit vector of [8 x i8] containing one of the source operands.
166/// \param n
167///    An immediate operand specifying how many bytes to right-shift the result.
168/// \returns A 64-bit integer vector containing the concatenated right-shifted
169///    value.
170#define _mm_alignr_pi8(a, b, n) \
171  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
172
173/// Horizontally adds the adjacent pairs of values contained in 2 packed
174///    128-bit vectors of [8 x i16].
175///
176/// \headerfile <x86intrin.h>
177///
178/// This intrinsic corresponds to the \c VPHADDW instruction.
179///
180/// \param __a
181///    A 128-bit vector of [8 x i16] containing one of the source operands. The
182///    horizontal sums of the values are stored in the lower bits of the
183///    destination.
184/// \param __b
185///    A 128-bit vector of [8 x i16] containing one of the source operands. The
186///    horizontal sums of the values are stored in the upper bits of the
187///    destination.
188/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
189///    both operands.
190static __inline__ __m128i __DEFAULT_FN_ATTRS
191_mm_hadd_epi16(__m128i __a, __m128i __b)
192{
193    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
194}
195
196/// Horizontally adds the adjacent pairs of values contained in 2 packed
197///    128-bit vectors of [4 x i32].
198///
199/// \headerfile <x86intrin.h>
200///
201/// This intrinsic corresponds to the \c VPHADDD instruction.
202///
203/// \param __a
204///    A 128-bit vector of [4 x i32] containing one of the source operands. The
205///    horizontal sums of the values are stored in the lower bits of the
206///    destination.
207/// \param __b
208///    A 128-bit vector of [4 x i32] containing one of the source operands. The
209///    horizontal sums of the values are stored in the upper bits of the
210///    destination.
211/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
212///    both operands.
213static __inline__ __m128i __DEFAULT_FN_ATTRS
214_mm_hadd_epi32(__m128i __a, __m128i __b)
215{
216    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
217}
218
219/// Horizontally adds the adjacent pairs of values contained in 2 packed
220///    64-bit vectors of [4 x i16].
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the \c PHADDW instruction.
225///
226/// \param __a
227///    A 64-bit vector of [4 x i16] containing one of the source operands. The
228///    horizontal sums of the values are stored in the lower bits of the
229///    destination.
230/// \param __b
231///    A 64-bit vector of [4 x i16] containing one of the source operands. The
232///    horizontal sums of the values are stored in the upper bits of the
233///    destination.
234/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
235///    operands.
236static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
237_mm_hadd_pi16(__m64 __a, __m64 __b)
238{
239    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
240}
241
242/// Horizontally adds the adjacent pairs of values contained in 2 packed
243///    64-bit vectors of [2 x i32].
244///
245/// \headerfile <x86intrin.h>
246///
247/// This intrinsic corresponds to the \c PHADDD instruction.
248///
249/// \param __a
250///    A 64-bit vector of [2 x i32] containing one of the source operands. The
251///    horizontal sums of the values are stored in the lower bits of the
252///    destination.
253/// \param __b
254///    A 64-bit vector of [2 x i32] containing one of the source operands. The
255///    horizontal sums of the values are stored in the upper bits of the
256///    destination.
257/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
258///    operands.
259static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
260_mm_hadd_pi32(__m64 __a, __m64 __b)
261{
262    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
263}
264
265/// Horizontally adds the adjacent pairs of values contained in 2 packed
266///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
267///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
268///    0x8000.
269///
270/// \headerfile <x86intrin.h>
271///
272/// This intrinsic corresponds to the \c VPHADDSW instruction.
273///
274/// \param __a
275///    A 128-bit vector of [8 x i16] containing one of the source operands. The
276///    horizontal sums of the values are stored in the lower bits of the
277///    destination.
278/// \param __b
279///    A 128-bit vector of [8 x i16] containing one of the source operands. The
280///    horizontal sums of the values are stored in the upper bits of the
281///    destination.
282/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
283///    sums of both operands.
284static __inline__ __m128i __DEFAULT_FN_ATTRS
285_mm_hadds_epi16(__m128i __a, __m128i __b)
286{
287    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
288}
289
290/// Horizontally adds the adjacent pairs of values contained in 2 packed
291///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
292///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
293///    0x8000.
294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the \c PHADDSW instruction.
298///
299/// \param __a
300///    A 64-bit vector of [4 x i16] containing one of the source operands. The
301///    horizontal sums of the values are stored in the lower bits of the
302///    destination.
303/// \param __b
304///    A 64-bit vector of [4 x i16] containing one of the source operands. The
305///    horizontal sums of the values are stored in the upper bits of the
306///    destination.
307/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
308///    sums of both operands.
309static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
310_mm_hadds_pi16(__m64 __a, __m64 __b)
311{
312    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
313}
314
315/// Horizontally subtracts the adjacent pairs of values contained in 2
316///    packed 128-bit vectors of [8 x i16].
317///
318/// \headerfile <x86intrin.h>
319///
320/// This intrinsic corresponds to the \c VPHSUBW instruction.
321///
322/// \param __a
323///    A 128-bit vector of [8 x i16] containing one of the source operands. The
324///    horizontal differences between the values are stored in the lower bits of
325///    the destination.
326/// \param __b
327///    A 128-bit vector of [8 x i16] containing one of the source operands. The
328///    horizontal differences between the values are stored in the upper bits of
329///    the destination.
330/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
331///    of both operands.
332static __inline__ __m128i __DEFAULT_FN_ATTRS
333_mm_hsub_epi16(__m128i __a, __m128i __b)
334{
335    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
336}
337
338/// Horizontally subtracts the adjacent pairs of values contained in 2
339///    packed 128-bit vectors of [4 x i32].
340///
341/// \headerfile <x86intrin.h>
342///
343/// This intrinsic corresponds to the \c VPHSUBD instruction.
344///
345/// \param __a
346///    A 128-bit vector of [4 x i32] containing one of the source operands. The
347///    horizontal differences between the values are stored in the lower bits of
348///    the destination.
349/// \param __b
350///    A 128-bit vector of [4 x i32] containing one of the source operands. The
351///    horizontal differences between the values are stored in the upper bits of
352///    the destination.
353/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
354///    of both operands.
355static __inline__ __m128i __DEFAULT_FN_ATTRS
356_mm_hsub_epi32(__m128i __a, __m128i __b)
357{
358    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
359}
360
361/// Horizontally subtracts the adjacent pairs of values contained in 2
362///    packed 64-bit vectors of [4 x i16].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the \c PHSUBW instruction.
367///
368/// \param __a
369///    A 64-bit vector of [4 x i16] containing one of the source operands. The
370///    horizontal differences between the values are stored in the lower bits of
371///    the destination.
372/// \param __b
373///    A 64-bit vector of [4 x i16] containing one of the source operands. The
374///    horizontal differences between the values are stored in the upper bits of
375///    the destination.
376/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
377///    of both operands.
378static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
379_mm_hsub_pi16(__m64 __a, __m64 __b)
380{
381    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
382}
383
384/// Horizontally subtracts the adjacent pairs of values contained in 2
385///    packed 64-bit vectors of [2 x i32].
386///
387/// \headerfile <x86intrin.h>
388///
389/// This intrinsic corresponds to the \c PHSUBD instruction.
390///
391/// \param __a
392///    A 64-bit vector of [2 x i32] containing one of the source operands. The
393///    horizontal differences between the values are stored in the lower bits of
394///    the destination.
395/// \param __b
396///    A 64-bit vector of [2 x i32] containing one of the source operands. The
397///    horizontal differences between the values are stored in the upper bits of
398///    the destination.
399/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
400///    of both operands.
401static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
402_mm_hsub_pi32(__m64 __a, __m64 __b)
403{
404    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
405}
406
407/// Horizontally subtracts the adjacent pairs of values contained in 2
408///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
409///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
410///    saturated to 0x8000.
411///
412/// \headerfile <x86intrin.h>
413///
414/// This intrinsic corresponds to the \c VPHSUBSW instruction.
415///
416/// \param __a
417///    A 128-bit vector of [8 x i16] containing one of the source operands. The
418///    horizontal differences between the values are stored in the lower bits of
419///    the destination.
420/// \param __b
421///    A 128-bit vector of [8 x i16] containing one of the source operands. The
422///    horizontal differences between the values are stored in the upper bits of
423///    the destination.
424/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
425///    differences of both operands.
426static __inline__ __m128i __DEFAULT_FN_ATTRS
427_mm_hsubs_epi16(__m128i __a, __m128i __b)
428{
429    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
430}
431
432/// Horizontally subtracts the adjacent pairs of values contained in 2
433///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
434///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
435///    saturated to 0x8000.
436///
437/// \headerfile <x86intrin.h>
438///
439/// This intrinsic corresponds to the \c PHSUBSW instruction.
440///
441/// \param __a
442///    A 64-bit vector of [4 x i16] containing one of the source operands. The
443///    horizontal differences between the values are stored in the lower bits of
444///    the destination.
445/// \param __b
446///    A 64-bit vector of [4 x i16] containing one of the source operands. The
447///    horizontal differences between the values are stored in the upper bits of
448///    the destination.
449/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
450///    differences of both operands.
451static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
452_mm_hsubs_pi16(__m64 __a, __m64 __b)
453{
454    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
455}
456
457/// Multiplies corresponding pairs of packed 8-bit unsigned integer
458///    values contained in the first source operand and packed 8-bit signed
459///    integer values contained in the second source operand, adds pairs of
460///    contiguous products with signed saturation, and writes the 16-bit sums to
461///    the corresponding bits in the destination.
462///
463///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
464///    both operands are multiplied, and the sum of both results is written to
465///    bits [15:0] of the destination.
466///
467/// \headerfile <x86intrin.h>
468///
469/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
470///
471/// \param __a
472///    A 128-bit integer vector containing the first source operand.
473/// \param __b
474///    A 128-bit integer vector containing the second source operand.
475/// \returns A 128-bit integer vector containing the sums of products of both
476///    operands: \n
477///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
478///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
479///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
480///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
481///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
482///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
483///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
484///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
485static __inline__ __m128i __DEFAULT_FN_ATTRS
486_mm_maddubs_epi16(__m128i __a, __m128i __b)
487{
488    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
489}
490
491/// Multiplies corresponding pairs of packed 8-bit unsigned integer
492///    values contained in the first source operand and packed 8-bit signed
493///    integer values contained in the second source operand, adds pairs of
494///    contiguous products with signed saturation, and writes the 16-bit sums to
495///    the corresponding bits in the destination.
496///
497///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
498///    both operands are multiplied, and the sum of both results is written to
499///    bits [15:0] of the destination.
500///
501/// \headerfile <x86intrin.h>
502///
503/// This intrinsic corresponds to the \c PMADDUBSW instruction.
504///
505/// \param __a
506///    A 64-bit integer vector containing the first source operand.
507/// \param __b
508///    A 64-bit integer vector containing the second source operand.
509/// \returns A 64-bit integer vector containing the sums of products of both
510///    operands: \n
511///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
512///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
513///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
514///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
515static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
516_mm_maddubs_pi16(__m64 __a, __m64 __b)
517{
518    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
519}
520
521/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
522///    products to the 18 most significant bits by right-shifting, rounds the
523///    truncated value by adding 1, and writes bits [16:1] to the destination.
524///
525/// \headerfile <x86intrin.h>
526///
527/// This intrinsic corresponds to the \c VPMULHRSW instruction.
528///
529/// \param __a
530///    A 128-bit vector of [8 x i16] containing one of the source operands.
531/// \param __b
532///    A 128-bit vector of [8 x i16] containing one of the source operands.
533/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
534///    products of both operands.
535static __inline__ __m128i __DEFAULT_FN_ATTRS
536_mm_mulhrs_epi16(__m128i __a, __m128i __b)
537{
538    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
539}
540
541/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
542///    products to the 18 most significant bits by right-shifting, rounds the
543///    truncated value by adding 1, and writes bits [16:1] to the destination.
544///
545/// \headerfile <x86intrin.h>
546///
547/// This intrinsic corresponds to the \c PMULHRSW instruction.
548///
549/// \param __a
550///    A 64-bit vector of [4 x i16] containing one of the source operands.
551/// \param __b
552///    A 64-bit vector of [4 x i16] containing one of the source operands.
553/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
554///    products of both operands.
555static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
556_mm_mulhrs_pi16(__m64 __a, __m64 __b)
557{
558    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
559}
560
561/// Copies the 8-bit integers from a 128-bit integer vector to the
562///    destination or clears 8-bit values in the destination, as specified by
563///    the second source operand.
564///
565/// \headerfile <x86intrin.h>
566///
567/// This intrinsic corresponds to the \c VPSHUFB instruction.
568///
569/// \param __a
570///    A 128-bit integer vector containing the values to be copied.
571/// \param __b
572///    A 128-bit integer vector containing control bytes corresponding to
573///    positions in the destination:
574///    Bit 7: \n
575///    1: Clear the corresponding byte in the destination. \n
576///    0: Copy the selected source byte to the corresponding byte in the
577///    destination. \n
578///    Bits [6:4] Reserved.  \n
579///    Bits [3:0] select the source byte to be copied.
580/// \returns A 128-bit integer vector containing the copied or cleared values.
581static __inline__ __m128i __DEFAULT_FN_ATTRS
582_mm_shuffle_epi8(__m128i __a, __m128i __b)
583{
584    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
585}
586
587/// Copies the 8-bit integers from a 64-bit integer vector to the
588///    destination or clears 8-bit values in the destination, as specified by
589///    the second source operand.
590///
591/// \headerfile <x86intrin.h>
592///
593/// This intrinsic corresponds to the \c PSHUFB instruction.
594///
595/// \param __a
596///    A 64-bit integer vector containing the values to be copied.
597/// \param __b
598///    A 64-bit integer vector containing control bytes corresponding to
599///    positions in the destination:
600///    Bit 7: \n
601///    1: Clear the corresponding byte in the destination. \n
602///    0: Copy the selected source byte to the corresponding byte in the
603///    destination. \n
604///    Bits [3:0] select the source byte to be copied.
605/// \returns A 64-bit integer vector containing the copied or cleared values.
606static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
607_mm_shuffle_pi8(__m64 __a, __m64 __b)
608{
609    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
610}
611
612/// For each 8-bit integer in the first source operand, perform one of
613///    the following actions as specified by the second source operand.
614///
615///    If the byte in the second source is negative, calculate the two's
616///    complement of the corresponding byte in the first source, and write that
617///    value to the destination. If the byte in the second source is positive,
618///    copy the corresponding byte from the first source to the destination. If
619///    the byte in the second source is zero, clear the corresponding byte in
620///    the destination.
621///
622/// \headerfile <x86intrin.h>
623///
624/// This intrinsic corresponds to the \c VPSIGNB instruction.
625///
626/// \param __a
627///    A 128-bit integer vector containing the values to be copied.
628/// \param __b
629///    A 128-bit integer vector containing control bytes corresponding to
630///    positions in the destination.
631/// \returns A 128-bit integer vector containing the resultant values.
632static __inline__ __m128i __DEFAULT_FN_ATTRS
633_mm_sign_epi8(__m128i __a, __m128i __b)
634{
635    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
636}
637
638/// For each 16-bit integer in the first source operand, perform one of
639///    the following actions as specified by the second source operand.
640///
641///    If the word in the second source is negative, calculate the two's
642///    complement of the corresponding word in the first source, and write that
643///    value to the destination. If the word in the second source is positive,
644///    copy the corresponding word from the first source to the destination. If
645///    the word in the second source is zero, clear the corresponding word in
646///    the destination.
647///
648/// \headerfile <x86intrin.h>
649///
650/// This intrinsic corresponds to the \c VPSIGNW instruction.
651///
652/// \param __a
653///    A 128-bit integer vector containing the values to be copied.
654/// \param __b
655///    A 128-bit integer vector containing control words corresponding to
656///    positions in the destination.
657/// \returns A 128-bit integer vector containing the resultant values.
658static __inline__ __m128i __DEFAULT_FN_ATTRS
659_mm_sign_epi16(__m128i __a, __m128i __b)
660{
661    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
662}
663
664/// For each 32-bit integer in the first source operand, perform one of
665///    the following actions as specified by the second source operand.
666///
667///    If the doubleword in the second source is negative, calculate the two's
668///    complement of the corresponding word in the first source, and write that
669///    value to the destination. If the doubleword in the second source is
670///    positive, copy the corresponding word from the first source to the
671///    destination. If the doubleword in the second source is zero, clear the
672///    corresponding word in the destination.
673///
674/// \headerfile <x86intrin.h>
675///
676/// This intrinsic corresponds to the \c VPSIGND instruction.
677///
678/// \param __a
679///    A 128-bit integer vector containing the values to be copied.
680/// \param __b
681///    A 128-bit integer vector containing control doublewords corresponding to
682///    positions in the destination.
683/// \returns A 128-bit integer vector containing the resultant values.
684static __inline__ __m128i __DEFAULT_FN_ATTRS
685_mm_sign_epi32(__m128i __a, __m128i __b)
686{
687    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
688}
689
690/// For each 8-bit integer in the first source operand, perform one of
691///    the following actions as specified by the second source operand.
692///
693///    If the byte in the second source is negative, calculate the two's
694///    complement of the corresponding byte in the first source, and write that
695///    value to the destination. If the byte in the second source is positive,
696///    copy the corresponding byte from the first source to the destination. If
697///    the byte in the second source is zero, clear the corresponding byte in
698///    the destination.
699///
700/// \headerfile <x86intrin.h>
701///
702/// This intrinsic corresponds to the \c PSIGNB instruction.
703///
704/// \param __a
705///    A 64-bit integer vector containing the values to be copied.
706/// \param __b
707///    A 64-bit integer vector containing control bytes corresponding to
708///    positions in the destination.
709/// \returns A 64-bit integer vector containing the resultant values.
710static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
711_mm_sign_pi8(__m64 __a, __m64 __b)
712{
713    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
714}
715
716/// For each 16-bit integer in the first source operand, perform one of
717///    the following actions as specified by the second source operand.
718///
719///    If the word in the second source is negative, calculate the two's
720///    complement of the corresponding word in the first source, and write that
721///    value to the destination. If the word in the second source is positive,
722///    copy the corresponding word from the first source to the destination. If
723///    the word in the second source is zero, clear the corresponding word in
724///    the destination.
725///
726/// \headerfile <x86intrin.h>
727///
728/// This intrinsic corresponds to the \c PSIGNW instruction.
729///
730/// \param __a
731///    A 64-bit integer vector containing the values to be copied.
732/// \param __b
733///    A 64-bit integer vector containing control words corresponding to
734///    positions in the destination.
735/// \returns A 64-bit integer vector containing the resultant values.
736static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
737_mm_sign_pi16(__m64 __a, __m64 __b)
738{
739    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
740}
741
742/// For each 32-bit integer in the first source operand, perform one of
743///    the following actions as specified by the second source operand.
744///
745///    If the doubleword in the second source is negative, calculate the two's
746///    complement of the corresponding doubleword in the first source, and
747///    write that value to the destination. If the doubleword in the second
748///    source is positive, copy the corresponding doubleword from the first
749///    source to the destination. If the doubleword in the second source is
750///    zero, clear the corresponding doubleword in the destination.
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the \c PSIGND instruction.
755///
756/// \param __a
757///    A 64-bit integer vector containing the values to be copied.
758/// \param __b
759///    A 64-bit integer vector containing two control doublewords corresponding
760///    to positions in the destination.
761/// \returns A 64-bit integer vector containing the resultant values.
762static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
763_mm_sign_pi32(__m64 __a, __m64 __b)
764{
765    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
766}
767
768#undef __DEFAULT_FN_ATTRS
769#undef __DEFAULT_FN_ATTRS_MMX
770
771#endif /* __TMMINTRIN_H */
772