tmmintrin.h revision 309124
132516Sgibbs/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
240029Sgibbs *
332516Sgibbs * Permission is hereby granted, free of charge, to any person obtaining a copy
432516Sgibbs * of this software and associated documentation files (the "Software"), to deal
532516Sgibbs * in the Software without restriction, including without limitation the rights
632516Sgibbs * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
732516Sgibbs * copies of the Software, and to permit persons to whom the Software is
832516Sgibbs * furnished to do so, subject to the following conditions:
932516Sgibbs *
1032516Sgibbs * The above copyright notice and this permission notice shall be included in
1132516Sgibbs * all copies or substantial portions of the Software.
1232516Sgibbs *
1332516Sgibbs * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1432516Sgibbs * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1532516Sgibbs * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1632516Sgibbs * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1732516Sgibbs * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1832516Sgibbs * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1932516Sgibbs * THE SOFTWARE.
2032516Sgibbs *
2132516Sgibbs *===-----------------------------------------------------------------------===
2232516Sgibbs */
2332516Sgibbs
2432516Sgibbs#ifndef __TMMINTRIN_H
2532516Sgibbs#define __TMMINTRIN_H
2650477Speter
2732516Sgibbs#include <pmmintrin.h>
2832516Sgibbs
2932516Sgibbs/* Define the default attributes for the functions in this file. */
3032516Sgibbs#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
3132516Sgibbs
3267551Sjhb/// \brief Computes the absolute value of each of the packed 8-bit signed
3367551Sjhb///    integers in the source operand and stores the 8-bit unsigned integer
34112346Smux///    results in the destination.
3576827Salfred///
3679224Sdillon/// \headerfile <x86intrin.h>
3776827Salfred///
38104486Ssam/// This intrinsic corresponds to the \c PABSB instruction.
39104486Ssam///
4032516Sgibbs/// \param __a
4132516Sgibbs///    A 64-bit vector of [8 x i8].
4232516Sgibbs/// \returns A 64-bit integer vector containing the absolute values of the
43104486Ssam///    elements in the operand.
4432516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
45112436Smux_mm_abs_pi8(__m64 __a)
4632516Sgibbs{
4732516Sgibbs    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
4832516Sgibbs}
49113228Sjake
5032516Sgibbs/// \brief Computes the absolute value of each of the packed 8-bit signed
5132516Sgibbs///    integers in the source operand and stores the 8-bit unsigned integer
5232516Sgibbs///    results in the destination.
5335767Sgibbs///
5432516Sgibbs/// \headerfile <x86intrin.h>
5532516Sgibbs///
5632516Sgibbs/// This intrinsic corresponds to the \c VPABSB instruction.
5732516Sgibbs///
5832516Sgibbs/// \param __a
5932516Sgibbs///    A 128-bit vector of [16 x i8].
6035767Sgibbs/// \returns A 128-bit integer vector containing the absolute values of the
6132516Sgibbs///    elements in the operand.
6232516Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
6332516Sgibbs_mm_abs_epi8(__m128i __a)
6432516Sgibbs{
6532516Sgibbs    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
6632516Sgibbs}
6732516Sgibbs
6832516Sgibbs/// \brief Computes the absolute value of each of the packed 16-bit signed
6932516Sgibbs///    integers in the source operand and stores the 16-bit unsigned integer
7032516Sgibbs///    results in the destination.
7132516Sgibbs///
7260938Sjake/// \headerfile <x86intrin.h>
7332516Sgibbs///
7432516Sgibbs/// This intrinsic corresponds to the \c PABSW instruction.
7532516Sgibbs///
7632516Sgibbs/// \param __a
7760938Sjake///    A 64-bit vector of [4 x i16].
7832516Sgibbs/// \returns A 64-bit integer vector containing the absolute values of the
7932516Sgibbs///    elements in the operand.
8032516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
8132516Sgibbs_mm_abs_pi16(__m64 __a)
8232516Sgibbs{
8332516Sgibbs    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
8432516Sgibbs}
8532516Sgibbs
8632516Sgibbs/// \brief Computes the absolute value of each of the packed 16-bit signed
8732516Sgibbs///    integers in the source operand and stores the 16-bit unsigned integer
8832516Sgibbs///    results in the destination.
8932516Sgibbs///
9032516Sgibbs/// \headerfile <x86intrin.h>
9132516Sgibbs///
9232516Sgibbs/// This intrinsic corresponds to the \c VPABSW instruction.
9360938Sjake///
9432516Sgibbs/// \param __a
9532516Sgibbs///    A 128-bit vector of [8 x i16].
9660938Sjake/// \returns A 128-bit integer vector containing the absolute values of the
9760938Sjake///    elements in the operand.
9832516Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
9932516Sgibbs_mm_abs_epi16(__m128i __a)
100112346Smux{
10132516Sgibbs    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
102113228Sjake}
103113228Sjake
104112569Sjake/// \brief Computes the absolute value of each of the packed 32-bit signed
10532516Sgibbs///    integers in the source operand and stores the 32-bit unsigned integer
10632516Sgibbs///    results in the destination.
10732516Sgibbs///
10832516Sgibbs/// \headerfile <x86intrin.h>
109112346Smux///
110112346Smux/// This intrinsic corresponds to the \c PABSD instruction.
111112346Smux///
11295076Salfred/// \param __a
11395076Salfred///    A 64-bit vector of [2 x i32].
11495076Salfred/// \returns A 64-bit integer vector containing the absolute values of the
11595076Salfred///    elements in the operand.
11695076Salfredstatic __inline__ __m64 __DEFAULT_FN_ATTRS
11795076Salfred_mm_abs_pi32(__m64 __a)
11895076Salfred{
11995076Salfred    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
12032516Sgibbs}
12132516Sgibbs
12232516Sgibbs/// \brief Computes the absolute value of each of the packed 32-bit signed
12332516Sgibbs///    integers in the source operand and stores the 32-bit unsigned integer
12432516Sgibbs///    results in the destination.
12532516Sgibbs///
12632516Sgibbs/// \headerfile <x86intrin.h>
12732516Sgibbs///
12832516Sgibbs/// This intrinsic corresponds to the \c VPABSD instruction.
12932516Sgibbs///
13032516Sgibbs/// \param __a
13132516Sgibbs///    A 128-bit vector of [4 x i32].
13232516Sgibbs/// \returns A 128-bit integer vector containing the absolute values of the
13332516Sgibbs///    elements in the operand.
13432516Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
13532516Sgibbs_mm_abs_epi32(__m128i __a)
13632516Sgibbs{
13732516Sgibbs    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
13835767Sgibbs}
13932516Sgibbs
14032516Sgibbs/// \brief Concatenates the two 128-bit integer vector operands, and
14132516Sgibbs///    right-shifts the result by the number of bytes specified in the immediate
14232516Sgibbs///    operand.
14335767Sgibbs///
14435767Sgibbs/// \headerfile <x86intrin.h>
14535767Sgibbs///
14635767Sgibbs/// \code
14735767Sgibbs/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
14832516Sgibbs/// \endcode
14932516Sgibbs///
15032516Sgibbs/// This intrinsic corresponds to the \c PALIGNR instruction.
15132516Sgibbs///
15232516Sgibbs/// \param a
15332516Sgibbs///    A 128-bit vector of [16 x i8] containing one of the source operands.
15432516Sgibbs/// \param b
15532516Sgibbs///    A 128-bit vector of [16 x i8] containing one of the source operands.
15632516Sgibbs/// \param n
15732516Sgibbs///    An immediate operand specifying how many bytes to right-shift the result.
15832516Sgibbs/// \returns A 128-bit integer vector containing the concatenated right-shifted
15932516Sgibbs///    value.
16048449Smjacob#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
16132516Sgibbs  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
162112569Sjake                                     (__v16qi)(__m128i)(b), (n)); })
163112569Sjake
164112569Sjake/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
16532516Sgibbs///    the result by the number of bytes specified in the immediate operand.
16632516Sgibbs///
16732516Sgibbs/// \headerfile <x86intrin.h>
16832516Sgibbs///
16932516Sgibbs/// \code
17032516Sgibbs/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
17132516Sgibbs/// \endcode
17232516Sgibbs///
17332516Sgibbs/// This intrinsic corresponds to the \c PALIGNR instruction.
17432516Sgibbs///
17532516Sgibbs/// \param a
17632516Sgibbs///    A 64-bit vector of [8 x i8] containing one of the source operands.
17732516Sgibbs/// \param b
17832516Sgibbs///    A 64-bit vector of [8 x i8] containing one of the source operands.
17932516Sgibbs/// \param n
18032516Sgibbs///    An immediate operand specifying how many bytes to right-shift the result.
18132516Sgibbs/// \returns A 64-bit integer vector containing the concatenated right-shifted
18235767Sgibbs///    value.
18332516Sgibbs#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
18432516Sgibbs  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
18532516Sgibbs
18635256Sdes/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
18732516Sgibbs///    128-bit vectors of [8 x i16].
18832516Sgibbs///
18932516Sgibbs/// \headerfile <x86intrin.h>
19032516Sgibbs///
19132516Sgibbs/// This intrinsic corresponds to the \c VPHADDW instruction.
192112436Smux///
193112436Smux/// \param __a
19432516Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
19532516Sgibbs///    horizontal sums of the values are stored in the lower bits of the
196112569Sjake///    destination.
197112569Sjake/// \param __b
19832516Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
19932516Sgibbs///    horizontal sums of the values are stored in the upper bits of the
20032516Sgibbs///    destination.
20132516Sgibbs/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
20232516Sgibbs///    both operands.
20332516Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
20432516Sgibbs_mm_hadd_epi16(__m128i __a, __m128i __b)
20535767Sgibbs{
20632516Sgibbs    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
20732516Sgibbs}
20832516Sgibbs
20932516Sgibbs/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
21032516Sgibbs///    128-bit vectors of [4 x i32].
21132516Sgibbs///
21232516Sgibbs/// \headerfile <x86intrin.h>
21332516Sgibbs///
21432516Sgibbs/// This intrinsic corresponds to the \c VPHADDD instruction.
21532516Sgibbs///
21632516Sgibbs/// \param __a
21735767Sgibbs///    A 128-bit vector of [4 x i32] containing one of the source operands. The
21835767Sgibbs///    horizontal sums of the values are stored in the lower bits of the
21932516Sgibbs///    destination.
22032516Sgibbs/// \param __b
22132516Sgibbs///    A 128-bit vector of [4 x i32] containing one of the source operands. The
22232516Sgibbs///    horizontal sums of the values are stored in the upper bits of the
22332516Sgibbs///    destination.
22432516Sgibbs/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
22532516Sgibbs///    both operands.
22632516Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
22732516Sgibbs_mm_hadd_epi32(__m128i __a, __m128i __b)
22832516Sgibbs{
22932516Sgibbs    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
23032516Sgibbs}
23132516Sgibbs
23232516Sgibbs/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
23332516Sgibbs///    64-bit vectors of [4 x i16].
23432516Sgibbs///
23532516Sgibbs/// \headerfile <x86intrin.h>
23632516Sgibbs///
23732516Sgibbs/// This intrinsic corresponds to the \c PHADDW instruction.
23832516Sgibbs///
23932516Sgibbs/// \param __a
24032516Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
241112436Smux///    horizontal sums of the values are stored in the lower bits of the
24232516Sgibbs///    destination.
24332516Sgibbs/// \param __b
24440029Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
24540029Sgibbs///    horizontal sums of the values are stored in the upper bits of the
24640029Sgibbs///    destination.
24740029Sgibbs/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
24840029Sgibbs///    operands.
24940029Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
25040029Sgibbs_mm_hadd_pi16(__m64 __a, __m64 __b)
25140029Sgibbs{
25232516Sgibbs    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
25332516Sgibbs}
25432516Sgibbs
25532516Sgibbs/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
25632516Sgibbs///    64-bit vectors of [2 x i32].
25732516Sgibbs///
25832516Sgibbs/// \headerfile <x86intrin.h>
25932516Sgibbs///
26032516Sgibbs/// This intrinsic corresponds to the \c PHADDD instruction.
26132516Sgibbs///
26232516Sgibbs/// \param __a
26332516Sgibbs///    A 64-bit vector of [2 x i32] containing one of the source operands. The
26432516Sgibbs///    horizontal sums of the values are stored in the lower bits of the
26532516Sgibbs///    destination.
26632516Sgibbs/// \param __b
26732516Sgibbs///    A 64-bit vector of [2 x i32] containing one of the source operands. The
268112569Sjake///    horizontal sums of the values are stored in the upper bits of the
26932516Sgibbs///    destination.
27032516Sgibbs/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
27132516Sgibbs///    operands.
27232516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
27369781Sdwmalone_mm_hadd_pi32(__m64 __a, __m64 __b)
27469781Sdwmalone{
27535767Sgibbs    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
27669781Sdwmalone}
27769781Sdwmalone
27869781Sdwmalone/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
27969781Sdwmalone///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
28032516Sgibbs///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
28132516Sgibbs///
28232516Sgibbs/// \headerfile <x86intrin.h>
28332516Sgibbs///
28432516Sgibbs/// This intrinsic corresponds to the \c VPHADDSW instruction.
28535767Sgibbs///
28635767Sgibbs/// \param __a
28735767Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
28832516Sgibbs///    horizontal sums of the values are stored in the lower bits of the
28932516Sgibbs///    destination.
29035767Sgibbs/// \param __b
29135767Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
29235767Sgibbs///    horizontal sums of the values are stored in the upper bits of the
29335767Sgibbs///    destination.
29435767Sgibbs/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
29535767Sgibbs///    sums of both operands.
29635767Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
29735767Sgibbs_mm_hadds_epi16(__m128i __a, __m128i __b)
298113228Sjake{
29932516Sgibbs    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
300113228Sjake}
301113228Sjake
30235767Sgibbs/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
30335767Sgibbs///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
30435767Sgibbs///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
30535767Sgibbs///
30635767Sgibbs/// \headerfile <x86intrin.h>
30735767Sgibbs///
30835767Sgibbs/// This intrinsic corresponds to the \c PHADDSW instruction.
30932516Sgibbs///
31032516Sgibbs/// \param __a
31140029Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
31232516Sgibbs///    horizontal sums of the values are stored in the lower bits of the
31332516Sgibbs///    destination.
31432516Sgibbs/// \param __b
31532516Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
31632516Sgibbs///    horizontal sums of the values are stored in the upper bits of the
31732516Sgibbs///    destination.
31832516Sgibbs/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
31932516Sgibbs///    sums of both operands.
32032516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
32132516Sgibbs_mm_hadds_pi16(__m64 __a, __m64 __b)
32232516Sgibbs{
32332516Sgibbs    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
32432516Sgibbs}
32532516Sgibbs
32632516Sgibbs/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
32732516Sgibbs///    packed 128-bit vectors of [8 x i16].
32832516Sgibbs///
32932516Sgibbs/// \headerfile <x86intrin.h>
33032516Sgibbs///
33132516Sgibbs/// This intrinsic corresponds to the \c VPHSUBW instruction.
33232516Sgibbs///
33332516Sgibbs/// \param __a
33435767Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
33535767Sgibbs///    horizontal differences between the values are stored in the lower bits of
33635767Sgibbs///    the destination.
33735767Sgibbs/// \param __b
33835767Sgibbs///    A 128-bit vector of [8 x i16] containing one of the source operands. The
33935767Sgibbs///    horizontal differences between the values are stored in the upper bits of
34035767Sgibbs///    the destination.
341110030Sscottl/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
342110030Sscottl///    of both operands.
34335767Sgibbsstatic __inline__ __m128i __DEFAULT_FN_ATTRS
344110030Sscottl_mm_hsub_epi16(__m128i __a, __m128i __b)
345110030Sscottl{
346110030Sscottl    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
347110030Sscottl}
34835767Sgibbs
34940029Sgibbs/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
35035767Sgibbs///    packed 128-bit vectors of [4 x i32].
351112569Sjake///
352112569Sjake/// \headerfile <x86intrin.h>
353110030Sscottl///
354111119Simp/// This intrinsic corresponds to the \c VPHSUBD instruction.
35535767Sgibbs///
35635767Sgibbs/// \param __a
35735767Sgibbs///    A 128-bit vector of [4 x i32] containing one of the source operands. The
35835767Sgibbs///    horizontal differences between the values are stored in the lower bits of
35935767Sgibbs///    the destination.
36035767Sgibbs/// \param __b
361112196Smux///    A 128-bit vector of [4 x i32] containing one of the source operands. The
362110030Sscottl///    horizontal differences between the values are stored in the upper bits of
363111119Simp///    the destination.
36448449Smjacob/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
36548449Smjacob///    of both operands.
366112196Smuxstatic __inline__ __m128i __DEFAULT_FN_ATTRS
36735767Sgibbs_mm_hsub_epi32(__m128i __a, __m128i __b)
36835767Sgibbs{
36935767Sgibbs    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
37035767Sgibbs}
37135767Sgibbs
37235767Sgibbs/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
373110030Sscottl///    packed 64-bit vectors of [4 x i16].
374110030Sscottl///
375110030Sscottl/// \headerfile <x86intrin.h>
376110030Sscottl///
377110030Sscottl/// This intrinsic corresponds to the \c PHSUBW instruction.
378110030Sscottl///
379110030Sscottl/// \param __a
38035767Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
38135767Sgibbs///    horizontal differences between the values are stored in the lower bits of
38295076Salfred///    the destination.
38335767Sgibbs/// \param __b
38435767Sgibbs///    A 64-bit vector of [4 x i16] containing one of the source operands. The
385110030Sscottl///    horizontal differences between the values are stored in the upper bits of
386110030Sscottl///    the destination.
38735767Sgibbs/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
38835767Sgibbs///    of both operands.
38935767Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
39035767Sgibbs_mm_hsub_pi16(__m64 __a, __m64 __b)
39135767Sgibbs{
39249859Sgibbs    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
39335767Sgibbs}
394112569Sjake
39540029Sgibbs/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
396112196Smux///    packed 64-bit vectors of [2 x i32].
397112196Smux///
398110030Sscottl/// \headerfile <x86intrin.h>
399112196Smux///
400112196Smux/// This intrinsic corresponds to the \c PHSUBD instruction.
40135767Sgibbs///
40235767Sgibbs/// \param __a
403110030Sscottl///    A 64-bit vector of [2 x i32] containing one of the source operands. The
404110030Sscottl///    horizontal differences between the values are stored in the lower bits of
405110030Sscottl///    the destination.
406110030Sscottl/// \param __b
407110030Sscottl///    A 64-bit vector of [2 x i32] containing one of the source operands. The
408110030Sscottl///    horizontal differences between the values are stored in the upper bits of
40932516Sgibbs///    the destination.
410104486Ssam/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
411104486Ssam///    of both operands.
412104486Ssamstatic __inline__ __m64 __DEFAULT_FN_ATTRS
413104486Ssam_mm_hsub_pi32(__m64 __a, __m64 __b)
414104486Ssam{
415104486Ssam    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
416104486Ssam}
417113228Sjake
418104486Ssam/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
419104486Ssam///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
420104486Ssam///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
421104486Ssam///    saturated to 8000h.
422113228Sjake///
423104486Ssam/// \headerfile <x86intrin.h>
424104486Ssam///
425104486Ssam/// This intrinsic corresponds to the \c VPHSUBSW instruction.
426104486Ssam///
427104486Ssam/// \param __a
428113228Sjake///    A 128-bit vector of [8 x i16] containing one of the source operands. The
429113228Sjake///    horizontal differences between the values are stored in the lower bits of
430113228Sjake///    the destination.
431104486Ssam/// \param __b
432104486Ssam///    A 128-bit vector of [8 x i16] containing one of the source operands. The
433104486Ssam///    horizontal differences between the values are stored in the upper bits of
434113228Sjake///    the destination.
435113228Sjake/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
436113228Sjake///    differences of both operands.
437104486Ssamstatic __inline__ __m128i __DEFAULT_FN_ATTRS
438104486Ssam_mm_hsubs_epi16(__m128i __a, __m128i __b)
439104486Ssam{
440104486Ssam    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
441104486Ssam}
442113228Sjake
443113228Sjake/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
444113228Sjake///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
445113228Sjake///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
446113228Sjake///    saturated to 8000h.
447113228Sjake///
448113228Sjake/// \headerfile <x86intrin.h>
449113228Sjake///
450113228Sjake/// This intrinsic corresponds to the \c PHSUBSW instruction.
451113228Sjake///
452113228Sjake/// \param __a
453113228Sjake///    A 64-bit vector of [4 x i16] containing one of the source operands. The
454113228Sjake///    horizontal differences between the values are stored in the lower bits of
455113228Sjake///    the destination.
456113228Sjake/// \param __b
457113228Sjake///    A 64-bit vector of [4 x i16] containing one of the source operands. The
458113228Sjake///    horizontal differences between the values are stored in the upper bits of
459113228Sjake///    the destination.
460113228Sjake/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
461113228Sjake///    differences of both operands.
462113228Sjakestatic __inline__ __m64 __DEFAULT_FN_ATTRS
463113228Sjake_mm_hsubs_pi16(__m64 __a, __m64 __b)
464113228Sjake{
465113228Sjake    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
466113228Sjake}
467113472Ssimokawa
468113472Ssimokawa/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
469113472Ssimokawa///    values contained in the first source operand and packed 8-bit signed
470113472Ssimokawa///    integer values contained in the second source operand, adds pairs of
471113472Ssimokawa///    contiguous products with signed saturation, and writes the 16-bit sums to
472113472Ssimokawa///    the corresponding bits in the destination. For example, bits [7:0] of
473113472Ssimokawa///    both operands are multiplied, bits [15:8] of both operands are
474113472Ssimokawa///    multiplied, and the sum of both results is written to bits [15:0] of the
475113472Ssimokawa///    destination.
476113472Ssimokawa///
477113472Ssimokawa/// \headerfile <x86intrin.h>
478113472Ssimokawa///
479113472Ssimokawa/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
480113472Ssimokawa///
481113472Ssimokawa/// \param __a
482113472Ssimokawa///    A 128-bit integer vector containing the first source operand.
483113228Sjake/// \param __b
484113228Sjake///    A 128-bit integer vector containing the second source operand.
485113228Sjake/// \returns A 128-bit integer vector containing the sums of products of both
486113228Sjake///    operands:
487104486Ssam///    R0 := (__a0 * __b0) + (__a1 * __b1)
488113228Sjake///    R1 := (__a2 * __b2) + (__a3 * __b3)
489104486Ssam///    R2 := (__a4 * __b4) + (__a5 * __b5)
490104486Ssam///    R3 := (__a6 * __b6) + (__a7 * __b7)
491104486Ssam///    R4 := (__a8 * __b8) + (__a9 * __b9)
492104486Ssam///    R5 := (__a10 * __b10) + (__a11 * __b11)
493104486Ssam///    R6 := (__a12 * __b12) + (__a13 * __b13)
494104486Ssam///    R7 := (__a14 * __b14) + (__a15 * __b15)
495104486Ssamstatic __inline__ __m128i __DEFAULT_FN_ATTRS
496104486Ssam_mm_maddubs_epi16(__m128i __a, __m128i __b)
497104486Ssam{
498104486Ssam    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
499104486Ssam}
500104486Ssam
501104486Ssam/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
502104486Ssam///    values contained in the first source operand and packed 8-bit signed
503104486Ssam///    integer values contained in the second source operand, adds pairs of
504104486Ssam///    contiguous products with signed saturation, and writes the 16-bit sums to
505104486Ssam///    the corresponding bits in the destination. For example, bits [7:0] of
506104486Ssam///    both operands are multiplied, bits [15:8] of both operands are
507104486Ssam///    multiplied, and the sum of both results is written to bits [15:0] of the
508104486Ssam///    destination.
509104486Ssam///
510104486Ssam/// \headerfile <x86intrin.h>
511104486Ssam///
512104486Ssam/// This intrinsic corresponds to the \c PMADDUBSW instruction.
513104486Ssam///
514104486Ssam/// \param __a
515113228Sjake///    A 64-bit integer vector containing the first source operand.
516113228Sjake/// \param __b
517113228Sjake///    A 64-bit integer vector containing the second source operand.
518104486Ssam/// \returns A 64-bit integer vector containing the sums of products of both
519104486Ssam///    operands:
520104486Ssam///    R0 := (__a0 * __b0) + (__a1 * __b1)
521104486Ssam///    R1 := (__a2 * __b2) + (__a3 * __b3)
522104486Ssam///    R2 := (__a4 * __b4) + (__a5 * __b5)
523104486Ssam///    R3 := (__a6 * __b6) + (__a7 * __b7)
524104486Ssamstatic __inline__ __m64 __DEFAULT_FN_ATTRS
525104486Ssam_mm_maddubs_pi16(__m64 __a, __m64 __b)
526104486Ssam{
527113228Sjake    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
528104486Ssam}
529104486Ssam
530104486Ssam/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
531104486Ssam///    products to the 18 most significant bits by right-shifting, rounds the
532104486Ssam///    truncated value by adding 1, and writes bits [16:1] to the destination.
533104486Ssam///
534104486Ssam/// \headerfile <x86intrin.h>
535104486Ssam///
536104486Ssam/// This intrinsic corresponds to the \c VPMULHRSW instruction.
537104486Ssam///
538104486Ssam/// \param __a
539104486Ssam///    A 128-bit vector of [8 x i16] containing one of the source operands.
540104486Ssam/// \param __b
541104486Ssam///    A 128-bit vector of [8 x i16] containing one of the source operands.
542104486Ssam/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
543104486Ssam///    products of both operands.
544104486Ssamstatic __inline__ __m128i __DEFAULT_FN_ATTRS
545104486Ssam_mm_mulhrs_epi16(__m128i __a, __m128i __b)
546104486Ssam{
547104486Ssam    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
548104486Ssam}
549104486Ssam
550104486Ssam/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
551104486Ssam///    products to the 18 most significant bits by right-shifting, rounds the
552104486Ssam///    truncated value by adding 1, and writes bits [16:1] to the destination.
553104486Ssam///
554113459Ssimokawa/// \headerfile <x86intrin.h>
555113459Ssimokawa///
556104486Ssam/// This intrinsic corresponds to the \c PMULHRSW instruction.
557113459Ssimokawa///
558113459Ssimokawa/// \param __a
559113459Ssimokawa///    A 64-bit vector of [4 x i16] containing one of the source operands.
560113459Ssimokawa/// \param __b
561113459Ssimokawa///    A 64-bit vector of [4 x i16] containing one of the source operands.
562113459Ssimokawa/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
563113459Ssimokawa///    products of both operands.
564113459Ssimokawastatic __inline__ __m64 __DEFAULT_FN_ATTRS
565113459Ssimokawa_mm_mulhrs_pi16(__m64 __a, __m64 __b)
566113459Ssimokawa{
567113459Ssimokawa    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
568113459Ssimokawa}
569113492Smux
570113459Ssimokawa/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
571113459Ssimokawa///    destination or clears 8-bit values in the destination, as specified by
572113472Ssimokawa///    the second source operand.
573113472Ssimokawa///
574113472Ssimokawa/// \headerfile <x86intrin.h>
575113472Ssimokawa///
576113472Ssimokawa/// This intrinsic corresponds to the \c VPSHUFB instruction.
577113472Ssimokawa///
578113459Ssimokawa/// \param __a
579113459Ssimokawa///    A 128-bit integer vector containing the values to be copied.
580113459Ssimokawa/// \param __b
581113472Ssimokawa///    A 128-bit integer vector containing control bytes corresponding to
582113492Smux///    positions in the destination:
583113472Ssimokawa///    Bit 7:
584113459Ssimokawa///    1: Clear the corresponding byte in the destination.
585113459Ssimokawa///    0: Copy the selected source byte to the corresponding byte in the
586113459Ssimokawa///    destination.
587113459Ssimokawa///    Bits [6:4] Reserved.
588113459Ssimokawa///    Bits [3:0] select the source byte to be copied.
589113459Ssimokawa/// \returns A 128-bit integer vector containing the copied or cleared values.
590113459Ssimokawastatic __inline__ __m128i __DEFAULT_FN_ATTRS
591113459Ssimokawa_mm_shuffle_epi8(__m128i __a, __m128i __b)
592113459Ssimokawa{
593113459Ssimokawa    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
594104486Ssam}
595104486Ssam
596104486Ssam/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
597104486Ssam///    destination or clears 8-bit values in the destination, as specified by
598104486Ssam///    the second source operand.
599104486Ssam///
600104486Ssam/// \headerfile <x86intrin.h>
601104486Ssam///
602104486Ssam/// This intrinsic corresponds to the \c PSHUFB instruction.
603104486Ssam///
604104486Ssam/// \param __a
605104486Ssam///    A 64-bit integer vector containing the values to be copied.
606104486Ssam/// \param __b
607104486Ssam///    A 64-bit integer vector containing control bytes corresponding to
608104486Ssam///    positions in the destination:
609104486Ssam///    Bit 7:
610104486Ssam///    1: Clear the corresponding byte in the destination.
611104486Ssam///    0: Copy the selected source byte to the corresponding byte in the
612113472Ssimokawa///    destination.
613104486Ssam///    Bits [3:0] select the source byte to be copied.
614104486Ssam/// \returns A 64-bit integer vector containing the copied or cleared values.
615104486Ssamstatic __inline__ __m64 __DEFAULT_FN_ATTRS
616104486Ssam_mm_shuffle_pi8(__m64 __a, __m64 __b)
617113228Sjake{
618104486Ssam    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
619104486Ssam}
620104486Ssam
621110335Sharti/// \brief For each 8-bit integer in the first source operand, perform one of
622113228Sjake///    the following actions as specified by the second source operand: If the
623110335Sharti///    byte in the second source is negative, calculate the two's complement of
624110335Sharti///    the corresponding byte in the first source, and write that value to the
625110335Sharti///    destination. If the byte in the second source is positive, copy the
626110335Sharti///    corresponding byte from the first source to the destination. If the byte
627110335Sharti///    in the second source is zero, clear the corresponding byte in the
628110335Sharti///    destination.
629104486Ssam///
630104486Ssam/// \headerfile <x86intrin.h>
631104486Ssam///
632104486Ssam/// This intrinsic corresponds to the \c VPSIGNB instruction.
633104486Ssam///
634104486Ssam/// \param __a
635104486Ssam///    A 128-bit integer vector containing the values to be copied.
636104486Ssam/// \param __b
637104486Ssam///    A 128-bit integer vector containing control bytes corresponding to
638104486Ssam///    positions in the destination.
639104486Ssam/// \returns A 128-bit integer vector containing the resultant values.
640104486Ssamstatic __inline__ __m128i __DEFAULT_FN_ATTRS
641104486Ssam_mm_sign_epi8(__m128i __a, __m128i __b)
642104486Ssam{
643104486Ssam    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
644104486Ssam}
645104486Ssam
646104486Ssam/// \brief For each 16-bit integer in the first source operand, perform one of
647104486Ssam///    the following actions as specified by the second source operand: If the
648104486Ssam///    word in the second source is negative, calculate the two's complement of
649104486Ssam///    the corresponding word in the first source, and write that value to the
650104486Ssam///    destination. If the word in the second source is positive, copy the
651104486Ssam///    corresponding word from the first source to the destination. If the word
652104486Ssam///    in the second source is zero, clear the corresponding word in the
653113228Sjake///    destination.
654104486Ssam///
655104486Ssam/// \headerfile <x86intrin.h>
656104486Ssam///
657104486Ssam/// This intrinsic corresponds to the \c VPSIGNW instruction.
658104486Ssam///
659104486Ssam/// \param __a
660104486Ssam///    A 128-bit integer vector containing the values to be copied.
661104486Ssam/// \param __b
662104486Ssam///    A 128-bit integer vector containing control words corresponding to
663104486Ssam///    positions in the destination.
664113472Ssimokawa/// \returns A 128-bit integer vector containing the resultant values.
665104486Ssamstatic __inline__ __m128i __DEFAULT_FN_ATTRS
666104486Ssam_mm_sign_epi16(__m128i __a, __m128i __b)
667104486Ssam{
668104486Ssam    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
669104486Ssam}
670104486Ssam
671104486Ssam/// \brief For each 32-bit integer in the first source operand, perform one of
672104486Ssam///    the following actions as specified by the second source operand: If the
673104486Ssam///    doubleword in the second source is negative, calculate the two's
674104486Ssam///    complement of the corresponding word in the first source, and write that
675104486Ssam///    value to the destination. If the doubleword in the second source is
676104486Ssam///    positive, copy the corresponding word from the first source to the
677104486Ssam///    destination. If the doubleword in the second source is zero, clear the
678104486Ssam///    corresponding word in the destination.
679104486Ssam///
680104486Ssam/// \headerfile <x86intrin.h>
681104486Ssam///
682104486Ssam/// This intrinsic corresponds to the \c VPSIGND instruction.
683104486Ssam///
684104486Ssam/// \param __a
685104486Ssam///    A 128-bit integer vector containing the values to be copied.
686110335Sharti/// \param __b
687113228Sjake///    A 128-bit integer vector containing control doublewords corresponding to
688110335Sharti///    positions in the destination.
689110335Sharti/// \returns A 128-bit integer vector containing the resultant values.
690110335Shartistatic __inline__ __m128i __DEFAULT_FN_ATTRS
691110335Sharti_mm_sign_epi32(__m128i __a, __m128i __b)
692104486Ssam{
693110335Sharti    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
694110335Sharti}
695104486Ssam
696104486Ssam/// \brief For each 8-bit integer in the first source operand, perform one of
697104486Ssam///    the following actions as specified by the second source operand: If the
698104486Ssam///    byte in the second source is negative, calculate the two's complement of
699104486Ssam///    the corresponding byte in the first source, and write that value to the
700104486Ssam///    destination. If the byte in the second source is positive, copy the
701104486Ssam///    corresponding byte from the first source to the destination. If the byte
702104486Ssam///    in the second source is zero, clear the corresponding byte in the
703104486Ssam///    destination.
704104486Ssam///
705104486Ssam/// \headerfile <x86intrin.h>
706104486Ssam///
707104486Ssam/// This intrinsic corresponds to the \c PSIGNB instruction.
70832516Sgibbs///
70932516Sgibbs/// \param __a
71032516Sgibbs///    A 64-bit integer vector containing the values to be copied.
71132516Sgibbs/// \param __b
71232516Sgibbs///    A 64-bit integer vector containing control bytes corresponding to
71332516Sgibbs///    positions in the destination.
71432516Sgibbs/// \returns A 64-bit integer vector containing the resultant values.
71532516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
71632516Sgibbs_mm_sign_pi8(__m64 __a, __m64 __b)
71732516Sgibbs{
71832516Sgibbs    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
71932516Sgibbs}
72032516Sgibbs
72132516Sgibbs/// \brief For each 16-bit integer in the first source operand, perform one of
722113347Smux///    the following actions as specified by the second source operand: If the
72332516Sgibbs///    word in the second source is negative, calculate the two's complement of
72432516Sgibbs///    the corresponding word in the first source, and write that value to the
72532516Sgibbs///    destination. If the word in the second source is positive, copy the
72632516Sgibbs///    corresponding word from the first source to the destination. If the word
72732516Sgibbs///    in the second source is zero, clear the corresponding word in the
72832516Sgibbs///    destination.
72932516Sgibbs///
73032516Sgibbs/// \headerfile <x86intrin.h>
73132516Sgibbs///
732113347Smux/// This intrinsic corresponds to the \c PSIGNW instruction.
73332516Sgibbs///
73432516Sgibbs/// \param __a
73532516Sgibbs///    A 64-bit integer vector containing the values to be copied.
73632516Sgibbs/// \param __b
73732516Sgibbs///    A 64-bit integer vector containing control words corresponding to
73832516Sgibbs///    positions in the destination.
739113347Smux/// \returns A 64-bit integer vector containing the resultant values.
74032516Sgibbsstatic __inline__ __m64 __DEFAULT_FN_ATTRS
741113347Smux_mm_sign_pi16(__m64 __a, __m64 __b)
74232516Sgibbs{
74332516Sgibbs    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
74432516Sgibbs}
74532516Sgibbs
74632516Sgibbs/// \brief For each 32-bit integer in the first source operand, perform one of
74732516Sgibbs///    the following actions as specified by the second source operand: If the
74832516Sgibbs///    doubleword in the second source is negative, calculate the two's
74932516Sgibbs///    complement of the corresponding doubleword in the first source, and
75032516Sgibbs///    write that value to the destination. If the doubleword in the second
75132516Sgibbs///    source is positive, copy the corresponding doubleword from the first
752112346Smux///    source to the destination. If the doubleword in the second source is
753112346Smux///    zero, clear the corresponding doubleword in the destination.
754112346Smux///
755112346Smux/// \headerfile <x86intrin.h>
756112346Smux///
757112346Smux/// This intrinsic corresponds to the \c PSIGND instruction.
758112346Smux///
759112346Smux/// \param __a
760112346Smux///    A 64-bit integer vector containing the values to be copied.
761112346Smux/// \param __b
762112346Smux///    A 64-bit integer vector containing two control doublewords corresponding
763112346Smux///    to positions in the destination.
764112346Smux/// \returns A 64-bit integer vector containing the resultant values.
765112346Smuxstatic __inline__ __m64 __DEFAULT_FN_ATTRS
766112346Smux_mm_sign_pi32(__m64 __a, __m64 __b)
76732516Sgibbs{
76832516Sgibbs    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
76932516Sgibbs}
77032516Sgibbs
77132516Sgibbs#undef __DEFAULT_FN_ATTRS
77232516Sgibbs
77332516Sgibbs#endif /* __TMMINTRIN_H */
77432516Sgibbs