1/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __AMMINTRIN_H
11#define __AMMINTRIN_H
12
13#include <pmmintrin.h>
14
15/* Define the default attributes for the functions in this file. */
16#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
17
18/// Extracts the specified bits from the lower 64 bits of the 128-bit
19///    integer vector operand at the index \a idx and of the length \a len.
20///
21/// \headerfile <x86intrin.h>
22///
23/// \code
24/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
25/// \endcode
26///
27/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
28///
29/// \param x
30///    The value from which bits are extracted.
31/// \param len
32///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
33///    are zero, the length is interpreted as 64.
34/// \param idx
35///    Bits [5:0] specify the index of the least significant bit; the other
36///    bits are ignored. If the sum of the index and length is greater than 64,
37///    the result is undefined. If the length and index are both zero, bits
38///    [63:0] of parameter \a x are extracted. If the length is zero but the
39///    index is non-zero, the result is undefined.
40/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
41///    extracted from the source operand.
42#define _mm_extracti_si64(x, len, idx) \
43  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
44                                  (char)(len), (char)(idx)))
45
46/// Extracts the specified bits from the lower 64 bits of the 128-bit
47///    integer vector operand at the index and of the length specified by
48///    \a __y.
49///
50/// \headerfile <x86intrin.h>
51///
52/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
53///
54/// \param __x
55///    The value from which bits are extracted.
56/// \param __y
57///    Specifies the index of the least significant bit at [13:8] and the
58///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
59///    length is interpreted as 64. If the sum of the index and length is
60///    greater than 64, the result is undefined. If the length and index are
61///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
62///    is zero but the index is non-zero, the result is undefined.
63/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
64///    from the source operand.
65static __inline__ __m128i __DEFAULT_FN_ATTRS
66_mm_extract_si64(__m128i __x, __m128i __y)
67{
68  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
69}
70
71/// Inserts bits of a specified length from the source integer vector
72///    \a y into the lower 64 bits of the destination integer vector \a x at
73///    the index \a idx and of the length \a len.
74///
75/// \headerfile <x86intrin.h>
76///
77/// \code
78/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
79/// const int idx);
80/// \endcode
81///
82/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
83///
84/// \param x
85///    The destination operand where bits will be inserted. The inserted bits
86///    are defined by the length \a len and by the index \a idx specifying the
87///    least significant bit.
88/// \param y
89///    The source operand containing the bits to be extracted. The extracted
90///    bits are the least significant bits of operand \a y of length \a len.
91/// \param len
92///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
93///    are zero, the length is interpreted as 64.
94/// \param idx
95///    Bits [5:0] specify the index of the least significant bit; the other
96///    bits are ignored. If the sum of the index and length is greater than 64,
97///    the result is undefined. If the length and index are both zero, bits
98///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
99///    is zero but the index is non-zero, the result is undefined.
100/// \returns A 128-bit integer vector containing the original lower 64-bits of
101///    destination operand \a x with the specified bitfields replaced by the
102///    lower bits of source operand \a y. The upper 64 bits of the return value
103///    are undefined.
104#define _mm_inserti_si64(x, y, len, idx) \
105  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
106                                    (__v2di)(__m128i)(y), \
107                                    (char)(len), (char)(idx)))
108
109/// Inserts bits of a specified length from the source integer vector
110///    \a __y into the lower 64 bits of the destination integer vector \a __x
111///    at the index and of the length specified by \a __y.
112///
113/// \headerfile <x86intrin.h>
114///
115/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
116///
117/// \param __x
118///    The destination operand where bits will be inserted. The inserted bits
119///    are defined by the length and by the index of the least significant bit
120///    specified by operand \a __y.
121/// \param __y
122///    The source operand containing the bits to be extracted. The extracted
123///    bits are the least significant bits of operand \a __y with length
124///    specified by bits [69:64]. These are inserted into the destination at the
125///    index specified by bits [77:72]; all other bits are ignored. If bits
126///    [69:64] are zero, the length is interpreted as 64. If the sum of the
127///    index and length is greater than 64, the result is undefined. If the
128///    length and index are both zero, bits [63:0] of parameter \a __y are
129///    inserted into parameter \a __x. If the length is zero but the index is
130///    non-zero, the result is undefined.
131/// \returns A 128-bit integer vector containing the original lower 64-bits of
132///    destination operand \a __x with the specified bitfields replaced by the
133///    lower bits of source operand \a __y. The upper 64 bits of the return
134///    value are undefined.
135static __inline__ __m128i __DEFAULT_FN_ATTRS
136_mm_insert_si64(__m128i __x, __m128i __y)
137{
138  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
139}
140
141/// Stores a 64-bit double-precision value in a 64-bit memory location.
142///    To minimize caching, the data is flagged as non-temporal (unlikely to be
143///    used again soon).
144///
145/// \headerfile <x86intrin.h>
146///
147/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
148///
149/// \param __p
150///    The 64-bit memory location used to store the register value.
151/// \param __a
152///    The 64-bit double-precision floating-point register value to be stored.
153static __inline__ void __DEFAULT_FN_ATTRS
154_mm_stream_sd(double *__p, __m128d __a)
155{
156  __builtin_ia32_movntsd(__p, (__v2df)__a);
157}
158
159/// Stores a 32-bit single-precision floating-point value in a 32-bit
160///    memory location. To minimize caching, the data is flagged as
161///    non-temporal (unlikely to be used again soon).
162///
163/// \headerfile <x86intrin.h>
164///
165/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
166///
167/// \param __p
168///    The 32-bit memory location used to store the register value.
169/// \param __a
170///    The 32-bit single-precision floating-point register value to be stored.
171static __inline__ void __DEFAULT_FN_ATTRS
172_mm_stream_ss(float *__p, __m128 __a)
173{
174  __builtin_ia32_movntss(__p, (__v4sf)__a);
175}
176
177#undef __DEFAULT_FN_ATTRS
178
179#endif /* __AMMINTRIN_H */
180