smmintrin.h revision 225736
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef _SMMINTRIN_H
25#define _SMMINTRIN_H
26
27#ifndef __SSE4_1__
28#error "SSE4.1 instruction set not enabled"
29#else
30
31#include <tmmintrin.h>
32
33/* SSE4 Rounding macros. */
34#define _MM_FROUND_TO_NEAREST_INT    0x00
35#define _MM_FROUND_TO_NEG_INF        0x01
36#define _MM_FROUND_TO_POS_INF        0x02
37#define _MM_FROUND_TO_ZERO           0x03
38#define _MM_FROUND_CUR_DIRECTION     0x04
39
40#define _MM_FROUND_RAISE_EXC         0x00
41#define _MM_FROUND_NO_EXC            0x08
42
43#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
44#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
45#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
46#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
47#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
48#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
49
50#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
51#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
52#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
53#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
54
55#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
56#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
57#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
58#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
59
60#define _mm_round_ps(X, Y)      __builtin_ia32_roundps((X), (Y))
61#define _mm_round_ss(X, Y, M)   __builtin_ia32_roundss((X), (Y), (M))
62#define _mm_round_pd(X, M)      __builtin_ia32_roundpd((X), (M))
63#define _mm_round_sd(X, Y, M)   __builtin_ia32_roundsd((X), (Y), (M))
64
65/* SSE4 Packed Blending Intrinsics.  */
66static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
67_mm_blend_pd (__m128d __V1, __m128d __V2, const int __M)
68{
69  return (__m128d) __builtin_ia32_blendpd ((__v2df)__V1, (__v2df)__V2, __M);
70}
71
72static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
73_mm_blend_ps (__m128 __V1, __m128 __V2, const int __M)
74{
75  return (__m128) __builtin_ia32_blendps ((__v4sf)__V1, (__v4sf)__V2, __M);
76}
77
78static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
79_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
80{
81  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
82                                            (__v2df)__M);
83}
84
85static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
86_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
87{
88  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
89                                           (__v4sf)__M);
90}
91
92static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
93_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
94{
95  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
96                                               (__v16qi)__M);
97}
98
99static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
100_mm_blend_epi16 (__m128i __V1, __m128i __V2, const int __M)
101{
102  return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__V1, (__v8hi)__V2, __M);
103}
104
105/* SSE4 Dword Multiply Instructions.  */
106static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
107_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
108{
109  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
110}
111
112static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
113_mm_mul_epi32 (__m128i __V1, __m128i __V2)
114{
115  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
116}
117
118/* SSE4 Floating Point Dot Product Instructions.  */
119#define _mm_dp_ps(X, Y, M) __builtin_ia32_dpps ((X), (Y), (M))
120#define _mm_dp_pd(X, Y, M) __builtin_ia32_dppd ((X), (Y), (M))
121
122/* SSE4 Streaming Load Hint Instruction.  */
123static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
124_mm_stream_load_si128 (__m128i *__V)
125{
126  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
127}
128
129/* SSE4 Packed Integer Min/Max Instructions.  */
130static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
131_mm_min_epi8 (__m128i __V1, __m128i __V2)
132{
133  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
134}
135
136static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
137_mm_max_epi8 (__m128i __V1, __m128i __V2)
138{
139  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
140}
141
142static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
143_mm_min_epu16 (__m128i __V1, __m128i __V2)
144{
145  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
146}
147
148static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
149_mm_max_epu16 (__m128i __V1, __m128i __V2)
150{
151  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
152}
153
154static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
155_mm_min_epi32 (__m128i __V1, __m128i __V2)
156{
157  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
158}
159
160static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
161_mm_max_epi32 (__m128i __V1, __m128i __V2)
162{
163  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
164}
165
166static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
167_mm_min_epu32 (__m128i __V1, __m128i __V2)
168{
169  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
170}
171
172static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
173_mm_max_epu32 (__m128i __V1, __m128i __V2)
174{
175  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
176}
177
178/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
179#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
180#define _mm_extract_ps(X, N) (__extension__                      \
181                              ({ union { int i; float f; } __t;  \
182                                 __v4sf __a = (__v4sf)(X);       \
183                                 __t.f = __a[N];                 \
184                                 __t.i;}))
185
186/* Miscellaneous insert and extract macros.  */
187/* Extract a single-precision float from X at index N into D.  */
188#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
189                                                    (D) = __a[N]; }))
190
191/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
192   an index suitable for _mm_insert_ps.  */
193#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
194
195/* Extract a float from X at index N into the first index of the return.  */
196#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
197                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
198
199/* Insert int into packed integer array at index.  */
200#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
201                                                   __a[N] = I;               \
202                                                   __a;}))
203#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
204                                                    __a[N] = I;             \
205                                                    __a;}))
206#ifdef __x86_64__
207#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
208                                                    __a[N] = I;             \
209                                                    __a;}))
210#endif /* __x86_64__ */
211
212/* Extract int from packed integer array at index.  This returns the element
213 * as a zero extended value, so it is unsigned.
214 */
215#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
216                                                 (unsigned char)__a[N];}))
217#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
218                                                  (unsigned)__a[N];}))
219#ifdef __x86_64__
220#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
221                                                  __a[N];}))
222#endif /* __x86_64 */
223
224/* SSE4 128-bit Packed Integer Comparisons.  */
225static __inline__ int __attribute__((__always_inline__, __nodebug__))
226_mm_testz_si128(__m128i __M, __m128i __V)
227{
228  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
229}
230
231static __inline__ int __attribute__((__always_inline__, __nodebug__))
232_mm_testc_si128(__m128i __M, __m128i __V)
233{
234  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
235}
236
237static __inline__ int __attribute__((__always_inline__, __nodebug__))
238_mm_testnzc_si128(__m128i __M, __m128i __V)
239{
240  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
241}
242
243#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
244#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
245#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((V), (V))
246
247/* SSE4 64-bit Packed Integer Comparisons.  */
248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
249_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
250{
251  return (__m128i) __builtin_ia32_pcmpeqq((__v2di)__V1, (__v2di)__V2);
252}
253
254/* SSE4 Packed Integer Sign-Extension.  */
255static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
256_mm_cvtepi8_epi16(__m128i __V)
257{
258  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
259}
260
261static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
262_mm_cvtepi8_epi32(__m128i __V)
263{
264  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
265}
266
267static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
268_mm_cvtepi8_epi64(__m128i __V)
269{
270  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
271}
272
273static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
274_mm_cvtepi16_epi32(__m128i __V)
275{
276  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
277}
278
279static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
280_mm_cvtepi16_epi64(__m128i __V)
281{
282  return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
283}
284
285static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
286_mm_cvtepi32_epi64(__m128i __V)
287{
288  return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
289}
290
291/* SSE4 Packed Integer Zero-Extension.  */
292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
293_mm_cvtepu8_epi16(__m128i __V)
294{
295  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
296}
297
298static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
299_mm_cvtepu8_epi32(__m128i __V)
300{
301  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
302}
303
304static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
305_mm_cvtepu8_epi64(__m128i __V)
306{
307  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
308}
309
310static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
311_mm_cvtepu16_epi32(__m128i __V)
312{
313  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
314}
315
316static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
317_mm_cvtepu16_epi64(__m128i __V)
318{
319  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
320}
321
322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
323_mm_cvtepu32_epi64(__m128i __V)
324{
325  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
326}
327
328/* SSE4 Pack with Unsigned Saturation.  */
329static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
330_mm_packus_epi32(__m128i __V1, __m128i __V2)
331{
332  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
333}
334
335/* SSE4 Multiple Packed Sums of Absolute Difference.  */
336#define _mm_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw128((X), (Y), (M))
337
338/* These definitions are normally in nmmintrin.h, but gcc puts them in here
339   so we'll do the same.  */
340#ifdef __SSE4_2__
341
342/* These specify the type of data that we're comparing.  */
343#define _SIDD_UBYTE_OPS                 0x00
344#define _SIDD_UWORD_OPS                 0x01
345#define _SIDD_SBYTE_OPS                 0x02
346#define _SIDD_SWORD_OPS                 0x03
347
348/* These specify the type of comparison operation.  */
349#define _SIDD_CMP_EQUAL_ANY             0x00
350#define _SIDD_CMP_RANGES                0x04
351#define _SIDD_CMP_EQUAL_EACH            0x08
352#define _SIDD_CMP_EQUAL_ORDERED         0x0c
353
354/* These macros specify the polarity of the operation.  */
355#define _SIDD_POSITIVE_POLARITY         0x00
356#define _SIDD_NEGATIVE_POLARITY         0x10
357#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
358#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
359
360/* These macros are used in _mm_cmpXstri() to specify the return.  */
361#define _SIDD_LEAST_SIGNIFICANT         0x00
362#define _SIDD_MOST_SIGNIFICANT          0x40
363
364/* These macros are used in _mm_cmpXstri() to specify the return.  */
365#define _SIDD_BIT_MASK                  0x00
366#define _SIDD_UNIT_MASK                 0x40
367
368/* SSE4.2 Packed Comparison Intrinsics.  */
369#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
370#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
371
372#define _mm_cmpestrm(A, LA, B, LB, M) \
373     __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
374#define _mm_cmpestri(X, LX, Y, LY, M) \
375     __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
376
377/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
378#define _mm_cmpistra(A, LA, B, LB, M) \
379     __builtin_ia32_pcmpistria128((A), (LA), (B), (LB), (M))
380#define _mm_cmpistrc(A, LA, B, LB, M) \
381     __builtin_ia32_pcmpistric128((A), (LA), (B), (LB), (M))
382#define _mm_cmpistro(A, LA, B, LB, M) \
383     __builtin_ia32_pcmpistrio128((A), (LA), (B), (LB), (M))
384#define _mm_cmpistrs(A, LA, B, LB, M) \
385     __builtin_ia32_pcmpistris128((A), (LA), (B), (LB), (M))
386#define _mm_cmpistrz(A, LA, B, LB, M) \
387     __builtin_ia32_pcmpistriz128((A), (LA), (B), (LB), (M))
388
389#define _mm_cmpestra(A, LA, B, LB, M) \
390     __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
391#define _mm_cmpestrc(A, LA, B, LB, M) \
392     __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
393#define _mm_cmpestro(A, LA, B, LB, M) \
394     __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
395#define _mm_cmpestrs(A, LA, B, LB, M) \
396     __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
397#define _mm_cmpestrz(A, LA, B, LB, M) \
398     __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
399
400/* SSE4.2 Compare Packed Data -- Greater Than.  */
401static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
402_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
403{
404  return __builtin_ia32_pcmpgtq((__v2di)__V1, (__v2di)__V2);
405}
406
407/* SSE4.2 Accumulate CRC32.  */
408static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
409_mm_crc32_u8(unsigned int __C, unsigned char __D)
410{
411  return __builtin_ia32_crc32qi(__C, __D);
412}
413
414static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
415_mm_crc32_u16(unsigned int __C, unsigned short __D)
416{
417  return __builtin_ia32_crc32hi(__C, __D);
418}
419
420static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
421_mm_crc32_u32(unsigned int __C, unsigned int __D)
422{
423  return __builtin_ia32_crc32si(__C, __D);
424}
425
426#ifdef __x86_64__
427static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
428_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
429{
430  return __builtin_ia32_crc32di(__C, __D);
431}
432#endif /* __x86_64__ */
433
434/* SSE4.2 Population Count.  */
435static __inline__ int __attribute__((__always_inline__, __nodebug__))
436_mm_popcnt_u32(unsigned int __A)
437{
438  return __builtin_popcount(__A);
439}
440
441#ifdef __x86_64__
442static __inline__ long long __attribute__((__always_inline__, __nodebug__))
443_mm_popcnt_u64(unsigned long long __A)
444{
445  return __builtin_popcountll(__A);
446}
447#endif /* __x86_64__ */
448
449#endif /* __SSE4_2__ */
450#endif /* __SSE4_1__ */
451
452#endif /* _SMMINTRIN_H */
453