smmintrin.h revision 288943
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef _SMMINTRIN_H 25#define _SMMINTRIN_H 26 27#ifndef __SSE4_1__ 28#error "SSE4.1 instruction set not enabled" 29#else 30 31#include <tmmintrin.h> 32 33/* Define the default attributes for the functions in this file. */ 34#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) 35 36/* SSE4 Rounding macros. */ 37#define _MM_FROUND_TO_NEAREST_INT 0x00 38#define _MM_FROUND_TO_NEG_INF 0x01 39#define _MM_FROUND_TO_POS_INF 0x02 40#define _MM_FROUND_TO_ZERO 0x03 41#define _MM_FROUND_CUR_DIRECTION 0x04 42 43#define _MM_FROUND_RAISE_EXC 0x00 44#define _MM_FROUND_NO_EXC 0x08 45 46#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 47#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 48#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 49#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 50#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 51#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 52 53#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 54#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 55#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 56#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 57 58#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 59#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 60#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 61#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 62 63#define _mm_round_ps(X, M) __extension__ ({ \ 64 __m128 __X = (X); \ 65 (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); }) 66 67#define _mm_round_ss(X, Y, M) __extension__ ({ \ 68 __m128 __X = (X); \ 69 __m128 __Y = (Y); \ 70 (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); }) 71 72#define _mm_round_pd(X, M) __extension__ ({ \ 73 __m128d __X = (X); \ 74 (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); }) 75 76#define _mm_round_sd(X, Y, M) __extension__ ({ \ 77 __m128d __X = (X); \ 78 __m128d __Y = (Y); \ 79 (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); }) 80 81/* SSE4 Packed Blending Intrinsics. */ 82#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ 83 __m128d __V1 = (V1); \ 84 __m128d __V2 = (V2); \ 85 (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \ 86 (((M) & 0x01) ? 2 : 0), \ 87 (((M) & 0x02) ? 3 : 1)); }) 88 89#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ 90 __m128 __V1 = (V1); \ 91 __m128 __V2 = (V2); \ 92 (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \ 93 (((M) & 0x01) ? 4 : 0), \ 94 (((M) & 0x02) ? 5 : 1), \ 95 (((M) & 0x04) ? 6 : 2), \ 96 (((M) & 0x08) ? 7 : 3)); }) 97 98static __inline__ __m128d __DEFAULT_FN_ATTRS 99_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 100{ 101 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 102 (__v2df)__M); 103} 104 105static __inline__ __m128 __DEFAULT_FN_ATTRS 106_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 107{ 108 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 109 (__v4sf)__M); 110} 111 112static __inline__ __m128i __DEFAULT_FN_ATTRS 113_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 114{ 115 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 116 (__v16qi)__M); 117} 118 119#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ 120 __m128i __V1 = (V1); \ 121 __m128i __V2 = (V2); \ 122 (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \ 123 (((M) & 0x01) ? 8 : 0), \ 124 (((M) & 0x02) ? 9 : 1), \ 125 (((M) & 0x04) ? 10 : 2), \ 126 (((M) & 0x08) ? 11 : 3), \ 127 (((M) & 0x10) ? 12 : 4), \ 128 (((M) & 0x20) ? 13 : 5), \ 129 (((M) & 0x40) ? 14 : 6), \ 130 (((M) & 0x80) ? 15 : 7)); }) 131 132/* SSE4 Dword Multiply Instructions. */ 133static __inline__ __m128i __DEFAULT_FN_ATTRS 134_mm_mullo_epi32 (__m128i __V1, __m128i __V2) 135{ 136 return (__m128i) ((__v4si)__V1 * (__v4si)__V2); 137} 138 139static __inline__ __m128i __DEFAULT_FN_ATTRS 140_mm_mul_epi32 (__m128i __V1, __m128i __V2) 141{ 142 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 143} 144 145/* SSE4 Floating Point Dot Product Instructions. */ 146#define _mm_dp_ps(X, Y, M) __extension__ ({ \ 147 __m128 __X = (X); \ 148 __m128 __Y = (Y); \ 149 (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); }) 150 151#define _mm_dp_pd(X, Y, M) __extension__ ({\ 152 __m128d __X = (X); \ 153 __m128d __Y = (Y); \ 154 (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); }) 155 156/* SSE4 Streaming Load Hint Instruction. */ 157static __inline__ __m128i __DEFAULT_FN_ATTRS 158_mm_stream_load_si128 (__m128i *__V) 159{ 160 return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V); 161} 162 163/* SSE4 Packed Integer Min/Max Instructions. */ 164static __inline__ __m128i __DEFAULT_FN_ATTRS 165_mm_min_epi8 (__m128i __V1, __m128i __V2) 166{ 167 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 168} 169 170static __inline__ __m128i __DEFAULT_FN_ATTRS 171_mm_max_epi8 (__m128i __V1, __m128i __V2) 172{ 173 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 174} 175 176static __inline__ __m128i __DEFAULT_FN_ATTRS 177_mm_min_epu16 (__m128i __V1, __m128i __V2) 178{ 179 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 180} 181 182static __inline__ __m128i __DEFAULT_FN_ATTRS 183_mm_max_epu16 (__m128i __V1, __m128i __V2) 184{ 185 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 186} 187 188static __inline__ __m128i __DEFAULT_FN_ATTRS 189_mm_min_epi32 (__m128i __V1, __m128i __V2) 190{ 191 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 192} 193 194static __inline__ __m128i __DEFAULT_FN_ATTRS 195_mm_max_epi32 (__m128i __V1, __m128i __V2) 196{ 197 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 198} 199 200static __inline__ __m128i __DEFAULT_FN_ATTRS 201_mm_min_epu32 (__m128i __V1, __m128i __V2) 202{ 203 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 204} 205 206static __inline__ __m128i __DEFAULT_FN_ATTRS 207_mm_max_epu32 (__m128i __V1, __m128i __V2) 208{ 209 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 210} 211 212/* SSE4 Insertion and Extraction from XMM Register Instructions. */ 213#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 214#define _mm_extract_ps(X, N) (__extension__ \ 215 ({ union { int __i; float __f; } __t; \ 216 __v4sf __a = (__v4sf)(X); \ 217 __t.__f = __a[(N) & 3]; \ 218 __t.__i;})) 219 220/* Miscellaneous insert and extract macros. */ 221/* Extract a single-precision float from X at index N into D. */ 222#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ 223 (D) = __a[N]; })) 224 225/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 226 an index suitable for _mm_insert_ps. */ 227#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 228 229/* Extract a float from X at index N into the first index of the return. */ 230#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 231 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 232 233/* Insert int into packed integer array at index. */ 234#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ 235 __a[(N) & 15] = (I); \ 236 __a;})) 237#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ 238 __a[(N) & 3] = (I); \ 239 __a;})) 240#ifdef __x86_64__ 241#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ 242 __a[(N) & 1] = (I); \ 243 __a;})) 244#endif /* __x86_64__ */ 245 246/* Extract int from packed integer array at index. This returns the element 247 * as a zero extended value, so it is unsigned. 248 */ 249#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ 250 (int)(unsigned char) \ 251 __a[(N) & 15];})) 252#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ 253 __a[(N) & 3];})) 254#ifdef __x86_64__ 255#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ 256 __a[(N) & 1];})) 257#endif /* __x86_64 */ 258 259/* SSE4 128-bit Packed Integer Comparisons. */ 260static __inline__ int __DEFAULT_FN_ATTRS 261_mm_testz_si128(__m128i __M, __m128i __V) 262{ 263 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 264} 265 266static __inline__ int __DEFAULT_FN_ATTRS 267_mm_testc_si128(__m128i __M, __m128i __V) 268{ 269 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 270} 271 272static __inline__ int __DEFAULT_FN_ATTRS 273_mm_testnzc_si128(__m128i __M, __m128i __V) 274{ 275 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 276} 277 278#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 279#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 280#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 281 282/* SSE4 64-bit Packed Integer Comparisons. */ 283static __inline__ __m128i __DEFAULT_FN_ATTRS 284_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 285{ 286 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 287} 288 289/* SSE4 Packed Integer Sign-Extension. */ 290static __inline__ __m128i __DEFAULT_FN_ATTRS 291_mm_cvtepi8_epi16(__m128i __V) 292{ 293 return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V); 294} 295 296static __inline__ __m128i __DEFAULT_FN_ATTRS 297_mm_cvtepi8_epi32(__m128i __V) 298{ 299 return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V); 300} 301 302static __inline__ __m128i __DEFAULT_FN_ATTRS 303_mm_cvtepi8_epi64(__m128i __V) 304{ 305 return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V); 306} 307 308static __inline__ __m128i __DEFAULT_FN_ATTRS 309_mm_cvtepi16_epi32(__m128i __V) 310{ 311 return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 312} 313 314static __inline__ __m128i __DEFAULT_FN_ATTRS 315_mm_cvtepi16_epi64(__m128i __V) 316{ 317 return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V); 318} 319 320static __inline__ __m128i __DEFAULT_FN_ATTRS 321_mm_cvtepi32_epi64(__m128i __V) 322{ 323 return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V); 324} 325 326/* SSE4 Packed Integer Zero-Extension. */ 327static __inline__ __m128i __DEFAULT_FN_ATTRS 328_mm_cvtepu8_epi16(__m128i __V) 329{ 330 return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V); 331} 332 333static __inline__ __m128i __DEFAULT_FN_ATTRS 334_mm_cvtepu8_epi32(__m128i __V) 335{ 336 return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V); 337} 338 339static __inline__ __m128i __DEFAULT_FN_ATTRS 340_mm_cvtepu8_epi64(__m128i __V) 341{ 342 return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V); 343} 344 345static __inline__ __m128i __DEFAULT_FN_ATTRS 346_mm_cvtepu16_epi32(__m128i __V) 347{ 348 return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V); 349} 350 351static __inline__ __m128i __DEFAULT_FN_ATTRS 352_mm_cvtepu16_epi64(__m128i __V) 353{ 354 return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V); 355} 356 357static __inline__ __m128i __DEFAULT_FN_ATTRS 358_mm_cvtepu32_epi64(__m128i __V) 359{ 360 return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V); 361} 362 363/* SSE4 Pack with Unsigned Saturation. */ 364static __inline__ __m128i __DEFAULT_FN_ATTRS 365_mm_packus_epi32(__m128i __V1, __m128i __V2) 366{ 367 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 368} 369 370/* SSE4 Multiple Packed Sums of Absolute Difference. */ 371#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ 372 __m128i __X = (X); \ 373 __m128i __Y = (Y); \ 374 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); }) 375 376static __inline__ __m128i __DEFAULT_FN_ATTRS 377_mm_minpos_epu16(__m128i __V) 378{ 379 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 380} 381 382/* These definitions are normally in nmmintrin.h, but gcc puts them in here 383 so we'll do the same. */ 384#ifdef __SSE4_2__ 385 386/* These specify the type of data that we're comparing. */ 387#define _SIDD_UBYTE_OPS 0x00 388#define _SIDD_UWORD_OPS 0x01 389#define _SIDD_SBYTE_OPS 0x02 390#define _SIDD_SWORD_OPS 0x03 391 392/* These specify the type of comparison operation. */ 393#define _SIDD_CMP_EQUAL_ANY 0x00 394#define _SIDD_CMP_RANGES 0x04 395#define _SIDD_CMP_EQUAL_EACH 0x08 396#define _SIDD_CMP_EQUAL_ORDERED 0x0c 397 398/* These macros specify the polarity of the operation. */ 399#define _SIDD_POSITIVE_POLARITY 0x00 400#define _SIDD_NEGATIVE_POLARITY 0x10 401#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 402#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 403 404/* These macros are used in _mm_cmpXstri() to specify the return. */ 405#define _SIDD_LEAST_SIGNIFICANT 0x00 406#define _SIDD_MOST_SIGNIFICANT 0x40 407 408/* These macros are used in _mm_cmpXstri() to specify the return. */ 409#define _SIDD_BIT_MASK 0x00 410#define _SIDD_UNIT_MASK 0x40 411 412/* SSE4.2 Packed Comparison Intrinsics. */ 413#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M)) 414#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M)) 415 416#define _mm_cmpestrm(A, LA, B, LB, M) \ 417 __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M)) 418#define _mm_cmpestri(A, LA, B, LB, M) \ 419 __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M)) 420 421/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 422#define _mm_cmpistra(A, B, M) \ 423 __builtin_ia32_pcmpistria128((A), (B), (M)) 424#define _mm_cmpistrc(A, B, M) \ 425 __builtin_ia32_pcmpistric128((A), (B), (M)) 426#define _mm_cmpistro(A, B, M) \ 427 __builtin_ia32_pcmpistrio128((A), (B), (M)) 428#define _mm_cmpistrs(A, B, M) \ 429 __builtin_ia32_pcmpistris128((A), (B), (M)) 430#define _mm_cmpistrz(A, B, M) \ 431 __builtin_ia32_pcmpistriz128((A), (B), (M)) 432 433#define _mm_cmpestra(A, LA, B, LB, M) \ 434 __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M)) 435#define _mm_cmpestrc(A, LA, B, LB, M) \ 436 __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M)) 437#define _mm_cmpestro(A, LA, B, LB, M) \ 438 __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M)) 439#define _mm_cmpestrs(A, LA, B, LB, M) \ 440 __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M)) 441#define _mm_cmpestrz(A, LA, B, LB, M) \ 442 __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) 443 444/* SSE4.2 Compare Packed Data -- Greater Than. */ 445static __inline__ __m128i __DEFAULT_FN_ATTRS 446_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 447{ 448 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 449} 450 451/* SSE4.2 Accumulate CRC32. */ 452static __inline__ unsigned int __DEFAULT_FN_ATTRS 453_mm_crc32_u8(unsigned int __C, unsigned char __D) 454{ 455 return __builtin_ia32_crc32qi(__C, __D); 456} 457 458static __inline__ unsigned int __DEFAULT_FN_ATTRS 459_mm_crc32_u16(unsigned int __C, unsigned short __D) 460{ 461 return __builtin_ia32_crc32hi(__C, __D); 462} 463 464static __inline__ unsigned int __DEFAULT_FN_ATTRS 465_mm_crc32_u32(unsigned int __C, unsigned int __D) 466{ 467 return __builtin_ia32_crc32si(__C, __D); 468} 469 470#ifdef __x86_64__ 471static __inline__ unsigned long long __DEFAULT_FN_ATTRS 472_mm_crc32_u64(unsigned long long __C, unsigned long long __D) 473{ 474 return __builtin_ia32_crc32di(__C, __D); 475} 476#endif /* __x86_64__ */ 477 478#undef __DEFAULT_FN_ATTRS 479 480#ifdef __POPCNT__ 481#include <popcntintrin.h> 482#endif 483 484#endif /* __SSE4_2__ */ 485#endif /* __SSE4_1__ */ 486 487#endif /* _SMMINTRIN_H */ 488