avxintrin.h revision 212904
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28typedef double __v4df __attribute__ ((__vector_size__ (32))); 29typedef float __v8sf __attribute__ ((__vector_size__ (32))); 30typedef long long __v4di __attribute__ ((__vector_size__ (32))); 31typedef int __v8si __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef char __v32qi __attribute__ ((__vector_size__ (32))); 34 35typedef float __m256 __attribute__ ((__vector_size__ (32))); 36typedef double __m256d __attribute__((__vector_size__(32))); 37typedef long long __m256i __attribute__((__vector_size__(32))); 38 39/* Arithmetic */ 40static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 41_mm256_add_pd(__m256d a, __m256d b) 42{ 43 return a+b; 44} 45 46static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 47_mm256_add_ps(__m256 a, __m256 b) 48{ 49 return a+b; 50} 51 52static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 53_mm256_sub_pd(__m256d a, __m256d b) 54{ 55 return a-b; 56} 57 58static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 59_mm256_sub_ps(__m256 a, __m256 b) 60{ 61 return a-b; 62} 63 64static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 65_mm256_addsub_pd(__m256d a, __m256d b) 66{ 67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 68} 69 70static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 71_mm256_addsub_ps(__m256 a, __m256 b) 72{ 73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 74} 75 76static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 77_mm256_div_pd(__m256d a, __m256d b) 78{ 79 return a / b; 80} 81 82static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 83_mm256_div_ps(__m256 a, __m256 b) 84{ 85 return a / b; 86} 87 88static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 89_mm256_max_pd(__m256d a, __m256d b) 90{ 91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 92} 93 94static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 95_mm256_max_ps(__m256 a, __m256 b) 96{ 97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 98} 99 100static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 101_mm256_min_pd(__m256d a, __m256d b) 102{ 103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 104} 105 106static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 107_mm256_min_ps(__m256 a, __m256 b) 108{ 109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 110} 111 112static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 113_mm256_mul_pd(__m256d a, __m256d b) 114{ 115 return a * b; 116} 117 118static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 119_mm256_mul_ps(__m256 a, __m256 b) 120{ 121 return a * b; 122} 123 124static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 125_mm256_sqrt_pd(__m256d a) 126{ 127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 128} 129 130static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 131_mm256_sqrt_ps(__m256 a) 132{ 133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 134} 135 136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 137_mm256_rsqrt_ps(__m256 a) 138{ 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 140} 141 142static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 143_mm256_rcp_ps(__m256 a) 144{ 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 146} 147 148static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 149_mm256_round_pd(__m256d v, const int m) 150{ 151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m); 152} 153 154static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 155_mm256_round_ps(__m256 v, const int m) 156{ 157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m); 158} 159 160#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 162#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 164 165/* Logical */ 166static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 167_mm256_and_pd(__m256d a, __m256d b) 168{ 169 return (__m256d)((__v4di)a & (__v4di)b); 170} 171 172static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 173_mm256_and_ps(__m256 a, __m256 b) 174{ 175 return (__m256)((__v8si)a & (__v8si)b); 176} 177 178static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 179_mm256_andnot_pd(__m256d a, __m256d b) 180{ 181 return (__m256d)(~(__v4di)a & (__v4di)b); 182} 183 184static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 185_mm256_andnot_ps(__m256 a, __m256 b) 186{ 187 return (__m256)(~(__v8si)a & (__v8si)b); 188} 189 190static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 191_mm256_or_pd(__m256d a, __m256d b) 192{ 193 return (__m256d)((__v4di)a | (__v4di)b); 194} 195 196static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 197_mm256_or_ps(__m256 a, __m256 b) 198{ 199 return (__m256)((__v8si)a | (__v8si)b); 200} 201 202static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 203_mm256_xor_pd(__m256d a, __m256d b) 204{ 205 return (__m256d)((__v4di)a ^ (__v4di)b); 206} 207 208static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 209_mm256_xor_ps(__m256 a, __m256 b) 210{ 211 return (__m256)((__v8si)a ^ (__v8si)b); 212} 213 214/* Horizontal arithmetic */ 215static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 216_mm256_hadd_pd(__m256d a, __m256d b) 217{ 218 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 219} 220 221static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 222_mm256_hadd_ps(__m256 a, __m256 b) 223{ 224 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 225} 226 227static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 228_mm256_hsub_pd(__m256d a, __m256d b) 229{ 230 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 231} 232 233static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 234_mm256_hsub_ps(__m256 a, __m256 b) 235{ 236 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 237} 238 239/* Vector permutations */ 240static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 241_mm_permutevar_pd(__m128d a, __m128i c) 242{ 243 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 244} 245 246static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 247_mm256_permutevar_pd(__m256d a, __m256i c) 248{ 249 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 250} 251 252static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 253_mm_permutevar_ps(__m128 a, __m128i c) 254{ 255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 256} 257 258static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 259_mm256_permutevar_ps(__m256 a, __m256i c) 260{ 261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 262 (__v8si)c); 263} 264 265static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 266_mm_permute_pd(__m128d a, const int c) 267{ 268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c); 269} 270 271static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 272_mm256_permute_pd(__m256d a, const int c) 273{ 274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c); 275} 276 277static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 278_mm_permute_ps(__m128 a, const int c) 279{ 280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c); 281} 282 283static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 284_mm256_permute_ps(__m256 a, const int c) 285{ 286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c); 287} 288 289static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c) 291{ 292 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c); 293} 294 295static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c) 297{ 298 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c); 299} 300 301static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c) 303{ 304 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c); 305} 306 307/* Vector Blend */ 308static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 309_mm256_blend_pd(__m256d a, __m256d b, const int c) 310{ 311 return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c); 312} 313 314static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 315_mm256_blend_ps(__m256 a, __m256 b, const int c) 316{ 317 return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c); 318} 319 320static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 321_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 322{ 323 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 324} 325 326static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 327_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 328{ 329 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 330} 331 332/* Vector Dot Product */ 333static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 334_mm256_dp_ps(__m256 a, __m256 b, const int c) 335{ 336 return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c); 337} 338 339/* Vector shuffle */ 340#define _mm256_shuffle_ps(a, b, mask) \ 341 (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \ 342 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 343 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 344 (mask) & 0x3 + 4, (((mask) & 0xc) >> 2) + 4, \ 345 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12)) 346 347#define _mm256_shuffle_pd(a, b, mask) \ 348 (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \ 349 (mask) & 0x1, \ 350 (((mask) & 0x2) >> 1) + 4, \ 351 (((mask) & 0x4) >> 2) + 2, \ 352 (((mask) & 0x8) >> 3) + 6)) 353 354/* Compare */ 355#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 356#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 357#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 358#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 359#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 360#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 361#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 362#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 363#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 364#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 365#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 366#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 367#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 368#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 369#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 370#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 371#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 372#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 373#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 374#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 375#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 376#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 377#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 378#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 379#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 380#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 381#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 382#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 383#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 384#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 385#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 386#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 387 388static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 389_mm_cmp_pd(__m128d a, __m128d b, const int c) 390{ 391 return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c); 392} 393 394static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 395_mm_cmp_ps(__m128 a, __m128 b, const int c) 396{ 397 return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c); 398} 399 400static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 401_mm256_cmp_pd(__m256d a, __m256d b, const int c) 402{ 403 return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c); 404} 405 406static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 407_mm256_cmp_ps(__m256 a, __m256 b, const int c) 408{ 409 return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c); 410} 411 412static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 413_mm_cmp_sd(__m128d a, __m128d b, const int c) 414{ 415 return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c); 416} 417 418static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 419_mm_cmp_ss(__m128 a, __m128 b, const int c) 420{ 421 return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c); 422} 423 424/* Vector extract */ 425static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 426_mm256_extractf128_pd(__m256d a, const int o) 427{ 428 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o); 429} 430 431static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 432_mm256_extractf128_ps(__m256 a, const int o) 433{ 434 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o); 435} 436 437static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 438_mm256_extractf128_si256(__m256i a, const int o) 439{ 440 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o); 441} 442 443static __inline int __attribute__((__always_inline__, __nodebug__)) 444_mm256_extract_epi32(__m256i a, int const imm) 445{ 446 __v8si b = (__v8si)a; 447 return b[imm]; 448} 449 450static __inline int __attribute__((__always_inline__, __nodebug__)) 451_mm256_extract_epi16(__m256i a, int const imm) 452{ 453 __v16hi b = (__v16hi)a; 454 return b[imm]; 455} 456 457static __inline int __attribute__((__always_inline__, __nodebug__)) 458_mm256_extract_epi8(__m256i a, int const imm) 459{ 460 __v32qi b = (__v32qi)a; 461 return b[imm]; 462} 463 464#ifdef __x86_64__ 465static __inline long long __attribute__((__always_inline__, __nodebug__)) 466_mm256_extract_epi64(__m256i a, const int imm) 467{ 468 __v4di b = (__v4di)a; 469 return b[imm]; 470} 471#endif 472 473/* Vector insert */ 474static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 475_mm256_insertf128_pd(__m256d a, __m128d b, const int o) 476{ 477 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o); 478} 479 480static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 481_mm256_insertf128_ps(__m256 a, __m128 b, const int o) 482{ 483 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o); 484} 485 486static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 487_mm256_insertf128_si256(__m256i a, __m128i b, const int o) 488{ 489 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o); 490} 491 492static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 493_mm256_insert_epi32(__m256i a, int b, int const imm) 494{ 495 __v8si c = (__v8si)a; 496 c[imm & 7] = b; 497 return (__m256i)c; 498} 499 500static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 501_mm256_insert_epi16(__m256i a, int b, int const imm) 502{ 503 __v16hi c = (__v16hi)a; 504 c[imm & 15] = b; 505 return (__m256i)c; 506} 507 508static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 509_mm256_insert_epi8(__m256i a, int b, int const imm) 510{ 511 __v32qi c = (__v32qi)a; 512 c[imm & 31] = b; 513 return (__m256i)c; 514} 515 516#ifdef __x86_64__ 517static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 518_mm256_insert_epi64(__m256i a, int b, int const imm) 519{ 520 __v4di c = (__v4di)a; 521 c[imm & 3] = b; 522 return (__m256i)c; 523} 524#endif 525 526/* Conversion */ 527static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 528_mm256_cvtepi32_pd(__m128i a) 529{ 530 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 531} 532 533static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 534_mm256_cvtepi32_ps(__m256i a) 535{ 536 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 537} 538 539static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 540_mm256_cvtpd_ps(__m256d a) 541{ 542 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 543} 544 545static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 546_mm256_cvtps_epi32(__m256 a) 547{ 548 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 549} 550 551static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 552_mm256_cvtps_pd(__m128 a) 553{ 554 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 555} 556 557static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 558_mm256_cvttpd_epi32(__m256d a) 559{ 560 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 561} 562 563static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 564_mm256_cvtpd_epi32(__m256d a) 565{ 566 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 567} 568 569static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 570_mm256_cvttps_epi32(__m256 a) 571{ 572 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 573} 574 575/* Vector replicate */ 576static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 577_mm256_movehdup_ps(__m256 a) 578{ 579 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 580} 581 582static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 583_mm256_moveldup_ps(__m256 a) 584{ 585 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 586} 587 588static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 589_mm256_movedup_pd(__m256d a) 590{ 591 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 592} 593 594/* Unpack and Interleave */ 595static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 596_mm256_unpackhi_pd(__m256d a, __m256d b) 597{ 598 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 599} 600 601static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 602_mm256_unpacklo_pd(__m256d a, __m256d b) 603{ 604 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 605} 606 607static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 608_mm256_unpackhi_ps(__m256 a, __m256 b) 609{ 610 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 611} 612 613static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 614_mm256_unpacklo_ps(__m256 a, __m256 b) 615{ 616 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 617} 618 619/* Bit Test */ 620static __inline int __attribute__((__always_inline__, __nodebug__)) 621_mm_testz_pd(__m128d a, __m128d b) 622{ 623 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 624} 625 626static __inline int __attribute__((__always_inline__, __nodebug__)) 627_mm_testc_pd(__m128d a, __m128d b) 628{ 629 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 630} 631 632static __inline int __attribute__((__always_inline__, __nodebug__)) 633_mm_testnzc_pd(__m128d a, __m128d b) 634{ 635 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 636} 637 638static __inline int __attribute__((__always_inline__, __nodebug__)) 639_mm_testz_ps(__m128 a, __m128 b) 640{ 641 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 642} 643 644static __inline int __attribute__((__always_inline__, __nodebug__)) 645_mm_testc_ps(__m128 a, __m128 b) 646{ 647 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 648} 649 650static __inline int __attribute__((__always_inline__, __nodebug__)) 651_mm_testnzc_ps(__m128 a, __m128 b) 652{ 653 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 654} 655 656static __inline int __attribute__((__always_inline__, __nodebug__)) 657_mm256_testz_pd(__m256d a, __m256d b) 658{ 659 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 660} 661 662static __inline int __attribute__((__always_inline__, __nodebug__)) 663_mm256_testc_pd(__m256d a, __m256d b) 664{ 665 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 666} 667 668static __inline int __attribute__((__always_inline__, __nodebug__)) 669_mm256_testnzc_pd(__m256d a, __m256d b) 670{ 671 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 672} 673 674static __inline int __attribute__((__always_inline__, __nodebug__)) 675_mm256_testz_ps(__m256 a, __m256 b) 676{ 677 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 678} 679 680static __inline int __attribute__((__always_inline__, __nodebug__)) 681_mm256_testc_ps(__m256 a, __m256 b) 682{ 683 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 684} 685 686static __inline int __attribute__((__always_inline__, __nodebug__)) 687_mm256_testnzc_ps(__m256 a, __m256 b) 688{ 689 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 690} 691 692static __inline int __attribute__((__always_inline__, __nodebug__)) 693_mm256_testz_si256(__m256i a, __m256i b) 694{ 695 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 696} 697 698static __inline int __attribute__((__always_inline__, __nodebug__)) 699_mm256_testc_si256(__m256i a, __m256i b) 700{ 701 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 702} 703 704static __inline int __attribute__((__always_inline__, __nodebug__)) 705_mm256_testnzc_si256(__m256i a, __m256i b) 706{ 707 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 708} 709 710/* Vector extract sign mask */ 711static __inline int __attribute__((__always_inline__, __nodebug__)) 712_mm256_movemask_pd(__m256d a) 713{ 714 return __builtin_ia32_movmskpd256((__v4df)a); 715} 716 717static __inline int __attribute__((__always_inline__, __nodebug__)) 718_mm256_movemask_ps(__m256 a) 719{ 720 return __builtin_ia32_movmskps256((__v8sf)a); 721} 722 723/* Vector zero */ 724static __inline void __attribute__((__always_inline__, __nodebug__)) 725_mm256_zeroall(void) 726{ 727 __builtin_ia32_vzeroall(); 728} 729 730static __inline void __attribute__((__always_inline__, __nodebug__)) 731_mm256_zeroupper(void) 732{ 733 __builtin_ia32_vzeroupper(); 734} 735 736/* Vector load with broadcast */ 737static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 738_mm_broadcast_ss(float const *a) 739{ 740 return (__m128)__builtin_ia32_vbroadcastss(a); 741} 742 743static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 744_mm256_broadcast_sd(double const *a) 745{ 746 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 747} 748 749static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 750_mm256_broadcast_ss(float const *a) 751{ 752 return (__m256)__builtin_ia32_vbroadcastss256(a); 753} 754 755static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 756_mm256_broadcast_pd(__m128d const *a) 757{ 758 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 759} 760 761static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 762_mm256_broadcast_ps(__m128 const *a) 763{ 764 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 765} 766 767/* SIMD load ops */ 768static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 769_mm256_load_pd(double const *p) 770{ 771 return *(__m256d *)p; 772} 773 774static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 775_mm256_load_ps(float const *p) 776{ 777 return *(__m256 *)p; 778} 779 780static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 781_mm256_loadu_pd(double const *p) 782{ 783 return (__m256d)__builtin_ia32_loadupd256(p); 784} 785 786static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 787_mm256_loadu_ps(float const *p) 788{ 789 return (__m256)__builtin_ia32_loadups256(p); 790} 791 792static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 793_mm256_load_si256(__m256i const *p) 794{ 795 return *p; 796} 797 798static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 799_mm256_loadu_si256(__m256i const *p) 800{ 801 return (__m256i)__builtin_ia32_loaddqu256((char const *)p); 802} 803 804static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 805_mm256_lddqu_si256(__m256i const *p) 806{ 807 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 808} 809 810/* SIMD store ops */ 811static __inline void __attribute__((__always_inline__, __nodebug__)) 812_mm256_store_pd(double *p, __m256d a) 813{ 814 *(__m256d *)p = a; 815} 816 817static __inline void __attribute__((__always_inline__, __nodebug__)) 818_mm256_store_ps(float *p, __m256 a) 819{ 820 *(__m256 *)p = a; 821} 822 823static __inline void __attribute__((__always_inline__, __nodebug__)) 824_mm256_storeu_pd(double *p, __m256d a) 825{ 826 __builtin_ia32_storeupd256(p, (__v4df)a); 827} 828 829static __inline void __attribute__((__always_inline__, __nodebug__)) 830_mm256_storeu_ps(float *p, __m256 a) 831{ 832 __builtin_ia32_storeups256(p, (__v8sf)a); 833} 834 835static __inline void __attribute__((__always_inline__, __nodebug__)) 836_mm256_store_si256(__m256i *p, __m256i a) 837{ 838 *p = a; 839} 840 841static __inline void __attribute__((__always_inline__, __nodebug__)) 842_mm256_storeu_si256(__m256i *p, __m256i a) 843{ 844 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 845} 846 847/* Conditional load ops */ 848static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 849_mm_maskload_pd(double const *p, __m128d m) 850{ 851 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 852} 853 854static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 855_mm256_maskload_pd(double const *p, __m256d m) 856{ 857 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 858} 859 860static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 861_mm_maskload_ps(float const *p, __m128 m) 862{ 863 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 864} 865 866static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 867_mm256_maskload_ps(float const *p, __m256 m) 868{ 869 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 870} 871 872/* Conditional store ops */ 873static __inline void __attribute__((__always_inline__, __nodebug__)) 874_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 875{ 876 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 877} 878 879static __inline void __attribute__((__always_inline__, __nodebug__)) 880_mm_maskstore_pd(double *p, __m128d m, __m128d a) 881{ 882 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 883} 884 885static __inline void __attribute__((__always_inline__, __nodebug__)) 886_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 887{ 888 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 889} 890 891static __inline void __attribute__((__always_inline__, __nodebug__)) 892_mm_maskstore_ps(float *p, __m128 m, __m128 a) 893{ 894 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 895} 896 897/* Cacheability support ops */ 898static __inline void __attribute__((__always_inline__, __nodebug__)) 899_mm256_stream_si256(__m256i *a, __m256i b) 900{ 901 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 902} 903 904static __inline void __attribute__((__always_inline__, __nodebug__)) 905_mm256_stream_pd(double *a, __m256d b) 906{ 907 __builtin_ia32_movntpd256(a, (__v4df)b); 908} 909 910static __inline void __attribute__((__always_inline__, __nodebug__)) 911_mm256_stream_ps(float *p, __m256 a) 912{ 913 __builtin_ia32_movntps256(p, (__v8sf)a); 914} 915 916/* Create vectors */ 917static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 918_mm256_set_pd(double a, double b, double c, double d) 919{ 920 return (__m256d){ d, c, b, a }; 921} 922 923static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 924_mm256_set_ps(float a, float b, float c, float d, 925 float e, float f, float g, float h) 926{ 927 return (__m256){ h, g, f, e, d, c, b, a }; 928} 929 930static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 931_mm256_set_epi32(int i0, int i1, int i2, int i3, 932 int i4, int i5, int i6, int i7) 933{ 934 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 935} 936 937static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 938_mm256_set_epi16(short w15, short w14, short w13, short w12, 939 short w11, short w10, short w09, short w08, 940 short w07, short w06, short w05, short w04, 941 short w03, short w02, short w01, short w00) 942{ 943 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 944 w08, w09, w10, w11, w12, w13, w14, w15 }; 945} 946 947static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 948_mm256_set_epi8(char b31, char b30, char b29, char b28, 949 char b27, char b26, char b25, char b24, 950 char b23, char b22, char b21, char b20, 951 char b19, char b18, char b17, char b16, 952 char b15, char b14, char b13, char b12, 953 char b11, char b10, char b09, char b08, 954 char b07, char b06, char b05, char b04, 955 char b03, char b02, char b01, char b00) 956{ 957 return (__m256i)(__v32qi){ 958 b00, b01, b02, b03, b04, b05, b06, b07, 959 b08, b09, b10, b11, b12, b13, b14, b15, 960 b16, b17, b18, b19, b20, b21, b22, b23, 961 b24, b25, b26, b27, b28, b29, b30, b31 962 }; 963} 964 965static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 966_mm256_set_epi64x(long long a, long long b, long long c, long long d) 967{ 968 return (__m256i)(__v4di){ d, c, b, a }; 969} 970 971/* Create vectors with elements in reverse order */ 972static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 973_mm256_setr_pd(double a, double b, double c, double d) 974{ 975 return (__m256d){ a, b, c, d }; 976} 977 978static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 979_mm256_setr_ps(float a, float b, float c, float d, 980 float e, float f, float g, float h) 981{ 982 return (__m256){ a, b, c, d, e, f, g, h }; 983} 984 985static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 986_mm256_setr_epi32(int i0, int i1, int i2, int i3, 987 int i4, int i5, int i6, int i7) 988{ 989 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 990} 991 992static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 993_mm256_setr_epi16(short w15, short w14, short w13, short w12, 994 short w11, short w10, short w09, short w08, 995 short w07, short w06, short w05, short w04, 996 short w03, short w02, short w01, short w00) 997{ 998 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 999 w07, w06, w05, w04, w03, w02, w01, w00 }; 1000} 1001 1002static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1003_mm256_setr_epi8(char b31, char b30, char b29, char b28, 1004 char b27, char b26, char b25, char b24, 1005 char b23, char b22, char b21, char b20, 1006 char b19, char b18, char b17, char b16, 1007 char b15, char b14, char b13, char b12, 1008 char b11, char b10, char b09, char b08, 1009 char b07, char b06, char b05, char b04, 1010 char b03, char b02, char b01, char b00) 1011{ 1012 return (__m256i)(__v32qi){ 1013 b31, b30, b29, b28, b27, b26, b25, b24, 1014 b23, b22, b21, b20, b19, b18, b17, b16, 1015 b15, b14, b13, b12, b11, b10, b09, b08, 1016 b07, b06, b05, b04, b03, b02, b01, b00 }; 1017} 1018 1019static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1020_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 1021{ 1022 return (__m256i)(__v4di){ a, b, c, d }; 1023} 1024 1025/* Create vectors with repeated elements */ 1026static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1027_mm256_set1_pd(double w) 1028{ 1029 return (__m256d){ w, w, w, w }; 1030} 1031 1032static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1033_mm256_set1_ps(float w) 1034{ 1035 return (__m256){ w, w, w, w, w, w, w, w }; 1036} 1037 1038static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1039_mm256_set1_epi32(int i) 1040{ 1041 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1042} 1043 1044static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1045_mm256_set1_epi16(short w) 1046{ 1047 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1048} 1049 1050static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1051_mm256_set1_epi8(char b) 1052{ 1053 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1054 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1055} 1056 1057static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1058_mm256_set1_epi64x(long long q) 1059{ 1060 return (__m256i)(__v4di){ q, q, q, q }; 1061} 1062 1063/* Create zeroed vectors */ 1064static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1065_mm256_setzero_pd(void) 1066{ 1067 return (__m256d){ 0, 0, 0, 0 }; 1068} 1069 1070static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1071_mm256_setzero_ps(void) 1072{ 1073 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1074} 1075 1076static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1077_mm256_setzero_si256(void) 1078{ 1079 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1080} 1081 1082/* Cast between vector types */ 1083static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1084_mm256_castpd_ps(__m256d in) 1085{ 1086 return (__m256)in; 1087} 1088 1089static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1090_mm256_castpd_si256(__m256d in) 1091{ 1092 return (__m256i)in; 1093} 1094 1095static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1096_mm256_castps_pd(__m256 in) 1097{ 1098 return (__m256d)in; 1099} 1100 1101static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1102_mm256_castps_si256(__m256 in) 1103{ 1104 return (__m256i)in; 1105} 1106 1107static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1108_mm256_castsi256_ps(__m256i in) 1109{ 1110 return (__m256)in; 1111} 1112 1113static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1114_mm256_castsi256_pd(__m256i in) 1115{ 1116 return (__m256d)in; 1117} 1118 1119static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1120_mm256_castpd256_pd128(__m256d in) 1121{ 1122 return __builtin_shufflevector(in, in, 0, 1); 1123} 1124 1125static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1126_mm256_castps256_ps128(__m256 in) 1127{ 1128 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1129} 1130 1131static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1132_mm256_castsi256_si128(__m256i in) 1133{ 1134 return __builtin_shufflevector(in, in, 0, 1); 1135} 1136 1137static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1138_mm256_castpd128_pd256(__m128d in) 1139{ 1140 __m128d zero = _mm_setzero_pd(); 1141 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1142} 1143 1144static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1145_mm256_castps128_ps256(__m128 in) 1146{ 1147 __m128 zero = _mm_setzero_ps(); 1148 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1149} 1150 1151static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1152_mm256_castsi128_si256(__m128i in) 1153{ 1154 __m128i zero = _mm_setzero_si128(); 1155 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1156} 1157