avxintrin.h revision 234353
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28typedef double __v4df __attribute__ ((__vector_size__ (32))); 29typedef float __v8sf __attribute__ ((__vector_size__ (32))); 30typedef long long __v4di __attribute__ ((__vector_size__ (32))); 31typedef int __v8si __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef char __v32qi __attribute__ ((__vector_size__ (32))); 34 35typedef float __m256 __attribute__ ((__vector_size__ (32))); 36typedef double __m256d __attribute__((__vector_size__(32))); 37typedef long long __m256i __attribute__((__vector_size__(32))); 38 39/* Arithmetic */ 40static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 41_mm256_add_pd(__m256d a, __m256d b) 42{ 43 return a+b; 44} 45 46static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 47_mm256_add_ps(__m256 a, __m256 b) 48{ 49 return a+b; 50} 51 52static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 53_mm256_sub_pd(__m256d a, __m256d b) 54{ 55 return a-b; 56} 57 58static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 59_mm256_sub_ps(__m256 a, __m256 b) 60{ 61 return a-b; 62} 63 64static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 65_mm256_addsub_pd(__m256d a, __m256d b) 66{ 67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 68} 69 70static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 71_mm256_addsub_ps(__m256 a, __m256 b) 72{ 73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 74} 75 76static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 77_mm256_div_pd(__m256d a, __m256d b) 78{ 79 return a / b; 80} 81 82static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 83_mm256_div_ps(__m256 a, __m256 b) 84{ 85 return a / b; 86} 87 88static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 89_mm256_max_pd(__m256d a, __m256d b) 90{ 91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 92} 93 94static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 95_mm256_max_ps(__m256 a, __m256 b) 96{ 97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 98} 99 100static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 101_mm256_min_pd(__m256d a, __m256d b) 102{ 103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 104} 105 106static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 107_mm256_min_ps(__m256 a, __m256 b) 108{ 109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 110} 111 112static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 113_mm256_mul_pd(__m256d a, __m256d b) 114{ 115 return a * b; 116} 117 118static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 119_mm256_mul_ps(__m256 a, __m256 b) 120{ 121 return a * b; 122} 123 124static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 125_mm256_sqrt_pd(__m256d a) 126{ 127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 128} 129 130static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 131_mm256_sqrt_ps(__m256 a) 132{ 133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 134} 135 136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 137_mm256_rsqrt_ps(__m256 a) 138{ 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 140} 141 142static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 143_mm256_rcp_ps(__m256 a) 144{ 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 146} 147 148#define _mm256_round_pd(V, M) __extension__ ({ \ 149 __m256d __V = (V); \ 150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) 151 152#define _mm256_round_ps(V, M) __extension__ ({ \ 153 __m256 __V = (V); \ 154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) 155 156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 160 161/* Logical */ 162static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 163_mm256_and_pd(__m256d a, __m256d b) 164{ 165 return (__m256d)((__v4di)a & (__v4di)b); 166} 167 168static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 169_mm256_and_ps(__m256 a, __m256 b) 170{ 171 return (__m256)((__v8si)a & (__v8si)b); 172} 173 174static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 175_mm256_andnot_pd(__m256d a, __m256d b) 176{ 177 return (__m256d)(~(__v4di)a & (__v4di)b); 178} 179 180static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 181_mm256_andnot_ps(__m256 a, __m256 b) 182{ 183 return (__m256)(~(__v8si)a & (__v8si)b); 184} 185 186static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 187_mm256_or_pd(__m256d a, __m256d b) 188{ 189 return (__m256d)((__v4di)a | (__v4di)b); 190} 191 192static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 193_mm256_or_ps(__m256 a, __m256 b) 194{ 195 return (__m256)((__v8si)a | (__v8si)b); 196} 197 198static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 199_mm256_xor_pd(__m256d a, __m256d b) 200{ 201 return (__m256d)((__v4di)a ^ (__v4di)b); 202} 203 204static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 205_mm256_xor_ps(__m256 a, __m256 b) 206{ 207 return (__m256)((__v8si)a ^ (__v8si)b); 208} 209 210/* Horizontal arithmetic */ 211static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 212_mm256_hadd_pd(__m256d a, __m256d b) 213{ 214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 215} 216 217static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 218_mm256_hadd_ps(__m256 a, __m256 b) 219{ 220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 221} 222 223static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 224_mm256_hsub_pd(__m256d a, __m256d b) 225{ 226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 227} 228 229static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 230_mm256_hsub_ps(__m256 a, __m256 b) 231{ 232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 233} 234 235/* Vector permutations */ 236static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 237_mm_permutevar_pd(__m128d a, __m128i c) 238{ 239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 240} 241 242static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 243_mm256_permutevar_pd(__m256d a, __m256i c) 244{ 245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 246} 247 248static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 249_mm_permutevar_ps(__m128 a, __m128i c) 250{ 251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 252} 253 254static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 255_mm256_permutevar_ps(__m256 a, __m256i c) 256{ 257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 258 (__v8si)c); 259} 260 261#define _mm_permute_pd(A, C) __extension__ ({ \ 262 __m128d __A = (A); \ 263 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ 264 (C) & 0x1, ((C) & 0x2) >> 1); }) 265 266#define _mm256_permute_pd(A, C) __extension__ ({ \ 267 __m256d __A = (A); \ 268 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ 269 (C) & 0x1, ((C) & 0x2) >> 1, \ 270 2 + (((C) & 0x4) >> 2), \ 271 2 + (((C) & 0x8) >> 3)); }) 272 273#define _mm_permute_ps(A, C) __extension__ ({ \ 274 __m128 __A = (A); \ 275 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ 276 (C) & 0x3, ((C) & 0xc) >> 2, \ 277 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 278 279#define _mm256_permute_ps(A, C) __extension__ ({ \ 280 __m256 __A = (A); \ 281 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ 282 (C) & 0x3, ((C) & 0xc) >> 2, \ 283 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 284 4 + (((C) & 0x03) >> 0), \ 285 4 + (((C) & 0x0c) >> 2), \ 286 4 + (((C) & 0x30) >> 4), \ 287 4 + (((C) & 0xc0) >> 6)); }) 288 289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 290 __m256d __V1 = (V1); \ 291 __m256d __V2 = (V2); \ 292 (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \ 293 ((M) & 0x3) * 2, \ 294 ((M) & 0x3) * 2 + 1, \ 295 (((M) & 0x30) >> 4) * 2, \ 296 (((M) & 0x30) >> 4) * 2 + 1); }) 297 298#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 299 __m256 __V1 = (V1); \ 300 __m256 __V2 = (V2); \ 301 (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \ 302 ((M) & 0x3) * 4, \ 303 ((M) & 0x3) * 4 + 1, \ 304 ((M) & 0x3) * 4 + 2, \ 305 ((M) & 0x3) * 4 + 3, \ 306 (((M) & 0x30) >> 4) * 4, \ 307 (((M) & 0x30) >> 4) * 4 + 1, \ 308 (((M) & 0x30) >> 4) * 4 + 2, \ 309 (((M) & 0x30) >> 4) * 4 + 3); }) 310 311#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 312 __m256i __V1 = (V1); \ 313 __m256i __V2 = (V2); \ 314 (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \ 315 ((M) & 0x3) * 4, \ 316 ((M) & 0x3) * 4 + 1, \ 317 ((M) & 0x3) * 4 + 2, \ 318 ((M) & 0x3) * 4 + 3, \ 319 (((M) & 0x30) >> 4) * 4, \ 320 (((M) & 0x30) >> 4) * 4 + 1, \ 321 (((M) & 0x30) >> 4) * 4 + 2, \ 322 (((M) & 0x30) >> 4) * 4 + 3); }) 323 324/* Vector Blend */ 325#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 326 __m256d __V1 = (V1); \ 327 __m256d __V2 = (V2); \ 328 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); }) 329 330#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 331 __m256 __V1 = (V1); \ 332 __m256 __V2 = (V2); \ 333 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 334 335static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 336_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 337{ 338 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 339} 340 341static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 342_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 343{ 344 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 345} 346 347/* Vector Dot Product */ 348#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 349 __m256 __V1 = (V1); \ 350 __m256 __V2 = (V2); \ 351 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 352 353/* Vector shuffle */ 354#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 355 __m256 __a = (a); \ 356 __m256 __b = (b); \ 357 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 358 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 359 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 360 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 361 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 362 363#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 364 __m256d __a = (a); \ 365 __m256d __b = (b); \ 366 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 367 (mask) & 0x1, \ 368 (((mask) & 0x2) >> 1) + 4, \ 369 (((mask) & 0x4) >> 2) + 2, \ 370 (((mask) & 0x8) >> 3) + 6); }) 371 372/* Compare */ 373#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 374#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 375#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 376#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 377#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 378#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 379#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 380#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 381#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 382#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 383#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 384#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 385#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 386#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 387#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 388#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 389#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 390#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 391#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 392#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 393#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 394#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 395#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 396#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 397#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 398#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 399#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 400#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 401#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 402#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 403#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 404#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 405 406#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 407 __m128d __a = (a); \ 408 __m128d __b = (b); \ 409 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 410 411#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 412 __m128 __a = (a); \ 413 __m128 __b = (b); \ 414 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 415 416#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 417 __m256d __a = (a); \ 418 __m256d __b = (b); \ 419 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 420 421#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 422 __m256 __a = (a); \ 423 __m256 __b = (b); \ 424 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 425 426#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 427 __m128d __a = (a); \ 428 __m128d __b = (b); \ 429 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 430 431#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 432 __m128 __a = (a); \ 433 __m128 __b = (b); \ 434 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 435 436/* Vector extract */ 437#define _mm256_extractf128_pd(A, O) __extension__ ({ \ 438 __m256d __A = (A); \ 439 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) 440 441#define _mm256_extractf128_ps(A, O) __extension__ ({ \ 442 __m256 __A = (A); \ 443 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) 444 445#define _mm256_extractf128_si256(A, O) __extension__ ({ \ 446 __m256i __A = (A); \ 447 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) 448 449static __inline int __attribute__((__always_inline__, __nodebug__)) 450_mm256_extract_epi32(__m256i a, int const imm) 451{ 452 __v8si b = (__v8si)a; 453 return b[imm]; 454} 455 456static __inline int __attribute__((__always_inline__, __nodebug__)) 457_mm256_extract_epi16(__m256i a, int const imm) 458{ 459 __v16hi b = (__v16hi)a; 460 return b[imm]; 461} 462 463static __inline int __attribute__((__always_inline__, __nodebug__)) 464_mm256_extract_epi8(__m256i a, int const imm) 465{ 466 __v32qi b = (__v32qi)a; 467 return b[imm]; 468} 469 470#ifdef __x86_64__ 471static __inline long long __attribute__((__always_inline__, __nodebug__)) 472_mm256_extract_epi64(__m256i a, const int imm) 473{ 474 __v4di b = (__v4di)a; 475 return b[imm]; 476} 477#endif 478 479/* Vector insert */ 480#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ 481 __m256d __V1 = (V1); \ 482 __m128d __V2 = (V2); \ 483 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) 484 485#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ 486 __m256 __V1 = (V1); \ 487 __m128 __V2 = (V2); \ 488 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) 489 490#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ 491 __m256i __V1 = (V1); \ 492 __m128i __V2 = (V2); \ 493 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) 494 495static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 496_mm256_insert_epi32(__m256i a, int b, int const imm) 497{ 498 __v8si c = (__v8si)a; 499 c[imm & 7] = b; 500 return (__m256i)c; 501} 502 503static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 504_mm256_insert_epi16(__m256i a, int b, int const imm) 505{ 506 __v16hi c = (__v16hi)a; 507 c[imm & 15] = b; 508 return (__m256i)c; 509} 510 511static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 512_mm256_insert_epi8(__m256i a, int b, int const imm) 513{ 514 __v32qi c = (__v32qi)a; 515 c[imm & 31] = b; 516 return (__m256i)c; 517} 518 519#ifdef __x86_64__ 520static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 521_mm256_insert_epi64(__m256i a, int b, int const imm) 522{ 523 __v4di c = (__v4di)a; 524 c[imm & 3] = b; 525 return (__m256i)c; 526} 527#endif 528 529/* Conversion */ 530static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 531_mm256_cvtepi32_pd(__m128i a) 532{ 533 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 534} 535 536static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 537_mm256_cvtepi32_ps(__m256i a) 538{ 539 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 540} 541 542static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 543_mm256_cvtpd_ps(__m256d a) 544{ 545 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 546} 547 548static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 549_mm256_cvtps_epi32(__m256 a) 550{ 551 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 552} 553 554static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 555_mm256_cvtps_pd(__m128 a) 556{ 557 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 558} 559 560static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 561_mm256_cvttpd_epi32(__m256d a) 562{ 563 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 564} 565 566static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 567_mm256_cvtpd_epi32(__m256d a) 568{ 569 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 570} 571 572static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 573_mm256_cvttps_epi32(__m256 a) 574{ 575 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 576} 577 578/* Vector replicate */ 579static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 580_mm256_movehdup_ps(__m256 a) 581{ 582 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 583} 584 585static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 586_mm256_moveldup_ps(__m256 a) 587{ 588 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 589} 590 591static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 592_mm256_movedup_pd(__m256d a) 593{ 594 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 595} 596 597/* Unpack and Interleave */ 598static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 599_mm256_unpackhi_pd(__m256d a, __m256d b) 600{ 601 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 602} 603 604static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 605_mm256_unpacklo_pd(__m256d a, __m256d b) 606{ 607 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 608} 609 610static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 611_mm256_unpackhi_ps(__m256 a, __m256 b) 612{ 613 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 614} 615 616static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 617_mm256_unpacklo_ps(__m256 a, __m256 b) 618{ 619 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 620} 621 622/* Bit Test */ 623static __inline int __attribute__((__always_inline__, __nodebug__)) 624_mm_testz_pd(__m128d a, __m128d b) 625{ 626 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 627} 628 629static __inline int __attribute__((__always_inline__, __nodebug__)) 630_mm_testc_pd(__m128d a, __m128d b) 631{ 632 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 633} 634 635static __inline int __attribute__((__always_inline__, __nodebug__)) 636_mm_testnzc_pd(__m128d a, __m128d b) 637{ 638 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 639} 640 641static __inline int __attribute__((__always_inline__, __nodebug__)) 642_mm_testz_ps(__m128 a, __m128 b) 643{ 644 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 645} 646 647static __inline int __attribute__((__always_inline__, __nodebug__)) 648_mm_testc_ps(__m128 a, __m128 b) 649{ 650 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 651} 652 653static __inline int __attribute__((__always_inline__, __nodebug__)) 654_mm_testnzc_ps(__m128 a, __m128 b) 655{ 656 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 657} 658 659static __inline int __attribute__((__always_inline__, __nodebug__)) 660_mm256_testz_pd(__m256d a, __m256d b) 661{ 662 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 663} 664 665static __inline int __attribute__((__always_inline__, __nodebug__)) 666_mm256_testc_pd(__m256d a, __m256d b) 667{ 668 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 669} 670 671static __inline int __attribute__((__always_inline__, __nodebug__)) 672_mm256_testnzc_pd(__m256d a, __m256d b) 673{ 674 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 675} 676 677static __inline int __attribute__((__always_inline__, __nodebug__)) 678_mm256_testz_ps(__m256 a, __m256 b) 679{ 680 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 681} 682 683static __inline int __attribute__((__always_inline__, __nodebug__)) 684_mm256_testc_ps(__m256 a, __m256 b) 685{ 686 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 687} 688 689static __inline int __attribute__((__always_inline__, __nodebug__)) 690_mm256_testnzc_ps(__m256 a, __m256 b) 691{ 692 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 693} 694 695static __inline int __attribute__((__always_inline__, __nodebug__)) 696_mm256_testz_si256(__m256i a, __m256i b) 697{ 698 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 699} 700 701static __inline int __attribute__((__always_inline__, __nodebug__)) 702_mm256_testc_si256(__m256i a, __m256i b) 703{ 704 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 705} 706 707static __inline int __attribute__((__always_inline__, __nodebug__)) 708_mm256_testnzc_si256(__m256i a, __m256i b) 709{ 710 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 711} 712 713/* Vector extract sign mask */ 714static __inline int __attribute__((__always_inline__, __nodebug__)) 715_mm256_movemask_pd(__m256d a) 716{ 717 return __builtin_ia32_movmskpd256((__v4df)a); 718} 719 720static __inline int __attribute__((__always_inline__, __nodebug__)) 721_mm256_movemask_ps(__m256 a) 722{ 723 return __builtin_ia32_movmskps256((__v8sf)a); 724} 725 726/* Vector zero */ 727static __inline void __attribute__((__always_inline__, __nodebug__)) 728_mm256_zeroall(void) 729{ 730 __builtin_ia32_vzeroall(); 731} 732 733static __inline void __attribute__((__always_inline__, __nodebug__)) 734_mm256_zeroupper(void) 735{ 736 __builtin_ia32_vzeroupper(); 737} 738 739/* Vector load with broadcast */ 740static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 741_mm_broadcast_ss(float const *a) 742{ 743 return (__m128)__builtin_ia32_vbroadcastss(a); 744} 745 746static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 747_mm256_broadcast_sd(double const *a) 748{ 749 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 750} 751 752static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 753_mm256_broadcast_ss(float const *a) 754{ 755 return (__m256)__builtin_ia32_vbroadcastss256(a); 756} 757 758static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 759_mm256_broadcast_pd(__m128d const *a) 760{ 761 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 762} 763 764static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 765_mm256_broadcast_ps(__m128 const *a) 766{ 767 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 768} 769 770/* SIMD load ops */ 771static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 772_mm256_load_pd(double const *p) 773{ 774 return *(__m256d *)p; 775} 776 777static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 778_mm256_load_ps(float const *p) 779{ 780 return *(__m256 *)p; 781} 782 783static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 784_mm256_loadu_pd(double const *p) 785{ 786 struct __loadu_pd { 787 __m256d v; 788 } __attribute__((packed, may_alias)); 789 return ((struct __loadu_pd*)p)->v; 790} 791 792static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 793_mm256_loadu_ps(float const *p) 794{ 795 struct __loadu_ps { 796 __m256 v; 797 } __attribute__((packed, may_alias)); 798 return ((struct __loadu_ps*)p)->v; 799} 800 801static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 802_mm256_load_si256(__m256i const *p) 803{ 804 return *p; 805} 806 807static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 808_mm256_loadu_si256(__m256i const *p) 809{ 810 struct __loadu_si256 { 811 __m256i v; 812 } __attribute__((packed, may_alias)); 813 return ((struct __loadu_si256*)p)->v; 814} 815 816static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 817_mm256_lddqu_si256(__m256i const *p) 818{ 819 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 820} 821 822/* SIMD store ops */ 823static __inline void __attribute__((__always_inline__, __nodebug__)) 824_mm256_store_pd(double *p, __m256d a) 825{ 826 *(__m256d *)p = a; 827} 828 829static __inline void __attribute__((__always_inline__, __nodebug__)) 830_mm256_store_ps(float *p, __m256 a) 831{ 832 *(__m256 *)p = a; 833} 834 835static __inline void __attribute__((__always_inline__, __nodebug__)) 836_mm256_storeu_pd(double *p, __m256d a) 837{ 838 __builtin_ia32_storeupd256(p, (__v4df)a); 839} 840 841static __inline void __attribute__((__always_inline__, __nodebug__)) 842_mm256_storeu_ps(float *p, __m256 a) 843{ 844 __builtin_ia32_storeups256(p, (__v8sf)a); 845} 846 847static __inline void __attribute__((__always_inline__, __nodebug__)) 848_mm256_store_si256(__m256i *p, __m256i a) 849{ 850 *p = a; 851} 852 853static __inline void __attribute__((__always_inline__, __nodebug__)) 854_mm256_storeu_si256(__m256i *p, __m256i a) 855{ 856 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 857} 858 859/* Conditional load ops */ 860static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 861_mm_maskload_pd(double const *p, __m128d m) 862{ 863 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 864} 865 866static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 867_mm256_maskload_pd(double const *p, __m256d m) 868{ 869 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 870} 871 872static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 873_mm_maskload_ps(float const *p, __m128 m) 874{ 875 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 876} 877 878static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 879_mm256_maskload_ps(float const *p, __m256 m) 880{ 881 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 882} 883 884/* Conditional store ops */ 885static __inline void __attribute__((__always_inline__, __nodebug__)) 886_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 887{ 888 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 889} 890 891static __inline void __attribute__((__always_inline__, __nodebug__)) 892_mm_maskstore_pd(double *p, __m128d m, __m128d a) 893{ 894 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 895} 896 897static __inline void __attribute__((__always_inline__, __nodebug__)) 898_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 899{ 900 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 901} 902 903static __inline void __attribute__((__always_inline__, __nodebug__)) 904_mm_maskstore_ps(float *p, __m128 m, __m128 a) 905{ 906 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 907} 908 909/* Cacheability support ops */ 910static __inline void __attribute__((__always_inline__, __nodebug__)) 911_mm256_stream_si256(__m256i *a, __m256i b) 912{ 913 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 914} 915 916static __inline void __attribute__((__always_inline__, __nodebug__)) 917_mm256_stream_pd(double *a, __m256d b) 918{ 919 __builtin_ia32_movntpd256(a, (__v4df)b); 920} 921 922static __inline void __attribute__((__always_inline__, __nodebug__)) 923_mm256_stream_ps(float *p, __m256 a) 924{ 925 __builtin_ia32_movntps256(p, (__v8sf)a); 926} 927 928/* Create vectors */ 929static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 930_mm256_set_pd(double a, double b, double c, double d) 931{ 932 return (__m256d){ d, c, b, a }; 933} 934 935static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 936_mm256_set_ps(float a, float b, float c, float d, 937 float e, float f, float g, float h) 938{ 939 return (__m256){ h, g, f, e, d, c, b, a }; 940} 941 942static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 943_mm256_set_epi32(int i0, int i1, int i2, int i3, 944 int i4, int i5, int i6, int i7) 945{ 946 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 947} 948 949static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 950_mm256_set_epi16(short w15, short w14, short w13, short w12, 951 short w11, short w10, short w09, short w08, 952 short w07, short w06, short w05, short w04, 953 short w03, short w02, short w01, short w00) 954{ 955 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 956 w08, w09, w10, w11, w12, w13, w14, w15 }; 957} 958 959static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 960_mm256_set_epi8(char b31, char b30, char b29, char b28, 961 char b27, char b26, char b25, char b24, 962 char b23, char b22, char b21, char b20, 963 char b19, char b18, char b17, char b16, 964 char b15, char b14, char b13, char b12, 965 char b11, char b10, char b09, char b08, 966 char b07, char b06, char b05, char b04, 967 char b03, char b02, char b01, char b00) 968{ 969 return (__m256i)(__v32qi){ 970 b00, b01, b02, b03, b04, b05, b06, b07, 971 b08, b09, b10, b11, b12, b13, b14, b15, 972 b16, b17, b18, b19, b20, b21, b22, b23, 973 b24, b25, b26, b27, b28, b29, b30, b31 974 }; 975} 976 977static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 978_mm256_set_epi64x(long long a, long long b, long long c, long long d) 979{ 980 return (__m256i)(__v4di){ d, c, b, a }; 981} 982 983/* Create vectors with elements in reverse order */ 984static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 985_mm256_setr_pd(double a, double b, double c, double d) 986{ 987 return (__m256d){ a, b, c, d }; 988} 989 990static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 991_mm256_setr_ps(float a, float b, float c, float d, 992 float e, float f, float g, float h) 993{ 994 return (__m256){ a, b, c, d, e, f, g, h }; 995} 996 997static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 998_mm256_setr_epi32(int i0, int i1, int i2, int i3, 999 int i4, int i5, int i6, int i7) 1000{ 1001 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 1002} 1003 1004static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1005_mm256_setr_epi16(short w15, short w14, short w13, short w12, 1006 short w11, short w10, short w09, short w08, 1007 short w07, short w06, short w05, short w04, 1008 short w03, short w02, short w01, short w00) 1009{ 1010 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 1011 w07, w06, w05, w04, w03, w02, w01, w00 }; 1012} 1013 1014static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1015_mm256_setr_epi8(char b31, char b30, char b29, char b28, 1016 char b27, char b26, char b25, char b24, 1017 char b23, char b22, char b21, char b20, 1018 char b19, char b18, char b17, char b16, 1019 char b15, char b14, char b13, char b12, 1020 char b11, char b10, char b09, char b08, 1021 char b07, char b06, char b05, char b04, 1022 char b03, char b02, char b01, char b00) 1023{ 1024 return (__m256i)(__v32qi){ 1025 b31, b30, b29, b28, b27, b26, b25, b24, 1026 b23, b22, b21, b20, b19, b18, b17, b16, 1027 b15, b14, b13, b12, b11, b10, b09, b08, 1028 b07, b06, b05, b04, b03, b02, b01, b00 }; 1029} 1030 1031static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1032_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 1033{ 1034 return (__m256i)(__v4di){ a, b, c, d }; 1035} 1036 1037/* Create vectors with repeated elements */ 1038static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1039_mm256_set1_pd(double w) 1040{ 1041 return (__m256d){ w, w, w, w }; 1042} 1043 1044static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1045_mm256_set1_ps(float w) 1046{ 1047 return (__m256){ w, w, w, w, w, w, w, w }; 1048} 1049 1050static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1051_mm256_set1_epi32(int i) 1052{ 1053 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1054} 1055 1056static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1057_mm256_set1_epi16(short w) 1058{ 1059 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1060} 1061 1062static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1063_mm256_set1_epi8(char b) 1064{ 1065 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1066 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1067} 1068 1069static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1070_mm256_set1_epi64x(long long q) 1071{ 1072 return (__m256i)(__v4di){ q, q, q, q }; 1073} 1074 1075/* Create zeroed vectors */ 1076static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1077_mm256_setzero_pd(void) 1078{ 1079 return (__m256d){ 0, 0, 0, 0 }; 1080} 1081 1082static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1083_mm256_setzero_ps(void) 1084{ 1085 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1086} 1087 1088static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1089_mm256_setzero_si256(void) 1090{ 1091 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1092} 1093 1094/* Cast between vector types */ 1095static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1096_mm256_castpd_ps(__m256d in) 1097{ 1098 return (__m256)in; 1099} 1100 1101static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1102_mm256_castpd_si256(__m256d in) 1103{ 1104 return (__m256i)in; 1105} 1106 1107static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1108_mm256_castps_pd(__m256 in) 1109{ 1110 return (__m256d)in; 1111} 1112 1113static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1114_mm256_castps_si256(__m256 in) 1115{ 1116 return (__m256i)in; 1117} 1118 1119static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1120_mm256_castsi256_ps(__m256i in) 1121{ 1122 return (__m256)in; 1123} 1124 1125static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1126_mm256_castsi256_pd(__m256i in) 1127{ 1128 return (__m256d)in; 1129} 1130 1131static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1132_mm256_castpd256_pd128(__m256d in) 1133{ 1134 return __builtin_shufflevector(in, in, 0, 1); 1135} 1136 1137static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1138_mm256_castps256_ps128(__m256 in) 1139{ 1140 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1141} 1142 1143static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1144_mm256_castsi256_si128(__m256i in) 1145{ 1146 return __builtin_shufflevector(in, in, 0, 1); 1147} 1148 1149static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1150_mm256_castpd128_pd256(__m128d in) 1151{ 1152 __m128d zero = _mm_setzero_pd(); 1153 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1154} 1155 1156static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1157_mm256_castps128_ps256(__m128 in) 1158{ 1159 __m128 zero = _mm_setzero_ps(); 1160 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1161} 1162 1163static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1164_mm256_castsi128_si256(__m128i in) 1165{ 1166 __m128i zero = _mm_setzero_si128(); 1167 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1168} 1169 1170/* SIMD load ops (unaligned) */ 1171static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1172_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo) 1173{ 1174 struct __loadu_ps { 1175 __m128 v; 1176 } __attribute__((__packed__, __may_alias__)); 1177 1178 __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v); 1179 return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1); 1180} 1181 1182static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1183_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo) 1184{ 1185 struct __loadu_pd { 1186 __m128d v; 1187 } __attribute__((__packed__, __may_alias__)); 1188 1189 __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v); 1190 return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1); 1191} 1192 1193static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1194_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo) 1195{ 1196 struct __loadu_si128 { 1197 __m128i v; 1198 } __attribute__((packed, may_alias)); 1199 __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v); 1200 return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1); 1201} 1202 1203/* SIMD store ops (unaligned) */ 1204static __inline void __attribute__((__always_inline__, __nodebug__)) 1205_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a) 1206{ 1207 __m128 v128; 1208 1209 v128 = _mm256_castps256_ps128(a); 1210 __builtin_ia32_storeups(addr_lo, v128); 1211 v128 = _mm256_extractf128_ps(a, 1); 1212 __builtin_ia32_storeups(addr_hi, v128); 1213} 1214 1215static __inline void __attribute__((__always_inline__, __nodebug__)) 1216_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a) 1217{ 1218 __m128d v128; 1219 1220 v128 = _mm256_castpd256_pd128(a); 1221 __builtin_ia32_storeupd(addr_lo, v128); 1222 v128 = _mm256_extractf128_pd(a, 1); 1223 __builtin_ia32_storeupd(addr_hi, v128); 1224} 1225 1226static __inline void __attribute__((__always_inline__, __nodebug__)) 1227_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a) 1228{ 1229 __m128i v128; 1230 1231 v128 = _mm256_castsi256_si128(a); 1232 __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128); 1233 v128 = _mm256_extractf128_si256(a, 1); 1234 __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128); 1235} 1236