emmintrin.h revision 252723
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36/* Type defines. */ 37typedef double __v2df __attribute__ ((__vector_size__ (16))); 38typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39typedef short __v8hi __attribute__((__vector_size__(16))); 40typedef char __v16qi __attribute__((__vector_size__(16))); 41 42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43_mm_add_sd(__m128d __a, __m128d __b) 44{ 45 __a[0] += __b[0]; 46 return __a; 47} 48 49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50_mm_add_pd(__m128d __a, __m128d __b) 51{ 52 return __a + __b; 53} 54 55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56_mm_sub_sd(__m128d __a, __m128d __b) 57{ 58 __a[0] -= __b[0]; 59 return __a; 60} 61 62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63_mm_sub_pd(__m128d __a, __m128d __b) 64{ 65 return __a - __b; 66} 67 68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69_mm_mul_sd(__m128d __a, __m128d __b) 70{ 71 __a[0] *= __b[0]; 72 return __a; 73} 74 75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76_mm_mul_pd(__m128d __a, __m128d __b) 77{ 78 return __a * __b; 79} 80 81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82_mm_div_sd(__m128d __a, __m128d __b) 83{ 84 __a[0] /= __b[0]; 85 return __a; 86} 87 88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89_mm_div_pd(__m128d __a, __m128d __b) 90{ 91 return __a / __b; 92} 93 94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95_mm_sqrt_sd(__m128d __a, __m128d __b) 96{ 97 __m128d __c = __builtin_ia32_sqrtsd(__b); 98 return (__m128d) { __c[0], __a[1] }; 99} 100 101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102_mm_sqrt_pd(__m128d __a) 103{ 104 return __builtin_ia32_sqrtpd(__a); 105} 106 107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108_mm_min_sd(__m128d __a, __m128d __b) 109{ 110 return __builtin_ia32_minsd(__a, __b); 111} 112 113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_min_pd(__m128d __a, __m128d __b) 115{ 116 return __builtin_ia32_minpd(__a, __b); 117} 118 119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_sd(__m128d __a, __m128d __b) 121{ 122 return __builtin_ia32_maxsd(__a, __b); 123} 124 125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_max_pd(__m128d __a, __m128d __b) 127{ 128 return __builtin_ia32_maxpd(__a, __b); 129} 130 131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_and_pd(__m128d __a, __m128d __b) 133{ 134 return (__m128d)((__v4si)__a & (__v4si)__b); 135} 136 137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_andnot_pd(__m128d __a, __m128d __b) 139{ 140 return (__m128d)(~(__v4si)__a & (__v4si)__b); 141} 142 143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_or_pd(__m128d __a, __m128d __b) 145{ 146 return (__m128d)((__v4si)__a | (__v4si)__b); 147} 148 149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_xor_pd(__m128d __a, __m128d __b) 151{ 152 return (__m128d)((__v4si)__a ^ (__v4si)__b); 153} 154 155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmpeq_pd(__m128d __a, __m128d __b) 157{ 158 return (__m128d)__builtin_ia32_cmppd(__a, __b, 0); 159} 160 161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmplt_pd(__m128d __a, __m128d __b) 163{ 164 return (__m128d)__builtin_ia32_cmppd(__a, __b, 1); 165} 166 167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmple_pd(__m128d __a, __m128d __b) 169{ 170 return (__m128d)__builtin_ia32_cmppd(__a, __b, 2); 171} 172 173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpgt_pd(__m128d __a, __m128d __b) 175{ 176 return (__m128d)__builtin_ia32_cmppd(__b, __a, 1); 177} 178 179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpge_pd(__m128d __a, __m128d __b) 181{ 182 return (__m128d)__builtin_ia32_cmppd(__b, __a, 2); 183} 184 185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpord_pd(__m128d __a, __m128d __b) 187{ 188 return (__m128d)__builtin_ia32_cmppd(__a, __b, 7); 189} 190 191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpunord_pd(__m128d __a, __m128d __b) 193{ 194 return (__m128d)__builtin_ia32_cmppd(__a, __b, 3); 195} 196 197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpneq_pd(__m128d __a, __m128d __b) 199{ 200 return (__m128d)__builtin_ia32_cmppd(__a, __b, 4); 201} 202 203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnlt_pd(__m128d __a, __m128d __b) 205{ 206 return (__m128d)__builtin_ia32_cmppd(__a, __b, 5); 207} 208 209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpnle_pd(__m128d __a, __m128d __b) 211{ 212 return (__m128d)__builtin_ia32_cmppd(__a, __b, 6); 213} 214 215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpngt_pd(__m128d __a, __m128d __b) 217{ 218 return (__m128d)__builtin_ia32_cmppd(__b, __a, 5); 219} 220 221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpnge_pd(__m128d __a, __m128d __b) 223{ 224 return (__m128d)__builtin_ia32_cmppd(__b, __a, 6); 225} 226 227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmpeq_sd(__m128d __a, __m128d __b) 229{ 230 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0); 231} 232 233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmplt_sd(__m128d __a, __m128d __b) 235{ 236 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1); 237} 238 239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmple_sd(__m128d __a, __m128d __b) 241{ 242 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2); 243} 244 245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpgt_sd(__m128d __a, __m128d __b) 247{ 248 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 1); 249} 250 251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 252_mm_cmpge_sd(__m128d __a, __m128d __b) 253{ 254 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 2); 255} 256 257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 258_mm_cmpord_sd(__m128d __a, __m128d __b) 259{ 260 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7); 261} 262 263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 264_mm_cmpunord_sd(__m128d __a, __m128d __b) 265{ 266 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3); 267} 268 269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 270_mm_cmpneq_sd(__m128d __a, __m128d __b) 271{ 272 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4); 273} 274 275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 276_mm_cmpnlt_sd(__m128d __a, __m128d __b) 277{ 278 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5); 279} 280 281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 282_mm_cmpnle_sd(__m128d __a, __m128d __b) 283{ 284 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6); 285} 286 287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 288_mm_cmpngt_sd(__m128d __a, __m128d __b) 289{ 290 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 5); 291} 292 293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 294_mm_cmpnge_sd(__m128d __a, __m128d __b) 295{ 296 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 6); 297} 298 299static __inline__ int __attribute__((__always_inline__, __nodebug__)) 300_mm_comieq_sd(__m128d __a, __m128d __b) 301{ 302 return __builtin_ia32_comisdeq(__a, __b); 303} 304 305static __inline__ int __attribute__((__always_inline__, __nodebug__)) 306_mm_comilt_sd(__m128d __a, __m128d __b) 307{ 308 return __builtin_ia32_comisdlt(__a, __b); 309} 310 311static __inline__ int __attribute__((__always_inline__, __nodebug__)) 312_mm_comile_sd(__m128d __a, __m128d __b) 313{ 314 return __builtin_ia32_comisdle(__a, __b); 315} 316 317static __inline__ int __attribute__((__always_inline__, __nodebug__)) 318_mm_comigt_sd(__m128d __a, __m128d __b) 319{ 320 return __builtin_ia32_comisdgt(__a, __b); 321} 322 323static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324_mm_comige_sd(__m128d __a, __m128d __b) 325{ 326 return __builtin_ia32_comisdge(__a, __b); 327} 328 329static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330_mm_comineq_sd(__m128d __a, __m128d __b) 331{ 332 return __builtin_ia32_comisdneq(__a, __b); 333} 334 335static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336_mm_ucomieq_sd(__m128d __a, __m128d __b) 337{ 338 return __builtin_ia32_ucomisdeq(__a, __b); 339} 340 341static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342_mm_ucomilt_sd(__m128d __a, __m128d __b) 343{ 344 return __builtin_ia32_ucomisdlt(__a, __b); 345} 346 347static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348_mm_ucomile_sd(__m128d __a, __m128d __b) 349{ 350 return __builtin_ia32_ucomisdle(__a, __b); 351} 352 353static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354_mm_ucomigt_sd(__m128d __a, __m128d __b) 355{ 356 return __builtin_ia32_ucomisdgt(__a, __b); 357} 358 359static __inline__ int __attribute__((__always_inline__, __nodebug__)) 360_mm_ucomige_sd(__m128d __a, __m128d __b) 361{ 362 return __builtin_ia32_ucomisdge(__a, __b); 363} 364 365static __inline__ int __attribute__((__always_inline__, __nodebug__)) 366_mm_ucomineq_sd(__m128d __a, __m128d __b) 367{ 368 return __builtin_ia32_ucomisdneq(__a, __b); 369} 370 371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 372_mm_cvtpd_ps(__m128d __a) 373{ 374 return __builtin_ia32_cvtpd2ps(__a); 375} 376 377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 378_mm_cvtps_pd(__m128 __a) 379{ 380 return __builtin_ia32_cvtps2pd(__a); 381} 382 383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 384_mm_cvtepi32_pd(__m128i __a) 385{ 386 return __builtin_ia32_cvtdq2pd((__v4si)__a); 387} 388 389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 390_mm_cvtpd_epi32(__m128d __a) 391{ 392 return __builtin_ia32_cvtpd2dq(__a); 393} 394 395static __inline__ int __attribute__((__always_inline__, __nodebug__)) 396_mm_cvtsd_si32(__m128d __a) 397{ 398 return __builtin_ia32_cvtsd2si(__a); 399} 400 401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 402_mm_cvtsd_ss(__m128 __a, __m128d __b) 403{ 404 __a[0] = __b[0]; 405 return __a; 406} 407 408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 409_mm_cvtsi32_sd(__m128d __a, int __b) 410{ 411 __a[0] = __b; 412 return __a; 413} 414 415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 416_mm_cvtss_sd(__m128d __a, __m128 __b) 417{ 418 __a[0] = __b[0]; 419 return __a; 420} 421 422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttpd_epi32(__m128d __a) 424{ 425 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 426} 427 428static __inline__ int __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttsd_si32(__m128d __a) 430{ 431 return __a[0]; 432} 433 434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtpd_pi32(__m128d __a) 436{ 437 return (__m64)__builtin_ia32_cvtpd2pi(__a); 438} 439 440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 441_mm_cvttpd_pi32(__m128d __a) 442{ 443 return (__m64)__builtin_ia32_cvttpd2pi(__a); 444} 445 446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 447_mm_cvtpi32_pd(__m64 __a) 448{ 449 return __builtin_ia32_cvtpi2pd((__v2si)__a); 450} 451 452static __inline__ double __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtsd_f64(__m128d __a) 454{ 455 return __a[0]; 456} 457 458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 459_mm_load_pd(double const *__dp) 460{ 461 return *(__m128d*)__dp; 462} 463 464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 465_mm_load1_pd(double const *__dp) 466{ 467 struct __mm_load1_pd_struct { 468 double __u; 469 } __attribute__((__packed__, __may_alias__)); 470 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 471 return (__m128d){ __u, __u }; 472} 473 474#define _mm_load_pd1(dp) _mm_load1_pd(dp) 475 476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 477_mm_loadr_pd(double const *__dp) 478{ 479 __m128d __u = *(__m128d*)__dp; 480 return __builtin_shufflevector(__u, __u, 1, 0); 481} 482 483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 484_mm_loadu_pd(double const *__dp) 485{ 486 struct __loadu_pd { 487 __m128d __v; 488 } __attribute__((packed, may_alias)); 489 return ((struct __loadu_pd*)__dp)->__v; 490} 491 492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 493_mm_load_sd(double const *__dp) 494{ 495 struct __mm_load_sd_struct { 496 double __u; 497 } __attribute__((__packed__, __may_alias__)); 498 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 499 return (__m128d){ __u, 0 }; 500} 501 502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 503_mm_loadh_pd(__m128d __a, double const *__dp) 504{ 505 struct __mm_loadh_pd_struct { 506 double __u; 507 } __attribute__((__packed__, __may_alias__)); 508 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 509 return (__m128d){ __a[0], __u }; 510} 511 512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 513_mm_loadl_pd(__m128d __a, double const *__dp) 514{ 515 struct __mm_loadl_pd_struct { 516 double __u; 517 } __attribute__((__packed__, __may_alias__)); 518 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 519 return (__m128d){ __u, __a[1] }; 520} 521 522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 523_mm_set_sd(double __w) 524{ 525 return (__m128d){ __w, 0 }; 526} 527 528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 529_mm_set1_pd(double __w) 530{ 531 return (__m128d){ __w, __w }; 532} 533 534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 535_mm_set_pd(double __w, double __x) 536{ 537 return (__m128d){ __x, __w }; 538} 539 540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 541_mm_setr_pd(double __w, double __x) 542{ 543 return (__m128d){ __w, __x }; 544} 545 546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 547_mm_setzero_pd(void) 548{ 549 return (__m128d){ 0, 0 }; 550} 551 552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 553_mm_move_sd(__m128d __a, __m128d __b) 554{ 555 return (__m128d){ __b[0], __a[1] }; 556} 557 558static __inline__ void __attribute__((__always_inline__, __nodebug__)) 559_mm_store_sd(double *__dp, __m128d __a) 560{ 561 struct __mm_store_sd_struct { 562 double __u; 563 } __attribute__((__packed__, __may_alias__)); 564 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 565} 566 567static __inline__ void __attribute__((__always_inline__, __nodebug__)) 568_mm_store1_pd(double *__dp, __m128d __a) 569{ 570 struct __mm_store1_pd_struct { 571 double __u[2]; 572 } __attribute__((__packed__, __may_alias__)); 573 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 574 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 575} 576 577static __inline__ void __attribute__((__always_inline__, __nodebug__)) 578_mm_store_pd(double *__dp, __m128d __a) 579{ 580 *(__m128d *)__dp = __a; 581} 582 583static __inline__ void __attribute__((__always_inline__, __nodebug__)) 584_mm_storeu_pd(double *__dp, __m128d __a) 585{ 586 __builtin_ia32_storeupd(__dp, __a); 587} 588 589static __inline__ void __attribute__((__always_inline__, __nodebug__)) 590_mm_storer_pd(double *__dp, __m128d __a) 591{ 592 __a = __builtin_shufflevector(__a, __a, 1, 0); 593 *(__m128d *)__dp = __a; 594} 595 596static __inline__ void __attribute__((__always_inline__, __nodebug__)) 597_mm_storeh_pd(double *__dp, __m128d __a) 598{ 599 struct __mm_storeh_pd_struct { 600 double __u; 601 } __attribute__((__packed__, __may_alias__)); 602 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 603} 604 605static __inline__ void __attribute__((__always_inline__, __nodebug__)) 606_mm_storel_pd(double *__dp, __m128d __a) 607{ 608 struct __mm_storeh_pd_struct { 609 double __u; 610 } __attribute__((__packed__, __may_alias__)); 611 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 612} 613 614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 615_mm_add_epi8(__m128i __a, __m128i __b) 616{ 617 return (__m128i)((__v16qi)__a + (__v16qi)__b); 618} 619 620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 621_mm_add_epi16(__m128i __a, __m128i __b) 622{ 623 return (__m128i)((__v8hi)__a + (__v8hi)__b); 624} 625 626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 627_mm_add_epi32(__m128i __a, __m128i __b) 628{ 629 return (__m128i)((__v4si)__a + (__v4si)__b); 630} 631 632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 633_mm_add_si64(__m64 __a, __m64 __b) 634{ 635 return __a + __b; 636} 637 638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 639_mm_add_epi64(__m128i __a, __m128i __b) 640{ 641 return __a + __b; 642} 643 644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 645_mm_adds_epi8(__m128i __a, __m128i __b) 646{ 647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 648} 649 650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 651_mm_adds_epi16(__m128i __a, __m128i __b) 652{ 653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 654} 655 656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 657_mm_adds_epu8(__m128i __a, __m128i __b) 658{ 659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 660} 661 662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 663_mm_adds_epu16(__m128i __a, __m128i __b) 664{ 665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 666} 667 668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 669_mm_avg_epu8(__m128i __a, __m128i __b) 670{ 671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 672} 673 674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 675_mm_avg_epu16(__m128i __a, __m128i __b) 676{ 677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 678} 679 680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 681_mm_madd_epi16(__m128i __a, __m128i __b) 682{ 683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 684} 685 686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 687_mm_max_epi16(__m128i __a, __m128i __b) 688{ 689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 690} 691 692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 693_mm_max_epu8(__m128i __a, __m128i __b) 694{ 695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 696} 697 698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 699_mm_min_epi16(__m128i __a, __m128i __b) 700{ 701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 702} 703 704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 705_mm_min_epu8(__m128i __a, __m128i __b) 706{ 707 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 708} 709 710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 711_mm_mulhi_epi16(__m128i __a, __m128i __b) 712{ 713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 714} 715 716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 717_mm_mulhi_epu16(__m128i __a, __m128i __b) 718{ 719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 720} 721 722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 723_mm_mullo_epi16(__m128i __a, __m128i __b) 724{ 725 return (__m128i)((__v8hi)__a * (__v8hi)__b); 726} 727 728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 729_mm_mul_su32(__m64 __a, __m64 __b) 730{ 731 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 732} 733 734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 735_mm_mul_epu32(__m128i __a, __m128i __b) 736{ 737 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 738} 739 740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 741_mm_sad_epu8(__m128i __a, __m128i __b) 742{ 743 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 744} 745 746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 747_mm_sub_epi8(__m128i __a, __m128i __b) 748{ 749 return (__m128i)((__v16qi)__a - (__v16qi)__b); 750} 751 752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 753_mm_sub_epi16(__m128i __a, __m128i __b) 754{ 755 return (__m128i)((__v8hi)__a - (__v8hi)__b); 756} 757 758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 759_mm_sub_epi32(__m128i __a, __m128i __b) 760{ 761 return (__m128i)((__v4si)__a - (__v4si)__b); 762} 763 764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 765_mm_sub_si64(__m64 __a, __m64 __b) 766{ 767 return __a - __b; 768} 769 770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 771_mm_sub_epi64(__m128i __a, __m128i __b) 772{ 773 return __a - __b; 774} 775 776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 777_mm_subs_epi8(__m128i __a, __m128i __b) 778{ 779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 780} 781 782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 783_mm_subs_epi16(__m128i __a, __m128i __b) 784{ 785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 786} 787 788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 789_mm_subs_epu8(__m128i __a, __m128i __b) 790{ 791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 792} 793 794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 795_mm_subs_epu16(__m128i __a, __m128i __b) 796{ 797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 798} 799 800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 801_mm_and_si128(__m128i __a, __m128i __b) 802{ 803 return __a & __b; 804} 805 806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 807_mm_andnot_si128(__m128i __a, __m128i __b) 808{ 809 return ~__a & __b; 810} 811 812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 813_mm_or_si128(__m128i __a, __m128i __b) 814{ 815 return __a | __b; 816} 817 818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 819_mm_xor_si128(__m128i __a, __m128i __b) 820{ 821 return __a ^ __b; 822} 823 824#define _mm_slli_si128(a, count) __extension__ ({ \ 825 __m128i __a = (a); \ 826 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 827 828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 829_mm_slli_epi16(__m128i __a, int __count) 830{ 831 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 832} 833 834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 835_mm_sll_epi16(__m128i __a, __m128i __count) 836{ 837 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 838} 839 840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 841_mm_slli_epi32(__m128i __a, int __count) 842{ 843 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 844} 845 846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 847_mm_sll_epi32(__m128i __a, __m128i __count) 848{ 849 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 850} 851 852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 853_mm_slli_epi64(__m128i __a, int __count) 854{ 855 return __builtin_ia32_psllqi128(__a, __count); 856} 857 858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 859_mm_sll_epi64(__m128i __a, __m128i __count) 860{ 861 return __builtin_ia32_psllq128(__a, __count); 862} 863 864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 865_mm_srai_epi16(__m128i __a, int __count) 866{ 867 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 868} 869 870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 871_mm_sra_epi16(__m128i __a, __m128i __count) 872{ 873 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 874} 875 876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 877_mm_srai_epi32(__m128i __a, int __count) 878{ 879 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 880} 881 882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 883_mm_sra_epi32(__m128i __a, __m128i __count) 884{ 885 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 886} 887 888 889#define _mm_srli_si128(a, count) __extension__ ({ \ 890 __m128i __a = (a); \ 891 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 892 893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 894_mm_srli_epi16(__m128i __a, int __count) 895{ 896 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 897} 898 899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 900_mm_srl_epi16(__m128i __a, __m128i __count) 901{ 902 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 903} 904 905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 906_mm_srli_epi32(__m128i __a, int __count) 907{ 908 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 909} 910 911static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 912_mm_srl_epi32(__m128i __a, __m128i __count) 913{ 914 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 915} 916 917static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 918_mm_srli_epi64(__m128i __a, int __count) 919{ 920 return __builtin_ia32_psrlqi128(__a, __count); 921} 922 923static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 924_mm_srl_epi64(__m128i __a, __m128i __count) 925{ 926 return __builtin_ia32_psrlq128(__a, __count); 927} 928 929static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 930_mm_cmpeq_epi8(__m128i __a, __m128i __b) 931{ 932 return (__m128i)((__v16qi)__a == (__v16qi)__b); 933} 934 935static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 936_mm_cmpeq_epi16(__m128i __a, __m128i __b) 937{ 938 return (__m128i)((__v8hi)__a == (__v8hi)__b); 939} 940 941static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 942_mm_cmpeq_epi32(__m128i __a, __m128i __b) 943{ 944 return (__m128i)((__v4si)__a == (__v4si)__b); 945} 946 947static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 948_mm_cmpgt_epi8(__m128i __a, __m128i __b) 949{ 950 /* This function always performs a signed comparison, but __v16qi is a char 951 which may be signed or unsigned. */ 952 typedef signed char __v16qs __attribute__((__vector_size__(16))); 953 return (__m128i)((__v16qs)__a > (__v16qs)__b); 954} 955 956static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 957_mm_cmpgt_epi16(__m128i __a, __m128i __b) 958{ 959 return (__m128i)((__v8hi)__a > (__v8hi)__b); 960} 961 962static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 963_mm_cmpgt_epi32(__m128i __a, __m128i __b) 964{ 965 return (__m128i)((__v4si)__a > (__v4si)__b); 966} 967 968static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 969_mm_cmplt_epi8(__m128i __a, __m128i __b) 970{ 971 return _mm_cmpgt_epi8(__b, __a); 972} 973 974static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 975_mm_cmplt_epi16(__m128i __a, __m128i __b) 976{ 977 return _mm_cmpgt_epi16(__b, __a); 978} 979 980static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 981_mm_cmplt_epi32(__m128i __a, __m128i __b) 982{ 983 return _mm_cmpgt_epi32(__b, __a); 984} 985 986#ifdef __x86_64__ 987static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 988_mm_cvtsi64_sd(__m128d __a, long long __b) 989{ 990 __a[0] = __b; 991 return __a; 992} 993 994static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 995_mm_cvtsd_si64(__m128d __a) 996{ 997 return __builtin_ia32_cvtsd2si64(__a); 998} 999 1000static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1001_mm_cvttsd_si64(__m128d __a) 1002{ 1003 return __a[0]; 1004} 1005#endif 1006 1007static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1008_mm_cvtepi32_ps(__m128i __a) 1009{ 1010 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1011} 1012 1013static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1014_mm_cvtps_epi32(__m128 __a) 1015{ 1016 return (__m128i)__builtin_ia32_cvtps2dq(__a); 1017} 1018 1019static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1020_mm_cvttps_epi32(__m128 __a) 1021{ 1022 return (__m128i)__builtin_ia32_cvttps2dq(__a); 1023} 1024 1025static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1026_mm_cvtsi32_si128(int __a) 1027{ 1028 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1029} 1030 1031#ifdef __x86_64__ 1032static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1033_mm_cvtsi64_si128(long long __a) 1034{ 1035 return (__m128i){ __a, 0 }; 1036} 1037#endif 1038 1039static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1040_mm_cvtsi128_si32(__m128i __a) 1041{ 1042 __v4si __b = (__v4si)__a; 1043 return __b[0]; 1044} 1045 1046#ifdef __x86_64__ 1047static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1048_mm_cvtsi128_si64(__m128i __a) 1049{ 1050 return __a[0]; 1051} 1052#endif 1053 1054static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1055_mm_load_si128(__m128i const *__p) 1056{ 1057 return *__p; 1058} 1059 1060static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1061_mm_loadu_si128(__m128i const *__p) 1062{ 1063 struct __loadu_si128 { 1064 __m128i __v; 1065 } __attribute__((packed, may_alias)); 1066 return ((struct __loadu_si128*)__p)->__v; 1067} 1068 1069static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1070_mm_loadl_epi64(__m128i const *__p) 1071{ 1072 struct __mm_loadl_epi64_struct { 1073 long long __u; 1074 } __attribute__((__packed__, __may_alias__)); 1075 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1076} 1077 1078static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1079_mm_set_epi64x(long long q1, long long q0) 1080{ 1081 return (__m128i){ q0, q1 }; 1082} 1083 1084static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1085_mm_set_epi64(__m64 q1, __m64 q0) 1086{ 1087 return (__m128i){ (long long)q0, (long long)q1 }; 1088} 1089 1090static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1091_mm_set_epi32(int i3, int i2, int i1, int i0) 1092{ 1093 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1094} 1095 1096static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1097_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1098{ 1099 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1100} 1101 1102static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1103_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1104{ 1105 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1106} 1107 1108static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1109_mm_set1_epi64x(long long __q) 1110{ 1111 return (__m128i){ __q, __q }; 1112} 1113 1114static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1115_mm_set1_epi64(__m64 __q) 1116{ 1117 return (__m128i){ (long long)__q, (long long)__q }; 1118} 1119 1120static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1121_mm_set1_epi32(int __i) 1122{ 1123 return (__m128i)(__v4si){ __i, __i, __i, __i }; 1124} 1125 1126static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1127_mm_set1_epi16(short __w) 1128{ 1129 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 1130} 1131 1132static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1133_mm_set1_epi8(char __b) 1134{ 1135 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 1136} 1137 1138static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1139_mm_setr_epi64(__m64 q0, __m64 q1) 1140{ 1141 return (__m128i){ (long long)q0, (long long)q1 }; 1142} 1143 1144static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1145_mm_setr_epi32(int i0, int i1, int i2, int i3) 1146{ 1147 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1148} 1149 1150static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1151_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1152{ 1153 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1154} 1155 1156static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1157_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1158{ 1159 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1160} 1161 1162static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1163_mm_setzero_si128(void) 1164{ 1165 return (__m128i){ 0LL, 0LL }; 1166} 1167 1168static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1169_mm_store_si128(__m128i *__p, __m128i __b) 1170{ 1171 *__p = __b; 1172} 1173 1174static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1175_mm_storeu_si128(__m128i *__p, __m128i __b) 1176{ 1177 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 1178} 1179 1180static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1181_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 1182{ 1183 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 1184} 1185 1186static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1187_mm_storel_epi64(__m128i *__p, __m128i __a) 1188{ 1189 struct __mm_storel_epi64_struct { 1190 long long __u; 1191 } __attribute__((__packed__, __may_alias__)); 1192 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 1193} 1194 1195static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1196_mm_stream_pd(double *__p, __m128d __a) 1197{ 1198 __builtin_ia32_movntpd(__p, __a); 1199} 1200 1201static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1202_mm_stream_si128(__m128i *__p, __m128i __a) 1203{ 1204 __builtin_ia32_movntdq(__p, __a); 1205} 1206 1207static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1208_mm_stream_si32(int *__p, int __a) 1209{ 1210 __builtin_ia32_movnti(__p, __a); 1211} 1212 1213static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1214_mm_clflush(void const *__p) 1215{ 1216 __builtin_ia32_clflush(__p); 1217} 1218 1219static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1220_mm_lfence(void) 1221{ 1222 __builtin_ia32_lfence(); 1223} 1224 1225static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1226_mm_mfence(void) 1227{ 1228 __builtin_ia32_mfence(); 1229} 1230 1231static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1232_mm_packs_epi16(__m128i __a, __m128i __b) 1233{ 1234 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 1235} 1236 1237static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1238_mm_packs_epi32(__m128i __a, __m128i __b) 1239{ 1240 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 1241} 1242 1243static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1244_mm_packus_epi16(__m128i __a, __m128i __b) 1245{ 1246 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 1247} 1248 1249static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1250_mm_extract_epi16(__m128i __a, int __imm) 1251{ 1252 __v8hi __b = (__v8hi)__a; 1253 return (unsigned short)__b[__imm]; 1254} 1255 1256static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1257_mm_insert_epi16(__m128i __a, int __b, int __imm) 1258{ 1259 __v8hi __c = (__v8hi)__a; 1260 __c[__imm & 7] = __b; 1261 return (__m128i)__c; 1262} 1263 1264static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1265_mm_movemask_epi8(__m128i __a) 1266{ 1267 return __builtin_ia32_pmovmskb128((__v16qi)__a); 1268} 1269 1270#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1271 __m128i __a = (a); \ 1272 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 1273 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1274 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1275 1276#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1277 __m128i __a = (a); \ 1278 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1279 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1280 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1281 4, 5, 6, 7); }) 1282 1283#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1284 __m128i __a = (a); \ 1285 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1286 0, 1, 2, 3, \ 1287 4 + (((imm) & 0x03) >> 0), \ 1288 4 + (((imm) & 0x0c) >> 2), \ 1289 4 + (((imm) & 0x30) >> 4), \ 1290 4 + (((imm) & 0xc0) >> 6)); }) 1291 1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1293_mm_unpackhi_epi8(__m128i __a, __m128i __b) 1294{ 1295 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1296} 1297 1298static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1299_mm_unpackhi_epi16(__m128i __a, __m128i __b) 1300{ 1301 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1302} 1303 1304static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1305_mm_unpackhi_epi32(__m128i __a, __m128i __b) 1306{ 1307 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 1308} 1309 1310static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1311_mm_unpackhi_epi64(__m128i __a, __m128i __b) 1312{ 1313 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 1314} 1315 1316static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1317_mm_unpacklo_epi8(__m128i __a, __m128i __b) 1318{ 1319 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1320} 1321 1322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1323_mm_unpacklo_epi16(__m128i __a, __m128i __b) 1324{ 1325 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1326} 1327 1328static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1329_mm_unpacklo_epi32(__m128i __a, __m128i __b) 1330{ 1331 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 1332} 1333 1334static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1335_mm_unpacklo_epi64(__m128i __a, __m128i __b) 1336{ 1337 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 1338} 1339 1340static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1341_mm_movepi64_pi64(__m128i __a) 1342{ 1343 return (__m64)__a[0]; 1344} 1345 1346static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1347_mm_movpi64_pi64(__m64 __a) 1348{ 1349 return (__m128i){ (long long)__a, 0 }; 1350} 1351 1352static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1353_mm_move_epi64(__m128i __a) 1354{ 1355 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 1356} 1357 1358static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1359_mm_unpackhi_pd(__m128d __a, __m128d __b) 1360{ 1361 return __builtin_shufflevector(__a, __b, 1, 2+1); 1362} 1363 1364static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1365_mm_unpacklo_pd(__m128d __a, __m128d __b) 1366{ 1367 return __builtin_shufflevector(__a, __b, 0, 2+0); 1368} 1369 1370static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1371_mm_movemask_pd(__m128d __a) 1372{ 1373 return __builtin_ia32_movmskpd(__a); 1374} 1375 1376#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1377 __m128d __a = (a); \ 1378 __m128d __b = (b); \ 1379 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 1380 1381static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1382_mm_castpd_ps(__m128d __a) 1383{ 1384 return (__m128)__a; 1385} 1386 1387static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1388_mm_castpd_si128(__m128d __a) 1389{ 1390 return (__m128i)__a; 1391} 1392 1393static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1394_mm_castps_pd(__m128 __a) 1395{ 1396 return (__m128d)__a; 1397} 1398 1399static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1400_mm_castps_si128(__m128 __a) 1401{ 1402 return (__m128i)__a; 1403} 1404 1405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1406_mm_castsi128_ps(__m128i __a) 1407{ 1408 return (__m128)__a; 1409} 1410 1411static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1412_mm_castsi128_pd(__m128i __a) 1413{ 1414 return (__m128d)__a; 1415} 1416 1417static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1418_mm_pause(void) 1419{ 1420 __asm__ volatile ("pause"); 1421} 1422 1423#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1424 1425#endif /* __SSE2__ */ 1426 1427#endif /* __EMMINTRIN_H */ 1428