emmintrin.h revision 221345
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36/* Type defines. */ 37typedef double __v2df __attribute__ ((__vector_size__ (16))); 38typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39typedef short __v8hi __attribute__((__vector_size__(16))); 40typedef char __v16qi __attribute__((__vector_size__(16))); 41 42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43_mm_add_sd(__m128d a, __m128d b) 44{ 45 a[0] += b[0]; 46 return a; 47} 48 49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50_mm_add_pd(__m128d a, __m128d b) 51{ 52 return a + b; 53} 54 55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56_mm_sub_sd(__m128d a, __m128d b) 57{ 58 a[0] -= b[0]; 59 return a; 60} 61 62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63_mm_sub_pd(__m128d a, __m128d b) 64{ 65 return a - b; 66} 67 68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69_mm_mul_sd(__m128d a, __m128d b) 70{ 71 a[0] *= b[0]; 72 return a; 73} 74 75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76_mm_mul_pd(__m128d a, __m128d b) 77{ 78 return a * b; 79} 80 81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82_mm_div_sd(__m128d a, __m128d b) 83{ 84 a[0] /= b[0]; 85 return a; 86} 87 88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89_mm_div_pd(__m128d a, __m128d b) 90{ 91 return a / b; 92} 93 94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95_mm_sqrt_sd(__m128d a, __m128d b) 96{ 97 __m128d c = __builtin_ia32_sqrtsd(b); 98 return (__m128d) { c[0], a[1] }; 99} 100 101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102_mm_sqrt_pd(__m128d a) 103{ 104 return __builtin_ia32_sqrtpd(a); 105} 106 107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108_mm_min_sd(__m128d a, __m128d b) 109{ 110 return __builtin_ia32_minsd(a, b); 111} 112 113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_min_pd(__m128d a, __m128d b) 115{ 116 return __builtin_ia32_minpd(a, b); 117} 118 119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_sd(__m128d a, __m128d b) 121{ 122 return __builtin_ia32_maxsd(a, b); 123} 124 125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_max_pd(__m128d a, __m128d b) 127{ 128 return __builtin_ia32_maxpd(a, b); 129} 130 131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_and_pd(__m128d a, __m128d b) 133{ 134 return (__m128d)((__v4si)a & (__v4si)b); 135} 136 137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_andnot_pd(__m128d a, __m128d b) 139{ 140 return (__m128d)(~(__v4si)a & (__v4si)b); 141} 142 143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_or_pd(__m128d a, __m128d b) 145{ 146 return (__m128d)((__v4si)a | (__v4si)b); 147} 148 149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_xor_pd(__m128d a, __m128d b) 151{ 152 return (__m128d)((__v4si)a ^ (__v4si)b); 153} 154 155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmpeq_pd(__m128d a, __m128d b) 157{ 158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 159} 160 161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmplt_pd(__m128d a, __m128d b) 163{ 164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 165} 166 167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmple_pd(__m128d a, __m128d b) 169{ 170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 171} 172 173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpgt_pd(__m128d a, __m128d b) 175{ 176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 177} 178 179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpge_pd(__m128d a, __m128d b) 181{ 182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 183} 184 185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpord_pd(__m128d a, __m128d b) 187{ 188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 189} 190 191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpunord_pd(__m128d a, __m128d b) 193{ 194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 195} 196 197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpneq_pd(__m128d a, __m128d b) 199{ 200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 201} 202 203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnlt_pd(__m128d a, __m128d b) 205{ 206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 207} 208 209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpnle_pd(__m128d a, __m128d b) 211{ 212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 213} 214 215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpngt_pd(__m128d a, __m128d b) 217{ 218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 219} 220 221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpnge_pd(__m128d a, __m128d b) 223{ 224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 225} 226 227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmpeq_sd(__m128d a, __m128d b) 229{ 230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 231} 232 233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmplt_sd(__m128d a, __m128d b) 235{ 236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 237} 238 239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmple_sd(__m128d a, __m128d b) 241{ 242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 243} 244 245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpgt_sd(__m128d a, __m128d b) 247{ 248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 249} 250 251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 252_mm_cmpge_sd(__m128d a, __m128d b) 253{ 254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 255} 256 257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 258_mm_cmpord_sd(__m128d a, __m128d b) 259{ 260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 261} 262 263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 264_mm_cmpunord_sd(__m128d a, __m128d b) 265{ 266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 267} 268 269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 270_mm_cmpneq_sd(__m128d a, __m128d b) 271{ 272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 273} 274 275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 276_mm_cmpnlt_sd(__m128d a, __m128d b) 277{ 278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 279} 280 281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 282_mm_cmpnle_sd(__m128d a, __m128d b) 283{ 284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 285} 286 287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 288_mm_cmpngt_sd(__m128d a, __m128d b) 289{ 290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 291} 292 293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 294_mm_cmpnge_sd(__m128d a, __m128d b) 295{ 296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 297} 298 299static __inline__ int __attribute__((__always_inline__, __nodebug__)) 300_mm_comieq_sd(__m128d a, __m128d b) 301{ 302 return __builtin_ia32_comisdeq(a, b); 303} 304 305static __inline__ int __attribute__((__always_inline__, __nodebug__)) 306_mm_comilt_sd(__m128d a, __m128d b) 307{ 308 return __builtin_ia32_comisdlt(a, b); 309} 310 311static __inline__ int __attribute__((__always_inline__, __nodebug__)) 312_mm_comile_sd(__m128d a, __m128d b) 313{ 314 return __builtin_ia32_comisdle(a, b); 315} 316 317static __inline__ int __attribute__((__always_inline__, __nodebug__)) 318_mm_comigt_sd(__m128d a, __m128d b) 319{ 320 return __builtin_ia32_comisdgt(a, b); 321} 322 323static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324_mm_comineq_sd(__m128d a, __m128d b) 325{ 326 return __builtin_ia32_comisdneq(a, b); 327} 328 329static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330_mm_ucomieq_sd(__m128d a, __m128d b) 331{ 332 return __builtin_ia32_ucomisdeq(a, b); 333} 334 335static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336_mm_ucomilt_sd(__m128d a, __m128d b) 337{ 338 return __builtin_ia32_ucomisdlt(a, b); 339} 340 341static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342_mm_ucomile_sd(__m128d a, __m128d b) 343{ 344 return __builtin_ia32_ucomisdle(a, b); 345} 346 347static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348_mm_ucomigt_sd(__m128d a, __m128d b) 349{ 350 return __builtin_ia32_ucomisdgt(a, b); 351} 352 353static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354_mm_ucomineq_sd(__m128d a, __m128d b) 355{ 356 return __builtin_ia32_ucomisdneq(a, b); 357} 358 359static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 360_mm_cvtpd_ps(__m128d a) 361{ 362 return __builtin_ia32_cvtpd2ps(a); 363} 364 365static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 366_mm_cvtps_pd(__m128 a) 367{ 368 return __builtin_ia32_cvtps2pd(a); 369} 370 371static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 372_mm_cvtepi32_pd(__m128i a) 373{ 374 return __builtin_ia32_cvtdq2pd((__v4si)a); 375} 376 377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 378_mm_cvtpd_epi32(__m128d a) 379{ 380 return __builtin_ia32_cvtpd2dq(a); 381} 382 383static __inline__ int __attribute__((__always_inline__, __nodebug__)) 384_mm_cvtsd_si32(__m128d a) 385{ 386 return __builtin_ia32_cvtsd2si(a); 387} 388 389static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 390_mm_cvtsd_ss(__m128 a, __m128d b) 391{ 392 a[0] = b[0]; 393 return a; 394} 395 396static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 397_mm_cvtsi32_sd(__m128d a, int b) 398{ 399 a[0] = b; 400 return a; 401} 402 403static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 404_mm_cvtss_sd(__m128d a, __m128 b) 405{ 406 a[0] = b[0]; 407 return a; 408} 409 410static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 411_mm_cvttpd_epi32(__m128d a) 412{ 413 return (__m128i)__builtin_ia32_cvttpd2dq(a); 414} 415 416static __inline__ int __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttsd_si32(__m128d a) 418{ 419 return a[0]; 420} 421 422static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 423_mm_cvtpd_pi32(__m128d a) 424{ 425 return (__m64)__builtin_ia32_cvtpd2pi(a); 426} 427 428static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttpd_pi32(__m128d a) 430{ 431 return (__m64)__builtin_ia32_cvttpd2pi(a); 432} 433 434static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtpi32_pd(__m64 a) 436{ 437 return __builtin_ia32_cvtpi2pd((__v2si)a); 438} 439 440static __inline__ double __attribute__((__always_inline__, __nodebug__)) 441_mm_cvtsd_f64(__m128d a) 442{ 443 return a[0]; 444} 445 446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 447_mm_load_pd(double const *dp) 448{ 449 return *(__m128d*)dp; 450} 451 452static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 453_mm_load1_pd(double const *dp) 454{ 455 return (__m128d){ dp[0], dp[0] }; 456} 457 458#define _mm_load_pd1(dp) _mm_load1_pd(dp) 459 460static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 461_mm_loadr_pd(double const *dp) 462{ 463 return (__m128d){ dp[1], dp[0] }; 464} 465 466static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 467_mm_loadu_pd(double const *dp) 468{ 469 return (__m128d){ dp[0], dp[1] }; 470} 471 472static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 473_mm_load_sd(double const *dp) 474{ 475 return (__m128d){ *dp, 0.0 }; 476} 477 478static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 479_mm_loadh_pd(__m128d a, double const *dp) 480{ 481 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 482} 483 484static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 485_mm_loadl_pd(__m128d a, double const *dp) 486{ 487 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 488} 489 490static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 491_mm_set_sd(double w) 492{ 493 return (__m128d){ w, 0 }; 494} 495 496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 497_mm_set1_pd(double w) 498{ 499 return (__m128d){ w, w }; 500} 501 502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 503_mm_set_pd(double w, double x) 504{ 505 return (__m128d){ x, w }; 506} 507 508static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 509_mm_setr_pd(double w, double x) 510{ 511 return (__m128d){ w, x }; 512} 513 514static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 515_mm_setzero_pd(void) 516{ 517 return (__m128d){ 0, 0 }; 518} 519 520static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 521_mm_move_sd(__m128d a, __m128d b) 522{ 523 return (__m128d){ b[0], a[1] }; 524} 525 526static __inline__ void __attribute__((__always_inline__, __nodebug__)) 527_mm_store_sd(double *dp, __m128d a) 528{ 529 dp[0] = a[0]; 530} 531 532static __inline__ void __attribute__((__always_inline__, __nodebug__)) 533_mm_store1_pd(double *dp, __m128d a) 534{ 535 dp[0] = a[0]; 536 dp[1] = a[0]; 537} 538 539static __inline__ void __attribute__((__always_inline__, __nodebug__)) 540_mm_store_pd(double *dp, __m128d a) 541{ 542 *(__m128d *)dp = a; 543} 544 545static __inline__ void __attribute__((__always_inline__, __nodebug__)) 546_mm_storeu_pd(double *dp, __m128d a) 547{ 548 __builtin_ia32_storeupd(dp, a); 549} 550 551static __inline__ void __attribute__((__always_inline__, __nodebug__)) 552_mm_storer_pd(double *dp, __m128d a) 553{ 554 dp[0] = a[1]; 555 dp[1] = a[0]; 556} 557 558static __inline__ void __attribute__((__always_inline__, __nodebug__)) 559_mm_storeh_pd(double *dp, __m128d a) 560{ 561 dp[0] = a[1]; 562} 563 564static __inline__ void __attribute__((__always_inline__, __nodebug__)) 565_mm_storel_pd(double *dp, __m128d a) 566{ 567 dp[0] = a[0]; 568} 569 570static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 571_mm_add_epi8(__m128i a, __m128i b) 572{ 573 return (__m128i)((__v16qi)a + (__v16qi)b); 574} 575 576static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 577_mm_add_epi16(__m128i a, __m128i b) 578{ 579 return (__m128i)((__v8hi)a + (__v8hi)b); 580} 581 582static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 583_mm_add_epi32(__m128i a, __m128i b) 584{ 585 return (__m128i)((__v4si)a + (__v4si)b); 586} 587 588static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 589_mm_add_si64(__m64 a, __m64 b) 590{ 591 return a + b; 592} 593 594static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 595_mm_add_epi64(__m128i a, __m128i b) 596{ 597 return a + b; 598} 599 600static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 601_mm_adds_epi8(__m128i a, __m128i b) 602{ 603 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 604} 605 606static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 607_mm_adds_epi16(__m128i a, __m128i b) 608{ 609 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 610} 611 612static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 613_mm_adds_epu8(__m128i a, __m128i b) 614{ 615 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 616} 617 618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 619_mm_adds_epu16(__m128i a, __m128i b) 620{ 621 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 622} 623 624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 625_mm_avg_epu8(__m128i a, __m128i b) 626{ 627 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 628} 629 630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 631_mm_avg_epu16(__m128i a, __m128i b) 632{ 633 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 634} 635 636static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 637_mm_madd_epi16(__m128i a, __m128i b) 638{ 639 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 640} 641 642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 643_mm_max_epi16(__m128i a, __m128i b) 644{ 645 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 646} 647 648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 649_mm_max_epu8(__m128i a, __m128i b) 650{ 651 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 652} 653 654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 655_mm_min_epi16(__m128i a, __m128i b) 656{ 657 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 658} 659 660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 661_mm_min_epu8(__m128i a, __m128i b) 662{ 663 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 664} 665 666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 667_mm_mulhi_epi16(__m128i a, __m128i b) 668{ 669 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 670} 671 672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 673_mm_mulhi_epu16(__m128i a, __m128i b) 674{ 675 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 676} 677 678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 679_mm_mullo_epi16(__m128i a, __m128i b) 680{ 681 return (__m128i)((__v8hi)a * (__v8hi)b); 682} 683 684static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 685_mm_mul_su32(__m64 a, __m64 b) 686{ 687 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 688} 689 690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 691_mm_mul_epu32(__m128i a, __m128i b) 692{ 693 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 694} 695 696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 697_mm_sad_epu8(__m128i a, __m128i b) 698{ 699 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 700} 701 702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 703_mm_sub_epi8(__m128i a, __m128i b) 704{ 705 return (__m128i)((__v16qi)a - (__v16qi)b); 706} 707 708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 709_mm_sub_epi16(__m128i a, __m128i b) 710{ 711 return (__m128i)((__v8hi)a - (__v8hi)b); 712} 713 714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 715_mm_sub_epi32(__m128i a, __m128i b) 716{ 717 return (__m128i)((__v4si)a - (__v4si)b); 718} 719 720static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 721_mm_sub_si64(__m64 a, __m64 b) 722{ 723 return a - b; 724} 725 726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 727_mm_sub_epi64(__m128i a, __m128i b) 728{ 729 return a - b; 730} 731 732static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 733_mm_subs_epi8(__m128i a, __m128i b) 734{ 735 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 736} 737 738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 739_mm_subs_epi16(__m128i a, __m128i b) 740{ 741 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 742} 743 744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 745_mm_subs_epu8(__m128i a, __m128i b) 746{ 747 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 748} 749 750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 751_mm_subs_epu16(__m128i a, __m128i b) 752{ 753 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 754} 755 756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 757_mm_and_si128(__m128i a, __m128i b) 758{ 759 return a & b; 760} 761 762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 763_mm_andnot_si128(__m128i a, __m128i b) 764{ 765 return ~a & b; 766} 767 768static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 769_mm_or_si128(__m128i a, __m128i b) 770{ 771 return a | b; 772} 773 774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 775_mm_xor_si128(__m128i a, __m128i b) 776{ 777 return a ^ b; 778} 779 780#define _mm_slli_si128(VEC, IMM) \ 781 ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8)) 782 783static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 784_mm_slli_epi16(__m128i a, int count) 785{ 786 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 787} 788 789static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 790_mm_sll_epi16(__m128i a, __m128i count) 791{ 792 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 793} 794 795static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 796_mm_slli_epi32(__m128i a, int count) 797{ 798 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 799} 800 801static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 802_mm_sll_epi32(__m128i a, __m128i count) 803{ 804 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 805} 806 807static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 808_mm_slli_epi64(__m128i a, int count) 809{ 810 return __builtin_ia32_psllqi128(a, count); 811} 812 813static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 814_mm_sll_epi64(__m128i a, __m128i count) 815{ 816 return __builtin_ia32_psllq128(a, count); 817} 818 819static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 820_mm_srai_epi16(__m128i a, int count) 821{ 822 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 823} 824 825static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 826_mm_sra_epi16(__m128i a, __m128i count) 827{ 828 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 829} 830 831static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 832_mm_srai_epi32(__m128i a, int count) 833{ 834 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 835} 836 837static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 838_mm_sra_epi32(__m128i a, __m128i count) 839{ 840 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 841} 842 843 844#define _mm_srli_si128(VEC, IMM) \ 845 ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8)) 846 847static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 848_mm_srli_epi16(__m128i a, int count) 849{ 850 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 851} 852 853static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 854_mm_srl_epi16(__m128i a, __m128i count) 855{ 856 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 857} 858 859static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 860_mm_srli_epi32(__m128i a, int count) 861{ 862 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 863} 864 865static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 866_mm_srl_epi32(__m128i a, __m128i count) 867{ 868 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 869} 870 871static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 872_mm_srli_epi64(__m128i a, int count) 873{ 874 return __builtin_ia32_psrlqi128(a, count); 875} 876 877static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 878_mm_srl_epi64(__m128i a, __m128i count) 879{ 880 return __builtin_ia32_psrlq128(a, count); 881} 882 883static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 884_mm_cmpeq_epi8(__m128i a, __m128i b) 885{ 886 return (__m128i)((__v16qi)a == (__v16qi)b); 887} 888 889static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 890_mm_cmpeq_epi16(__m128i a, __m128i b) 891{ 892 return (__m128i)((__v8hi)a == (__v8hi)b); 893} 894 895static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 896_mm_cmpeq_epi32(__m128i a, __m128i b) 897{ 898 return (__m128i)((__v4si)a == (__v4si)b); 899} 900 901static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 902_mm_cmpgt_epi8(__m128i a, __m128i b) 903{ 904 return (__m128i)((__v16qi)a > (__v16qi)b); 905} 906 907static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 908_mm_cmpgt_epi16(__m128i a, __m128i b) 909{ 910 return (__m128i)((__v8hi)a > (__v8hi)b); 911} 912 913static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 914_mm_cmpgt_epi32(__m128i a, __m128i b) 915{ 916 return (__m128i)((__v4si)a > (__v4si)b); 917} 918 919static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 920_mm_cmplt_epi8(__m128i a, __m128i b) 921{ 922 return _mm_cmpgt_epi8(b,a); 923} 924 925static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 926_mm_cmplt_epi16(__m128i a, __m128i b) 927{ 928 return _mm_cmpgt_epi16(b,a); 929} 930 931static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 932_mm_cmplt_epi32(__m128i a, __m128i b) 933{ 934 return _mm_cmpgt_epi32(b,a); 935} 936 937#ifdef __x86_64__ 938static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 939_mm_cvtsi64_sd(__m128d a, long long b) 940{ 941 a[0] = b; 942 return a; 943} 944 945static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 946_mm_cvtsd_si64(__m128d a) 947{ 948 return __builtin_ia32_cvtsd2si64(a); 949} 950 951static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 952_mm_cvttsd_si64(__m128d a) 953{ 954 return a[0]; 955} 956#endif 957 958static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 959_mm_cvtepi32_ps(__m128i a) 960{ 961 return __builtin_ia32_cvtdq2ps((__v4si)a); 962} 963 964static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 965_mm_cvtps_epi32(__m128 a) 966{ 967 return (__m128i)__builtin_ia32_cvtps2dq(a); 968} 969 970static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 971_mm_cvttps_epi32(__m128 a) 972{ 973 return (__m128i)__builtin_ia32_cvttps2dq(a); 974} 975 976static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 977_mm_cvtsi32_si128(int a) 978{ 979 return (__m128i)(__v4si){ a, 0, 0, 0 }; 980} 981 982#ifdef __x86_64__ 983static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 984_mm_cvtsi64_si128(long long a) 985{ 986 return (__m128i){ a, 0 }; 987} 988#endif 989 990static __inline__ int __attribute__((__always_inline__, __nodebug__)) 991_mm_cvtsi128_si32(__m128i a) 992{ 993 __v4si b = (__v4si)a; 994 return b[0]; 995} 996 997#ifdef __x86_64__ 998static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 999_mm_cvtsi128_si64(__m128i a) 1000{ 1001 return a[0]; 1002} 1003#endif 1004 1005static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1006_mm_load_si128(__m128i const *p) 1007{ 1008 return *p; 1009} 1010 1011static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1012_mm_loadu_si128(__m128i const *p) 1013{ 1014 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 1015} 1016 1017static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1018_mm_loadl_epi64(__m128i const *p) 1019{ 1020 return (__m128i) { *(long long*)p, 0}; 1021} 1022 1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1024_mm_set_epi64x(long long q1, long long q0) 1025{ 1026 return (__m128i){ q0, q1 }; 1027} 1028 1029static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1030_mm_set_epi64(__m64 q1, __m64 q0) 1031{ 1032 return (__m128i){ (long long)q0, (long long)q1 }; 1033} 1034 1035static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1036_mm_set_epi32(int i3, int i2, int i1, int i0) 1037{ 1038 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1039} 1040 1041static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1042_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1043{ 1044 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1045} 1046 1047static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1048_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1049{ 1050 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1051} 1052 1053static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1054_mm_set1_epi64x(long long q) 1055{ 1056 return (__m128i){ q, q }; 1057} 1058 1059static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1060_mm_set1_epi64(__m64 q) 1061{ 1062 return (__m128i){ (long long)q, (long long)q }; 1063} 1064 1065static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1066_mm_set1_epi32(int i) 1067{ 1068 return (__m128i)(__v4si){ i, i, i, i }; 1069} 1070 1071static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1072_mm_set1_epi16(short w) 1073{ 1074 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1075} 1076 1077static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1078_mm_set1_epi8(char b) 1079{ 1080 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1081} 1082 1083static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1084_mm_setr_epi64(__m64 q0, __m64 q1) 1085{ 1086 return (__m128i){ (long long)q0, (long long)q1 }; 1087} 1088 1089static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1090_mm_setr_epi32(int i0, int i1, int i2, int i3) 1091{ 1092 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1093} 1094 1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1096_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1097{ 1098 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1099} 1100 1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1102_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1103{ 1104 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1105} 1106 1107static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1108_mm_setzero_si128(void) 1109{ 1110 return (__m128i){ 0LL, 0LL }; 1111} 1112 1113static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1114_mm_store_si128(__m128i *p, __m128i b) 1115{ 1116 *p = b; 1117} 1118 1119static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1120_mm_storeu_si128(__m128i *p, __m128i b) 1121{ 1122 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1123} 1124 1125static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1126_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1127{ 1128 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1129} 1130 1131static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1132_mm_storel_epi64(__m128i *p, __m128i a) 1133{ 1134 __builtin_ia32_storelv4si((__v2si *)p, a); 1135} 1136 1137static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1138_mm_stream_pd(double *p, __m128d a) 1139{ 1140 __builtin_ia32_movntpd(p, a); 1141} 1142 1143static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1144_mm_stream_si128(__m128i *p, __m128i a) 1145{ 1146 __builtin_ia32_movntdq(p, a); 1147} 1148 1149static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1150_mm_stream_si32(int *p, int a) 1151{ 1152 __builtin_ia32_movnti(p, a); 1153} 1154 1155static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1156_mm_clflush(void const *p) 1157{ 1158 __builtin_ia32_clflush(p); 1159} 1160 1161static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1162_mm_lfence(void) 1163{ 1164 __builtin_ia32_lfence(); 1165} 1166 1167static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1168_mm_mfence(void) 1169{ 1170 __builtin_ia32_mfence(); 1171} 1172 1173static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1174_mm_packs_epi16(__m128i a, __m128i b) 1175{ 1176 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1177} 1178 1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1180_mm_packs_epi32(__m128i a, __m128i b) 1181{ 1182 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1183} 1184 1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1186_mm_packus_epi16(__m128i a, __m128i b) 1187{ 1188 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1189} 1190 1191static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1192_mm_extract_epi16(__m128i a, int imm) 1193{ 1194 __v8hi b = (__v8hi)a; 1195 return (unsigned short)b[imm]; 1196} 1197 1198static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1199_mm_insert_epi16(__m128i a, int b, int imm) 1200{ 1201 __v8hi c = (__v8hi)a; 1202 c[imm & 7] = b; 1203 return (__m128i)c; 1204} 1205 1206static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1207_mm_movemask_epi8(__m128i a) 1208{ 1209 return __builtin_ia32_pmovmskb128((__v16qi)a); 1210} 1211 1212#define _mm_shuffle_epi32(a, imm) \ 1213 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \ 1214 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1215 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6)) 1216 1217 1218#define _mm_shufflelo_epi16(a, imm) \ 1219 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \ 1220 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1221 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1222 4, 5, 6, 7)) 1223#define _mm_shufflehi_epi16(a, imm) \ 1224 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \ 1225 4 + (((imm) & 0x03) >> 0), \ 1226 4 + (((imm) & 0x0c) >> 2), \ 1227 4 + (((imm) & 0x30) >> 4), \ 1228 4 + (((imm) & 0xc0) >> 6))) 1229 1230static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1231_mm_unpackhi_epi8(__m128i a, __m128i b) 1232{ 1233 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1234} 1235 1236static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1237_mm_unpackhi_epi16(__m128i a, __m128i b) 1238{ 1239 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1240} 1241 1242static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1243_mm_unpackhi_epi32(__m128i a, __m128i b) 1244{ 1245 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1246} 1247 1248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1249_mm_unpackhi_epi64(__m128i a, __m128i b) 1250{ 1251 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1252} 1253 1254static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1255_mm_unpacklo_epi8(__m128i a, __m128i b) 1256{ 1257 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1258} 1259 1260static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1261_mm_unpacklo_epi16(__m128i a, __m128i b) 1262{ 1263 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1264} 1265 1266static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1267_mm_unpacklo_epi32(__m128i a, __m128i b) 1268{ 1269 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1270} 1271 1272static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1273_mm_unpacklo_epi64(__m128i a, __m128i b) 1274{ 1275 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1276} 1277 1278static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1279_mm_movepi64_pi64(__m128i a) 1280{ 1281 return (__m64)a[0]; 1282} 1283 1284static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1285_mm_movpi64_pi64(__m64 a) 1286{ 1287 return (__m128i){ (long long)a, 0 }; 1288} 1289 1290static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1291_mm_move_epi64(__m128i a) 1292{ 1293 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1294} 1295 1296static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1297_mm_unpackhi_pd(__m128d a, __m128d b) 1298{ 1299 return __builtin_shufflevector(a, b, 1, 2+1); 1300} 1301 1302static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1303_mm_unpacklo_pd(__m128d a, __m128d b) 1304{ 1305 return __builtin_shufflevector(a, b, 0, 2+0); 1306} 1307 1308static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1309_mm_movemask_pd(__m128d a) 1310{ 1311 return __builtin_ia32_movmskpd(a); 1312} 1313 1314#define _mm_shuffle_pd(a, b, i) \ 1315 (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \ 1316 (((i) & 2) >> 1) + 2)) 1317 1318static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1319_mm_castpd_ps(__m128d in) 1320{ 1321 return (__m128)in; 1322} 1323 1324static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1325_mm_castpd_si128(__m128d in) 1326{ 1327 return (__m128i)in; 1328} 1329 1330static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1331_mm_castps_pd(__m128 in) 1332{ 1333 return (__m128d)in; 1334} 1335 1336static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1337_mm_castps_si128(__m128 in) 1338{ 1339 return (__m128i)in; 1340} 1341 1342static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1343_mm_castsi128_ps(__m128i in) 1344{ 1345 return (__m128)in; 1346} 1347 1348static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1349_mm_castsi128_pd(__m128i in) 1350{ 1351 return (__m128d)in; 1352} 1353 1354static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1355_mm_pause(void) 1356{ 1357 __asm__ volatile ("pause"); 1358} 1359 1360#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1361 1362#endif /* __SSE2__ */ 1363 1364#endif /* __EMMINTRIN_H */ 1365