emmintrin.h revision 223017
1254721Semaste/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2254721Semaste * 3254721Semaste * Permission is hereby granted, free of charge, to any person obtaining a copy 4254721Semaste * of this software and associated documentation files (the "Software"), to deal 5254721Semaste * in the Software without restriction, including without limitation the rights 6254721Semaste * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7254721Semaste * copies of the Software, and to permit persons to whom the Software is 8254721Semaste * furnished to do so, subject to the following conditions: 9254721Semaste * 10254721Semaste * The above copyright notice and this permission notice shall be included in 11254721Semaste * all copies or substantial portions of the Software. 12254721Semaste * 13254721Semaste * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14254721Semaste * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15254721Semaste * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16254721Semaste * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17254721Semaste * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18254721Semaste * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19254721Semaste * THE SOFTWARE. 20254721Semaste * 21254721Semaste *===-----------------------------------------------------------------------=== 22254721Semaste */ 23254721Semaste 24254721Semaste#ifndef __EMMINTRIN_H 25254721Semaste#define __EMMINTRIN_H 26254721Semaste 27254721Semaste#ifndef __SSE2__ 28254721Semaste#error "SSE2 instruction set not enabled" 29254721Semaste#else 30254721Semaste 31254721Semaste#include <xmmintrin.h> 32254721Semaste 33254721Semastetypedef double __m128d __attribute__((__vector_size__(16))); 34254721Semastetypedef long long __m128i __attribute__((__vector_size__(16))); 35254721Semaste 36254721Semaste/* Type defines. */ 37254721Semastetypedef double __v2df __attribute__ ((__vector_size__ (16))); 38254721Semastetypedef long long __v2di __attribute__ ((__vector_size__ (16))); 39254721Semastetypedef short __v8hi __attribute__((__vector_size__(16))); 40254721Semastetypedef char __v16qi __attribute__((__vector_size__(16))); 41254721Semaste 42254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43254721Semaste_mm_add_sd(__m128d a, __m128d b) 44254721Semaste{ 45254721Semaste a[0] += b[0]; 46254721Semaste return a; 47254721Semaste} 48254721Semaste 49254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50254721Semaste_mm_add_pd(__m128d a, __m128d b) 51254721Semaste{ 52254721Semaste return a + b; 53254721Semaste} 54254721Semaste 55254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56254721Semaste_mm_sub_sd(__m128d a, __m128d b) 57254721Semaste{ 58254721Semaste a[0] -= b[0]; 59254721Semaste return a; 60254721Semaste} 61254721Semaste 62254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63254721Semaste_mm_sub_pd(__m128d a, __m128d b) 64254721Semaste{ 65254721Semaste return a - b; 66254721Semaste} 67254721Semaste 68254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69254721Semaste_mm_mul_sd(__m128d a, __m128d b) 70254721Semaste{ 71254721Semaste a[0] *= b[0]; 72254721Semaste return a; 73254721Semaste} 74254721Semaste 75254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76254721Semaste_mm_mul_pd(__m128d a, __m128d b) 77254721Semaste{ 78254721Semaste return a * b; 79254721Semaste} 80254721Semaste 81254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82254721Semaste_mm_div_sd(__m128d a, __m128d b) 83254721Semaste{ 84254721Semaste a[0] /= b[0]; 85254721Semaste return a; 86254721Semaste} 87254721Semaste 88254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89254721Semaste_mm_div_pd(__m128d a, __m128d b) 90254721Semaste{ 91254721Semaste return a / b; 92254721Semaste} 93254721Semaste 94254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95254721Semaste_mm_sqrt_sd(__m128d a, __m128d b) 96254721Semaste{ 97254721Semaste __m128d c = __builtin_ia32_sqrtsd(b); 98254721Semaste return (__m128d) { c[0], a[1] }; 99254721Semaste} 100254721Semaste 101254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102254721Semaste_mm_sqrt_pd(__m128d a) 103254721Semaste{ 104254721Semaste return __builtin_ia32_sqrtpd(a); 105254721Semaste} 106254721Semaste 107254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108254721Semaste_mm_min_sd(__m128d a, __m128d b) 109254721Semaste{ 110254721Semaste return __builtin_ia32_minsd(a, b); 111254721Semaste} 112 113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_min_pd(__m128d a, __m128d b) 115{ 116 return __builtin_ia32_minpd(a, b); 117} 118 119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_sd(__m128d a, __m128d b) 121{ 122 return __builtin_ia32_maxsd(a, b); 123} 124 125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_max_pd(__m128d a, __m128d b) 127{ 128 return __builtin_ia32_maxpd(a, b); 129} 130 131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_and_pd(__m128d a, __m128d b) 133{ 134 return (__m128d)((__v4si)a & (__v4si)b); 135} 136 137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_andnot_pd(__m128d a, __m128d b) 139{ 140 return (__m128d)(~(__v4si)a & (__v4si)b); 141} 142 143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_or_pd(__m128d a, __m128d b) 145{ 146 return (__m128d)((__v4si)a | (__v4si)b); 147} 148 149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_xor_pd(__m128d a, __m128d b) 151{ 152 return (__m128d)((__v4si)a ^ (__v4si)b); 153} 154 155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmpeq_pd(__m128d a, __m128d b) 157{ 158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 159} 160 161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmplt_pd(__m128d a, __m128d b) 163{ 164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 165} 166 167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmple_pd(__m128d a, __m128d b) 169{ 170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 171} 172 173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpgt_pd(__m128d a, __m128d b) 175{ 176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 177} 178 179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpge_pd(__m128d a, __m128d b) 181{ 182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 183} 184 185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpord_pd(__m128d a, __m128d b) 187{ 188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 189} 190 191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpunord_pd(__m128d a, __m128d b) 193{ 194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 195} 196 197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpneq_pd(__m128d a, __m128d b) 199{ 200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 201} 202 203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnlt_pd(__m128d a, __m128d b) 205{ 206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 207} 208 209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpnle_pd(__m128d a, __m128d b) 211{ 212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 213} 214 215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpngt_pd(__m128d a, __m128d b) 217{ 218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 219} 220 221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpnge_pd(__m128d a, __m128d b) 223{ 224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 225} 226 227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmpeq_sd(__m128d a, __m128d b) 229{ 230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 231} 232 233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmplt_sd(__m128d a, __m128d b) 235{ 236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 237} 238 239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmple_sd(__m128d a, __m128d b) 241{ 242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 243} 244 245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpgt_sd(__m128d a, __m128d b) 247{ 248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 249} 250 251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 252_mm_cmpge_sd(__m128d a, __m128d b) 253{ 254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 255} 256 257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 258_mm_cmpord_sd(__m128d a, __m128d b) 259{ 260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 261} 262 263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 264_mm_cmpunord_sd(__m128d a, __m128d b) 265{ 266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 267} 268 269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 270_mm_cmpneq_sd(__m128d a, __m128d b) 271{ 272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 273} 274 275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 276_mm_cmpnlt_sd(__m128d a, __m128d b) 277{ 278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 279} 280 281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 282_mm_cmpnle_sd(__m128d a, __m128d b) 283{ 284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 285} 286 287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 288_mm_cmpngt_sd(__m128d a, __m128d b) 289{ 290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 291} 292 293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 294_mm_cmpnge_sd(__m128d a, __m128d b) 295{ 296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 297} 298 299static __inline__ int __attribute__((__always_inline__, __nodebug__)) 300_mm_comieq_sd(__m128d a, __m128d b) 301{ 302 return __builtin_ia32_comisdeq(a, b); 303} 304 305static __inline__ int __attribute__((__always_inline__, __nodebug__)) 306_mm_comilt_sd(__m128d a, __m128d b) 307{ 308 return __builtin_ia32_comisdlt(a, b); 309} 310 311static __inline__ int __attribute__((__always_inline__, __nodebug__)) 312_mm_comile_sd(__m128d a, __m128d b) 313{ 314 return __builtin_ia32_comisdle(a, b); 315} 316 317static __inline__ int __attribute__((__always_inline__, __nodebug__)) 318_mm_comigt_sd(__m128d a, __m128d b) 319{ 320 return __builtin_ia32_comisdgt(a, b); 321} 322 323static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324_mm_comineq_sd(__m128d a, __m128d b) 325{ 326 return __builtin_ia32_comisdneq(a, b); 327} 328 329static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330_mm_ucomieq_sd(__m128d a, __m128d b) 331{ 332 return __builtin_ia32_ucomisdeq(a, b); 333} 334 335static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336_mm_ucomilt_sd(__m128d a, __m128d b) 337{ 338 return __builtin_ia32_ucomisdlt(a, b); 339} 340 341static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342_mm_ucomile_sd(__m128d a, __m128d b) 343{ 344 return __builtin_ia32_ucomisdle(a, b); 345} 346 347static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348_mm_ucomigt_sd(__m128d a, __m128d b) 349{ 350 return __builtin_ia32_ucomisdgt(a, b); 351} 352 353static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354_mm_ucomineq_sd(__m128d a, __m128d b) 355{ 356 return __builtin_ia32_ucomisdneq(a, b); 357} 358 359static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 360_mm_cvtpd_ps(__m128d a) 361{ 362 return __builtin_ia32_cvtpd2ps(a); 363} 364 365static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 366_mm_cvtps_pd(__m128 a) 367{ 368 return __builtin_ia32_cvtps2pd(a); 369} 370 371static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 372_mm_cvtepi32_pd(__m128i a) 373{ 374 return __builtin_ia32_cvtdq2pd((__v4si)a); 375} 376 377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 378_mm_cvtpd_epi32(__m128d a) 379{ 380 return __builtin_ia32_cvtpd2dq(a); 381} 382 383static __inline__ int __attribute__((__always_inline__, __nodebug__)) 384_mm_cvtsd_si32(__m128d a) 385{ 386 return __builtin_ia32_cvtsd2si(a); 387} 388 389static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 390_mm_cvtsd_ss(__m128 a, __m128d b) 391{ 392 a[0] = b[0]; 393 return a; 394} 395 396static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 397_mm_cvtsi32_sd(__m128d a, int b) 398{ 399 a[0] = b; 400 return a; 401} 402 403static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 404_mm_cvtss_sd(__m128d a, __m128 b) 405{ 406 a[0] = b[0]; 407 return a; 408} 409 410static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 411_mm_cvttpd_epi32(__m128d a) 412{ 413 return (__m128i)__builtin_ia32_cvttpd2dq(a); 414} 415 416static __inline__ int __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttsd_si32(__m128d a) 418{ 419 return a[0]; 420} 421 422static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 423_mm_cvtpd_pi32(__m128d a) 424{ 425 return (__m64)__builtin_ia32_cvtpd2pi(a); 426} 427 428static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttpd_pi32(__m128d a) 430{ 431 return (__m64)__builtin_ia32_cvttpd2pi(a); 432} 433 434static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtpi32_pd(__m64 a) 436{ 437 return __builtin_ia32_cvtpi2pd((__v2si)a); 438} 439 440static __inline__ double __attribute__((__always_inline__, __nodebug__)) 441_mm_cvtsd_f64(__m128d a) 442{ 443 return a[0]; 444} 445 446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 447_mm_load_pd(double const *dp) 448{ 449 return *(__m128d*)dp; 450} 451 452static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 453_mm_load1_pd(double const *dp) 454{ 455 return (__m128d){ dp[0], dp[0] }; 456} 457 458#define _mm_load_pd1(dp) _mm_load1_pd(dp) 459 460static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 461_mm_loadr_pd(double const *dp) 462{ 463 return (__m128d){ dp[1], dp[0] }; 464} 465 466static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 467_mm_loadu_pd(double const *dp) 468{ 469 struct __loadu_pd { 470 __m128d v; 471 } __attribute__((packed, may_alias)); 472 return ((struct __loadu_pd*)dp)->v; 473} 474 475static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 476_mm_load_sd(double const *dp) 477{ 478 return (__m128d){ *dp, 0.0 }; 479} 480 481static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 482_mm_loadh_pd(__m128d a, double const *dp) 483{ 484 return (__m128d){ a[0], *dp }; 485} 486 487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 488_mm_loadl_pd(__m128d a, double const *dp) 489{ 490 return (__m128d){ *dp, a[1] }; 491} 492 493static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 494_mm_set_sd(double w) 495{ 496 return (__m128d){ w, 0 }; 497} 498 499static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 500_mm_set1_pd(double w) 501{ 502 return (__m128d){ w, w }; 503} 504 505static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 506_mm_set_pd(double w, double x) 507{ 508 return (__m128d){ x, w }; 509} 510 511static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 512_mm_setr_pd(double w, double x) 513{ 514 return (__m128d){ w, x }; 515} 516 517static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 518_mm_setzero_pd(void) 519{ 520 return (__m128d){ 0, 0 }; 521} 522 523static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 524_mm_move_sd(__m128d a, __m128d b) 525{ 526 return (__m128d){ b[0], a[1] }; 527} 528 529static __inline__ void __attribute__((__always_inline__, __nodebug__)) 530_mm_store_sd(double *dp, __m128d a) 531{ 532 dp[0] = a[0]; 533} 534 535static __inline__ void __attribute__((__always_inline__, __nodebug__)) 536_mm_store1_pd(double *dp, __m128d a) 537{ 538 dp[0] = a[0]; 539 dp[1] = a[0]; 540} 541 542static __inline__ void __attribute__((__always_inline__, __nodebug__)) 543_mm_store_pd(double *dp, __m128d a) 544{ 545 *(__m128d *)dp = a; 546} 547 548static __inline__ void __attribute__((__always_inline__, __nodebug__)) 549_mm_storeu_pd(double *dp, __m128d a) 550{ 551 __builtin_ia32_storeupd(dp, a); 552} 553 554static __inline__ void __attribute__((__always_inline__, __nodebug__)) 555_mm_storer_pd(double *dp, __m128d a) 556{ 557 dp[0] = a[1]; 558 dp[1] = a[0]; 559} 560 561static __inline__ void __attribute__((__always_inline__, __nodebug__)) 562_mm_storeh_pd(double *dp, __m128d a) 563{ 564 dp[0] = a[1]; 565} 566 567static __inline__ void __attribute__((__always_inline__, __nodebug__)) 568_mm_storel_pd(double *dp, __m128d a) 569{ 570 dp[0] = a[0]; 571} 572 573static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 574_mm_add_epi8(__m128i a, __m128i b) 575{ 576 return (__m128i)((__v16qi)a + (__v16qi)b); 577} 578 579static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 580_mm_add_epi16(__m128i a, __m128i b) 581{ 582 return (__m128i)((__v8hi)a + (__v8hi)b); 583} 584 585static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 586_mm_add_epi32(__m128i a, __m128i b) 587{ 588 return (__m128i)((__v4si)a + (__v4si)b); 589} 590 591static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 592_mm_add_si64(__m64 a, __m64 b) 593{ 594 return a + b; 595} 596 597static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 598_mm_add_epi64(__m128i a, __m128i b) 599{ 600 return a + b; 601} 602 603static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 604_mm_adds_epi8(__m128i a, __m128i b) 605{ 606 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 607} 608 609static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 610_mm_adds_epi16(__m128i a, __m128i b) 611{ 612 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 613} 614 615static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 616_mm_adds_epu8(__m128i a, __m128i b) 617{ 618 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 619} 620 621static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 622_mm_adds_epu16(__m128i a, __m128i b) 623{ 624 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 625} 626 627static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 628_mm_avg_epu8(__m128i a, __m128i b) 629{ 630 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 631} 632 633static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 634_mm_avg_epu16(__m128i a, __m128i b) 635{ 636 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 637} 638 639static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 640_mm_madd_epi16(__m128i a, __m128i b) 641{ 642 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 643} 644 645static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 646_mm_max_epi16(__m128i a, __m128i b) 647{ 648 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 649} 650 651static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 652_mm_max_epu8(__m128i a, __m128i b) 653{ 654 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 655} 656 657static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 658_mm_min_epi16(__m128i a, __m128i b) 659{ 660 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 661} 662 663static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 664_mm_min_epu8(__m128i a, __m128i b) 665{ 666 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 667} 668 669static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 670_mm_mulhi_epi16(__m128i a, __m128i b) 671{ 672 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 673} 674 675static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 676_mm_mulhi_epu16(__m128i a, __m128i b) 677{ 678 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 679} 680 681static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 682_mm_mullo_epi16(__m128i a, __m128i b) 683{ 684 return (__m128i)((__v8hi)a * (__v8hi)b); 685} 686 687static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 688_mm_mul_su32(__m64 a, __m64 b) 689{ 690 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 691} 692 693static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 694_mm_mul_epu32(__m128i a, __m128i b) 695{ 696 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 697} 698 699static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 700_mm_sad_epu8(__m128i a, __m128i b) 701{ 702 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 703} 704 705static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 706_mm_sub_epi8(__m128i a, __m128i b) 707{ 708 return (__m128i)((__v16qi)a - (__v16qi)b); 709} 710 711static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 712_mm_sub_epi16(__m128i a, __m128i b) 713{ 714 return (__m128i)((__v8hi)a - (__v8hi)b); 715} 716 717static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 718_mm_sub_epi32(__m128i a, __m128i b) 719{ 720 return (__m128i)((__v4si)a - (__v4si)b); 721} 722 723static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 724_mm_sub_si64(__m64 a, __m64 b) 725{ 726 return a - b; 727} 728 729static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 730_mm_sub_epi64(__m128i a, __m128i b) 731{ 732 return a - b; 733} 734 735static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 736_mm_subs_epi8(__m128i a, __m128i b) 737{ 738 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 739} 740 741static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 742_mm_subs_epi16(__m128i a, __m128i b) 743{ 744 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 745} 746 747static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 748_mm_subs_epu8(__m128i a, __m128i b) 749{ 750 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 751} 752 753static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 754_mm_subs_epu16(__m128i a, __m128i b) 755{ 756 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 757} 758 759static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 760_mm_and_si128(__m128i a, __m128i b) 761{ 762 return a & b; 763} 764 765static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 766_mm_andnot_si128(__m128i a, __m128i b) 767{ 768 return ~a & b; 769} 770 771static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 772_mm_or_si128(__m128i a, __m128i b) 773{ 774 return a | b; 775} 776 777static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 778_mm_xor_si128(__m128i a, __m128i b) 779{ 780 return a ^ b; 781} 782 783#define _mm_slli_si128(VEC, IMM) \ 784 ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8)) 785 786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 787_mm_slli_epi16(__m128i a, int count) 788{ 789 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 790} 791 792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 793_mm_sll_epi16(__m128i a, __m128i count) 794{ 795 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 796} 797 798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 799_mm_slli_epi32(__m128i a, int count) 800{ 801 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 802} 803 804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 805_mm_sll_epi32(__m128i a, __m128i count) 806{ 807 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 808} 809 810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 811_mm_slli_epi64(__m128i a, int count) 812{ 813 return __builtin_ia32_psllqi128(a, count); 814} 815 816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 817_mm_sll_epi64(__m128i a, __m128i count) 818{ 819 return __builtin_ia32_psllq128(a, count); 820} 821 822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 823_mm_srai_epi16(__m128i a, int count) 824{ 825 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 826} 827 828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 829_mm_sra_epi16(__m128i a, __m128i count) 830{ 831 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 832} 833 834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 835_mm_srai_epi32(__m128i a, int count) 836{ 837 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 838} 839 840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 841_mm_sra_epi32(__m128i a, __m128i count) 842{ 843 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 844} 845 846 847#define _mm_srli_si128(VEC, IMM) \ 848 ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8)) 849 850static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 851_mm_srli_epi16(__m128i a, int count) 852{ 853 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 854} 855 856static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 857_mm_srl_epi16(__m128i a, __m128i count) 858{ 859 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 860} 861 862static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 863_mm_srli_epi32(__m128i a, int count) 864{ 865 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 866} 867 868static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 869_mm_srl_epi32(__m128i a, __m128i count) 870{ 871 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 872} 873 874static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 875_mm_srli_epi64(__m128i a, int count) 876{ 877 return __builtin_ia32_psrlqi128(a, count); 878} 879 880static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 881_mm_srl_epi64(__m128i a, __m128i count) 882{ 883 return __builtin_ia32_psrlq128(a, count); 884} 885 886static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 887_mm_cmpeq_epi8(__m128i a, __m128i b) 888{ 889 return (__m128i)((__v16qi)a == (__v16qi)b); 890} 891 892static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 893_mm_cmpeq_epi16(__m128i a, __m128i b) 894{ 895 return (__m128i)((__v8hi)a == (__v8hi)b); 896} 897 898static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 899_mm_cmpeq_epi32(__m128i a, __m128i b) 900{ 901 return (__m128i)((__v4si)a == (__v4si)b); 902} 903 904static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 905_mm_cmpgt_epi8(__m128i a, __m128i b) 906{ 907 return (__m128i)((__v16qi)a > (__v16qi)b); 908} 909 910static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 911_mm_cmpgt_epi16(__m128i a, __m128i b) 912{ 913 return (__m128i)((__v8hi)a > (__v8hi)b); 914} 915 916static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 917_mm_cmpgt_epi32(__m128i a, __m128i b) 918{ 919 return (__m128i)((__v4si)a > (__v4si)b); 920} 921 922static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 923_mm_cmplt_epi8(__m128i a, __m128i b) 924{ 925 return _mm_cmpgt_epi8(b,a); 926} 927 928static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 929_mm_cmplt_epi16(__m128i a, __m128i b) 930{ 931 return _mm_cmpgt_epi16(b,a); 932} 933 934static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 935_mm_cmplt_epi32(__m128i a, __m128i b) 936{ 937 return _mm_cmpgt_epi32(b,a); 938} 939 940#ifdef __x86_64__ 941static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 942_mm_cvtsi64_sd(__m128d a, long long b) 943{ 944 a[0] = b; 945 return a; 946} 947 948static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 949_mm_cvtsd_si64(__m128d a) 950{ 951 return __builtin_ia32_cvtsd2si64(a); 952} 953 954static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 955_mm_cvttsd_si64(__m128d a) 956{ 957 return a[0]; 958} 959#endif 960 961static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 962_mm_cvtepi32_ps(__m128i a) 963{ 964 return __builtin_ia32_cvtdq2ps((__v4si)a); 965} 966 967static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 968_mm_cvtps_epi32(__m128 a) 969{ 970 return (__m128i)__builtin_ia32_cvtps2dq(a); 971} 972 973static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 974_mm_cvttps_epi32(__m128 a) 975{ 976 return (__m128i)__builtin_ia32_cvttps2dq(a); 977} 978 979static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 980_mm_cvtsi32_si128(int a) 981{ 982 return (__m128i)(__v4si){ a, 0, 0, 0 }; 983} 984 985#ifdef __x86_64__ 986static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 987_mm_cvtsi64_si128(long long a) 988{ 989 return (__m128i){ a, 0 }; 990} 991#endif 992 993static __inline__ int __attribute__((__always_inline__, __nodebug__)) 994_mm_cvtsi128_si32(__m128i a) 995{ 996 __v4si b = (__v4si)a; 997 return b[0]; 998} 999 1000#ifdef __x86_64__ 1001static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1002_mm_cvtsi128_si64(__m128i a) 1003{ 1004 return a[0]; 1005} 1006#endif 1007 1008static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1009_mm_load_si128(__m128i const *p) 1010{ 1011 return *p; 1012} 1013 1014static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1015_mm_loadu_si128(__m128i const *p) 1016{ 1017 struct __loadu_si128 { 1018 __m128i v; 1019 } __attribute__((packed, may_alias)); 1020 return ((struct __loadu_si128*)p)->v; 1021} 1022 1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1024_mm_loadl_epi64(__m128i const *p) 1025{ 1026 return (__m128i) { *(long long*)p, 0}; 1027} 1028 1029static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1030_mm_set_epi64x(long long q1, long long q0) 1031{ 1032 return (__m128i){ q0, q1 }; 1033} 1034 1035static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1036_mm_set_epi64(__m64 q1, __m64 q0) 1037{ 1038 return (__m128i){ (long long)q0, (long long)q1 }; 1039} 1040 1041static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1042_mm_set_epi32(int i3, int i2, int i1, int i0) 1043{ 1044 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1045} 1046 1047static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1048_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1049{ 1050 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1051} 1052 1053static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1054_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1055{ 1056 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1057} 1058 1059static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1060_mm_set1_epi64x(long long q) 1061{ 1062 return (__m128i){ q, q }; 1063} 1064 1065static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1066_mm_set1_epi64(__m64 q) 1067{ 1068 return (__m128i){ (long long)q, (long long)q }; 1069} 1070 1071static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1072_mm_set1_epi32(int i) 1073{ 1074 return (__m128i)(__v4si){ i, i, i, i }; 1075} 1076 1077static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1078_mm_set1_epi16(short w) 1079{ 1080 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1081} 1082 1083static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1084_mm_set1_epi8(char b) 1085{ 1086 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1087} 1088 1089static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1090_mm_setr_epi64(__m64 q0, __m64 q1) 1091{ 1092 return (__m128i){ (long long)q0, (long long)q1 }; 1093} 1094 1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1096_mm_setr_epi32(int i0, int i1, int i2, int i3) 1097{ 1098 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1099} 1100 1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1102_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1103{ 1104 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1105} 1106 1107static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1108_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1109{ 1110 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1111} 1112 1113static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1114_mm_setzero_si128(void) 1115{ 1116 return (__m128i){ 0LL, 0LL }; 1117} 1118 1119static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1120_mm_store_si128(__m128i *p, __m128i b) 1121{ 1122 *p = b; 1123} 1124 1125static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1126_mm_storeu_si128(__m128i *p, __m128i b) 1127{ 1128 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1129} 1130 1131static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1132_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1133{ 1134 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1135} 1136 1137static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1138_mm_storel_epi64(__m128i *p, __m128i a) 1139{ 1140 __builtin_ia32_storelv4si((__v2si *)p, a); 1141} 1142 1143static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1144_mm_stream_pd(double *p, __m128d a) 1145{ 1146 __builtin_ia32_movntpd(p, a); 1147} 1148 1149static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1150_mm_stream_si128(__m128i *p, __m128i a) 1151{ 1152 __builtin_ia32_movntdq(p, a); 1153} 1154 1155static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1156_mm_stream_si32(int *p, int a) 1157{ 1158 __builtin_ia32_movnti(p, a); 1159} 1160 1161static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1162_mm_clflush(void const *p) 1163{ 1164 __builtin_ia32_clflush(p); 1165} 1166 1167static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1168_mm_lfence(void) 1169{ 1170 __builtin_ia32_lfence(); 1171} 1172 1173static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1174_mm_mfence(void) 1175{ 1176 __builtin_ia32_mfence(); 1177} 1178 1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1180_mm_packs_epi16(__m128i a, __m128i b) 1181{ 1182 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1183} 1184 1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1186_mm_packs_epi32(__m128i a, __m128i b) 1187{ 1188 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1189} 1190 1191static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1192_mm_packus_epi16(__m128i a, __m128i b) 1193{ 1194 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1195} 1196 1197static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1198_mm_extract_epi16(__m128i a, int imm) 1199{ 1200 __v8hi b = (__v8hi)a; 1201 return (unsigned short)b[imm]; 1202} 1203 1204static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1205_mm_insert_epi16(__m128i a, int b, int imm) 1206{ 1207 __v8hi c = (__v8hi)a; 1208 c[imm & 7] = b; 1209 return (__m128i)c; 1210} 1211 1212static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1213_mm_movemask_epi8(__m128i a) 1214{ 1215 return __builtin_ia32_pmovmskb128((__v16qi)a); 1216} 1217 1218#define _mm_shuffle_epi32(a, imm) \ 1219 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \ 1220 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1221 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6)) 1222 1223 1224#define _mm_shufflelo_epi16(a, imm) \ 1225 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \ 1226 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1227 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1228 4, 5, 6, 7)) 1229#define _mm_shufflehi_epi16(a, imm) \ 1230 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \ 1231 4 + (((imm) & 0x03) >> 0), \ 1232 4 + (((imm) & 0x0c) >> 2), \ 1233 4 + (((imm) & 0x30) >> 4), \ 1234 4 + (((imm) & 0xc0) >> 6))) 1235 1236static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1237_mm_unpackhi_epi8(__m128i a, __m128i b) 1238{ 1239 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1240} 1241 1242static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1243_mm_unpackhi_epi16(__m128i a, __m128i b) 1244{ 1245 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1246} 1247 1248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1249_mm_unpackhi_epi32(__m128i a, __m128i b) 1250{ 1251 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1252} 1253 1254static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1255_mm_unpackhi_epi64(__m128i a, __m128i b) 1256{ 1257 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1258} 1259 1260static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1261_mm_unpacklo_epi8(__m128i a, __m128i b) 1262{ 1263 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1264} 1265 1266static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1267_mm_unpacklo_epi16(__m128i a, __m128i b) 1268{ 1269 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1270} 1271 1272static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1273_mm_unpacklo_epi32(__m128i a, __m128i b) 1274{ 1275 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1276} 1277 1278static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1279_mm_unpacklo_epi64(__m128i a, __m128i b) 1280{ 1281 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1282} 1283 1284static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1285_mm_movepi64_pi64(__m128i a) 1286{ 1287 return (__m64)a[0]; 1288} 1289 1290static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1291_mm_movpi64_pi64(__m64 a) 1292{ 1293 return (__m128i){ (long long)a, 0 }; 1294} 1295 1296static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1297_mm_move_epi64(__m128i a) 1298{ 1299 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1300} 1301 1302static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1303_mm_unpackhi_pd(__m128d a, __m128d b) 1304{ 1305 return __builtin_shufflevector(a, b, 1, 2+1); 1306} 1307 1308static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1309_mm_unpacklo_pd(__m128d a, __m128d b) 1310{ 1311 return __builtin_shufflevector(a, b, 0, 2+0); 1312} 1313 1314static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1315_mm_movemask_pd(__m128d a) 1316{ 1317 return __builtin_ia32_movmskpd(a); 1318} 1319 1320#define _mm_shuffle_pd(a, b, i) \ 1321 (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \ 1322 (((i) & 2) >> 1) + 2)) 1323 1324static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1325_mm_castpd_ps(__m128d in) 1326{ 1327 return (__m128)in; 1328} 1329 1330static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1331_mm_castpd_si128(__m128d in) 1332{ 1333 return (__m128i)in; 1334} 1335 1336static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1337_mm_castps_pd(__m128 in) 1338{ 1339 return (__m128d)in; 1340} 1341 1342static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1343_mm_castps_si128(__m128 in) 1344{ 1345 return (__m128i)in; 1346} 1347 1348static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1349_mm_castsi128_ps(__m128i in) 1350{ 1351 return (__m128)in; 1352} 1353 1354static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1355_mm_castsi128_pd(__m128i in) 1356{ 1357 return (__m128d)in; 1358} 1359 1360static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1361_mm_pause(void) 1362{ 1363 __asm__ volatile ("pause"); 1364} 1365 1366#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1367 1368#endif /* __SSE2__ */ 1369 1370#endif /* __EMMINTRIN_H */ 1371