xmmintrin.h revision 204643
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__, __nodebug__)) 39_mm_add_ss(__m128 a, __m128 b) 40{ 41 a[0] += b[0]; 42 return a; 43} 44 45static inline __m128 __attribute__((__always_inline__, __nodebug__)) 46_mm_add_ps(__m128 a, __m128 b) 47{ 48 return a + b; 49} 50 51static inline __m128 __attribute__((__always_inline__, __nodebug__)) 52_mm_sub_ss(__m128 a, __m128 b) 53{ 54 a[0] -= b[0]; 55 return a; 56} 57 58static inline __m128 __attribute__((__always_inline__, __nodebug__)) 59_mm_sub_ps(__m128 a, __m128 b) 60{ 61 return a - b; 62} 63 64static inline __m128 __attribute__((__always_inline__, __nodebug__)) 65_mm_mul_ss(__m128 a, __m128 b) 66{ 67 a[0] *= b[0]; 68 return a; 69} 70 71static inline __m128 __attribute__((__always_inline__, __nodebug__)) 72_mm_mul_ps(__m128 a, __m128 b) 73{ 74 return a * b; 75} 76 77static inline __m128 __attribute__((__always_inline__, __nodebug__)) 78_mm_div_ss(__m128 a, __m128 b) 79{ 80 a[0] /= b[0]; 81 return a; 82} 83 84static inline __m128 __attribute__((__always_inline__, __nodebug__)) 85_mm_div_ps(__m128 a, __m128 b) 86{ 87 return a / b; 88} 89 90static inline __m128 __attribute__((__always_inline__, __nodebug__)) 91_mm_sqrt_ss(__m128 a) 92{ 93 return __builtin_ia32_sqrtss(a); 94} 95 96static inline __m128 __attribute__((__always_inline__, __nodebug__)) 97_mm_sqrt_ps(__m128 a) 98{ 99 return __builtin_ia32_sqrtps(a); 100} 101 102static inline __m128 __attribute__((__always_inline__, __nodebug__)) 103_mm_rcp_ss(__m128 a) 104{ 105 return __builtin_ia32_rcpss(a); 106} 107 108static inline __m128 __attribute__((__always_inline__, __nodebug__)) 109_mm_rcp_ps(__m128 a) 110{ 111 return __builtin_ia32_rcpps(a); 112} 113 114static inline __m128 __attribute__((__always_inline__, __nodebug__)) 115_mm_rsqrt_ss(__m128 a) 116{ 117 return __builtin_ia32_rsqrtss(a); 118} 119 120static inline __m128 __attribute__((__always_inline__, __nodebug__)) 121_mm_rsqrt_ps(__m128 a) 122{ 123 return __builtin_ia32_rsqrtps(a); 124} 125 126static inline __m128 __attribute__((__always_inline__, __nodebug__)) 127_mm_min_ss(__m128 a, __m128 b) 128{ 129 return __builtin_ia32_minss(a, b); 130} 131 132static inline __m128 __attribute__((__always_inline__, __nodebug__)) 133_mm_min_ps(__m128 a, __m128 b) 134{ 135 return __builtin_ia32_minps(a, b); 136} 137 138static inline __m128 __attribute__((__always_inline__, __nodebug__)) 139_mm_max_ss(__m128 a, __m128 b) 140{ 141 return __builtin_ia32_maxss(a, b); 142} 143 144static inline __m128 __attribute__((__always_inline__, __nodebug__)) 145_mm_max_ps(__m128 a, __m128 b) 146{ 147 return __builtin_ia32_maxps(a, b); 148} 149 150static inline __m128 __attribute__((__always_inline__, __nodebug__)) 151_mm_and_ps(__m128 a, __m128 b) 152{ 153 typedef int __v4si __attribute__((__vector_size__(16))); 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static inline __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 typedef int __v4si __attribute__((__vector_size__(16))); 161 return (__m128)(~(__v4si)a & (__v4si)b); 162} 163 164static inline __m128 __attribute__((__always_inline__, __nodebug__)) 165_mm_or_ps(__m128 a, __m128 b) 166{ 167 typedef int __v4si __attribute__((__vector_size__(16))); 168 return (__m128)((__v4si)a | (__v4si)b); 169} 170 171static inline __m128 __attribute__((__always_inline__, __nodebug__)) 172_mm_xor_ps(__m128 a, __m128 b) 173{ 174 typedef int __v4si __attribute__((__vector_size__(16))); 175 return (__m128)((__v4si)a ^ (__v4si)b); 176} 177 178static inline __m128 __attribute__((__always_inline__, __nodebug__)) 179_mm_cmpeq_ss(__m128 a, __m128 b) 180{ 181 return (__m128)__builtin_ia32_cmpss(a, b, 0); 182} 183 184static inline __m128 __attribute__((__always_inline__, __nodebug__)) 185_mm_cmpeq_ps(__m128 a, __m128 b) 186{ 187 return (__m128)__builtin_ia32_cmpps(a, b, 0); 188} 189 190static inline __m128 __attribute__((__always_inline__, __nodebug__)) 191_mm_cmplt_ss(__m128 a, __m128 b) 192{ 193 return (__m128)__builtin_ia32_cmpss(a, b, 1); 194} 195 196static inline __m128 __attribute__((__always_inline__, __nodebug__)) 197_mm_cmplt_ps(__m128 a, __m128 b) 198{ 199 return (__m128)__builtin_ia32_cmpps(a, b, 1); 200} 201 202static inline __m128 __attribute__((__always_inline__, __nodebug__)) 203_mm_cmple_ss(__m128 a, __m128 b) 204{ 205 return (__m128)__builtin_ia32_cmpss(a, b, 2); 206} 207 208static inline __m128 __attribute__((__always_inline__, __nodebug__)) 209_mm_cmple_ps(__m128 a, __m128 b) 210{ 211 return (__m128)__builtin_ia32_cmpps(a, b, 2); 212} 213 214static inline __m128 __attribute__((__always_inline__, __nodebug__)) 215_mm_cmpgt_ss(__m128 a, __m128 b) 216{ 217 return (__m128)__builtin_ia32_cmpss(b, a, 1); 218} 219 220static inline __m128 __attribute__((__always_inline__, __nodebug__)) 221_mm_cmpgt_ps(__m128 a, __m128 b) 222{ 223 return (__m128)__builtin_ia32_cmpps(b, a, 1); 224} 225 226static inline __m128 __attribute__((__always_inline__, __nodebug__)) 227_mm_cmpge_ss(__m128 a, __m128 b) 228{ 229 return (__m128)__builtin_ia32_cmpss(b, a, 2); 230} 231 232static inline __m128 __attribute__((__always_inline__, __nodebug__)) 233_mm_cmpge_ps(__m128 a, __m128 b) 234{ 235 return (__m128)__builtin_ia32_cmpps(b, a, 2); 236} 237 238static inline __m128 __attribute__((__always_inline__, __nodebug__)) 239_mm_cmpneq_ss(__m128 a, __m128 b) 240{ 241 return (__m128)__builtin_ia32_cmpss(a, b, 4); 242} 243 244static inline __m128 __attribute__((__always_inline__, __nodebug__)) 245_mm_cmpneq_ps(__m128 a, __m128 b) 246{ 247 return (__m128)__builtin_ia32_cmpps(a, b, 4); 248} 249 250static inline __m128 __attribute__((__always_inline__, __nodebug__)) 251_mm_cmpnlt_ss(__m128 a, __m128 b) 252{ 253 return (__m128)__builtin_ia32_cmpss(a, b, 5); 254} 255 256static inline __m128 __attribute__((__always_inline__, __nodebug__)) 257_mm_cmpnlt_ps(__m128 a, __m128 b) 258{ 259 return (__m128)__builtin_ia32_cmpps(a, b, 5); 260} 261 262static inline __m128 __attribute__((__always_inline__, __nodebug__)) 263_mm_cmpnle_ss(__m128 a, __m128 b) 264{ 265 return (__m128)__builtin_ia32_cmpss(a, b, 6); 266} 267 268static inline __m128 __attribute__((__always_inline__, __nodebug__)) 269_mm_cmpnle_ps(__m128 a, __m128 b) 270{ 271 return (__m128)__builtin_ia32_cmpps(a, b, 6); 272} 273 274static inline __m128 __attribute__((__always_inline__, __nodebug__)) 275_mm_cmpngt_ss(__m128 a, __m128 b) 276{ 277 return (__m128)__builtin_ia32_cmpss(b, a, 5); 278} 279 280static inline __m128 __attribute__((__always_inline__, __nodebug__)) 281_mm_cmpngt_ps(__m128 a, __m128 b) 282{ 283 return (__m128)__builtin_ia32_cmpps(b, a, 5); 284} 285 286static inline __m128 __attribute__((__always_inline__, __nodebug__)) 287_mm_cmpnge_ss(__m128 a, __m128 b) 288{ 289 return (__m128)__builtin_ia32_cmpss(b, a, 6); 290} 291 292static inline __m128 __attribute__((__always_inline__, __nodebug__)) 293_mm_cmpnge_ps(__m128 a, __m128 b) 294{ 295 return (__m128)__builtin_ia32_cmpps(b, a, 6); 296} 297 298static inline __m128 __attribute__((__always_inline__, __nodebug__)) 299_mm_cmpord_ss(__m128 a, __m128 b) 300{ 301 return (__m128)__builtin_ia32_cmpss(a, b, 7); 302} 303 304static inline __m128 __attribute__((__always_inline__, __nodebug__)) 305_mm_cmpord_ps(__m128 a, __m128 b) 306{ 307 return (__m128)__builtin_ia32_cmpps(a, b, 7); 308} 309 310static inline __m128 __attribute__((__always_inline__, __nodebug__)) 311_mm_cmpunord_ss(__m128 a, __m128 b) 312{ 313 return (__m128)__builtin_ia32_cmpss(a, b, 3); 314} 315 316static inline __m128 __attribute__((__always_inline__, __nodebug__)) 317_mm_cmpunord_ps(__m128 a, __m128 b) 318{ 319 return (__m128)__builtin_ia32_cmpps(a, b, 3); 320} 321 322static inline int __attribute__((__always_inline__, __nodebug__)) 323_mm_comieq_ss(__m128 a, __m128 b) 324{ 325 return __builtin_ia32_comieq(a, b); 326} 327 328static inline int __attribute__((__always_inline__, __nodebug__)) 329_mm_comilt_ss(__m128 a, __m128 b) 330{ 331 return __builtin_ia32_comilt(a, b); 332} 333 334static inline int __attribute__((__always_inline__, __nodebug__)) 335_mm_comile_ss(__m128 a, __m128 b) 336{ 337 return __builtin_ia32_comile(a, b); 338} 339 340static inline int __attribute__((__always_inline__, __nodebug__)) 341_mm_comigt_ss(__m128 a, __m128 b) 342{ 343 return __builtin_ia32_comigt(a, b); 344} 345 346static inline int __attribute__((__always_inline__, __nodebug__)) 347_mm_comige_ss(__m128 a, __m128 b) 348{ 349 return __builtin_ia32_comige(a, b); 350} 351 352static inline int __attribute__((__always_inline__, __nodebug__)) 353_mm_comineq_ss(__m128 a, __m128 b) 354{ 355 return __builtin_ia32_comineq(a, b); 356} 357 358static inline int __attribute__((__always_inline__, __nodebug__)) 359_mm_ucomieq_ss(__m128 a, __m128 b) 360{ 361 return __builtin_ia32_ucomieq(a, b); 362} 363 364static inline int __attribute__((__always_inline__, __nodebug__)) 365_mm_ucomilt_ss(__m128 a, __m128 b) 366{ 367 return __builtin_ia32_ucomilt(a, b); 368} 369 370static inline int __attribute__((__always_inline__, __nodebug__)) 371_mm_ucomile_ss(__m128 a, __m128 b) 372{ 373 return __builtin_ia32_ucomile(a, b); 374} 375 376static inline int __attribute__((__always_inline__, __nodebug__)) 377_mm_ucomigt_ss(__m128 a, __m128 b) 378{ 379 return __builtin_ia32_ucomigt(a, b); 380} 381 382static inline int __attribute__((__always_inline__, __nodebug__)) 383_mm_ucomige_ss(__m128 a, __m128 b) 384{ 385 return __builtin_ia32_ucomige(a, b); 386} 387 388static inline int __attribute__((__always_inline__, __nodebug__)) 389_mm_ucomineq_ss(__m128 a, __m128 b) 390{ 391 return __builtin_ia32_ucomineq(a, b); 392} 393 394static inline int __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtss_si32(__m128 a) 396{ 397 return __builtin_ia32_cvtss2si(a); 398} 399 400static inline int __attribute__((__always_inline__, __nodebug__)) 401_mm_cvt_ss2si(__m128 a) 402{ 403 return _mm_cvtss_si32(a); 404} 405 406#ifdef __x86_64__ 407 408static inline long long __attribute__((__always_inline__, __nodebug__)) 409_mm_cvtss_si64(__m128 a) 410{ 411 return __builtin_ia32_cvtss2si64(a); 412} 413 414#endif 415 416static inline __m64 __attribute__((__always_inline__, __nodebug__)) 417_mm_cvtps_pi32(__m128 a) 418{ 419 return (__m64)__builtin_ia32_cvtps2pi(a); 420} 421 422static inline int __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttss_si32(__m128 a) 424{ 425 return a[0]; 426} 427 428static inline int __attribute__((__always_inline__, __nodebug__)) 429_mm_cvtt_ss2si(__m128 a) 430{ 431 return _mm_cvttss_si32(a); 432} 433 434static inline long long __attribute__((__always_inline__, __nodebug__)) 435_mm_cvttss_si64(__m128 a) 436{ 437 return a[0]; 438} 439 440static inline __m64 __attribute__((__always_inline__, __nodebug__)) 441_mm_cvttps_pi32(__m128 a) 442{ 443 return (__m64)__builtin_ia32_cvttps2pi(a); 444} 445 446static inline __m128 __attribute__((__always_inline__, __nodebug__)) 447_mm_cvtsi32_ss(__m128 a, int b) 448{ 449 a[0] = b; 450 return a; 451} 452 453#ifdef __x86_64__ 454 455static inline __m128 __attribute__((__always_inline__, __nodebug__)) 456_mm_cvtsi64_ss(__m128 a, long long b) 457{ 458 a[0] = b; 459 return a; 460} 461 462#endif 463 464static inline __m128 __attribute__((__always_inline__, __nodebug__)) 465_mm_cvtpi32_ps(__m128 a, __m64 b) 466{ 467 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 468} 469 470static inline float __attribute__((__always_inline__, __nodebug__)) 471_mm_cvtss_f32(__m128 a) 472{ 473 return a[0]; 474} 475 476static inline __m128 __attribute__((__always_inline__, __nodebug__)) 477_mm_loadh_pi(__m128 a, const __m64 *p) 478{ 479 __m128 b; 480 b[0] = *(float*)p; 481 b[1] = *((float*)p+1); 482 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 483} 484 485static inline __m128 __attribute__((__always_inline__, __nodebug__)) 486_mm_loadl_pi(__m128 a, const __m64 *p) 487{ 488 __m128 b; 489 b[0] = *(float*)p; 490 b[1] = *((float*)p+1); 491 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 492} 493 494static inline __m128 __attribute__((__always_inline__, __nodebug__)) 495_mm_load_ss(const float *p) 496{ 497 return (__m128){ *p, 0, 0, 0 }; 498} 499 500static inline __m128 __attribute__((__always_inline__, __nodebug__)) 501_mm_load1_ps(const float *p) 502{ 503 return (__m128){ *p, *p, *p, *p }; 504} 505 506#define _mm_load_ps1(p) _mm_load1_ps(p) 507 508static inline __m128 __attribute__((__always_inline__, __nodebug__)) 509_mm_load_ps(const float *p) 510{ 511 return *(__m128*)p; 512} 513 514static inline __m128 __attribute__((__always_inline__, __nodebug__)) 515_mm_loadu_ps(const float *p) 516{ 517 return __builtin_ia32_loadups(p); 518} 519 520static inline __m128 __attribute__((__always_inline__, __nodebug__)) 521_mm_loadr_ps(const float *p) 522{ 523 __m128 a = _mm_load_ps(p); 524 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 525} 526 527static inline __m128 __attribute__((__always_inline__, __nodebug__)) 528_mm_set_ss(float w) 529{ 530 return (__m128){ w, 0, 0, 0 }; 531} 532 533static inline __m128 __attribute__((__always_inline__, __nodebug__)) 534_mm_set1_ps(float w) 535{ 536 return (__m128){ w, w, w, w }; 537} 538 539// Microsoft specific. 540static inline __m128 __attribute__((__always_inline__, __nodebug__)) 541_mm_set_ps1(float w) 542{ 543 return _mm_set1_ps(w); 544} 545 546static inline __m128 __attribute__((__always_inline__, __nodebug__)) 547_mm_set_ps(float z, float y, float x, float w) 548{ 549 return (__m128){ w, x, y, z }; 550} 551 552static inline __m128 __attribute__((__always_inline__, __nodebug__)) 553_mm_setr_ps(float z, float y, float x, float w) 554{ 555 return (__m128){ z, y, x, w }; 556} 557 558static inline __m128 __attribute__((__always_inline__)) 559_mm_setzero_ps(void) 560{ 561 return (__m128){ 0, 0, 0, 0 }; 562} 563 564static inline void __attribute__((__always_inline__)) 565_mm_storeh_pi(__m64 *p, __m128 a) 566{ 567 __builtin_ia32_storehps((__v2si *)p, a); 568} 569 570static inline void __attribute__((__always_inline__)) 571_mm_storel_pi(__m64 *p, __m128 a) 572{ 573 __builtin_ia32_storelps((__v2si *)p, a); 574} 575 576static inline void __attribute__((__always_inline__)) 577_mm_store_ss(float *p, __m128 a) 578{ 579 *p = a[0]; 580} 581 582static inline void __attribute__((__always_inline__, __nodebug__)) 583_mm_storeu_ps(float *p, __m128 a) 584{ 585 __builtin_ia32_storeups(p, a); 586} 587 588static inline void __attribute__((__always_inline__, __nodebug__)) 589_mm_store1_ps(float *p, __m128 a) 590{ 591 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 592 _mm_storeu_ps(p, a); 593} 594 595static inline void __attribute__((__always_inline__, __nodebug__)) 596_mm_store_ps(float *p, __m128 a) 597{ 598 *(__m128 *)p = a; 599} 600 601static inline void __attribute__((__always_inline__, __nodebug__)) 602_mm_storer_ps(float *p, __m128 a) 603{ 604 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 605 _mm_store_ps(p, a); 606} 607 608#define _MM_HINT_T0 1 609#define _MM_HINT_T1 2 610#define _MM_HINT_T2 3 611#define _MM_HINT_NTA 0 612 613/* FIXME: We have to #define this because "sel" must be a constant integer, and 614 Sema doesn't do any form of constant propagation yet. */ 615 616#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 617 618static inline void __attribute__((__always_inline__, __nodebug__)) 619_mm_stream_pi(__m64 *p, __m64 a) 620{ 621 __builtin_ia32_movntq(p, a); 622} 623 624static inline void __attribute__((__always_inline__, __nodebug__)) 625_mm_stream_ps(float *p, __m128 a) 626{ 627 __builtin_ia32_movntps(p, a); 628} 629 630static inline void __attribute__((__always_inline__, __nodebug__)) 631_mm_sfence(void) 632{ 633 __builtin_ia32_sfence(); 634} 635 636static inline int __attribute__((__always_inline__, __nodebug__)) 637_mm_extract_pi16(__m64 a, int n) 638{ 639 __v4hi b = (__v4hi)a; 640 return (unsigned short)b[n & 3]; 641} 642 643static inline __m64 __attribute__((__always_inline__, __nodebug__)) 644_mm_insert_pi16(__m64 a, int d, int n) 645{ 646 __v4hi b = (__v4hi)a; 647 b[n & 3] = d; 648 return (__m64)b; 649} 650 651static inline __m64 __attribute__((__always_inline__, __nodebug__)) 652_mm_max_pi16(__m64 a, __m64 b) 653{ 654 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 655} 656 657static inline __m64 __attribute__((__always_inline__, __nodebug__)) 658_mm_max_pu8(__m64 a, __m64 b) 659{ 660 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 661} 662 663static inline __m64 __attribute__((__always_inline__, __nodebug__)) 664_mm_min_pi16(__m64 a, __m64 b) 665{ 666 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 667} 668 669static inline __m64 __attribute__((__always_inline__, __nodebug__)) 670_mm_min_pu8(__m64 a, __m64 b) 671{ 672 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 673} 674 675static inline int __attribute__((__always_inline__, __nodebug__)) 676_mm_movemask_pi8(__m64 a) 677{ 678 return __builtin_ia32_pmovmskb((__v8qi)a); 679} 680 681static inline __m64 __attribute__((__always_inline__, __nodebug__)) 682_mm_mulhi_pu16(__m64 a, __m64 b) 683{ 684 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 685} 686 687#define _mm_shuffle_pi16(a, n) \ 688 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 689 (n) & 0x3, ((n) & 0xc) >> 2, \ 690 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 691 692static inline void __attribute__((__always_inline__, __nodebug__)) 693_mm_maskmove_si64(__m64 d, __m64 n, char *p) 694{ 695 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 696} 697 698static inline __m64 __attribute__((__always_inline__, __nodebug__)) 699_mm_avg_pu8(__m64 a, __m64 b) 700{ 701 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 702} 703 704static inline __m64 __attribute__((__always_inline__, __nodebug__)) 705_mm_avg_pu16(__m64 a, __m64 b) 706{ 707 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 708} 709 710static inline __m64 __attribute__((__always_inline__, __nodebug__)) 711_mm_sad_pu8(__m64 a, __m64 b) 712{ 713 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 714} 715 716static inline unsigned int __attribute__((__always_inline__, __nodebug__)) 717_mm_getcsr(void) 718{ 719 return __builtin_ia32_stmxcsr(); 720} 721 722static inline void __attribute__((__always_inline__, __nodebug__)) 723_mm_setcsr(unsigned int i) 724{ 725 __builtin_ia32_ldmxcsr(i); 726} 727 728#define _mm_shuffle_ps(a, b, mask) \ 729 (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \ 730 (((mask) & 0x30) >> 4) + 4, \ 731 (((mask) & 0xc0) >> 6) + 4)) 732 733static inline __m128 __attribute__((__always_inline__, __nodebug__)) 734_mm_unpackhi_ps(__m128 a, __m128 b) 735{ 736 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 737} 738 739static inline __m128 __attribute__((__always_inline__, __nodebug__)) 740_mm_unpacklo_ps(__m128 a, __m128 b) 741{ 742 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 743} 744 745static inline __m128 __attribute__((__always_inline__, __nodebug__)) 746_mm_move_ss(__m128 a, __m128 b) 747{ 748 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 749} 750 751static inline __m128 __attribute__((__always_inline__, __nodebug__)) 752_mm_movehl_ps(__m128 a, __m128 b) 753{ 754 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 755} 756 757static inline __m128 __attribute__((__always_inline__, __nodebug__)) 758_mm_movelh_ps(__m128 a, __m128 b) 759{ 760 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 761} 762 763static inline __m128 __attribute__((__always_inline__, __nodebug__)) 764_mm_cvtpi16_ps(__m64 a) 765{ 766 __m64 b, c; 767 __m128 r; 768 769 b = _mm_setzero_si64(); 770 b = _mm_cmpgt_pi16(b, a); 771 c = _mm_unpackhi_pi16(a, b); 772 r = _mm_setzero_ps(); 773 r = _mm_cvtpi32_ps(r, c); 774 r = _mm_movelh_ps(r, r); 775 c = _mm_unpacklo_pi16(a, b); 776 r = _mm_cvtpi32_ps(r, c); 777 778 return r; 779} 780 781static inline __m128 __attribute__((__always_inline__, __nodebug__)) 782_mm_cvtpu16_ps(__m64 a) 783{ 784 __m64 b, c; 785 __m128 r; 786 787 b = _mm_setzero_si64(); 788 c = _mm_unpackhi_pi16(a, b); 789 r = _mm_setzero_ps(); 790 r = _mm_cvtpi32_ps(r, c); 791 r = _mm_movelh_ps(r, r); 792 c = _mm_unpacklo_pi16(a, b); 793 r = _mm_cvtpi32_ps(r, c); 794 795 return r; 796} 797 798static inline __m128 __attribute__((__always_inline__, __nodebug__)) 799_mm_cvtpi8_ps(__m64 a) 800{ 801 __m64 b; 802 803 b = _mm_setzero_si64(); 804 b = _mm_cmpgt_pi8(b, a); 805 b = _mm_unpacklo_pi8(a, b); 806 807 return _mm_cvtpi16_ps(b); 808} 809 810static inline __m128 __attribute__((__always_inline__, __nodebug__)) 811_mm_cvtpu8_ps(__m64 a) 812{ 813 __m64 b; 814 815 b = _mm_setzero_si64(); 816 b = _mm_unpacklo_pi8(a, b); 817 818 return _mm_cvtpi16_ps(b); 819} 820 821static inline __m128 __attribute__((__always_inline__, __nodebug__)) 822_mm_cvtpi32x2_ps(__m64 a, __m64 b) 823{ 824 __m128 c; 825 826 c = _mm_setzero_ps(); 827 c = _mm_cvtpi32_ps(c, b); 828 c = _mm_movelh_ps(c, c); 829 830 return _mm_cvtpi32_ps(c, a); 831} 832 833static inline __m64 __attribute__((__always_inline__, __nodebug__)) 834_mm_cvtps_pi16(__m128 a) 835{ 836 __m64 b, c; 837 838 b = _mm_cvtps_pi32(a); 839 a = _mm_movehl_ps(a, a); 840 c = _mm_cvtps_pi32(a); 841 842 return _mm_packs_pi16(b, c); 843} 844 845static inline __m64 __attribute__((__always_inline__, __nodebug__)) 846_mm_cvtps_pi8(__m128 a) 847{ 848 __m64 b, c; 849 850 b = _mm_cvtps_pi16(a); 851 c = _mm_setzero_si64(); 852 853 return _mm_packs_pi16(b, c); 854} 855 856static inline int __attribute__((__always_inline__, __nodebug__)) 857_mm_movemask_ps(__m128 a) 858{ 859 return __builtin_ia32_movmskps(a); 860} 861 862#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 863 864#define _MM_EXCEPT_INVALID (0x0001) 865#define _MM_EXCEPT_DENORM (0x0002) 866#define _MM_EXCEPT_DIV_ZERO (0x0004) 867#define _MM_EXCEPT_OVERFLOW (0x0008) 868#define _MM_EXCEPT_UNDERFLOW (0x0010) 869#define _MM_EXCEPT_INEXACT (0x0020) 870#define _MM_EXCEPT_MASK (0x003f) 871 872#define _MM_MASK_INVALID (0x0080) 873#define _MM_MASK_DENORM (0x0100) 874#define _MM_MASK_DIV_ZERO (0x0200) 875#define _MM_MASK_OVERFLOW (0x0400) 876#define _MM_MASK_UNDERFLOW (0x0800) 877#define _MM_MASK_INEXACT (0x1000) 878#define _MM_MASK_MASK (0x1f80) 879 880#define _MM_ROUND_NEAREST (0x0000) 881#define _MM_ROUND_DOWN (0x2000) 882#define _MM_ROUND_UP (0x4000) 883#define _MM_ROUND_TOWARD_ZERO (0x6000) 884#define _MM_ROUND_MASK (0x6000) 885 886#define _MM_FLUSH_ZERO_MASK (0x8000) 887#define _MM_FLUSH_ZERO_ON (0x8000) 888#define _MM_FLUSH_ZERO_OFF (0x8000) 889 890#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 891#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 892#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 893#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 894 895#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 896#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 897#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 898#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 899 900#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 901do { \ 902 __m128 tmp3, tmp2, tmp1, tmp0; \ 903 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 904 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 905 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 906 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 907 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 908 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 909 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 910 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 911} while (0) 912 913/* Ugly hack for backwards-compatibility (compatible with gcc) */ 914#ifdef __SSE2__ 915#include <emmintrin.h> 916#endif 917 918#endif /* __SSE__ */ 919 920#endif /* __XMMINTRIN_H */ 921