xmmintrin.h revision 193576
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__, __nodebug__)) 39_mm_add_ss(__m128 a, __m128 b) 40{ 41 a[0] += b[0]; 42 return a; 43} 44 45static inline __m128 __attribute__((__always_inline__, __nodebug__)) 46_mm_add_ps(__m128 a, __m128 b) 47{ 48 return a + b; 49} 50 51static inline __m128 __attribute__((__always_inline__, __nodebug__)) 52_mm_sub_ss(__m128 a, __m128 b) 53{ 54 a[0] -= b[0]; 55 return a; 56} 57 58static inline __m128 __attribute__((__always_inline__, __nodebug__)) 59_mm_sub_ps(__m128 a, __m128 b) 60{ 61 return a - b; 62} 63 64static inline __m128 __attribute__((__always_inline__, __nodebug__)) 65_mm_mul_ss(__m128 a, __m128 b) 66{ 67 a[0] *= b[0]; 68 return a; 69} 70 71static inline __m128 __attribute__((__always_inline__, __nodebug__)) 72_mm_mul_ps(__m128 a, __m128 b) 73{ 74 return a * b; 75} 76 77static inline __m128 __attribute__((__always_inline__, __nodebug__)) 78_mm_div_ss(__m128 a, __m128 b) 79{ 80 a[0] /= b[0]; 81 return a; 82} 83 84static inline __m128 __attribute__((__always_inline__, __nodebug__)) 85_mm_div_ps(__m128 a, __m128 b) 86{ 87 return a / b; 88} 89 90static inline __m128 __attribute__((__always_inline__, __nodebug__)) 91_mm_sqrt_ss(__m128 a) 92{ 93 return __builtin_ia32_sqrtss(a); 94} 95 96static inline __m128 __attribute__((__always_inline__, __nodebug__)) 97_mm_sqrt_ps(__m128 a) 98{ 99 return __builtin_ia32_sqrtps(a); 100} 101 102static inline __m128 __attribute__((__always_inline__, __nodebug__)) 103_mm_rcp_ss(__m128 a) 104{ 105 return __builtin_ia32_rcpss(a); 106} 107 108static inline __m128 __attribute__((__always_inline__, __nodebug__)) 109_mm_rcp_ps(__m128 a) 110{ 111 return __builtin_ia32_rcpps(a); 112} 113 114static inline __m128 __attribute__((__always_inline__, __nodebug__)) 115_mm_rsqrt_ss(__m128 a) 116{ 117 return __builtin_ia32_rsqrtss(a); 118} 119 120static inline __m128 __attribute__((__always_inline__, __nodebug__)) 121_mm_rsqrt_ps(__m128 a) 122{ 123 return __builtin_ia32_rsqrtps(a); 124} 125 126static inline __m128 __attribute__((__always_inline__, __nodebug__)) 127_mm_min_ss(__m128 a, __m128 b) 128{ 129 return __builtin_ia32_minss(a, b); 130} 131 132static inline __m128 __attribute__((__always_inline__, __nodebug__)) 133_mm_min_ps(__m128 a, __m128 b) 134{ 135 return __builtin_ia32_minps(a, b); 136} 137 138static inline __m128 __attribute__((__always_inline__, __nodebug__)) 139_mm_max_ss(__m128 a, __m128 b) 140{ 141 return __builtin_ia32_maxss(a, b); 142} 143 144static inline __m128 __attribute__((__always_inline__, __nodebug__)) 145_mm_max_ps(__m128 a, __m128 b) 146{ 147 return __builtin_ia32_maxps(a, b); 148} 149 150static inline __m128 __attribute__((__always_inline__, __nodebug__)) 151_mm_and_ps(__m128 a, __m128 b) 152{ 153 typedef int __v4si __attribute__((__vector_size__(16))); 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static inline __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 typedef int __v4si __attribute__((__vector_size__(16))); 161 return (__m128)(~(__v4si)a & (__v4si)b); 162} 163 164static inline __m128 __attribute__((__always_inline__, __nodebug__)) 165_mm_or_ps(__m128 a, __m128 b) 166{ 167 typedef int __v4si __attribute__((__vector_size__(16))); 168 return (__m128)((__v4si)a | (__v4si)b); 169} 170 171static inline __m128 __attribute__((__always_inline__, __nodebug__)) 172_mm_xor_ps(__m128 a, __m128 b) 173{ 174 typedef int __v4si __attribute__((__vector_size__(16))); 175 return (__m128)((__v4si)a ^ ~(__v4si)b); 176} 177 178static inline __m128 __attribute__((__always_inline__, __nodebug__)) 179_mm_cmpeq_ss(__m128 a, __m128 b) 180{ 181 return (__m128)__builtin_ia32_cmpss(a, b, 0); 182} 183 184static inline __m128 __attribute__((__always_inline__, __nodebug__)) 185_mm_cmpeq_ps(__m128 a, __m128 b) 186{ 187 return (__m128)__builtin_ia32_cmpps(a, b, 0); 188} 189 190static inline __m128 __attribute__((__always_inline__, __nodebug__)) 191_mm_cmplt_ss(__m128 a, __m128 b) 192{ 193 return (__m128)__builtin_ia32_cmpss(a, b, 1); 194} 195 196static inline __m128 __attribute__((__always_inline__, __nodebug__)) 197_mm_cmplt_ps(__m128 a, __m128 b) 198{ 199 return (__m128)__builtin_ia32_cmpps(a, b, 1); 200} 201 202static inline __m128 __attribute__((__always_inline__, __nodebug__)) 203_mm_cmple_ss(__m128 a, __m128 b) 204{ 205 return (__m128)__builtin_ia32_cmpss(a, b, 2); 206} 207 208static inline __m128 __attribute__((__always_inline__, __nodebug__)) 209_mm_cmple_ps(__m128 a, __m128 b) 210{ 211 return (__m128)__builtin_ia32_cmpps(a, b, 2); 212} 213 214static inline __m128 __attribute__((__always_inline__, __nodebug__)) 215_mm_cmpgt_ss(__m128 a, __m128 b) 216{ 217 return (__m128)__builtin_ia32_cmpss(b, a, 1); 218} 219 220static inline __m128 __attribute__((__always_inline__, __nodebug__)) 221_mm_cmpgt_ps(__m128 a, __m128 b) 222{ 223 return (__m128)__builtin_ia32_cmpps(b, a, 1); 224} 225 226static inline __m128 __attribute__((__always_inline__, __nodebug__)) 227_mm_cmpge_ss(__m128 a, __m128 b) 228{ 229 return (__m128)__builtin_ia32_cmpss(b, a, 2); 230} 231 232static inline __m128 __attribute__((__always_inline__, __nodebug__)) 233_mm_cmpge_ps(__m128 a, __m128 b) 234{ 235 return (__m128)__builtin_ia32_cmpps(b, a, 2); 236} 237 238static inline __m128 __attribute__((__always_inline__, __nodebug__)) 239_mm_cmpneq_ss(__m128 a, __m128 b) 240{ 241 return (__m128)__builtin_ia32_cmpss(a, b, 4); 242} 243 244static inline __m128 __attribute__((__always_inline__, __nodebug__)) 245_mm_cmpneq_ps(__m128 a, __m128 b) 246{ 247 return (__m128)__builtin_ia32_cmpps(a, b, 4); 248} 249 250static inline __m128 __attribute__((__always_inline__, __nodebug__)) 251_mm_cmpnlt_ss(__m128 a, __m128 b) 252{ 253 return (__m128)__builtin_ia32_cmpss(a, b, 5); 254} 255 256static inline __m128 __attribute__((__always_inline__, __nodebug__)) 257_mm_cmpnlt_ps(__m128 a, __m128 b) 258{ 259 return (__m128)__builtin_ia32_cmpps(a, b, 5); 260} 261 262static inline __m128 __attribute__((__always_inline__, __nodebug__)) 263_mm_cmpnle_ss(__m128 a, __m128 b) 264{ 265 return (__m128)__builtin_ia32_cmpss(a, b, 6); 266} 267 268static inline __m128 __attribute__((__always_inline__, __nodebug__)) 269_mm_cmpnle_ps(__m128 a, __m128 b) 270{ 271 return (__m128)__builtin_ia32_cmpps(a, b, 6); 272} 273 274static inline __m128 __attribute__((__always_inline__, __nodebug__)) 275_mm_cmpngt_ss(__m128 a, __m128 b) 276{ 277 return (__m128)__builtin_ia32_cmpss(b, a, 5); 278} 279 280static inline __m128 __attribute__((__always_inline__, __nodebug__)) 281_mm_cmpngt_ps(__m128 a, __m128 b) 282{ 283 return (__m128)__builtin_ia32_cmpps(b, a, 5); 284} 285 286static inline __m128 __attribute__((__always_inline__, __nodebug__)) 287_mm_cmpnge_ss(__m128 a, __m128 b) 288{ 289 return (__m128)__builtin_ia32_cmpss(b, a, 6); 290} 291 292static inline __m128 __attribute__((__always_inline__, __nodebug__)) 293_mm_cmpnge_ps(__m128 a, __m128 b) 294{ 295 return (__m128)__builtin_ia32_cmpps(b, a, 6); 296} 297 298static inline __m128 __attribute__((__always_inline__, __nodebug__)) 299_mm_cmpord_ss(__m128 a, __m128 b) 300{ 301 return (__m128)__builtin_ia32_cmpss(a, b, 7); 302} 303 304static inline __m128 __attribute__((__always_inline__, __nodebug__)) 305_mm_cmpord_ps(__m128 a, __m128 b) 306{ 307 return (__m128)__builtin_ia32_cmpps(a, b, 7); 308} 309 310static inline __m128 __attribute__((__always_inline__, __nodebug__)) 311_mm_cmpunord_ss(__m128 a, __m128 b) 312{ 313 return (__m128)__builtin_ia32_cmpss(a, b, 3); 314} 315 316static inline __m128 __attribute__((__always_inline__, __nodebug__)) 317_mm_cmpunord_ps(__m128 a, __m128 b) 318{ 319 return (__m128)__builtin_ia32_cmpps(a, b, 3); 320} 321 322static inline int __attribute__((__always_inline__, __nodebug__)) 323_mm_comieq_ss(__m128 a, __m128 b) 324{ 325 return __builtin_ia32_comieq(a, b); 326} 327 328static inline int __attribute__((__always_inline__, __nodebug__)) 329_mm_comilt_ss(__m128 a, __m128 b) 330{ 331 return __builtin_ia32_comilt(a, b); 332} 333 334static inline int __attribute__((__always_inline__, __nodebug__)) 335_mm_comile_ss(__m128 a, __m128 b) 336{ 337 return __builtin_ia32_comile(a, b); 338} 339 340static inline int __attribute__((__always_inline__, __nodebug__)) 341_mm_comigt_ss(__m128 a, __m128 b) 342{ 343 return __builtin_ia32_comigt(a, b); 344} 345 346static inline int __attribute__((__always_inline__, __nodebug__)) 347_mm_comige_ss(__m128 a, __m128 b) 348{ 349 return __builtin_ia32_comige(a, b); 350} 351 352static inline int __attribute__((__always_inline__, __nodebug__)) 353_mm_comineq_ss(__m128 a, __m128 b) 354{ 355 return __builtin_ia32_comineq(a, b); 356} 357 358static inline int __attribute__((__always_inline__, __nodebug__)) 359_mm_ucomieq_ss(__m128 a, __m128 b) 360{ 361 return __builtin_ia32_ucomieq(a, b); 362} 363 364static inline int __attribute__((__always_inline__, __nodebug__)) 365_mm_ucomilt_ss(__m128 a, __m128 b) 366{ 367 return __builtin_ia32_ucomilt(a, b); 368} 369 370static inline int __attribute__((__always_inline__, __nodebug__)) 371_mm_ucomile_ss(__m128 a, __m128 b) 372{ 373 return __builtin_ia32_ucomile(a, b); 374} 375 376static inline int __attribute__((__always_inline__, __nodebug__)) 377_mm_ucomigt_ss(__m128 a, __m128 b) 378{ 379 return __builtin_ia32_ucomigt(a, b); 380} 381 382static inline int __attribute__((__always_inline__, __nodebug__)) 383_mm_ucomige_ss(__m128 a, __m128 b) 384{ 385 return __builtin_ia32_ucomige(a, b); 386} 387 388static inline int __attribute__((__always_inline__, __nodebug__)) 389_mm_ucomineq_ss(__m128 a, __m128 b) 390{ 391 return __builtin_ia32_ucomineq(a, b); 392} 393 394static inline int __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtss_si32(__m128 a) 396{ 397 return __builtin_ia32_cvtss2si(a); 398} 399 400#ifdef __x86_64__ 401 402static inline long long __attribute__((__always_inline__, __nodebug__)) 403_mm_cvtss_si64(__m128 a) 404{ 405 return __builtin_ia32_cvtss2si64(a); 406} 407 408#endif 409 410static inline __m64 __attribute__((__always_inline__, __nodebug__)) 411_mm_cvtps_pi32(__m128 a) 412{ 413 return (__m64)__builtin_ia32_cvtps2pi(a); 414} 415 416static inline int __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttss_si32(__m128 a) 418{ 419 return a[0]; 420} 421 422static inline long long __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttss_si64(__m128 a) 424{ 425 return a[0]; 426} 427 428static inline __m64 __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttps_pi32(__m128 a) 430{ 431 return (__m64)__builtin_ia32_cvttps2pi(a); 432} 433 434static inline __m128 __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtsi32_ss(__m128 a, int b) 436{ 437 a[0] = b; 438 return a; 439} 440 441#ifdef __x86_64__ 442 443static inline __m128 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvtsi64_ss(__m128 a, long long b) 445{ 446 a[0] = b; 447 return a; 448} 449 450#endif 451 452static inline __m128 __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtpi32_ps(__m128 a, __m64 b) 454{ 455 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 456} 457 458static inline float __attribute__((__always_inline__, __nodebug__)) 459_mm_cvtss_f32(__m128 a) 460{ 461 return a[0]; 462} 463 464static inline __m128 __attribute__((__always_inline__, __nodebug__)) 465_mm_loadh_pi(__m128 a, __m64 const *p) 466{ 467 return __builtin_ia32_loadhps(a, (__v2si *)p); 468} 469 470static inline __m128 __attribute__((__always_inline__, __nodebug__)) 471_mm_loadl_pi(__m128 a, __m64 const *p) 472{ 473#if 0 474 // FIXME: This should work, but gives really crappy code at the moment 475 __m128 b; 476 b[0] = *(float*)p; 477 b[1] = *((float*)p+1); 478 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 479#endif 480 return __builtin_ia32_loadlps(a, (__v2si *)p); 481} 482 483static inline __m128 __attribute__((__always_inline__, __nodebug__)) 484_mm_load_ss(float *p) 485{ 486 return (__m128){ *p, 0, 0, 0 }; 487} 488 489static inline __m128 __attribute__((__always_inline__, __nodebug__)) 490_mm_load1_ps(float *p) 491{ 492 return (__m128){ *p, *p, *p, *p }; 493} 494 495#define _mm_load_ps1(p) _mm_load1_ps(p) 496 497static inline __m128 __attribute__((__always_inline__, __nodebug__)) 498_mm_load_ps(float *p) 499{ 500 return *(__m128*)p; 501} 502 503static inline __m128 __attribute__((__always_inline__, __nodebug__)) 504_mm_loadu_ps(float *p) 505{ 506 return __builtin_ia32_loadups(p); 507} 508 509static inline __m128 __attribute__((__always_inline__, __nodebug__)) 510_mm_loadr_ps(float *p) 511{ 512 __m128 a = _mm_load_ps(p); 513 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 514} 515 516static inline __m128 __attribute__((__always_inline__, __nodebug__)) 517_mm_set_ss(float w) 518{ 519 return (__m128){ w, 0, 0, 0 }; 520} 521 522static inline __m128 __attribute__((__always_inline__, __nodebug__)) 523_mm_set1_ps(float w) 524{ 525 return (__m128){ w, w, w, w }; 526} 527 528// Microsoft specific. 529static inline __m128 __attribute__((__always_inline__, __nodebug__)) 530_mm_set_ps1(float w) 531{ 532 return _mm_set1_ps(w); 533} 534 535static inline __m128 __attribute__((__always_inline__, __nodebug__)) 536_mm_set_ps(float z, float y, float x, float w) 537{ 538 return (__m128){ w, x, y, z }; 539} 540 541static inline __m128 __attribute__((__always_inline__, __nodebug__)) 542_mm_setr_ps(float z, float y, float x, float w) 543{ 544 return (__m128){ z, y, x, w }; 545} 546 547static inline __m128 __attribute__((__always_inline__)) 548_mm_setzero_ps(void) 549{ 550 return (__m128){ 0, 0, 0, 0 }; 551} 552 553static inline void __attribute__((__always_inline__)) 554_mm_storeh_pi(__m64 *p, __m128 a) 555{ 556 __builtin_ia32_storehps((__v2si *)p, a); 557} 558 559static inline void __attribute__((__always_inline__)) 560_mm_storel_pi(__m64 *p, __m128 a) 561{ 562 __builtin_ia32_storelps((__v2si *)p, a); 563} 564 565static inline void __attribute__((__always_inline__)) 566_mm_store_ss(float *p, __m128 a) 567{ 568 *p = a[0]; 569} 570 571static inline void __attribute__((__always_inline__, __nodebug__)) 572_mm_storeu_ps(float *p, __m128 a) 573{ 574 __builtin_ia32_storeups(p, a); 575} 576 577static inline void __attribute__((__always_inline__, __nodebug__)) 578_mm_store1_ps(float *p, __m128 a) 579{ 580 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 581 _mm_storeu_ps(p, a); 582} 583 584static inline void __attribute__((__always_inline__, __nodebug__)) 585_mm_store_ps(float *p, __m128 a) 586{ 587 *(__m128 *)p = a; 588} 589 590static inline void __attribute__((__always_inline__, __nodebug__)) 591_mm_storer_ps(float *p, __m128 a) 592{ 593 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 594 _mm_store_ps(p, a); 595} 596 597#define _MM_HINT_T0 1 598#define _MM_HINT_T1 2 599#define _MM_HINT_T2 3 600#define _MM_HINT_NTA 0 601 602/* FIXME: We have to #define this because "sel" must be a constant integer, and 603 Sema doesn't do any form of constant propagation yet. */ 604 605#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 606 607static inline void __attribute__((__always_inline__, __nodebug__)) 608_mm_stream_pi(__m64 *p, __m64 a) 609{ 610 __builtin_ia32_movntq(p, a); 611} 612 613static inline void __attribute__((__always_inline__, __nodebug__)) 614_mm_stream_ps(float *p, __m128 a) 615{ 616 __builtin_ia32_movntps(p, a); 617} 618 619static inline void __attribute__((__always_inline__, __nodebug__)) 620_mm_sfence(void) 621{ 622 __builtin_ia32_sfence(); 623} 624 625static inline int __attribute__((__always_inline__, __nodebug__)) 626_mm_extract_pi16(__m64 a, int n) 627{ 628 __v4hi b = (__v4hi)a; 629 return (unsigned short)b[n & 3]; 630} 631 632static inline __m64 __attribute__((__always_inline__, __nodebug__)) 633_mm_insert_pi16(__m64 a, int d, int n) 634{ 635 __v4hi b = (__v4hi)a; 636 b[n & 3] = d; 637 return (__m64)b; 638} 639 640static inline __m64 __attribute__((__always_inline__, __nodebug__)) 641_mm_max_pi16(__m64 a, __m64 b) 642{ 643 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 644} 645 646static inline __m64 __attribute__((__always_inline__, __nodebug__)) 647_mm_max_pu8(__m64 a, __m64 b) 648{ 649 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 650} 651 652static inline __m64 __attribute__((__always_inline__, __nodebug__)) 653_mm_min_pi16(__m64 a, __m64 b) 654{ 655 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 656} 657 658static inline __m64 __attribute__((__always_inline__, __nodebug__)) 659_mm_min_pu8(__m64 a, __m64 b) 660{ 661 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 662} 663 664static inline int __attribute__((__always_inline__, __nodebug__)) 665_mm_movemask_pi8(__m64 a) 666{ 667 return __builtin_ia32_pmovmskb((__v8qi)a); 668} 669 670static inline __m64 __attribute__((__always_inline__, __nodebug__)) 671_mm_mulhi_pu16(__m64 a, __m64 b) 672{ 673 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 674} 675 676#define _mm_shuffle_pi16(a, n) \ 677 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 678 (n) & 0x3, ((n) & 0xc) >> 2, \ 679 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 680 681static inline void __attribute__((__always_inline__, __nodebug__)) 682_mm_maskmove_si64(__m64 d, __m64 n, char *p) 683{ 684 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 685} 686 687static inline __m64 __attribute__((__always_inline__, __nodebug__)) 688_mm_avg_pu8(__m64 a, __m64 b) 689{ 690 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 691} 692 693static inline __m64 __attribute__((__always_inline__, __nodebug__)) 694_mm_avg_pu16(__m64 a, __m64 b) 695{ 696 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 697} 698 699static inline __m64 __attribute__((__always_inline__, __nodebug__)) 700_mm_sad_pu8(__m64 a, __m64 b) 701{ 702 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 703} 704 705static inline unsigned int __attribute__((__always_inline__, __nodebug__)) 706_mm_getcsr(void) 707{ 708 return __builtin_ia32_stmxcsr(); 709} 710 711static inline void __attribute__((__always_inline__, __nodebug__)) 712_mm_setcsr(unsigned int i) 713{ 714 __builtin_ia32_ldmxcsr(i); 715} 716 717#define _mm_shuffle_ps(a, b, mask) \ 718 (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \ 719 (((mask) & 0x30) >> 4) + 4, \ 720 (((mask) & 0xc0) >> 6) + 4)) 721 722static inline __m128 __attribute__((__always_inline__, __nodebug__)) 723_mm_unpackhi_ps(__m128 a, __m128 b) 724{ 725 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 726} 727 728static inline __m128 __attribute__((__always_inline__, __nodebug__)) 729_mm_unpacklo_ps(__m128 a, __m128 b) 730{ 731 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 732} 733 734static inline __m128 __attribute__((__always_inline__, __nodebug__)) 735_mm_move_ss(__m128 a, __m128 b) 736{ 737 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 738} 739 740static inline __m128 __attribute__((__always_inline__, __nodebug__)) 741_mm_movehl_ps(__m128 a, __m128 b) 742{ 743 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 744} 745 746static inline __m128 __attribute__((__always_inline__, __nodebug__)) 747_mm_movelh_ps(__m128 a, __m128 b) 748{ 749 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 750} 751 752static inline __m128 __attribute__((__always_inline__, __nodebug__)) 753_mm_cvtpi16_ps(__m64 a) 754{ 755 __m64 b, c; 756 __m128 r; 757 758 b = _mm_setzero_si64(); 759 b = _mm_cmpgt_pi16(b, a); 760 c = _mm_unpackhi_pi16(a, b); 761 r = _mm_setzero_ps(); 762 r = _mm_cvtpi32_ps(r, c); 763 r = _mm_movelh_ps(r, r); 764 c = _mm_unpacklo_pi16(a, b); 765 r = _mm_cvtpi32_ps(r, c); 766 767 return r; 768} 769 770static inline __m128 __attribute__((__always_inline__, __nodebug__)) 771_mm_cvtpu16_ps(__m64 a) 772{ 773 __m64 b, c; 774 __m128 r; 775 776 b = _mm_setzero_si64(); 777 c = _mm_unpackhi_pi16(a, b); 778 r = _mm_setzero_ps(); 779 r = _mm_cvtpi32_ps(r, c); 780 r = _mm_movelh_ps(r, r); 781 c = _mm_unpacklo_pi16(a, b); 782 r = _mm_cvtpi32_ps(r, c); 783 784 return r; 785} 786 787static inline __m128 __attribute__((__always_inline__, __nodebug__)) 788_mm_cvtpi8_ps(__m64 a) 789{ 790 __m64 b; 791 792 b = _mm_setzero_si64(); 793 b = _mm_cmpgt_pi8(b, a); 794 b = _mm_unpacklo_pi8(a, b); 795 796 return _mm_cvtpi16_ps(b); 797} 798 799static inline __m128 __attribute__((__always_inline__, __nodebug__)) 800_mm_cvtpu8_ps(__m64 a) 801{ 802 __m64 b; 803 804 b = _mm_setzero_si64(); 805 b = _mm_unpacklo_pi8(a, b); 806 807 return _mm_cvtpi16_ps(b); 808} 809 810static inline __m128 __attribute__((__always_inline__, __nodebug__)) 811_mm_cvtpi32x2_ps(__m64 a, __m64 b) 812{ 813 __m128 c; 814 815 c = _mm_setzero_ps(); 816 c = _mm_cvtpi32_ps(c, b); 817 c = _mm_movelh_ps(c, c); 818 819 return _mm_cvtpi32_ps(c, a); 820} 821 822static inline __m64 __attribute__((__always_inline__, __nodebug__)) 823_mm_cvtps_pi16(__m128 a) 824{ 825 __m64 b, c; 826 827 b = _mm_cvtps_pi32(a); 828 a = _mm_movehl_ps(a, a); 829 c = _mm_cvtps_pi32(a); 830 831 return _mm_packs_pi16(b, c); 832} 833 834static inline __m64 __attribute__((__always_inline__, __nodebug__)) 835_mm_cvtps_pi8(__m128 a) 836{ 837 __m64 b, c; 838 839 b = _mm_cvtps_pi16(a); 840 c = _mm_setzero_si64(); 841 842 return _mm_packs_pi16(b, c); 843} 844 845static inline int __attribute__((__always_inline__, __nodebug__)) 846_mm_movemask_ps(__m128 a) 847{ 848 return __builtin_ia32_movmskps(a); 849} 850 851#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 852 853#define _MM_EXCEPT_INVALID (0x0001) 854#define _MM_EXCEPT_DENORM (0x0002) 855#define _MM_EXCEPT_DIV_ZERO (0x0004) 856#define _MM_EXCEPT_OVERFLOW (0x0008) 857#define _MM_EXCEPT_UNDERFLOW (0x0010) 858#define _MM_EXCEPT_INEXACT (0x0020) 859#define _MM_EXCEPT_MASK (0x003f) 860 861#define _MM_MASK_INVALID (0x0080) 862#define _MM_MASK_DENORM (0x0100) 863#define _MM_MASK_DIV_ZERO (0x0200) 864#define _MM_MASK_OVERFLOW (0x0400) 865#define _MM_MASK_UNDERFLOW (0x0800) 866#define _MM_MASK_INEXACT (0x1000) 867#define _MM_MASK_MASK (0x1f80) 868 869#define _MM_ROUND_NEAREST (0x0000) 870#define _MM_ROUND_DOWN (0x2000) 871#define _MM_ROUND_UP (0x4000) 872#define _MM_ROUND_TOWARD_ZERO (0x6000) 873#define _MM_ROUND_MASK (0x6000) 874 875#define _MM_FLUSH_ZERO_MASK (0x8000) 876#define _MM_FLUSH_ZERO_ON (0x8000) 877#define _MM_FLUSH_ZERO_OFF (0x8000) 878 879#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 880#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 881#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 882#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 883 884#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 885#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 886#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 887#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 888 889#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 890do { \ 891 __m128 tmp3, tmp2, tmp1, tmp0; \ 892 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 893 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 894 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 895 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 896 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 897 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 898 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 899 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 900} while (0) 901 902#include <emmintrin.h> 903 904#endif /* __SSE__ */ 905 906#endif /* __XMMINTRIN_H */ 907