xmmintrin.h revision 212904
1264391Snwhitehorn/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2264391Snwhitehorn * 3264925Simp * Permission is hereby granted, free of charge, to any person obtaining a copy 4264391Snwhitehorn * of this software and associated documentation files (the "Software"), to deal 5264391Snwhitehorn * in the Software without restriction, including without limitation the rights 6264391Snwhitehorn * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7264391Snwhitehorn * copies of the Software, and to permit persons to whom the Software is 8264391Snwhitehorn * furnished to do so, subject to the following conditions: 9264391Snwhitehorn * 10264391Snwhitehorn * The above copyright notice and this permission notice shall be included in 11264391Snwhitehorn * all copies or substantial portions of the Software. 12264391Snwhitehorn * 13264391Snwhitehorn * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14264391Snwhitehorn * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15264391Snwhitehorn * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16264391Snwhitehorn * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17264391Snwhitehorn * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18264391Snwhitehorn * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19264391Snwhitehorn * THE SOFTWARE. 20280950Sandrew * 21280950Sandrew *===-----------------------------------------------------------------------=== 22264391Snwhitehorn */ 23264391Snwhitehorn 24264391Snwhitehorn#ifndef __XMMINTRIN_H 25264403Snwhitehorn#define __XMMINTRIN_H 26281237Semaste 27281237Semaste#ifndef __SSE__ 28281237Semaste#error "SSE instruction set not enabled" 29264391Snwhitehorn#else 30264391Snwhitehorn 31264975Snwhitehorn#include <mmintrin.h> 32264391Snwhitehorn 33264391Snwhitehorntypedef int __v4si __attribute__((__vector_size__(16))); 34280950Sandrewtypedef float __v4sf __attribute__((__vector_size__(16))); 35281027Sandrewtypedef float __m128 __attribute__((__vector_size__(16))); 36264391Snwhitehorn 37281027Sandrew#include <mm_malloc.h> 38281027Sandrew 39281027Sandrewstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 40281027Sandrew_mm_add_ss(__m128 a, __m128 b) 41281238Semaste{ 42281156Sandrew a[0] += b[0]; 43281238Semaste return a; 44281238Semaste} 45281156Sandrew 46281156Sandrewstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 47281156Sandrew_mm_add_ps(__m128 a, __m128 b) 48281156Sandrew{ 49281156Sandrew return a + b; 50264391Snwhitehorn} 51264391Snwhitehorn 52264391Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 53264391Snwhitehorn_mm_sub_ss(__m128 a, __m128 b) 54264391Snwhitehorn{ 55264391Snwhitehorn a[0] -= b[0]; 56264391Snwhitehorn return a; 57280950Sandrew} 58264391Snwhitehorn 59281156Sandrewstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 60281156Sandrew_mm_sub_ps(__m128 a, __m128 b) 61264391Snwhitehorn{ 62264391Snwhitehorn return a - b; 63264391Snwhitehorn} 64264391Snwhitehorn 65264391Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 66264391Snwhitehorn_mm_mul_ss(__m128 a, __m128 b) 67264391Snwhitehorn{ 68281156Sandrew a[0] *= b[0]; 69264391Snwhitehorn return a; 70264391Snwhitehorn} 71276146Semaste 72264391Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 73264414Snwhitehorn_mm_mul_ps(__m128 a, __m128 b) 74264414Snwhitehorn{ 75264975Snwhitehorn return a * b; 76264975Snwhitehorn} 77264975Snwhitehorn 78264975Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 79264975Snwhitehorn_mm_div_ss(__m128 a, __m128 b) 80264975Snwhitehorn{ 81264975Snwhitehorn a[0] /= b[0]; 82281156Sandrew return a; 83281156Sandrew} 84264975Snwhitehorn 85264975Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 86264975Snwhitehorn_mm_div_ps(__m128 a, __m128 b) 87281117Sandrew{ 88264975Snwhitehorn return a / b; 89264391Snwhitehorn} 90264391Snwhitehorn 91264391Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 92264391Snwhitehorn_mm_sqrt_ss(__m128 a) 93281027Sandrew{ 94264391Snwhitehorn return __builtin_ia32_sqrtss(a); 95264391Snwhitehorn} 96281117Sandrew 97264391Snwhitehornstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 98264391Snwhitehorn_mm_sqrt_ps(__m128 a) 99264391Snwhitehorn{ 100264391Snwhitehorn return __builtin_ia32_sqrtps(a); 101264391Snwhitehorn} 102264391Snwhitehorn 103281027Sandrewstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 104_mm_rcp_ss(__m128 a) 105{ 106 return __builtin_ia32_rcpss(a); 107} 108 109static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 110_mm_rcp_ps(__m128 a) 111{ 112 return __builtin_ia32_rcpps(a); 113} 114 115static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116_mm_rsqrt_ss(__m128 a) 117{ 118 return __builtin_ia32_rsqrtss(a); 119} 120 121static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122_mm_rsqrt_ps(__m128 a) 123{ 124 return __builtin_ia32_rsqrtps(a); 125} 126 127static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 128_mm_min_ss(__m128 a, __m128 b) 129{ 130 return __builtin_ia32_minss(a, b); 131} 132 133static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 134_mm_min_ps(__m128 a, __m128 b) 135{ 136 return __builtin_ia32_minps(a, b); 137} 138 139static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 140_mm_max_ss(__m128 a, __m128 b) 141{ 142 return __builtin_ia32_maxss(a, b); 143} 144 145static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 146_mm_max_ps(__m128 a, __m128 b) 147{ 148 return __builtin_ia32_maxps(a, b); 149} 150 151static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 152_mm_and_ps(__m128 a, __m128 b) 153{ 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 return (__m128)(~(__v4si)a & (__v4si)b); 161} 162 163static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 164_mm_or_ps(__m128 a, __m128 b) 165{ 166 return (__m128)((__v4si)a | (__v4si)b); 167} 168 169static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 170_mm_xor_ps(__m128 a, __m128 b) 171{ 172 return (__m128)((__v4si)a ^ (__v4si)b); 173} 174 175static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 176_mm_cmpeq_ss(__m128 a, __m128 b) 177{ 178 return (__m128)__builtin_ia32_cmpss(a, b, 0); 179} 180 181static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 182_mm_cmpeq_ps(__m128 a, __m128 b) 183{ 184 return (__m128)__builtin_ia32_cmpps(a, b, 0); 185} 186 187static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 188_mm_cmplt_ss(__m128 a, __m128 b) 189{ 190 return (__m128)__builtin_ia32_cmpss(a, b, 1); 191} 192 193static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 194_mm_cmplt_ps(__m128 a, __m128 b) 195{ 196 return (__m128)__builtin_ia32_cmpps(a, b, 1); 197} 198 199static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 200_mm_cmple_ss(__m128 a, __m128 b) 201{ 202 return (__m128)__builtin_ia32_cmpss(a, b, 2); 203} 204 205static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 206_mm_cmple_ps(__m128 a, __m128 b) 207{ 208 return (__m128)__builtin_ia32_cmpps(a, b, 2); 209} 210 211static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 212_mm_cmpgt_ss(__m128 a, __m128 b) 213{ 214 return (__m128)__builtin_ia32_cmpss(b, a, 1); 215} 216 217static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 218_mm_cmpgt_ps(__m128 a, __m128 b) 219{ 220 return (__m128)__builtin_ia32_cmpps(b, a, 1); 221} 222 223static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 224_mm_cmpge_ss(__m128 a, __m128 b) 225{ 226 return (__m128)__builtin_ia32_cmpss(b, a, 2); 227} 228 229static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 230_mm_cmpge_ps(__m128 a, __m128 b) 231{ 232 return (__m128)__builtin_ia32_cmpps(b, a, 2); 233} 234 235static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 236_mm_cmpneq_ss(__m128 a, __m128 b) 237{ 238 return (__m128)__builtin_ia32_cmpss(a, b, 4); 239} 240 241static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 242_mm_cmpneq_ps(__m128 a, __m128 b) 243{ 244 return (__m128)__builtin_ia32_cmpps(a, b, 4); 245} 246 247static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 248_mm_cmpnlt_ss(__m128 a, __m128 b) 249{ 250 return (__m128)__builtin_ia32_cmpss(a, b, 5); 251} 252 253static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 254_mm_cmpnlt_ps(__m128 a, __m128 b) 255{ 256 return (__m128)__builtin_ia32_cmpps(a, b, 5); 257} 258 259static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 260_mm_cmpnle_ss(__m128 a, __m128 b) 261{ 262 return (__m128)__builtin_ia32_cmpss(a, b, 6); 263} 264 265static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 266_mm_cmpnle_ps(__m128 a, __m128 b) 267{ 268 return (__m128)__builtin_ia32_cmpps(a, b, 6); 269} 270 271static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 272_mm_cmpngt_ss(__m128 a, __m128 b) 273{ 274 return (__m128)__builtin_ia32_cmpss(b, a, 5); 275} 276 277static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 278_mm_cmpngt_ps(__m128 a, __m128 b) 279{ 280 return (__m128)__builtin_ia32_cmpps(b, a, 5); 281} 282 283static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 284_mm_cmpnge_ss(__m128 a, __m128 b) 285{ 286 return (__m128)__builtin_ia32_cmpss(b, a, 6); 287} 288 289static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 290_mm_cmpnge_ps(__m128 a, __m128 b) 291{ 292 return (__m128)__builtin_ia32_cmpps(b, a, 6); 293} 294 295static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 296_mm_cmpord_ss(__m128 a, __m128 b) 297{ 298 return (__m128)__builtin_ia32_cmpss(a, b, 7); 299} 300 301static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 302_mm_cmpord_ps(__m128 a, __m128 b) 303{ 304 return (__m128)__builtin_ia32_cmpps(a, b, 7); 305} 306 307static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 308_mm_cmpunord_ss(__m128 a, __m128 b) 309{ 310 return (__m128)__builtin_ia32_cmpss(a, b, 3); 311} 312 313static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 314_mm_cmpunord_ps(__m128 a, __m128 b) 315{ 316 return (__m128)__builtin_ia32_cmpps(a, b, 3); 317} 318 319static __inline__ int __attribute__((__always_inline__, __nodebug__)) 320_mm_comieq_ss(__m128 a, __m128 b) 321{ 322 return __builtin_ia32_comieq(a, b); 323} 324 325static __inline__ int __attribute__((__always_inline__, __nodebug__)) 326_mm_comilt_ss(__m128 a, __m128 b) 327{ 328 return __builtin_ia32_comilt(a, b); 329} 330 331static __inline__ int __attribute__((__always_inline__, __nodebug__)) 332_mm_comile_ss(__m128 a, __m128 b) 333{ 334 return __builtin_ia32_comile(a, b); 335} 336 337static __inline__ int __attribute__((__always_inline__, __nodebug__)) 338_mm_comigt_ss(__m128 a, __m128 b) 339{ 340 return __builtin_ia32_comigt(a, b); 341} 342 343static __inline__ int __attribute__((__always_inline__, __nodebug__)) 344_mm_comige_ss(__m128 a, __m128 b) 345{ 346 return __builtin_ia32_comige(a, b); 347} 348 349static __inline__ int __attribute__((__always_inline__, __nodebug__)) 350_mm_comineq_ss(__m128 a, __m128 b) 351{ 352 return __builtin_ia32_comineq(a, b); 353} 354 355static __inline__ int __attribute__((__always_inline__, __nodebug__)) 356_mm_ucomieq_ss(__m128 a, __m128 b) 357{ 358 return __builtin_ia32_ucomieq(a, b); 359} 360 361static __inline__ int __attribute__((__always_inline__, __nodebug__)) 362_mm_ucomilt_ss(__m128 a, __m128 b) 363{ 364 return __builtin_ia32_ucomilt(a, b); 365} 366 367static __inline__ int __attribute__((__always_inline__, __nodebug__)) 368_mm_ucomile_ss(__m128 a, __m128 b) 369{ 370 return __builtin_ia32_ucomile(a, b); 371} 372 373static __inline__ int __attribute__((__always_inline__, __nodebug__)) 374_mm_ucomigt_ss(__m128 a, __m128 b) 375{ 376 return __builtin_ia32_ucomigt(a, b); 377} 378 379static __inline__ int __attribute__((__always_inline__, __nodebug__)) 380_mm_ucomige_ss(__m128 a, __m128 b) 381{ 382 return __builtin_ia32_ucomige(a, b); 383} 384 385static __inline__ int __attribute__((__always_inline__, __nodebug__)) 386_mm_ucomineq_ss(__m128 a, __m128 b) 387{ 388 return __builtin_ia32_ucomineq(a, b); 389} 390 391static __inline__ int __attribute__((__always_inline__, __nodebug__)) 392_mm_cvtss_si32(__m128 a) 393{ 394 return __builtin_ia32_cvtss2si(a); 395} 396 397static __inline__ int __attribute__((__always_inline__, __nodebug__)) 398_mm_cvt_ss2si(__m128 a) 399{ 400 return _mm_cvtss_si32(a); 401} 402 403#ifdef __x86_64__ 404 405static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 406_mm_cvtss_si64(__m128 a) 407{ 408 return __builtin_ia32_cvtss2si64(a); 409} 410 411#endif 412 413static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 414_mm_cvtps_pi32(__m128 a) 415{ 416 return (__m64)__builtin_ia32_cvtps2pi(a); 417} 418 419static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 420_mm_cvt_ps2pi(__m128 a) 421{ 422 return _mm_cvtps_pi32(a); 423} 424 425static __inline__ int __attribute__((__always_inline__, __nodebug__)) 426_mm_cvttss_si32(__m128 a) 427{ 428 return a[0]; 429} 430 431static __inline__ int __attribute__((__always_inline__, __nodebug__)) 432_mm_cvtt_ss2si(__m128 a) 433{ 434 return _mm_cvttss_si32(a); 435} 436 437static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 438_mm_cvttss_si64(__m128 a) 439{ 440 return a[0]; 441} 442 443static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvttps_pi32(__m128 a) 445{ 446 return (__m64)__builtin_ia32_cvttps2pi(a); 447} 448 449static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 450_mm_cvtt_ps2pi(__m128 a) 451{ 452 return _mm_cvttps_pi32(a); 453} 454 455static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 456_mm_cvtsi32_ss(__m128 a, int b) 457{ 458 a[0] = b; 459 return a; 460} 461 462static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 463_mm_cvt_si2ss(__m128 a, int b) 464{ 465 return _mm_cvtsi32_ss(a, b); 466} 467 468#ifdef __x86_64__ 469 470static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 471_mm_cvtsi64_ss(__m128 a, long long b) 472{ 473 a[0] = b; 474 return a; 475} 476 477#endif 478 479static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 480_mm_cvtpi32_ps(__m128 a, __m64 b) 481{ 482 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 483} 484 485static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 486_mm_cvt_pi2ps(__m128 a, __m64 b) 487{ 488 return _mm_cvtpi32_ps(a, b); 489} 490 491static __inline__ float __attribute__((__always_inline__, __nodebug__)) 492_mm_cvtss_f32(__m128 a) 493{ 494 return a[0]; 495} 496 497static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 498_mm_loadh_pi(__m128 a, const __m64 *p) 499{ 500 __m128 b; 501 b[0] = *(float*)p; 502 b[1] = *((float*)p+1); 503 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 504} 505 506static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 507_mm_loadl_pi(__m128 a, const __m64 *p) 508{ 509 __m128 b; 510 b[0] = *(float*)p; 511 b[1] = *((float*)p+1); 512 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 513} 514 515static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 516_mm_load_ss(const float *p) 517{ 518 return (__m128){ *p, 0, 0, 0 }; 519} 520 521static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 522_mm_load1_ps(const float *p) 523{ 524 return (__m128){ *p, *p, *p, *p }; 525} 526 527#define _mm_load_ps1(p) _mm_load1_ps(p) 528 529static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 530_mm_load_ps(const float *p) 531{ 532 return *(__m128*)p; 533} 534 535static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 536_mm_loadu_ps(const float *p) 537{ 538 return __builtin_ia32_loadups(p); 539} 540 541static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 542_mm_loadr_ps(const float *p) 543{ 544 __m128 a = _mm_load_ps(p); 545 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 546} 547 548static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 549_mm_set_ss(float w) 550{ 551 return (__m128){ w, 0, 0, 0 }; 552} 553 554static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 555_mm_set1_ps(float w) 556{ 557 return (__m128){ w, w, w, w }; 558} 559 560// Microsoft specific. 561static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 562_mm_set_ps1(float w) 563{ 564 return _mm_set1_ps(w); 565} 566 567static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 568_mm_set_ps(float z, float y, float x, float w) 569{ 570 return (__m128){ w, x, y, z }; 571} 572 573static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 574_mm_setr_ps(float z, float y, float x, float w) 575{ 576 return (__m128){ z, y, x, w }; 577} 578 579static __inline__ __m128 __attribute__((__always_inline__)) 580_mm_setzero_ps(void) 581{ 582 return (__m128){ 0, 0, 0, 0 }; 583} 584 585static __inline__ void __attribute__((__always_inline__)) 586_mm_storeh_pi(__m64 *p, __m128 a) 587{ 588 __builtin_ia32_storehps((__v2si *)p, a); 589} 590 591static __inline__ void __attribute__((__always_inline__)) 592_mm_storel_pi(__m64 *p, __m128 a) 593{ 594 __builtin_ia32_storelps((__v2si *)p, a); 595} 596 597static __inline__ void __attribute__((__always_inline__)) 598_mm_store_ss(float *p, __m128 a) 599{ 600 *p = a[0]; 601} 602 603static __inline__ void __attribute__((__always_inline__, __nodebug__)) 604_mm_storeu_ps(float *p, __m128 a) 605{ 606 __builtin_ia32_storeups(p, a); 607} 608 609static __inline__ void __attribute__((__always_inline__, __nodebug__)) 610_mm_store1_ps(float *p, __m128 a) 611{ 612 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 613 _mm_storeu_ps(p, a); 614} 615 616static __inline__ void __attribute__((__always_inline__, __nodebug__)) 617_mm_store_ps1(float *p, __m128 a) 618{ 619 return _mm_store1_ps(p, a); 620} 621 622static __inline__ void __attribute__((__always_inline__, __nodebug__)) 623_mm_store_ps(float *p, __m128 a) 624{ 625 *(__m128 *)p = a; 626} 627 628static __inline__ void __attribute__((__always_inline__, __nodebug__)) 629_mm_storer_ps(float *p, __m128 a) 630{ 631 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 632 _mm_store_ps(p, a); 633} 634 635#define _MM_HINT_T0 3 636#define _MM_HINT_T1 2 637#define _MM_HINT_T2 1 638#define _MM_HINT_NTA 0 639 640/* FIXME: We have to #define this because "sel" must be a constant integer, and 641 Sema doesn't do any form of constant propagation yet. */ 642 643#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 644 645static __inline__ void __attribute__((__always_inline__, __nodebug__)) 646_mm_stream_pi(__m64 *p, __m64 a) 647{ 648 __builtin_ia32_movntq(p, a); 649} 650 651static __inline__ void __attribute__((__always_inline__, __nodebug__)) 652_mm_stream_ps(float *p, __m128 a) 653{ 654 __builtin_ia32_movntps(p, a); 655} 656 657static __inline__ void __attribute__((__always_inline__, __nodebug__)) 658_mm_sfence(void) 659{ 660 __builtin_ia32_sfence(); 661} 662 663static __inline__ int __attribute__((__always_inline__, __nodebug__)) 664_mm_extract_pi16(__m64 a, int n) 665{ 666 __v4hi b = (__v4hi)a; 667 return (unsigned short)b[n & 3]; 668} 669 670static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 671_mm_insert_pi16(__m64 a, int d, int n) 672{ 673 __v4hi b = (__v4hi)a; 674 b[n & 3] = d; 675 return (__m64)b; 676} 677 678static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 679_mm_max_pi16(__m64 a, __m64 b) 680{ 681 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 682} 683 684static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 685_mm_max_pu8(__m64 a, __m64 b) 686{ 687 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 688} 689 690static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 691_mm_min_pi16(__m64 a, __m64 b) 692{ 693 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 694} 695 696static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 697_mm_min_pu8(__m64 a, __m64 b) 698{ 699 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 700} 701 702static __inline__ int __attribute__((__always_inline__, __nodebug__)) 703_mm_movemask_pi8(__m64 a) 704{ 705 return __builtin_ia32_pmovmskb((__v8qi)a); 706} 707 708static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 709_mm_mulhi_pu16(__m64 a, __m64 b) 710{ 711 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 712} 713 714#define _mm_shuffle_pi16(a, n) \ 715 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 716 (n) & 0x3, ((n) & 0xc) >> 2, \ 717 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 718 719static __inline__ void __attribute__((__always_inline__, __nodebug__)) 720_mm_maskmove_si64(__m64 d, __m64 n, char *p) 721{ 722 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 723} 724 725static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 726_mm_avg_pu8(__m64 a, __m64 b) 727{ 728 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 729} 730 731static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 732_mm_avg_pu16(__m64 a, __m64 b) 733{ 734 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 735} 736 737static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 738_mm_sad_pu8(__m64 a, __m64 b) 739{ 740 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 741} 742 743static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 744_mm_getcsr(void) 745{ 746 return __builtin_ia32_stmxcsr(); 747} 748 749static __inline__ void __attribute__((__always_inline__, __nodebug__)) 750_mm_setcsr(unsigned int i) 751{ 752 __builtin_ia32_ldmxcsr(i); 753} 754 755#define _mm_shuffle_ps(a, b, mask) \ 756 (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 757 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 758 (((mask) & 0x30) >> 4) + 4, \ 759 (((mask) & 0xc0) >> 6) + 4)) 760 761static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 762_mm_unpackhi_ps(__m128 a, __m128 b) 763{ 764 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 765} 766 767static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 768_mm_unpacklo_ps(__m128 a, __m128 b) 769{ 770 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 771} 772 773static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 774_mm_move_ss(__m128 a, __m128 b) 775{ 776 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 777} 778 779static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 780_mm_movehl_ps(__m128 a, __m128 b) 781{ 782 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 783} 784 785static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 786_mm_movelh_ps(__m128 a, __m128 b) 787{ 788 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 789} 790 791static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 792_mm_cvtpi16_ps(__m64 a) 793{ 794 __m64 b, c; 795 __m128 r; 796 797 b = _mm_setzero_si64(); 798 b = _mm_cmpgt_pi16(b, a); 799 c = _mm_unpackhi_pi16(a, b); 800 r = _mm_setzero_ps(); 801 r = _mm_cvtpi32_ps(r, c); 802 r = _mm_movelh_ps(r, r); 803 c = _mm_unpacklo_pi16(a, b); 804 r = _mm_cvtpi32_ps(r, c); 805 806 return r; 807} 808 809static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 810_mm_cvtpu16_ps(__m64 a) 811{ 812 __m64 b, c; 813 __m128 r; 814 815 b = _mm_setzero_si64(); 816 c = _mm_unpackhi_pi16(a, b); 817 r = _mm_setzero_ps(); 818 r = _mm_cvtpi32_ps(r, c); 819 r = _mm_movelh_ps(r, r); 820 c = _mm_unpacklo_pi16(a, b); 821 r = _mm_cvtpi32_ps(r, c); 822 823 return r; 824} 825 826static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 827_mm_cvtpi8_ps(__m64 a) 828{ 829 __m64 b; 830 831 b = _mm_setzero_si64(); 832 b = _mm_cmpgt_pi8(b, a); 833 b = _mm_unpacklo_pi8(a, b); 834 835 return _mm_cvtpi16_ps(b); 836} 837 838static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 839_mm_cvtpu8_ps(__m64 a) 840{ 841 __m64 b; 842 843 b = _mm_setzero_si64(); 844 b = _mm_unpacklo_pi8(a, b); 845 846 return _mm_cvtpi16_ps(b); 847} 848 849static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 850_mm_cvtpi32x2_ps(__m64 a, __m64 b) 851{ 852 __m128 c; 853 854 c = _mm_setzero_ps(); 855 c = _mm_cvtpi32_ps(c, b); 856 c = _mm_movelh_ps(c, c); 857 858 return _mm_cvtpi32_ps(c, a); 859} 860 861static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 862_mm_cvtps_pi16(__m128 a) 863{ 864 __m64 b, c; 865 866 b = _mm_cvtps_pi32(a); 867 a = _mm_movehl_ps(a, a); 868 c = _mm_cvtps_pi32(a); 869 870 return _mm_packs_pi16(b, c); 871} 872 873static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 874_mm_cvtps_pi8(__m128 a) 875{ 876 __m64 b, c; 877 878 b = _mm_cvtps_pi16(a); 879 c = _mm_setzero_si64(); 880 881 return _mm_packs_pi16(b, c); 882} 883 884static __inline__ int __attribute__((__always_inline__, __nodebug__)) 885_mm_movemask_ps(__m128 a) 886{ 887 return __builtin_ia32_movmskps(a); 888} 889 890#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 891 892#define _MM_EXCEPT_INVALID (0x0001) 893#define _MM_EXCEPT_DENORM (0x0002) 894#define _MM_EXCEPT_DIV_ZERO (0x0004) 895#define _MM_EXCEPT_OVERFLOW (0x0008) 896#define _MM_EXCEPT_UNDERFLOW (0x0010) 897#define _MM_EXCEPT_INEXACT (0x0020) 898#define _MM_EXCEPT_MASK (0x003f) 899 900#define _MM_MASK_INVALID (0x0080) 901#define _MM_MASK_DENORM (0x0100) 902#define _MM_MASK_DIV_ZERO (0x0200) 903#define _MM_MASK_OVERFLOW (0x0400) 904#define _MM_MASK_UNDERFLOW (0x0800) 905#define _MM_MASK_INEXACT (0x1000) 906#define _MM_MASK_MASK (0x1f80) 907 908#define _MM_ROUND_NEAREST (0x0000) 909#define _MM_ROUND_DOWN (0x2000) 910#define _MM_ROUND_UP (0x4000) 911#define _MM_ROUND_TOWARD_ZERO (0x6000) 912#define _MM_ROUND_MASK (0x6000) 913 914#define _MM_FLUSH_ZERO_MASK (0x8000) 915#define _MM_FLUSH_ZERO_ON (0x8000) 916#define _MM_FLUSH_ZERO_OFF (0x8000) 917 918#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 919#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 920#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 921#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 922 923#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 924#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 925#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 926#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 927 928#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 929do { \ 930 __m128 tmp3, tmp2, tmp1, tmp0; \ 931 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 932 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 933 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 934 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 935 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 936 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 937 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 938 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 939} while (0) 940 941/* Aliases for compatibility. */ 942#define _m_pextrw _mm_extract_pi16 943#define _m_pinsrw _mm_insert_pi16 944#define _m_pmaxsw _mm_max_pi16 945#define _m_pmaxub _mm_max_pu8 946#define _m_pminsw _mm_min_pi16 947#define _m_pminub _mm_min_pu8 948#define _m_pmovmskb _mm_movemask_pi8 949#define _m_pmulhuw _mm_mulhi_pu16 950#define _m_pshufw _mm_shuffle_pi16 951#define _m_maskmovq _mm_maskmove_si64 952#define _m_pavgb _mm_avg_pu8 953#define _m_pavgw _mm_avg_pu16 954#define _m_psadbw _mm_sad_pu8 955#define _m_ _mm_ 956#define _m_ _mm_ 957 958/* Ugly hack for backwards-compatibility (compatible with gcc) */ 959#ifdef __SSE2__ 960#include <emmintrin.h> 961#endif 962 963#endif /* __SSE__ */ 964 965#endif /* __XMMINTRIN_H */ 966