xmmintrin.h revision 205408
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2193326Sed * 3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy 4193326Sed * of this software and associated documentation files (the "Software"), to deal 5193326Sed * in the Software without restriction, including without limitation the rights 6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7193326Sed * copies of the Software, and to permit persons to whom the Software is 8193326Sed * furnished to do so, subject to the following conditions: 9193326Sed * 10193326Sed * The above copyright notice and this permission notice shall be included in 11193326Sed * all copies or substantial portions of the Software. 12193326Sed * 13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19193326Sed * THE SOFTWARE. 20193326Sed * 21193326Sed *===-----------------------------------------------------------------------=== 22193326Sed */ 23193326Sed 24193326Sed#ifndef __XMMINTRIN_H 25193326Sed#define __XMMINTRIN_H 26193326Sed 27193326Sed#ifndef __SSE__ 28193326Sed#error "SSE instruction set not enabled" 29193326Sed#else 30193326Sed 31193326Sed#include <mmintrin.h> 32193326Sed 33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16))); 34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16))); 35193326Sedtypedef float __m128 __attribute__((__vector_size__(16))); 36193326Sed 37193326Sed#include <mm_malloc.h> 38193326Sed 39193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 40193326Sed_mm_add_ss(__m128 a, __m128 b) 41193326Sed{ 42193576Sed a[0] += b[0]; 43193576Sed return a; 44193326Sed} 45193326Sed 46193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 47193326Sed_mm_add_ps(__m128 a, __m128 b) 48193326Sed{ 49193326Sed return a + b; 50193326Sed} 51193326Sed 52193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 53193326Sed_mm_sub_ss(__m128 a, __m128 b) 54193326Sed{ 55193576Sed a[0] -= b[0]; 56193576Sed return a; 57193326Sed} 58193326Sed 59193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 60193326Sed_mm_sub_ps(__m128 a, __m128 b) 61193326Sed{ 62193326Sed return a - b; 63193326Sed} 64193326Sed 65193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 66193326Sed_mm_mul_ss(__m128 a, __m128 b) 67193326Sed{ 68193576Sed a[0] *= b[0]; 69193576Sed return a; 70193326Sed} 71193326Sed 72193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 73193326Sed_mm_mul_ps(__m128 a, __m128 b) 74193326Sed{ 75193326Sed return a * b; 76193326Sed} 77193326Sed 78193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 79193326Sed_mm_div_ss(__m128 a, __m128 b) 80193326Sed{ 81193576Sed a[0] /= b[0]; 82193576Sed return a; 83193326Sed} 84193326Sed 85193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 86193326Sed_mm_div_ps(__m128 a, __m128 b) 87193326Sed{ 88193326Sed return a / b; 89193326Sed} 90193326Sed 91193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 92193326Sed_mm_sqrt_ss(__m128 a) 93193326Sed{ 94193326Sed return __builtin_ia32_sqrtss(a); 95193326Sed} 96193326Sed 97193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 98193326Sed_mm_sqrt_ps(__m128 a) 99193326Sed{ 100193326Sed return __builtin_ia32_sqrtps(a); 101193326Sed} 102193326Sed 103193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 104193326Sed_mm_rcp_ss(__m128 a) 105193326Sed{ 106193326Sed return __builtin_ia32_rcpss(a); 107193326Sed} 108193326Sed 109193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 110193326Sed_mm_rcp_ps(__m128 a) 111193326Sed{ 112193326Sed return __builtin_ia32_rcpps(a); 113193326Sed} 114193326Sed 115193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 116193326Sed_mm_rsqrt_ss(__m128 a) 117193326Sed{ 118193326Sed return __builtin_ia32_rsqrtss(a); 119193326Sed} 120193326Sed 121193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 122193326Sed_mm_rsqrt_ps(__m128 a) 123193326Sed{ 124193326Sed return __builtin_ia32_rsqrtps(a); 125193326Sed} 126193326Sed 127193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 128193326Sed_mm_min_ss(__m128 a, __m128 b) 129193326Sed{ 130193326Sed return __builtin_ia32_minss(a, b); 131193326Sed} 132193326Sed 133193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 134193326Sed_mm_min_ps(__m128 a, __m128 b) 135193326Sed{ 136193326Sed return __builtin_ia32_minps(a, b); 137193326Sed} 138193326Sed 139193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 140193326Sed_mm_max_ss(__m128 a, __m128 b) 141193326Sed{ 142193326Sed return __builtin_ia32_maxss(a, b); 143193326Sed} 144193326Sed 145193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 146193326Sed_mm_max_ps(__m128 a, __m128 b) 147193326Sed{ 148193326Sed return __builtin_ia32_maxps(a, b); 149193326Sed} 150193326Sed 151193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 152193326Sed_mm_and_ps(__m128 a, __m128 b) 153193326Sed{ 154193576Sed return (__m128)((__v4si)a & (__v4si)b); 155193326Sed} 156193326Sed 157193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 158193326Sed_mm_andnot_ps(__m128 a, __m128 b) 159193326Sed{ 160193576Sed return (__m128)(~(__v4si)a & (__v4si)b); 161193326Sed} 162193326Sed 163193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 164193326Sed_mm_or_ps(__m128 a, __m128 b) 165193326Sed{ 166193576Sed return (__m128)((__v4si)a | (__v4si)b); 167193326Sed} 168193326Sed 169193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 170193326Sed_mm_xor_ps(__m128 a, __m128 b) 171193326Sed{ 172202379Srdivacky return (__m128)((__v4si)a ^ (__v4si)b); 173193326Sed} 174193326Sed 175193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 176193326Sed_mm_cmpeq_ss(__m128 a, __m128 b) 177193326Sed{ 178193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 0); 179193326Sed} 180193326Sed 181193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 182193326Sed_mm_cmpeq_ps(__m128 a, __m128 b) 183193326Sed{ 184193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 0); 185193326Sed} 186193326Sed 187193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 188193326Sed_mm_cmplt_ss(__m128 a, __m128 b) 189193326Sed{ 190193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 1); 191193326Sed} 192193326Sed 193193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 194193326Sed_mm_cmplt_ps(__m128 a, __m128 b) 195193326Sed{ 196193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 1); 197193326Sed} 198193326Sed 199193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 200193326Sed_mm_cmple_ss(__m128 a, __m128 b) 201193326Sed{ 202193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 2); 203193326Sed} 204193326Sed 205193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 206193326Sed_mm_cmple_ps(__m128 a, __m128 b) 207193326Sed{ 208193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 2); 209193326Sed} 210193326Sed 211193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 212193326Sed_mm_cmpgt_ss(__m128 a, __m128 b) 213193326Sed{ 214193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 1); 215193326Sed} 216193326Sed 217193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 218193326Sed_mm_cmpgt_ps(__m128 a, __m128 b) 219193326Sed{ 220193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 1); 221193326Sed} 222193326Sed 223193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 224193326Sed_mm_cmpge_ss(__m128 a, __m128 b) 225193326Sed{ 226193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 2); 227193326Sed} 228193326Sed 229193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 230193326Sed_mm_cmpge_ps(__m128 a, __m128 b) 231193326Sed{ 232193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 2); 233193326Sed} 234193326Sed 235193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 236193326Sed_mm_cmpneq_ss(__m128 a, __m128 b) 237193326Sed{ 238193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 4); 239193326Sed} 240193326Sed 241193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 242193326Sed_mm_cmpneq_ps(__m128 a, __m128 b) 243193326Sed{ 244193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 4); 245193326Sed} 246193326Sed 247193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 248193326Sed_mm_cmpnlt_ss(__m128 a, __m128 b) 249193326Sed{ 250193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 5); 251193326Sed} 252193326Sed 253193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 254193326Sed_mm_cmpnlt_ps(__m128 a, __m128 b) 255193326Sed{ 256193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 5); 257193326Sed} 258193326Sed 259193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 260193326Sed_mm_cmpnle_ss(__m128 a, __m128 b) 261193326Sed{ 262193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 6); 263193326Sed} 264193326Sed 265193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 266193326Sed_mm_cmpnle_ps(__m128 a, __m128 b) 267193326Sed{ 268193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 6); 269193326Sed} 270193326Sed 271193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 272193326Sed_mm_cmpngt_ss(__m128 a, __m128 b) 273193326Sed{ 274193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 5); 275193326Sed} 276193326Sed 277193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 278193326Sed_mm_cmpngt_ps(__m128 a, __m128 b) 279193326Sed{ 280193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 5); 281193326Sed} 282193326Sed 283193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 284193326Sed_mm_cmpnge_ss(__m128 a, __m128 b) 285193326Sed{ 286193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 6); 287193326Sed} 288193326Sed 289193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 290193326Sed_mm_cmpnge_ps(__m128 a, __m128 b) 291193326Sed{ 292193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 6); 293193326Sed} 294193326Sed 295193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 296193326Sed_mm_cmpord_ss(__m128 a, __m128 b) 297193326Sed{ 298193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 7); 299193326Sed} 300193326Sed 301193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 302193326Sed_mm_cmpord_ps(__m128 a, __m128 b) 303193326Sed{ 304193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 7); 305193326Sed} 306193326Sed 307193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 308193326Sed_mm_cmpunord_ss(__m128 a, __m128 b) 309193326Sed{ 310193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 3); 311193326Sed} 312193326Sed 313193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 314193326Sed_mm_cmpunord_ps(__m128 a, __m128 b) 315193326Sed{ 316193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 3); 317193326Sed} 318193326Sed 319193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 320193326Sed_mm_comieq_ss(__m128 a, __m128 b) 321193326Sed{ 322193326Sed return __builtin_ia32_comieq(a, b); 323193326Sed} 324193326Sed 325193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 326193326Sed_mm_comilt_ss(__m128 a, __m128 b) 327193326Sed{ 328193326Sed return __builtin_ia32_comilt(a, b); 329193326Sed} 330193326Sed 331193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 332193326Sed_mm_comile_ss(__m128 a, __m128 b) 333193326Sed{ 334193326Sed return __builtin_ia32_comile(a, b); 335193326Sed} 336193326Sed 337193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 338193326Sed_mm_comigt_ss(__m128 a, __m128 b) 339193326Sed{ 340193326Sed return __builtin_ia32_comigt(a, b); 341193326Sed} 342193326Sed 343193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 344193326Sed_mm_comige_ss(__m128 a, __m128 b) 345193326Sed{ 346193326Sed return __builtin_ia32_comige(a, b); 347193326Sed} 348193326Sed 349193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 350193326Sed_mm_comineq_ss(__m128 a, __m128 b) 351193326Sed{ 352193326Sed return __builtin_ia32_comineq(a, b); 353193326Sed} 354193326Sed 355193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 356193326Sed_mm_ucomieq_ss(__m128 a, __m128 b) 357193326Sed{ 358193326Sed return __builtin_ia32_ucomieq(a, b); 359193326Sed} 360193326Sed 361193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 362193326Sed_mm_ucomilt_ss(__m128 a, __m128 b) 363193326Sed{ 364193326Sed return __builtin_ia32_ucomilt(a, b); 365193326Sed} 366193326Sed 367193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 368193326Sed_mm_ucomile_ss(__m128 a, __m128 b) 369193326Sed{ 370193326Sed return __builtin_ia32_ucomile(a, b); 371193326Sed} 372193326Sed 373193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 374193326Sed_mm_ucomigt_ss(__m128 a, __m128 b) 375193326Sed{ 376193326Sed return __builtin_ia32_ucomigt(a, b); 377193326Sed} 378193326Sed 379193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 380193326Sed_mm_ucomige_ss(__m128 a, __m128 b) 381193326Sed{ 382193326Sed return __builtin_ia32_ucomige(a, b); 383193326Sed} 384193326Sed 385193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 386193326Sed_mm_ucomineq_ss(__m128 a, __m128 b) 387193326Sed{ 388193326Sed return __builtin_ia32_ucomineq(a, b); 389193326Sed} 390193326Sed 391193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 392193326Sed_mm_cvtss_si32(__m128 a) 393193326Sed{ 394193326Sed return __builtin_ia32_cvtss2si(a); 395193326Sed} 396193326Sed 397204643Srdivackystatic inline int __attribute__((__always_inline__, __nodebug__)) 398204643Srdivacky_mm_cvt_ss2si(__m128 a) 399204643Srdivacky{ 400204643Srdivacky return _mm_cvtss_si32(a); 401204643Srdivacky} 402204643Srdivacky 403193576Sed#ifdef __x86_64__ 404193576Sed 405193326Sedstatic inline long long __attribute__((__always_inline__, __nodebug__)) 406193326Sed_mm_cvtss_si64(__m128 a) 407193326Sed{ 408193326Sed return __builtin_ia32_cvtss2si64(a); 409193326Sed} 410193326Sed 411193576Sed#endif 412193576Sed 413193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 414193326Sed_mm_cvtps_pi32(__m128 a) 415193326Sed{ 416193326Sed return (__m64)__builtin_ia32_cvtps2pi(a); 417193326Sed} 418193326Sed 419193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 420193326Sed_mm_cvttss_si32(__m128 a) 421193326Sed{ 422193576Sed return a[0]; 423193326Sed} 424193326Sed 425204643Srdivackystatic inline int __attribute__((__always_inline__, __nodebug__)) 426204643Srdivacky_mm_cvtt_ss2si(__m128 a) 427204643Srdivacky{ 428204643Srdivacky return _mm_cvttss_si32(a); 429204643Srdivacky} 430204643Srdivacky 431193326Sedstatic inline long long __attribute__((__always_inline__, __nodebug__)) 432193326Sed_mm_cvttss_si64(__m128 a) 433193326Sed{ 434193576Sed return a[0]; 435193326Sed} 436193326Sed 437193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 438193326Sed_mm_cvttps_pi32(__m128 a) 439193326Sed{ 440193326Sed return (__m64)__builtin_ia32_cvttps2pi(a); 441193326Sed} 442193326Sed 443193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 444193326Sed_mm_cvtsi32_ss(__m128 a, int b) 445193326Sed{ 446193576Sed a[0] = b; 447193576Sed return a; 448193326Sed} 449193326Sed 450193326Sed#ifdef __x86_64__ 451193326Sed 452193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 453193326Sed_mm_cvtsi64_ss(__m128 a, long long b) 454193326Sed{ 455193576Sed a[0] = b; 456193576Sed return a; 457193326Sed} 458193326Sed 459193326Sed#endif 460193326Sed 461193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 462193326Sed_mm_cvtpi32_ps(__m128 a, __m64 b) 463193326Sed{ 464193326Sed return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 465193326Sed} 466193326Sed 467193326Sedstatic inline float __attribute__((__always_inline__, __nodebug__)) 468193326Sed_mm_cvtss_f32(__m128 a) 469193326Sed{ 470193326Sed return a[0]; 471193326Sed} 472193326Sed 473193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 474203955Srdivacky_mm_loadh_pi(__m128 a, const __m64 *p) 475193326Sed{ 476193631Sed __m128 b; 477193631Sed b[0] = *(float*)p; 478193631Sed b[1] = *((float*)p+1); 479193631Sed return __builtin_shufflevector(a, b, 0, 1, 4, 5); 480193326Sed} 481193326Sed 482193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 483203955Srdivacky_mm_loadl_pi(__m128 a, const __m64 *p) 484193326Sed{ 485193576Sed __m128 b; 486193576Sed b[0] = *(float*)p; 487193576Sed b[1] = *((float*)p+1); 488193631Sed return __builtin_shufflevector(a, b, 4, 5, 2, 3); 489193326Sed} 490193326Sed 491193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 492203955Srdivacky_mm_load_ss(const float *p) 493193326Sed{ 494193326Sed return (__m128){ *p, 0, 0, 0 }; 495193326Sed} 496193326Sed 497193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 498203955Srdivacky_mm_load1_ps(const float *p) 499193326Sed{ 500193326Sed return (__m128){ *p, *p, *p, *p }; 501193326Sed} 502193326Sed 503193326Sed#define _mm_load_ps1(p) _mm_load1_ps(p) 504193326Sed 505193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 506203955Srdivacky_mm_load_ps(const float *p) 507193326Sed{ 508193326Sed return *(__m128*)p; 509193326Sed} 510193326Sed 511193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 512203955Srdivacky_mm_loadu_ps(const float *p) 513193326Sed{ 514193326Sed return __builtin_ia32_loadups(p); 515193326Sed} 516193326Sed 517193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 518203955Srdivacky_mm_loadr_ps(const float *p) 519193326Sed{ 520193326Sed __m128 a = _mm_load_ps(p); 521193326Sed return __builtin_shufflevector(a, a, 3, 2, 1, 0); 522193326Sed} 523193326Sed 524193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 525193326Sed_mm_set_ss(float w) 526193326Sed{ 527193326Sed return (__m128){ w, 0, 0, 0 }; 528193326Sed} 529193326Sed 530193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 531193326Sed_mm_set1_ps(float w) 532193326Sed{ 533193326Sed return (__m128){ w, w, w, w }; 534193326Sed} 535193326Sed 536193326Sed// Microsoft specific. 537193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 538193326Sed_mm_set_ps1(float w) 539193326Sed{ 540193326Sed return _mm_set1_ps(w); 541193326Sed} 542193326Sed 543193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 544193326Sed_mm_set_ps(float z, float y, float x, float w) 545193326Sed{ 546193326Sed return (__m128){ w, x, y, z }; 547193326Sed} 548193326Sed 549193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 550193326Sed_mm_setr_ps(float z, float y, float x, float w) 551193326Sed{ 552193326Sed return (__m128){ z, y, x, w }; 553193326Sed} 554193326Sed 555193326Sedstatic inline __m128 __attribute__((__always_inline__)) 556193326Sed_mm_setzero_ps(void) 557193326Sed{ 558193326Sed return (__m128){ 0, 0, 0, 0 }; 559193326Sed} 560193326Sed 561193326Sedstatic inline void __attribute__((__always_inline__)) 562193326Sed_mm_storeh_pi(__m64 *p, __m128 a) 563193326Sed{ 564193326Sed __builtin_ia32_storehps((__v2si *)p, a); 565193326Sed} 566193326Sed 567193326Sedstatic inline void __attribute__((__always_inline__)) 568193326Sed_mm_storel_pi(__m64 *p, __m128 a) 569193326Sed{ 570193326Sed __builtin_ia32_storelps((__v2si *)p, a); 571193326Sed} 572193326Sed 573193326Sedstatic inline void __attribute__((__always_inline__)) 574193326Sed_mm_store_ss(float *p, __m128 a) 575193326Sed{ 576193326Sed *p = a[0]; 577193326Sed} 578193326Sed 579193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 580193326Sed_mm_storeu_ps(float *p, __m128 a) 581193326Sed{ 582193326Sed __builtin_ia32_storeups(p, a); 583193326Sed} 584193326Sed 585193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 586193326Sed_mm_store1_ps(float *p, __m128 a) 587193326Sed{ 588193326Sed a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 589193326Sed _mm_storeu_ps(p, a); 590193326Sed} 591193326Sed 592193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 593193326Sed_mm_store_ps(float *p, __m128 a) 594193326Sed{ 595193326Sed *(__m128 *)p = a; 596193326Sed} 597193326Sed 598193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 599193326Sed_mm_storer_ps(float *p, __m128 a) 600193326Sed{ 601193326Sed a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 602193326Sed _mm_store_ps(p, a); 603193326Sed} 604193326Sed 605193326Sed#define _MM_HINT_T0 1 606193326Sed#define _MM_HINT_T1 2 607193326Sed#define _MM_HINT_T2 3 608193326Sed#define _MM_HINT_NTA 0 609193326Sed 610193326Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and 611193326Sed Sema doesn't do any form of constant propagation yet. */ 612193326Sed 613193326Sed#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 614193326Sed 615193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 616193326Sed_mm_stream_pi(__m64 *p, __m64 a) 617193326Sed{ 618193326Sed __builtin_ia32_movntq(p, a); 619193326Sed} 620193326Sed 621193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 622193326Sed_mm_stream_ps(float *p, __m128 a) 623193326Sed{ 624193326Sed __builtin_ia32_movntps(p, a); 625193326Sed} 626193326Sed 627193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 628193326Sed_mm_sfence(void) 629193326Sed{ 630193326Sed __builtin_ia32_sfence(); 631193326Sed} 632193326Sed 633193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 634193326Sed_mm_extract_pi16(__m64 a, int n) 635193326Sed{ 636193326Sed __v4hi b = (__v4hi)a; 637193576Sed return (unsigned short)b[n & 3]; 638193326Sed} 639193326Sed 640193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 641193326Sed_mm_insert_pi16(__m64 a, int d, int n) 642193326Sed{ 643193576Sed __v4hi b = (__v4hi)a; 644193576Sed b[n & 3] = d; 645193576Sed return (__m64)b; 646193326Sed} 647193326Sed 648193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 649193326Sed_mm_max_pi16(__m64 a, __m64 b) 650193326Sed{ 651193326Sed return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 652193326Sed} 653193326Sed 654193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 655193326Sed_mm_max_pu8(__m64 a, __m64 b) 656193326Sed{ 657193326Sed return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 658193326Sed} 659193326Sed 660193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 661193326Sed_mm_min_pi16(__m64 a, __m64 b) 662193326Sed{ 663193326Sed return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 664193326Sed} 665193326Sed 666193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 667193326Sed_mm_min_pu8(__m64 a, __m64 b) 668193326Sed{ 669193326Sed return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 670193326Sed} 671193326Sed 672193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 673193326Sed_mm_movemask_pi8(__m64 a) 674193326Sed{ 675193326Sed return __builtin_ia32_pmovmskb((__v8qi)a); 676193326Sed} 677193326Sed 678193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 679193326Sed_mm_mulhi_pu16(__m64 a, __m64 b) 680193326Sed{ 681193326Sed return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 682193326Sed} 683193326Sed 684193576Sed#define _mm_shuffle_pi16(a, n) \ 685193576Sed ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 686193576Sed (n) & 0x3, ((n) & 0xc) >> 2, \ 687193576Sed ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 688193326Sed 689193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 690193326Sed_mm_maskmove_si64(__m64 d, __m64 n, char *p) 691193326Sed{ 692193326Sed __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 693193326Sed} 694193326Sed 695193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 696193326Sed_mm_avg_pu8(__m64 a, __m64 b) 697193326Sed{ 698193326Sed return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 699193326Sed} 700193326Sed 701193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 702193326Sed_mm_avg_pu16(__m64 a, __m64 b) 703193326Sed{ 704193326Sed return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 705193326Sed} 706193326Sed 707193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 708193326Sed_mm_sad_pu8(__m64 a, __m64 b) 709193326Sed{ 710193326Sed return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 711193326Sed} 712193326Sed 713193326Sedstatic inline unsigned int __attribute__((__always_inline__, __nodebug__)) 714193326Sed_mm_getcsr(void) 715193326Sed{ 716193326Sed return __builtin_ia32_stmxcsr(); 717193326Sed} 718193326Sed 719193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__)) 720193326Sed_mm_setcsr(unsigned int i) 721193326Sed{ 722193326Sed __builtin_ia32_ldmxcsr(i); 723193326Sed} 724193326Sed 725193576Sed#define _mm_shuffle_ps(a, b, mask) \ 726193576Sed (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \ 727193576Sed (((mask) & 0x30) >> 4) + 4, \ 728193576Sed (((mask) & 0xc0) >> 6) + 4)) 729193326Sed 730193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 731193326Sed_mm_unpackhi_ps(__m128 a, __m128 b) 732193326Sed{ 733193326Sed return __builtin_shufflevector(a, b, 2, 6, 3, 7); 734193326Sed} 735193326Sed 736193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 737193326Sed_mm_unpacklo_ps(__m128 a, __m128 b) 738193326Sed{ 739193326Sed return __builtin_shufflevector(a, b, 0, 4, 1, 5); 740193326Sed} 741193326Sed 742193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 743193326Sed_mm_move_ss(__m128 a, __m128 b) 744193326Sed{ 745193326Sed return __builtin_shufflevector(a, b, 4, 1, 2, 3); 746193326Sed} 747193326Sed 748193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 749193326Sed_mm_movehl_ps(__m128 a, __m128 b) 750193326Sed{ 751193326Sed return __builtin_shufflevector(a, b, 6, 7, 2, 3); 752193326Sed} 753193326Sed 754193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 755193326Sed_mm_movelh_ps(__m128 a, __m128 b) 756193326Sed{ 757193326Sed return __builtin_shufflevector(a, b, 0, 1, 4, 5); 758193326Sed} 759193326Sed 760193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 761193326Sed_mm_cvtpi16_ps(__m64 a) 762193326Sed{ 763193326Sed __m64 b, c; 764193326Sed __m128 r; 765193326Sed 766193326Sed b = _mm_setzero_si64(); 767193326Sed b = _mm_cmpgt_pi16(b, a); 768193326Sed c = _mm_unpackhi_pi16(a, b); 769193326Sed r = _mm_setzero_ps(); 770193326Sed r = _mm_cvtpi32_ps(r, c); 771193326Sed r = _mm_movelh_ps(r, r); 772193326Sed c = _mm_unpacklo_pi16(a, b); 773193326Sed r = _mm_cvtpi32_ps(r, c); 774193326Sed 775193326Sed return r; 776193326Sed} 777193326Sed 778193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 779193326Sed_mm_cvtpu16_ps(__m64 a) 780193326Sed{ 781193326Sed __m64 b, c; 782193326Sed __m128 r; 783193326Sed 784193326Sed b = _mm_setzero_si64(); 785193326Sed c = _mm_unpackhi_pi16(a, b); 786193326Sed r = _mm_setzero_ps(); 787193326Sed r = _mm_cvtpi32_ps(r, c); 788193326Sed r = _mm_movelh_ps(r, r); 789193326Sed c = _mm_unpacklo_pi16(a, b); 790193326Sed r = _mm_cvtpi32_ps(r, c); 791193326Sed 792193326Sed return r; 793193326Sed} 794193326Sed 795193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 796193326Sed_mm_cvtpi8_ps(__m64 a) 797193326Sed{ 798193326Sed __m64 b; 799193326Sed 800193326Sed b = _mm_setzero_si64(); 801193326Sed b = _mm_cmpgt_pi8(b, a); 802193326Sed b = _mm_unpacklo_pi8(a, b); 803193326Sed 804193326Sed return _mm_cvtpi16_ps(b); 805193326Sed} 806193326Sed 807193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 808193326Sed_mm_cvtpu8_ps(__m64 a) 809193326Sed{ 810193326Sed __m64 b; 811193326Sed 812193326Sed b = _mm_setzero_si64(); 813193326Sed b = _mm_unpacklo_pi8(a, b); 814193326Sed 815193326Sed return _mm_cvtpi16_ps(b); 816193326Sed} 817193326Sed 818193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 819193326Sed_mm_cvtpi32x2_ps(__m64 a, __m64 b) 820193326Sed{ 821193326Sed __m128 c; 822193326Sed 823193326Sed c = _mm_setzero_ps(); 824193326Sed c = _mm_cvtpi32_ps(c, b); 825193326Sed c = _mm_movelh_ps(c, c); 826193326Sed 827193326Sed return _mm_cvtpi32_ps(c, a); 828193326Sed} 829193326Sed 830193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 831193326Sed_mm_cvtps_pi16(__m128 a) 832193326Sed{ 833193326Sed __m64 b, c; 834193326Sed 835193326Sed b = _mm_cvtps_pi32(a); 836193326Sed a = _mm_movehl_ps(a, a); 837193326Sed c = _mm_cvtps_pi32(a); 838193326Sed 839193326Sed return _mm_packs_pi16(b, c); 840193326Sed} 841193326Sed 842193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 843193326Sed_mm_cvtps_pi8(__m128 a) 844193326Sed{ 845193326Sed __m64 b, c; 846193326Sed 847193326Sed b = _mm_cvtps_pi16(a); 848193326Sed c = _mm_setzero_si64(); 849193326Sed 850193326Sed return _mm_packs_pi16(b, c); 851193326Sed} 852193326Sed 853193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__)) 854193326Sed_mm_movemask_ps(__m128 a) 855193326Sed{ 856193326Sed return __builtin_ia32_movmskps(a); 857193326Sed} 858193326Sed 859193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 860193326Sed 861193326Sed#define _MM_EXCEPT_INVALID (0x0001) 862193326Sed#define _MM_EXCEPT_DENORM (0x0002) 863193326Sed#define _MM_EXCEPT_DIV_ZERO (0x0004) 864193326Sed#define _MM_EXCEPT_OVERFLOW (0x0008) 865193326Sed#define _MM_EXCEPT_UNDERFLOW (0x0010) 866193326Sed#define _MM_EXCEPT_INEXACT (0x0020) 867193326Sed#define _MM_EXCEPT_MASK (0x003f) 868193326Sed 869193326Sed#define _MM_MASK_INVALID (0x0080) 870193326Sed#define _MM_MASK_DENORM (0x0100) 871193326Sed#define _MM_MASK_DIV_ZERO (0x0200) 872193326Sed#define _MM_MASK_OVERFLOW (0x0400) 873193326Sed#define _MM_MASK_UNDERFLOW (0x0800) 874193326Sed#define _MM_MASK_INEXACT (0x1000) 875193326Sed#define _MM_MASK_MASK (0x1f80) 876193326Sed 877193326Sed#define _MM_ROUND_NEAREST (0x0000) 878193326Sed#define _MM_ROUND_DOWN (0x2000) 879193326Sed#define _MM_ROUND_UP (0x4000) 880193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000) 881193326Sed#define _MM_ROUND_MASK (0x6000) 882193326Sed 883193326Sed#define _MM_FLUSH_ZERO_MASK (0x8000) 884193326Sed#define _MM_FLUSH_ZERO_ON (0x8000) 885193326Sed#define _MM_FLUSH_ZERO_OFF (0x8000) 886193326Sed 887193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 888193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 889193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 890193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 891193326Sed 892193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 893193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 894193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 895193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 896193326Sed 897193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 898193326Seddo { \ 899193326Sed __m128 tmp3, tmp2, tmp1, tmp0; \ 900193326Sed tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 901193326Sed tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 902193326Sed tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 903193326Sed tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 904193326Sed (row0) = _mm_movelh_ps(tmp0, tmp2); \ 905193326Sed (row1) = _mm_movehl_ps(tmp2, tmp0); \ 906193326Sed (row2) = _mm_movelh_ps(tmp1, tmp3); \ 907203955Srdivacky (row3) = _mm_movehl_ps(tmp3, tmp1); \ 908193326Sed} while (0) 909193326Sed 910194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */ 911194179Sed#ifdef __SSE2__ 912193326Sed#include <emmintrin.h> 913194179Sed#endif 914193326Sed 915193326Sed#endif /* __SSE__ */ 916193326Sed 917193326Sed#endif /* __XMMINTRIN_H */ 918