xmmintrin.h revision 218893
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2193326Sed * 3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy 4193326Sed * of this software and associated documentation files (the "Software"), to deal 5193326Sed * in the Software without restriction, including without limitation the rights 6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7193326Sed * copies of the Software, and to permit persons to whom the Software is 8193326Sed * furnished to do so, subject to the following conditions: 9193326Sed * 10193326Sed * The above copyright notice and this permission notice shall be included in 11193326Sed * all copies or substantial portions of the Software. 12193326Sed * 13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19193326Sed * THE SOFTWARE. 20193326Sed * 21193326Sed *===-----------------------------------------------------------------------=== 22193326Sed */ 23193326Sed 24193326Sed#ifndef __XMMINTRIN_H 25193326Sed#define __XMMINTRIN_H 26193326Sed 27193326Sed#ifndef __SSE__ 28193326Sed#error "SSE instruction set not enabled" 29193326Sed#else 30193326Sed 31193326Sed#include <mmintrin.h> 32193326Sed 33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16))); 34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16))); 35193326Sedtypedef float __m128 __attribute__((__vector_size__(16))); 36193326Sed 37218893Sdim// This header should only be included in a hosted environment as it depends on 38218893Sdim// a standard library to provide allocation routines. 39218893Sdim#if __STDC_HOSTED__ 40193326Sed#include <mm_malloc.h> 41218893Sdim#endif 42193326Sed 43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44193326Sed_mm_add_ss(__m128 a, __m128 b) 45193326Sed{ 46193576Sed a[0] += b[0]; 47193576Sed return a; 48193326Sed} 49193326Sed 50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51193326Sed_mm_add_ps(__m128 a, __m128 b) 52193326Sed{ 53193326Sed return a + b; 54193326Sed} 55193326Sed 56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57193326Sed_mm_sub_ss(__m128 a, __m128 b) 58193326Sed{ 59193576Sed a[0] -= b[0]; 60193576Sed return a; 61193326Sed} 62193326Sed 63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64193326Sed_mm_sub_ps(__m128 a, __m128 b) 65193326Sed{ 66193326Sed return a - b; 67193326Sed} 68193326Sed 69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70193326Sed_mm_mul_ss(__m128 a, __m128 b) 71193326Sed{ 72193576Sed a[0] *= b[0]; 73193576Sed return a; 74193326Sed} 75193326Sed 76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77193326Sed_mm_mul_ps(__m128 a, __m128 b) 78193326Sed{ 79193326Sed return a * b; 80193326Sed} 81193326Sed 82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83193326Sed_mm_div_ss(__m128 a, __m128 b) 84193326Sed{ 85193576Sed a[0] /= b[0]; 86193576Sed return a; 87193326Sed} 88193326Sed 89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90193326Sed_mm_div_ps(__m128 a, __m128 b) 91193326Sed{ 92193326Sed return a / b; 93193326Sed} 94193326Sed 95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96193326Sed_mm_sqrt_ss(__m128 a) 97193326Sed{ 98193326Sed return __builtin_ia32_sqrtss(a); 99193326Sed} 100193326Sed 101206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 102193326Sed_mm_sqrt_ps(__m128 a) 103193326Sed{ 104193326Sed return __builtin_ia32_sqrtps(a); 105193326Sed} 106193326Sed 107206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 108193326Sed_mm_rcp_ss(__m128 a) 109193326Sed{ 110193326Sed return __builtin_ia32_rcpss(a); 111193326Sed} 112193326Sed 113206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 114193326Sed_mm_rcp_ps(__m128 a) 115193326Sed{ 116193326Sed return __builtin_ia32_rcpps(a); 117193326Sed} 118193326Sed 119206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 120193326Sed_mm_rsqrt_ss(__m128 a) 121193326Sed{ 122193326Sed return __builtin_ia32_rsqrtss(a); 123193326Sed} 124193326Sed 125206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 126193326Sed_mm_rsqrt_ps(__m128 a) 127193326Sed{ 128193326Sed return __builtin_ia32_rsqrtps(a); 129193326Sed} 130193326Sed 131206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 132193326Sed_mm_min_ss(__m128 a, __m128 b) 133193326Sed{ 134193326Sed return __builtin_ia32_minss(a, b); 135193326Sed} 136193326Sed 137206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 138193326Sed_mm_min_ps(__m128 a, __m128 b) 139193326Sed{ 140193326Sed return __builtin_ia32_minps(a, b); 141193326Sed} 142193326Sed 143206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 144193326Sed_mm_max_ss(__m128 a, __m128 b) 145193326Sed{ 146193326Sed return __builtin_ia32_maxss(a, b); 147193326Sed} 148193326Sed 149206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 150193326Sed_mm_max_ps(__m128 a, __m128 b) 151193326Sed{ 152193326Sed return __builtin_ia32_maxps(a, b); 153193326Sed} 154193326Sed 155206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 156193326Sed_mm_and_ps(__m128 a, __m128 b) 157193326Sed{ 158193576Sed return (__m128)((__v4si)a & (__v4si)b); 159193326Sed} 160193326Sed 161206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 162193326Sed_mm_andnot_ps(__m128 a, __m128 b) 163193326Sed{ 164193576Sed return (__m128)(~(__v4si)a & (__v4si)b); 165193326Sed} 166193326Sed 167206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 168193326Sed_mm_or_ps(__m128 a, __m128 b) 169193326Sed{ 170193576Sed return (__m128)((__v4si)a | (__v4si)b); 171193326Sed} 172193326Sed 173206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 174193326Sed_mm_xor_ps(__m128 a, __m128 b) 175193326Sed{ 176202379Srdivacky return (__m128)((__v4si)a ^ (__v4si)b); 177193326Sed} 178193326Sed 179206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 180193326Sed_mm_cmpeq_ss(__m128 a, __m128 b) 181193326Sed{ 182193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 0); 183193326Sed} 184193326Sed 185206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 186193326Sed_mm_cmpeq_ps(__m128 a, __m128 b) 187193326Sed{ 188193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 0); 189193326Sed} 190193326Sed 191206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 192193326Sed_mm_cmplt_ss(__m128 a, __m128 b) 193193326Sed{ 194193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 1); 195193326Sed} 196193326Sed 197206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 198193326Sed_mm_cmplt_ps(__m128 a, __m128 b) 199193326Sed{ 200193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 1); 201193326Sed} 202193326Sed 203206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 204193326Sed_mm_cmple_ss(__m128 a, __m128 b) 205193326Sed{ 206193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 2); 207193326Sed} 208193326Sed 209206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 210193326Sed_mm_cmple_ps(__m128 a, __m128 b) 211193326Sed{ 212193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 2); 213193326Sed} 214193326Sed 215206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 216193326Sed_mm_cmpgt_ss(__m128 a, __m128 b) 217193326Sed{ 218193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 1); 219193326Sed} 220193326Sed 221206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 222193326Sed_mm_cmpgt_ps(__m128 a, __m128 b) 223193326Sed{ 224193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 1); 225193326Sed} 226193326Sed 227206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 228193326Sed_mm_cmpge_ss(__m128 a, __m128 b) 229193326Sed{ 230193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 2); 231193326Sed} 232193326Sed 233206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 234193326Sed_mm_cmpge_ps(__m128 a, __m128 b) 235193326Sed{ 236193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 2); 237193326Sed} 238193326Sed 239206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 240193326Sed_mm_cmpneq_ss(__m128 a, __m128 b) 241193326Sed{ 242193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 4); 243193326Sed} 244193326Sed 245206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 246193326Sed_mm_cmpneq_ps(__m128 a, __m128 b) 247193326Sed{ 248193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 4); 249193326Sed} 250193326Sed 251206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 252193326Sed_mm_cmpnlt_ss(__m128 a, __m128 b) 253193326Sed{ 254193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 5); 255193326Sed} 256193326Sed 257206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 258193326Sed_mm_cmpnlt_ps(__m128 a, __m128 b) 259193326Sed{ 260193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 5); 261193326Sed} 262193326Sed 263206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 264193326Sed_mm_cmpnle_ss(__m128 a, __m128 b) 265193326Sed{ 266193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 6); 267193326Sed} 268193326Sed 269206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 270193326Sed_mm_cmpnle_ps(__m128 a, __m128 b) 271193326Sed{ 272193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 6); 273193326Sed} 274193326Sed 275206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 276193326Sed_mm_cmpngt_ss(__m128 a, __m128 b) 277193326Sed{ 278193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 5); 279193326Sed} 280193326Sed 281206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 282193326Sed_mm_cmpngt_ps(__m128 a, __m128 b) 283193326Sed{ 284193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 5); 285193326Sed} 286193326Sed 287206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 288193326Sed_mm_cmpnge_ss(__m128 a, __m128 b) 289193326Sed{ 290193326Sed return (__m128)__builtin_ia32_cmpss(b, a, 6); 291193326Sed} 292193326Sed 293206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 294193326Sed_mm_cmpnge_ps(__m128 a, __m128 b) 295193326Sed{ 296193326Sed return (__m128)__builtin_ia32_cmpps(b, a, 6); 297193326Sed} 298193326Sed 299206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 300193326Sed_mm_cmpord_ss(__m128 a, __m128 b) 301193326Sed{ 302193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 7); 303193326Sed} 304193326Sed 305206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 306193326Sed_mm_cmpord_ps(__m128 a, __m128 b) 307193326Sed{ 308193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 7); 309193326Sed} 310193326Sed 311206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 312193326Sed_mm_cmpunord_ss(__m128 a, __m128 b) 313193326Sed{ 314193326Sed return (__m128)__builtin_ia32_cmpss(a, b, 3); 315193326Sed} 316193326Sed 317206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 318193326Sed_mm_cmpunord_ps(__m128 a, __m128 b) 319193326Sed{ 320193326Sed return (__m128)__builtin_ia32_cmpps(a, b, 3); 321193326Sed} 322193326Sed 323206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 324193326Sed_mm_comieq_ss(__m128 a, __m128 b) 325193326Sed{ 326193326Sed return __builtin_ia32_comieq(a, b); 327193326Sed} 328193326Sed 329206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 330193326Sed_mm_comilt_ss(__m128 a, __m128 b) 331193326Sed{ 332193326Sed return __builtin_ia32_comilt(a, b); 333193326Sed} 334193326Sed 335206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 336193326Sed_mm_comile_ss(__m128 a, __m128 b) 337193326Sed{ 338193326Sed return __builtin_ia32_comile(a, b); 339193326Sed} 340193326Sed 341206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 342193326Sed_mm_comigt_ss(__m128 a, __m128 b) 343193326Sed{ 344193326Sed return __builtin_ia32_comigt(a, b); 345193326Sed} 346193326Sed 347206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 348193326Sed_mm_comige_ss(__m128 a, __m128 b) 349193326Sed{ 350193326Sed return __builtin_ia32_comige(a, b); 351193326Sed} 352193326Sed 353206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 354193326Sed_mm_comineq_ss(__m128 a, __m128 b) 355193326Sed{ 356193326Sed return __builtin_ia32_comineq(a, b); 357193326Sed} 358193326Sed 359206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 360193326Sed_mm_ucomieq_ss(__m128 a, __m128 b) 361193326Sed{ 362193326Sed return __builtin_ia32_ucomieq(a, b); 363193326Sed} 364193326Sed 365206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 366193326Sed_mm_ucomilt_ss(__m128 a, __m128 b) 367193326Sed{ 368193326Sed return __builtin_ia32_ucomilt(a, b); 369193326Sed} 370193326Sed 371206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 372193326Sed_mm_ucomile_ss(__m128 a, __m128 b) 373193326Sed{ 374193326Sed return __builtin_ia32_ucomile(a, b); 375193326Sed} 376193326Sed 377206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 378193326Sed_mm_ucomigt_ss(__m128 a, __m128 b) 379193326Sed{ 380193326Sed return __builtin_ia32_ucomigt(a, b); 381193326Sed} 382193326Sed 383206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 384193326Sed_mm_ucomige_ss(__m128 a, __m128 b) 385193326Sed{ 386193326Sed return __builtin_ia32_ucomige(a, b); 387193326Sed} 388193326Sed 389206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 390193326Sed_mm_ucomineq_ss(__m128 a, __m128 b) 391193326Sed{ 392193326Sed return __builtin_ia32_ucomineq(a, b); 393193326Sed} 394193326Sed 395206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 396193326Sed_mm_cvtss_si32(__m128 a) 397193326Sed{ 398193326Sed return __builtin_ia32_cvtss2si(a); 399193326Sed} 400193326Sed 401206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 402204643Srdivacky_mm_cvt_ss2si(__m128 a) 403204643Srdivacky{ 404204643Srdivacky return _mm_cvtss_si32(a); 405204643Srdivacky} 406204643Srdivacky 407193576Sed#ifdef __x86_64__ 408193576Sed 409206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 410193326Sed_mm_cvtss_si64(__m128 a) 411193326Sed{ 412193326Sed return __builtin_ia32_cvtss2si64(a); 413193326Sed} 414193326Sed 415193576Sed#endif 416193576Sed 417206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 418193326Sed_mm_cvtps_pi32(__m128 a) 419193326Sed{ 420193326Sed return (__m64)__builtin_ia32_cvtps2pi(a); 421193326Sed} 422193326Sed 423212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 424212904Sdim_mm_cvt_ps2pi(__m128 a) 425212904Sdim{ 426212904Sdim return _mm_cvtps_pi32(a); 427212904Sdim} 428212904Sdim 429206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 430193326Sed_mm_cvttss_si32(__m128 a) 431193326Sed{ 432193576Sed return a[0]; 433193326Sed} 434193326Sed 435206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 436204643Srdivacky_mm_cvtt_ss2si(__m128 a) 437204643Srdivacky{ 438204643Srdivacky return _mm_cvttss_si32(a); 439204643Srdivacky} 440204643Srdivacky 441206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 442193326Sed_mm_cvttss_si64(__m128 a) 443193326Sed{ 444193576Sed return a[0]; 445193326Sed} 446193326Sed 447206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 448193326Sed_mm_cvttps_pi32(__m128 a) 449193326Sed{ 450193326Sed return (__m64)__builtin_ia32_cvttps2pi(a); 451193326Sed} 452193326Sed 453212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 454212904Sdim_mm_cvtt_ps2pi(__m128 a) 455212904Sdim{ 456212904Sdim return _mm_cvttps_pi32(a); 457212904Sdim} 458212904Sdim 459206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 460193326Sed_mm_cvtsi32_ss(__m128 a, int b) 461193326Sed{ 462193576Sed a[0] = b; 463193576Sed return a; 464193326Sed} 465193326Sed 466212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 467212904Sdim_mm_cvt_si2ss(__m128 a, int b) 468212904Sdim{ 469212904Sdim return _mm_cvtsi32_ss(a, b); 470212904Sdim} 471212904Sdim 472193326Sed#ifdef __x86_64__ 473193326Sed 474206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 475193326Sed_mm_cvtsi64_ss(__m128 a, long long b) 476193326Sed{ 477193576Sed a[0] = b; 478193576Sed return a; 479193326Sed} 480193326Sed 481193326Sed#endif 482193326Sed 483206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 484193326Sed_mm_cvtpi32_ps(__m128 a, __m64 b) 485193326Sed{ 486193326Sed return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 487193326Sed} 488193326Sed 489212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 490212904Sdim_mm_cvt_pi2ps(__m128 a, __m64 b) 491212904Sdim{ 492212904Sdim return _mm_cvtpi32_ps(a, b); 493212904Sdim} 494212904Sdim 495206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__)) 496193326Sed_mm_cvtss_f32(__m128 a) 497193326Sed{ 498193326Sed return a[0]; 499193326Sed} 500193326Sed 501206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 502203955Srdivacky_mm_loadh_pi(__m128 a, const __m64 *p) 503193326Sed{ 504193631Sed __m128 b; 505193631Sed b[0] = *(float*)p; 506193631Sed b[1] = *((float*)p+1); 507193631Sed return __builtin_shufflevector(a, b, 0, 1, 4, 5); 508193326Sed} 509193326Sed 510206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 511203955Srdivacky_mm_loadl_pi(__m128 a, const __m64 *p) 512193326Sed{ 513193576Sed __m128 b; 514193576Sed b[0] = *(float*)p; 515193576Sed b[1] = *((float*)p+1); 516193631Sed return __builtin_shufflevector(a, b, 4, 5, 2, 3); 517193326Sed} 518193326Sed 519206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 520203955Srdivacky_mm_load_ss(const float *p) 521193326Sed{ 522193326Sed return (__m128){ *p, 0, 0, 0 }; 523193326Sed} 524193326Sed 525206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 526203955Srdivacky_mm_load1_ps(const float *p) 527193326Sed{ 528193326Sed return (__m128){ *p, *p, *p, *p }; 529193326Sed} 530193326Sed 531193326Sed#define _mm_load_ps1(p) _mm_load1_ps(p) 532193326Sed 533206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 534203955Srdivacky_mm_load_ps(const float *p) 535193326Sed{ 536193326Sed return *(__m128*)p; 537193326Sed} 538193326Sed 539206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 540203955Srdivacky_mm_loadu_ps(const float *p) 541193326Sed{ 542193326Sed return __builtin_ia32_loadups(p); 543193326Sed} 544193326Sed 545206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 546203955Srdivacky_mm_loadr_ps(const float *p) 547193326Sed{ 548193326Sed __m128 a = _mm_load_ps(p); 549193326Sed return __builtin_shufflevector(a, a, 3, 2, 1, 0); 550193326Sed} 551193326Sed 552206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 553193326Sed_mm_set_ss(float w) 554193326Sed{ 555193326Sed return (__m128){ w, 0, 0, 0 }; 556193326Sed} 557193326Sed 558206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 559193326Sed_mm_set1_ps(float w) 560193326Sed{ 561193326Sed return (__m128){ w, w, w, w }; 562193326Sed} 563193326Sed 564193326Sed// Microsoft specific. 565206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 566193326Sed_mm_set_ps1(float w) 567193326Sed{ 568193326Sed return _mm_set1_ps(w); 569193326Sed} 570193326Sed 571206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 572193326Sed_mm_set_ps(float z, float y, float x, float w) 573193326Sed{ 574193326Sed return (__m128){ w, x, y, z }; 575193326Sed} 576193326Sed 577206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 578193326Sed_mm_setr_ps(float z, float y, float x, float w) 579193326Sed{ 580193326Sed return (__m128){ z, y, x, w }; 581193326Sed} 582193326Sed 583206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__)) 584193326Sed_mm_setzero_ps(void) 585193326Sed{ 586193326Sed return (__m128){ 0, 0, 0, 0 }; 587193326Sed} 588193326Sed 589206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 590193326Sed_mm_storeh_pi(__m64 *p, __m128 a) 591193326Sed{ 592193326Sed __builtin_ia32_storehps((__v2si *)p, a); 593193326Sed} 594193326Sed 595206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 596193326Sed_mm_storel_pi(__m64 *p, __m128 a) 597193326Sed{ 598193326Sed __builtin_ia32_storelps((__v2si *)p, a); 599193326Sed} 600193326Sed 601206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 602193326Sed_mm_store_ss(float *p, __m128 a) 603193326Sed{ 604193326Sed *p = a[0]; 605193326Sed} 606193326Sed 607206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 608193326Sed_mm_storeu_ps(float *p, __m128 a) 609193326Sed{ 610193326Sed __builtin_ia32_storeups(p, a); 611193326Sed} 612193326Sed 613206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 614193326Sed_mm_store1_ps(float *p, __m128 a) 615193326Sed{ 616193326Sed a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 617193326Sed _mm_storeu_ps(p, a); 618193326Sed} 619193326Sed 620206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 621212904Sdim_mm_store_ps1(float *p, __m128 a) 622212904Sdim{ 623212904Sdim return _mm_store1_ps(p, a); 624212904Sdim} 625212904Sdim 626212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 627193326Sed_mm_store_ps(float *p, __m128 a) 628193326Sed{ 629193326Sed *(__m128 *)p = a; 630193326Sed} 631193326Sed 632206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 633193326Sed_mm_storer_ps(float *p, __m128 a) 634193326Sed{ 635193326Sed a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 636193326Sed _mm_store_ps(p, a); 637193326Sed} 638193326Sed 639212904Sdim#define _MM_HINT_T0 3 640193326Sed#define _MM_HINT_T1 2 641212904Sdim#define _MM_HINT_T2 1 642193326Sed#define _MM_HINT_NTA 0 643193326Sed 644210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and 645193326Sed Sema doesn't do any form of constant propagation yet. */ 646193326Sed 647210299Sed#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 648193326Sed 649206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 650193326Sed_mm_stream_pi(__m64 *p, __m64 a) 651193326Sed{ 652193326Sed __builtin_ia32_movntq(p, a); 653193326Sed} 654193326Sed 655206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 656193326Sed_mm_stream_ps(float *p, __m128 a) 657193326Sed{ 658193326Sed __builtin_ia32_movntps(p, a); 659193326Sed} 660193326Sed 661206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 662193326Sed_mm_sfence(void) 663193326Sed{ 664193326Sed __builtin_ia32_sfence(); 665193326Sed} 666193326Sed 667206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 668193326Sed_mm_extract_pi16(__m64 a, int n) 669193326Sed{ 670193326Sed __v4hi b = (__v4hi)a; 671193576Sed return (unsigned short)b[n & 3]; 672193326Sed} 673193326Sed 674206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 675193326Sed_mm_insert_pi16(__m64 a, int d, int n) 676193326Sed{ 677193576Sed __v4hi b = (__v4hi)a; 678193576Sed b[n & 3] = d; 679193576Sed return (__m64)b; 680193326Sed} 681193326Sed 682206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 683193326Sed_mm_max_pi16(__m64 a, __m64 b) 684193326Sed{ 685193326Sed return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 686193326Sed} 687193326Sed 688206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 689193326Sed_mm_max_pu8(__m64 a, __m64 b) 690193326Sed{ 691193326Sed return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 692193326Sed} 693193326Sed 694206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 695193326Sed_mm_min_pi16(__m64 a, __m64 b) 696193326Sed{ 697193326Sed return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 698193326Sed} 699193326Sed 700206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 701193326Sed_mm_min_pu8(__m64 a, __m64 b) 702193326Sed{ 703193326Sed return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 704193326Sed} 705193326Sed 706206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 707193326Sed_mm_movemask_pi8(__m64 a) 708193326Sed{ 709193326Sed return __builtin_ia32_pmovmskb((__v8qi)a); 710193326Sed} 711193326Sed 712206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 713193326Sed_mm_mulhi_pu16(__m64 a, __m64 b) 714193326Sed{ 715193326Sed return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 716193326Sed} 717193326Sed 718193576Sed#define _mm_shuffle_pi16(a, n) \ 719218893Sdim ((__m64)__builtin_ia32_pshufw(a, n)) 720193326Sed 721206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 722193326Sed_mm_maskmove_si64(__m64 d, __m64 n, char *p) 723193326Sed{ 724193326Sed __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 725193326Sed} 726193326Sed 727206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 728193326Sed_mm_avg_pu8(__m64 a, __m64 b) 729193326Sed{ 730193326Sed return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 731193326Sed} 732193326Sed 733206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 734193326Sed_mm_avg_pu16(__m64 a, __m64 b) 735193326Sed{ 736193326Sed return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 737193326Sed} 738193326Sed 739206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 740193326Sed_mm_sad_pu8(__m64 a, __m64 b) 741193326Sed{ 742193326Sed return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 743193326Sed} 744193326Sed 745206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 746193326Sed_mm_getcsr(void) 747193326Sed{ 748193326Sed return __builtin_ia32_stmxcsr(); 749193326Sed} 750193326Sed 751206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 752193326Sed_mm_setcsr(unsigned int i) 753193326Sed{ 754193326Sed __builtin_ia32_ldmxcsr(i); 755193326Sed} 756193326Sed 757193576Sed#define _mm_shuffle_ps(a, b, mask) \ 758210299Sed (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 759208600Srdivacky (mask) & 0x3, ((mask) & 0xc) >> 2, \ 760193576Sed (((mask) & 0x30) >> 4) + 4, \ 761193576Sed (((mask) & 0xc0) >> 6) + 4)) 762193326Sed 763206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 764193326Sed_mm_unpackhi_ps(__m128 a, __m128 b) 765193326Sed{ 766193326Sed return __builtin_shufflevector(a, b, 2, 6, 3, 7); 767193326Sed} 768193326Sed 769206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 770193326Sed_mm_unpacklo_ps(__m128 a, __m128 b) 771193326Sed{ 772193326Sed return __builtin_shufflevector(a, b, 0, 4, 1, 5); 773193326Sed} 774193326Sed 775206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 776193326Sed_mm_move_ss(__m128 a, __m128 b) 777193326Sed{ 778193326Sed return __builtin_shufflevector(a, b, 4, 1, 2, 3); 779193326Sed} 780193326Sed 781206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 782193326Sed_mm_movehl_ps(__m128 a, __m128 b) 783193326Sed{ 784193326Sed return __builtin_shufflevector(a, b, 6, 7, 2, 3); 785193326Sed} 786193326Sed 787206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 788193326Sed_mm_movelh_ps(__m128 a, __m128 b) 789193326Sed{ 790193326Sed return __builtin_shufflevector(a, b, 0, 1, 4, 5); 791193326Sed} 792193326Sed 793206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 794193326Sed_mm_cvtpi16_ps(__m64 a) 795193326Sed{ 796193326Sed __m64 b, c; 797193326Sed __m128 r; 798193326Sed 799193326Sed b = _mm_setzero_si64(); 800193326Sed b = _mm_cmpgt_pi16(b, a); 801193326Sed c = _mm_unpackhi_pi16(a, b); 802193326Sed r = _mm_setzero_ps(); 803193326Sed r = _mm_cvtpi32_ps(r, c); 804193326Sed r = _mm_movelh_ps(r, r); 805193326Sed c = _mm_unpacklo_pi16(a, b); 806193326Sed r = _mm_cvtpi32_ps(r, c); 807193326Sed 808193326Sed return r; 809193326Sed} 810193326Sed 811206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 812193326Sed_mm_cvtpu16_ps(__m64 a) 813193326Sed{ 814193326Sed __m64 b, c; 815193326Sed __m128 r; 816193326Sed 817193326Sed b = _mm_setzero_si64(); 818193326Sed c = _mm_unpackhi_pi16(a, b); 819193326Sed r = _mm_setzero_ps(); 820193326Sed r = _mm_cvtpi32_ps(r, c); 821193326Sed r = _mm_movelh_ps(r, r); 822193326Sed c = _mm_unpacklo_pi16(a, b); 823193326Sed r = _mm_cvtpi32_ps(r, c); 824193326Sed 825193326Sed return r; 826193326Sed} 827193326Sed 828206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 829193326Sed_mm_cvtpi8_ps(__m64 a) 830193326Sed{ 831193326Sed __m64 b; 832193326Sed 833193326Sed b = _mm_setzero_si64(); 834193326Sed b = _mm_cmpgt_pi8(b, a); 835193326Sed b = _mm_unpacklo_pi8(a, b); 836193326Sed 837193326Sed return _mm_cvtpi16_ps(b); 838193326Sed} 839193326Sed 840206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 841193326Sed_mm_cvtpu8_ps(__m64 a) 842193326Sed{ 843193326Sed __m64 b; 844193326Sed 845193326Sed b = _mm_setzero_si64(); 846193326Sed b = _mm_unpacklo_pi8(a, b); 847193326Sed 848193326Sed return _mm_cvtpi16_ps(b); 849193326Sed} 850193326Sed 851206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 852193326Sed_mm_cvtpi32x2_ps(__m64 a, __m64 b) 853193326Sed{ 854193326Sed __m128 c; 855193326Sed 856193326Sed c = _mm_setzero_ps(); 857193326Sed c = _mm_cvtpi32_ps(c, b); 858193326Sed c = _mm_movelh_ps(c, c); 859193326Sed 860193326Sed return _mm_cvtpi32_ps(c, a); 861193326Sed} 862193326Sed 863206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 864193326Sed_mm_cvtps_pi16(__m128 a) 865193326Sed{ 866193326Sed __m64 b, c; 867193326Sed 868193326Sed b = _mm_cvtps_pi32(a); 869193326Sed a = _mm_movehl_ps(a, a); 870193326Sed c = _mm_cvtps_pi32(a); 871193326Sed 872193326Sed return _mm_packs_pi16(b, c); 873193326Sed} 874193326Sed 875206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 876193326Sed_mm_cvtps_pi8(__m128 a) 877193326Sed{ 878193326Sed __m64 b, c; 879193326Sed 880193326Sed b = _mm_cvtps_pi16(a); 881193326Sed c = _mm_setzero_si64(); 882193326Sed 883193326Sed return _mm_packs_pi16(b, c); 884193326Sed} 885193326Sed 886206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 887193326Sed_mm_movemask_ps(__m128 a) 888193326Sed{ 889193326Sed return __builtin_ia32_movmskps(a); 890193326Sed} 891193326Sed 892193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 893193326Sed 894193326Sed#define _MM_EXCEPT_INVALID (0x0001) 895193326Sed#define _MM_EXCEPT_DENORM (0x0002) 896193326Sed#define _MM_EXCEPT_DIV_ZERO (0x0004) 897193326Sed#define _MM_EXCEPT_OVERFLOW (0x0008) 898193326Sed#define _MM_EXCEPT_UNDERFLOW (0x0010) 899193326Sed#define _MM_EXCEPT_INEXACT (0x0020) 900193326Sed#define _MM_EXCEPT_MASK (0x003f) 901193326Sed 902193326Sed#define _MM_MASK_INVALID (0x0080) 903193326Sed#define _MM_MASK_DENORM (0x0100) 904193326Sed#define _MM_MASK_DIV_ZERO (0x0200) 905193326Sed#define _MM_MASK_OVERFLOW (0x0400) 906193326Sed#define _MM_MASK_UNDERFLOW (0x0800) 907193326Sed#define _MM_MASK_INEXACT (0x1000) 908193326Sed#define _MM_MASK_MASK (0x1f80) 909193326Sed 910193326Sed#define _MM_ROUND_NEAREST (0x0000) 911193326Sed#define _MM_ROUND_DOWN (0x2000) 912193326Sed#define _MM_ROUND_UP (0x4000) 913193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000) 914193326Sed#define _MM_ROUND_MASK (0x6000) 915193326Sed 916193326Sed#define _MM_FLUSH_ZERO_MASK (0x8000) 917193326Sed#define _MM_FLUSH_ZERO_ON (0x8000) 918193326Sed#define _MM_FLUSH_ZERO_OFF (0x8000) 919193326Sed 920193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 921193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 922193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 923193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 924193326Sed 925193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 926193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 927193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 928193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 929193326Sed 930193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 931193326Seddo { \ 932193326Sed __m128 tmp3, tmp2, tmp1, tmp0; \ 933193326Sed tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 934193326Sed tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 935193326Sed tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 936193326Sed tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 937193326Sed (row0) = _mm_movelh_ps(tmp0, tmp2); \ 938193326Sed (row1) = _mm_movehl_ps(tmp2, tmp0); \ 939193326Sed (row2) = _mm_movelh_ps(tmp1, tmp3); \ 940203955Srdivacky (row3) = _mm_movehl_ps(tmp3, tmp1); \ 941193326Sed} while (0) 942193326Sed 943212904Sdim/* Aliases for compatibility. */ 944212904Sdim#define _m_pextrw _mm_extract_pi16 945212904Sdim#define _m_pinsrw _mm_insert_pi16 946212904Sdim#define _m_pmaxsw _mm_max_pi16 947212904Sdim#define _m_pmaxub _mm_max_pu8 948212904Sdim#define _m_pminsw _mm_min_pi16 949212904Sdim#define _m_pminub _mm_min_pu8 950212904Sdim#define _m_pmovmskb _mm_movemask_pi8 951212904Sdim#define _m_pmulhuw _mm_mulhi_pu16 952212904Sdim#define _m_pshufw _mm_shuffle_pi16 953212904Sdim#define _m_maskmovq _mm_maskmove_si64 954212904Sdim#define _m_pavgb _mm_avg_pu8 955212904Sdim#define _m_pavgw _mm_avg_pu16 956212904Sdim#define _m_psadbw _mm_sad_pu8 957212904Sdim#define _m_ _mm_ 958212904Sdim#define _m_ _mm_ 959212904Sdim 960194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */ 961194179Sed#ifdef __SSE2__ 962193326Sed#include <emmintrin.h> 963194179Sed#endif 964193326Sed 965193326Sed#endif /* __SSE__ */ 966193326Sed 967193326Sed#endif /* __XMMINTRIN_H */ 968