1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2193326Sed * 3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy 4193326Sed * of this software and associated documentation files (the "Software"), to deal 5193326Sed * in the Software without restriction, including without limitation the rights 6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7193326Sed * copies of the Software, and to permit persons to whom the Software is 8193326Sed * furnished to do so, subject to the following conditions: 9193326Sed * 10193326Sed * The above copyright notice and this permission notice shall be included in 11193326Sed * all copies or substantial portions of the Software. 12193326Sed * 13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19193326Sed * THE SOFTWARE. 20193326Sed * 21193326Sed *===-----------------------------------------------------------------------=== 22193326Sed */ 23193326Sed 24193326Sed#ifndef __XMMINTRIN_H 25193326Sed#define __XMMINTRIN_H 26193326Sed 27193326Sed#ifndef __SSE__ 28193326Sed#error "SSE instruction set not enabled" 29193326Sed#else 30193326Sed 31193326Sed#include <mmintrin.h> 32193326Sed 33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16))); 34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16))); 35193326Sedtypedef float __m128 __attribute__((__vector_size__(16))); 36193326Sed 37218893Sdim// This header should only be included in a hosted environment as it depends on 38218893Sdim// a standard library to provide allocation routines. 39218893Sdim#if __STDC_HOSTED__ 40193326Sed#include <mm_malloc.h> 41218893Sdim#endif 42193326Sed 43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44249423Sdim_mm_add_ss(__m128 __a, __m128 __b) 45193326Sed{ 46249423Sdim __a[0] += __b[0]; 47249423Sdim return __a; 48193326Sed} 49193326Sed 50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51249423Sdim_mm_add_ps(__m128 __a, __m128 __b) 52193326Sed{ 53249423Sdim return __a + __b; 54193326Sed} 55193326Sed 56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57249423Sdim_mm_sub_ss(__m128 __a, __m128 __b) 58193326Sed{ 59249423Sdim __a[0] -= __b[0]; 60249423Sdim return __a; 61193326Sed} 62193326Sed 63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64249423Sdim_mm_sub_ps(__m128 __a, __m128 __b) 65193326Sed{ 66249423Sdim return __a - __b; 67193326Sed} 68193326Sed 69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70249423Sdim_mm_mul_ss(__m128 __a, __m128 __b) 71193326Sed{ 72249423Sdim __a[0] *= __b[0]; 73249423Sdim return __a; 74193326Sed} 75193326Sed 76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77249423Sdim_mm_mul_ps(__m128 __a, __m128 __b) 78193326Sed{ 79249423Sdim return __a * __b; 80193326Sed} 81193326Sed 82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83249423Sdim_mm_div_ss(__m128 __a, __m128 __b) 84193326Sed{ 85249423Sdim __a[0] /= __b[0]; 86249423Sdim return __a; 87193326Sed} 88193326Sed 89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90249423Sdim_mm_div_ps(__m128 __a, __m128 __b) 91193326Sed{ 92249423Sdim return __a / __b; 93193326Sed} 94193326Sed 95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96249423Sdim_mm_sqrt_ss(__m128 __a) 97193326Sed{ 98249423Sdim __m128 __c = __builtin_ia32_sqrtss(__a); 99249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 100193326Sed} 101193326Sed 102206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 103249423Sdim_mm_sqrt_ps(__m128 __a) 104193326Sed{ 105249423Sdim return __builtin_ia32_sqrtps(__a); 106193326Sed} 107193326Sed 108206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 109249423Sdim_mm_rcp_ss(__m128 __a) 110193326Sed{ 111249423Sdim __m128 __c = __builtin_ia32_rcpss(__a); 112249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 113193326Sed} 114193326Sed 115206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116249423Sdim_mm_rcp_ps(__m128 __a) 117193326Sed{ 118249423Sdim return __builtin_ia32_rcpps(__a); 119193326Sed} 120193326Sed 121206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122249423Sdim_mm_rsqrt_ss(__m128 __a) 123193326Sed{ 124249423Sdim __m128 __c = __builtin_ia32_rsqrtss(__a); 125249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 126193326Sed} 127193326Sed 128206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 129249423Sdim_mm_rsqrt_ps(__m128 __a) 130193326Sed{ 131249423Sdim return __builtin_ia32_rsqrtps(__a); 132193326Sed} 133193326Sed 134206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 135249423Sdim_mm_min_ss(__m128 __a, __m128 __b) 136193326Sed{ 137249423Sdim return __builtin_ia32_minss(__a, __b); 138193326Sed} 139193326Sed 140206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 141249423Sdim_mm_min_ps(__m128 __a, __m128 __b) 142193326Sed{ 143249423Sdim return __builtin_ia32_minps(__a, __b); 144193326Sed} 145193326Sed 146206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 147249423Sdim_mm_max_ss(__m128 __a, __m128 __b) 148193326Sed{ 149249423Sdim return __builtin_ia32_maxss(__a, __b); 150193326Sed} 151193326Sed 152206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 153249423Sdim_mm_max_ps(__m128 __a, __m128 __b) 154193326Sed{ 155249423Sdim return __builtin_ia32_maxps(__a, __b); 156193326Sed} 157193326Sed 158206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 159249423Sdim_mm_and_ps(__m128 __a, __m128 __b) 160193326Sed{ 161249423Sdim return (__m128)((__v4si)__a & (__v4si)__b); 162193326Sed} 163193326Sed 164206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 165249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b) 166193326Sed{ 167249423Sdim return (__m128)(~(__v4si)__a & (__v4si)__b); 168193326Sed} 169193326Sed 170206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 171249423Sdim_mm_or_ps(__m128 __a, __m128 __b) 172193326Sed{ 173249423Sdim return (__m128)((__v4si)__a | (__v4si)__b); 174193326Sed} 175193326Sed 176206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 177249423Sdim_mm_xor_ps(__m128 __a, __m128 __b) 178193326Sed{ 179249423Sdim return (__m128)((__v4si)__a ^ (__v4si)__b); 180193326Sed} 181193326Sed 182206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 183249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b) 184193326Sed{ 185249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 0); 186193326Sed} 187193326Sed 188206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 189249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b) 190193326Sed{ 191249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 0); 192193326Sed} 193193326Sed 194206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 195249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b) 196193326Sed{ 197249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 1); 198193326Sed} 199193326Sed 200206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 201249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b) 202193326Sed{ 203249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 1); 204193326Sed} 205193326Sed 206206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 207249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b) 208193326Sed{ 209249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 2); 210193326Sed} 211193326Sed 212206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 213249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b) 214193326Sed{ 215249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 2); 216193326Sed} 217193326Sed 218206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 219249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b) 220193326Sed{ 221249423Sdim return (__m128)__builtin_ia32_cmpss(__b, __a, 1); 222193326Sed} 223193326Sed 224206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 225249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b) 226193326Sed{ 227249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 1); 228193326Sed} 229193326Sed 230206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 231249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b) 232193326Sed{ 233249423Sdim return (__m128)__builtin_ia32_cmpss(__b, __a, 2); 234193326Sed} 235193326Sed 236206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 237249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b) 238193326Sed{ 239249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 2); 240193326Sed} 241193326Sed 242206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 243249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b) 244193326Sed{ 245249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 4); 246193326Sed} 247193326Sed 248206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 249249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b) 250193326Sed{ 251249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 4); 252193326Sed} 253193326Sed 254206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 255249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b) 256193326Sed{ 257249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 5); 258193326Sed} 259193326Sed 260206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 261249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b) 262193326Sed{ 263249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 5); 264193326Sed} 265193326Sed 266206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 267249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b) 268193326Sed{ 269249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 6); 270193326Sed} 271193326Sed 272206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 273249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b) 274193326Sed{ 275249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 6); 276193326Sed} 277193326Sed 278206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 279249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b) 280193326Sed{ 281249423Sdim return (__m128)__builtin_ia32_cmpss(__b, __a, 5); 282193326Sed} 283193326Sed 284206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 285249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b) 286193326Sed{ 287249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 5); 288193326Sed} 289193326Sed 290206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 291249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b) 292193326Sed{ 293249423Sdim return (__m128)__builtin_ia32_cmpss(__b, __a, 6); 294193326Sed} 295193326Sed 296206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 297249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b) 298193326Sed{ 299249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 6); 300193326Sed} 301193326Sed 302206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 303249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b) 304193326Sed{ 305249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 7); 306193326Sed} 307193326Sed 308206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 309249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b) 310193326Sed{ 311249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 7); 312193326Sed} 313193326Sed 314206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 315249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b) 316193326Sed{ 317249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 3); 318193326Sed} 319193326Sed 320206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 321249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b) 322193326Sed{ 323249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 3); 324193326Sed} 325193326Sed 326206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 327249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b) 328193326Sed{ 329249423Sdim return __builtin_ia32_comieq(__a, __b); 330193326Sed} 331193326Sed 332206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 333249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b) 334193326Sed{ 335249423Sdim return __builtin_ia32_comilt(__a, __b); 336193326Sed} 337193326Sed 338206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 339249423Sdim_mm_comile_ss(__m128 __a, __m128 __b) 340193326Sed{ 341249423Sdim return __builtin_ia32_comile(__a, __b); 342193326Sed} 343193326Sed 344206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 345249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b) 346193326Sed{ 347249423Sdim return __builtin_ia32_comigt(__a, __b); 348193326Sed} 349193326Sed 350206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 351249423Sdim_mm_comige_ss(__m128 __a, __m128 __b) 352193326Sed{ 353249423Sdim return __builtin_ia32_comige(__a, __b); 354193326Sed} 355193326Sed 356206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 357249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b) 358193326Sed{ 359249423Sdim return __builtin_ia32_comineq(__a, __b); 360193326Sed} 361193326Sed 362206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 363249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b) 364193326Sed{ 365249423Sdim return __builtin_ia32_ucomieq(__a, __b); 366193326Sed} 367193326Sed 368206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 369249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b) 370193326Sed{ 371249423Sdim return __builtin_ia32_ucomilt(__a, __b); 372193326Sed} 373193326Sed 374206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 375249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b) 376193326Sed{ 377249423Sdim return __builtin_ia32_ucomile(__a, __b); 378193326Sed} 379193326Sed 380206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 381249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b) 382193326Sed{ 383249423Sdim return __builtin_ia32_ucomigt(__a, __b); 384193326Sed} 385193326Sed 386206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 387249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b) 388193326Sed{ 389249423Sdim return __builtin_ia32_ucomige(__a, __b); 390193326Sed} 391193326Sed 392206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 393249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b) 394193326Sed{ 395249423Sdim return __builtin_ia32_ucomineq(__a, __b); 396193326Sed} 397193326Sed 398206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 399249423Sdim_mm_cvtss_si32(__m128 __a) 400193326Sed{ 401249423Sdim return __builtin_ia32_cvtss2si(__a); 402193326Sed} 403193326Sed 404206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 405249423Sdim_mm_cvt_ss2si(__m128 __a) 406204643Srdivacky{ 407249423Sdim return _mm_cvtss_si32(__a); 408204643Srdivacky} 409204643Srdivacky 410193576Sed#ifdef __x86_64__ 411193576Sed 412206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 413249423Sdim_mm_cvtss_si64(__m128 __a) 414193326Sed{ 415249423Sdim return __builtin_ia32_cvtss2si64(__a); 416193326Sed} 417193326Sed 418193576Sed#endif 419193576Sed 420206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 421249423Sdim_mm_cvtps_pi32(__m128 __a) 422193326Sed{ 423249423Sdim return (__m64)__builtin_ia32_cvtps2pi(__a); 424193326Sed} 425193326Sed 426212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 427249423Sdim_mm_cvt_ps2pi(__m128 __a) 428212904Sdim{ 429249423Sdim return _mm_cvtps_pi32(__a); 430212904Sdim} 431212904Sdim 432206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 433249423Sdim_mm_cvttss_si32(__m128 __a) 434193326Sed{ 435249423Sdim return __a[0]; 436193326Sed} 437193326Sed 438206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 439249423Sdim_mm_cvtt_ss2si(__m128 __a) 440204643Srdivacky{ 441249423Sdim return _mm_cvttss_si32(__a); 442204643Srdivacky} 443204643Srdivacky 444206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 445249423Sdim_mm_cvttss_si64(__m128 __a) 446193326Sed{ 447249423Sdim return __a[0]; 448193326Sed} 449193326Sed 450206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 451249423Sdim_mm_cvttps_pi32(__m128 __a) 452193326Sed{ 453249423Sdim return (__m64)__builtin_ia32_cvttps2pi(__a); 454193326Sed} 455193326Sed 456212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 457249423Sdim_mm_cvtt_ps2pi(__m128 __a) 458212904Sdim{ 459249423Sdim return _mm_cvttps_pi32(__a); 460212904Sdim} 461212904Sdim 462206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 463249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b) 464193326Sed{ 465249423Sdim __a[0] = __b; 466249423Sdim return __a; 467193326Sed} 468193326Sed 469212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 470249423Sdim_mm_cvt_si2ss(__m128 __a, int __b) 471212904Sdim{ 472249423Sdim return _mm_cvtsi32_ss(__a, __b); 473212904Sdim} 474212904Sdim 475193326Sed#ifdef __x86_64__ 476193326Sed 477206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 478249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b) 479193326Sed{ 480249423Sdim __a[0] = __b; 481249423Sdim return __a; 482193326Sed} 483193326Sed 484193326Sed#endif 485193326Sed 486206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 487249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b) 488193326Sed{ 489249423Sdim return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 490193326Sed} 491193326Sed 492212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 493249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b) 494212904Sdim{ 495249423Sdim return _mm_cvtpi32_ps(__a, __b); 496212904Sdim} 497212904Sdim 498206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__)) 499249423Sdim_mm_cvtss_f32(__m128 __a) 500193326Sed{ 501249423Sdim return __a[0]; 502193326Sed} 503193326Sed 504206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 505249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p) 506193326Sed{ 507226633Sdim typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 508226633Sdim struct __mm_loadh_pi_struct { 509249423Sdim __mm_loadh_pi_v2f32 __u; 510226633Sdim } __attribute__((__packed__, __may_alias__)); 511249423Sdim __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 512249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 513249423Sdim return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 514193326Sed} 515193326Sed 516206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 517249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p) 518193326Sed{ 519226633Sdim typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 520226633Sdim struct __mm_loadl_pi_struct { 521249423Sdim __mm_loadl_pi_v2f32 __u; 522226633Sdim } __attribute__((__packed__, __may_alias__)); 523249423Sdim __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 524249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 525249423Sdim return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 526193326Sed} 527193326Sed 528206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 529249423Sdim_mm_load_ss(const float *__p) 530193326Sed{ 531226633Sdim struct __mm_load_ss_struct { 532249423Sdim float __u; 533226633Sdim } __attribute__((__packed__, __may_alias__)); 534249423Sdim float __u = ((struct __mm_load_ss_struct*)__p)->__u; 535249423Sdim return (__m128){ __u, 0, 0, 0 }; 536193326Sed} 537193326Sed 538206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 539249423Sdim_mm_load1_ps(const float *__p) 540193326Sed{ 541226633Sdim struct __mm_load1_ps_struct { 542249423Sdim float __u; 543226633Sdim } __attribute__((__packed__, __may_alias__)); 544249423Sdim float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 545249423Sdim return (__m128){ __u, __u, __u, __u }; 546193326Sed} 547193326Sed 548193326Sed#define _mm_load_ps1(p) _mm_load1_ps(p) 549193326Sed 550206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 551249423Sdim_mm_load_ps(const float *__p) 552193326Sed{ 553249423Sdim return *(__m128*)__p; 554193326Sed} 555193326Sed 556206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 557249423Sdim_mm_loadu_ps(const float *__p) 558193326Sed{ 559223017Sdim struct __loadu_ps { 560249423Sdim __m128 __v; 561226633Sdim } __attribute__((__packed__, __may_alias__)); 562249423Sdim return ((struct __loadu_ps*)__p)->__v; 563193326Sed} 564193326Sed 565206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 566249423Sdim_mm_loadr_ps(const float *__p) 567193326Sed{ 568249423Sdim __m128 __a = _mm_load_ps(__p); 569249423Sdim return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 570193326Sed} 571193326Sed 572206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 573249423Sdim_mm_set_ss(float __w) 574193326Sed{ 575249423Sdim return (__m128){ __w, 0, 0, 0 }; 576193326Sed} 577193326Sed 578206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 579249423Sdim_mm_set1_ps(float __w) 580193326Sed{ 581249423Sdim return (__m128){ __w, __w, __w, __w }; 582193326Sed} 583193326Sed 584193326Sed// Microsoft specific. 585206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 586249423Sdim_mm_set_ps1(float __w) 587193326Sed{ 588249423Sdim return _mm_set1_ps(__w); 589193326Sed} 590193326Sed 591206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 592249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w) 593193326Sed{ 594249423Sdim return (__m128){ __w, __x, __y, __z }; 595193326Sed} 596193326Sed 597206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 598249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w) 599193326Sed{ 600249423Sdim return (__m128){ __z, __y, __x, __w }; 601193326Sed} 602193326Sed 603206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__)) 604193326Sed_mm_setzero_ps(void) 605193326Sed{ 606193326Sed return (__m128){ 0, 0, 0, 0 }; 607193326Sed} 608193326Sed 609206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 610249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a) 611193326Sed{ 612249423Sdim __builtin_ia32_storehps((__v2si *)__p, __a); 613193326Sed} 614193326Sed 615206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 616249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a) 617193326Sed{ 618249423Sdim __builtin_ia32_storelps((__v2si *)__p, __a); 619193326Sed} 620193326Sed 621206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 622249423Sdim_mm_store_ss(float *__p, __m128 __a) 623193326Sed{ 624226633Sdim struct __mm_store_ss_struct { 625249423Sdim float __u; 626226633Sdim } __attribute__((__packed__, __may_alias__)); 627249423Sdim ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 628193326Sed} 629193326Sed 630206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 631249423Sdim_mm_storeu_ps(float *__p, __m128 __a) 632193326Sed{ 633249423Sdim __builtin_ia32_storeups(__p, __a); 634193326Sed} 635193326Sed 636206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 637249423Sdim_mm_store1_ps(float *__p, __m128 __a) 638193326Sed{ 639249423Sdim __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 640249423Sdim _mm_storeu_ps(__p, __a); 641193326Sed} 642193326Sed 643206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 644249423Sdim_mm_store_ps1(float *__p, __m128 __a) 645212904Sdim{ 646249423Sdim return _mm_store1_ps(__p, __a); 647212904Sdim} 648212904Sdim 649212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 650249423Sdim_mm_store_ps(float *__p, __m128 __a) 651193326Sed{ 652249423Sdim *(__m128 *)__p = __a; 653193326Sed} 654193326Sed 655206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 656249423Sdim_mm_storer_ps(float *__p, __m128 __a) 657193326Sed{ 658249423Sdim __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 659249423Sdim _mm_store_ps(__p, __a); 660193326Sed} 661193326Sed 662212904Sdim#define _MM_HINT_T0 3 663193326Sed#define _MM_HINT_T1 2 664212904Sdim#define _MM_HINT_T2 1 665193326Sed#define _MM_HINT_NTA 0 666193326Sed 667210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and 668193326Sed Sema doesn't do any form of constant propagation yet. */ 669193326Sed 670234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 671193326Sed 672206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 673249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a) 674193326Sed{ 675249423Sdim __builtin_ia32_movntq(__p, __a); 676193326Sed} 677193326Sed 678206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 679249423Sdim_mm_stream_ps(float *__p, __m128 __a) 680193326Sed{ 681249423Sdim __builtin_ia32_movntps(__p, __a); 682193326Sed} 683193326Sed 684206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 685193326Sed_mm_sfence(void) 686193326Sed{ 687193326Sed __builtin_ia32_sfence(); 688193326Sed} 689193326Sed 690206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 691249423Sdim_mm_extract_pi16(__m64 __a, int __n) 692193326Sed{ 693249423Sdim __v4hi __b = (__v4hi)__a; 694249423Sdim return (unsigned short)__b[__n & 3]; 695193326Sed} 696193326Sed 697206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 698249423Sdim_mm_insert_pi16(__m64 __a, int __d, int __n) 699193326Sed{ 700249423Sdim __v4hi __b = (__v4hi)__a; 701249423Sdim __b[__n & 3] = __d; 702249423Sdim return (__m64)__b; 703193326Sed} 704193326Sed 705206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 706249423Sdim_mm_max_pi16(__m64 __a, __m64 __b) 707193326Sed{ 708249423Sdim return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 709193326Sed} 710193326Sed 711206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 712249423Sdim_mm_max_pu8(__m64 __a, __m64 __b) 713193326Sed{ 714249423Sdim return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 715193326Sed} 716193326Sed 717206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 718249423Sdim_mm_min_pi16(__m64 __a, __m64 __b) 719193326Sed{ 720249423Sdim return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 721193326Sed} 722193326Sed 723206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 724249423Sdim_mm_min_pu8(__m64 __a, __m64 __b) 725193326Sed{ 726249423Sdim return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 727193326Sed} 728193326Sed 729206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 730249423Sdim_mm_movemask_pi8(__m64 __a) 731193326Sed{ 732249423Sdim return __builtin_ia32_pmovmskb((__v8qi)__a); 733193326Sed} 734193326Sed 735206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 736249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b) 737193326Sed{ 738249423Sdim return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 739193326Sed} 740193326Sed 741234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \ 742234353Sdim __m64 __a = (a); \ 743234353Sdim (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) 744193326Sed 745206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 746249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 747193326Sed{ 748249423Sdim __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 749193326Sed} 750193326Sed 751206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 752249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b) 753193326Sed{ 754249423Sdim return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 755193326Sed} 756193326Sed 757206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 758249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b) 759193326Sed{ 760249423Sdim return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 761193326Sed} 762193326Sed 763206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 764249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b) 765193326Sed{ 766249423Sdim return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 767193326Sed} 768193326Sed 769206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 770193326Sed_mm_getcsr(void) 771193326Sed{ 772193326Sed return __builtin_ia32_stmxcsr(); 773193326Sed} 774193326Sed 775206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 776249423Sdim_mm_setcsr(unsigned int __i) 777193326Sed{ 778249423Sdim __builtin_ia32_ldmxcsr(__i); 779193326Sed} 780193326Sed 781234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 782234353Sdim __m128 __a = (a); \ 783234353Sdim __m128 __b = (b); \ 784234353Sdim (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ 785234353Sdim (mask) & 0x3, ((mask) & 0xc) >> 2, \ 786234353Sdim (((mask) & 0x30) >> 4) + 4, \ 787234353Sdim (((mask) & 0xc0) >> 6) + 4); }) 788193326Sed 789206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 790249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b) 791193326Sed{ 792249423Sdim return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 793193326Sed} 794193326Sed 795206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 796249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b) 797193326Sed{ 798249423Sdim return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 799193326Sed} 800193326Sed 801206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 802249423Sdim_mm_move_ss(__m128 __a, __m128 __b) 803193326Sed{ 804249423Sdim return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 805193326Sed} 806193326Sed 807206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 808249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b) 809193326Sed{ 810249423Sdim return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 811193326Sed} 812193326Sed 813206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 814249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b) 815193326Sed{ 816249423Sdim return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 817193326Sed} 818193326Sed 819206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 820249423Sdim_mm_cvtpi16_ps(__m64 __a) 821193326Sed{ 822249423Sdim __m64 __b, __c; 823249423Sdim __m128 __r; 824193326Sed 825249423Sdim __b = _mm_setzero_si64(); 826249423Sdim __b = _mm_cmpgt_pi16(__b, __a); 827249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 828249423Sdim __r = _mm_setzero_ps(); 829249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 830249423Sdim __r = _mm_movelh_ps(__r, __r); 831249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 832249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 833193326Sed 834249423Sdim return __r; 835193326Sed} 836193326Sed 837206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 838249423Sdim_mm_cvtpu16_ps(__m64 __a) 839193326Sed{ 840249423Sdim __m64 __b, __c; 841249423Sdim __m128 __r; 842193326Sed 843249423Sdim __b = _mm_setzero_si64(); 844249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 845249423Sdim __r = _mm_setzero_ps(); 846249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 847249423Sdim __r = _mm_movelh_ps(__r, __r); 848249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 849249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 850193326Sed 851249423Sdim return __r; 852193326Sed} 853193326Sed 854206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 855249423Sdim_mm_cvtpi8_ps(__m64 __a) 856193326Sed{ 857249423Sdim __m64 __b; 858193326Sed 859249423Sdim __b = _mm_setzero_si64(); 860249423Sdim __b = _mm_cmpgt_pi8(__b, __a); 861249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 862193326Sed 863249423Sdim return _mm_cvtpi16_ps(__b); 864193326Sed} 865193326Sed 866206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 867249423Sdim_mm_cvtpu8_ps(__m64 __a) 868193326Sed{ 869249423Sdim __m64 __b; 870193326Sed 871249423Sdim __b = _mm_setzero_si64(); 872249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 873193326Sed 874249423Sdim return _mm_cvtpi16_ps(__b); 875193326Sed} 876193326Sed 877206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 878249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 879193326Sed{ 880249423Sdim __m128 __c; 881193326Sed 882249423Sdim __c = _mm_setzero_ps(); 883249423Sdim __c = _mm_cvtpi32_ps(__c, __b); 884249423Sdim __c = _mm_movelh_ps(__c, __c); 885193326Sed 886249423Sdim return _mm_cvtpi32_ps(__c, __a); 887193326Sed} 888193326Sed 889206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 890249423Sdim_mm_cvtps_pi16(__m128 __a) 891193326Sed{ 892249423Sdim __m64 __b, __c; 893193326Sed 894249423Sdim __b = _mm_cvtps_pi32(__a); 895249423Sdim __a = _mm_movehl_ps(__a, __a); 896249423Sdim __c = _mm_cvtps_pi32(__a); 897193326Sed 898249423Sdim return _mm_packs_pi16(__b, __c); 899193326Sed} 900193326Sed 901206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 902249423Sdim_mm_cvtps_pi8(__m128 __a) 903193326Sed{ 904249423Sdim __m64 __b, __c; 905193326Sed 906249423Sdim __b = _mm_cvtps_pi16(__a); 907249423Sdim __c = _mm_setzero_si64(); 908193326Sed 909249423Sdim return _mm_packs_pi16(__b, __c); 910193326Sed} 911193326Sed 912206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 913249423Sdim_mm_movemask_ps(__m128 __a) 914193326Sed{ 915249423Sdim return __builtin_ia32_movmskps(__a); 916193326Sed} 917193326Sed 918193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 919193326Sed 920193326Sed#define _MM_EXCEPT_INVALID (0x0001) 921193326Sed#define _MM_EXCEPT_DENORM (0x0002) 922193326Sed#define _MM_EXCEPT_DIV_ZERO (0x0004) 923193326Sed#define _MM_EXCEPT_OVERFLOW (0x0008) 924193326Sed#define _MM_EXCEPT_UNDERFLOW (0x0010) 925193326Sed#define _MM_EXCEPT_INEXACT (0x0020) 926193326Sed#define _MM_EXCEPT_MASK (0x003f) 927193326Sed 928193326Sed#define _MM_MASK_INVALID (0x0080) 929193326Sed#define _MM_MASK_DENORM (0x0100) 930193326Sed#define _MM_MASK_DIV_ZERO (0x0200) 931193326Sed#define _MM_MASK_OVERFLOW (0x0400) 932193326Sed#define _MM_MASK_UNDERFLOW (0x0800) 933193326Sed#define _MM_MASK_INEXACT (0x1000) 934193326Sed#define _MM_MASK_MASK (0x1f80) 935193326Sed 936193326Sed#define _MM_ROUND_NEAREST (0x0000) 937193326Sed#define _MM_ROUND_DOWN (0x2000) 938193326Sed#define _MM_ROUND_UP (0x4000) 939193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000) 940193326Sed#define _MM_ROUND_MASK (0x6000) 941193326Sed 942193326Sed#define _MM_FLUSH_ZERO_MASK (0x8000) 943193326Sed#define _MM_FLUSH_ZERO_ON (0x8000) 944234353Sdim#define _MM_FLUSH_ZERO_OFF (0x0000) 945193326Sed 946193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 947193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 948193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 949193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 950193326Sed 951193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 952193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 953193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 954193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 955193326Sed 956193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 957193326Seddo { \ 958193326Sed __m128 tmp3, tmp2, tmp1, tmp0; \ 959193326Sed tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 960193326Sed tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 961193326Sed tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 962193326Sed tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 963193326Sed (row0) = _mm_movelh_ps(tmp0, tmp2); \ 964193326Sed (row1) = _mm_movehl_ps(tmp2, tmp0); \ 965193326Sed (row2) = _mm_movelh_ps(tmp1, tmp3); \ 966203955Srdivacky (row3) = _mm_movehl_ps(tmp3, tmp1); \ 967193326Sed} while (0) 968193326Sed 969212904Sdim/* Aliases for compatibility. */ 970212904Sdim#define _m_pextrw _mm_extract_pi16 971212904Sdim#define _m_pinsrw _mm_insert_pi16 972212904Sdim#define _m_pmaxsw _mm_max_pi16 973212904Sdim#define _m_pmaxub _mm_max_pu8 974212904Sdim#define _m_pminsw _mm_min_pi16 975212904Sdim#define _m_pminub _mm_min_pu8 976212904Sdim#define _m_pmovmskb _mm_movemask_pi8 977212904Sdim#define _m_pmulhuw _mm_mulhi_pu16 978212904Sdim#define _m_pshufw _mm_shuffle_pi16 979212904Sdim#define _m_maskmovq _mm_maskmove_si64 980212904Sdim#define _m_pavgb _mm_avg_pu8 981212904Sdim#define _m_pavgw _mm_avg_pu16 982212904Sdim#define _m_psadbw _mm_sad_pu8 983212904Sdim#define _m_ _mm_ 984212904Sdim#define _m_ _mm_ 985212904Sdim 986249423Sdim#if !__has_feature(modules) 987194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */ 988194179Sed#ifdef __SSE2__ 989193326Sed#include <emmintrin.h> 990194179Sed#endif 991249423Sdim#endif 992193326Sed 993193326Sed#endif /* __SSE__ */ 994193326Sed 995193326Sed#endif /* __XMMINTRIN_H */ 996