xmmintrin.h revision 266674
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2193326Sed * 3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy 4193326Sed * of this software and associated documentation files (the "Software"), to deal 5193326Sed * in the Software without restriction, including without limitation the rights 6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7193326Sed * copies of the Software, and to permit persons to whom the Software is 8193326Sed * furnished to do so, subject to the following conditions: 9193326Sed * 10193326Sed * The above copyright notice and this permission notice shall be included in 11193326Sed * all copies or substantial portions of the Software. 12193326Sed * 13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19193326Sed * THE SOFTWARE. 20193326Sed * 21193326Sed *===-----------------------------------------------------------------------=== 22193326Sed */ 23193326Sed 24193326Sed#ifndef __XMMINTRIN_H 25193326Sed#define __XMMINTRIN_H 26193326Sed 27193326Sed#ifndef __SSE__ 28193326Sed#error "SSE instruction set not enabled" 29193326Sed#else 30193326Sed 31193326Sed#include <mmintrin.h> 32193326Sed 33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16))); 34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16))); 35193326Sedtypedef float __m128 __attribute__((__vector_size__(16))); 36193326Sed 37218893Sdim// This header should only be included in a hosted environment as it depends on 38218893Sdim// a standard library to provide allocation routines. 39218893Sdim#if __STDC_HOSTED__ 40193326Sed#include <mm_malloc.h> 41218893Sdim#endif 42193326Sed 43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44249423Sdim_mm_add_ss(__m128 __a, __m128 __b) 45193326Sed{ 46249423Sdim __a[0] += __b[0]; 47249423Sdim return __a; 48193326Sed} 49193326Sed 50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51249423Sdim_mm_add_ps(__m128 __a, __m128 __b) 52193326Sed{ 53249423Sdim return __a + __b; 54193326Sed} 55193326Sed 56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57249423Sdim_mm_sub_ss(__m128 __a, __m128 __b) 58193326Sed{ 59249423Sdim __a[0] -= __b[0]; 60249423Sdim return __a; 61193326Sed} 62193326Sed 63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64249423Sdim_mm_sub_ps(__m128 __a, __m128 __b) 65193326Sed{ 66249423Sdim return __a - __b; 67193326Sed} 68193326Sed 69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70249423Sdim_mm_mul_ss(__m128 __a, __m128 __b) 71193326Sed{ 72249423Sdim __a[0] *= __b[0]; 73249423Sdim return __a; 74193326Sed} 75193326Sed 76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77249423Sdim_mm_mul_ps(__m128 __a, __m128 __b) 78193326Sed{ 79249423Sdim return __a * __b; 80193326Sed} 81193326Sed 82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83249423Sdim_mm_div_ss(__m128 __a, __m128 __b) 84193326Sed{ 85249423Sdim __a[0] /= __b[0]; 86249423Sdim return __a; 87193326Sed} 88193326Sed 89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90249423Sdim_mm_div_ps(__m128 __a, __m128 __b) 91193326Sed{ 92249423Sdim return __a / __b; 93193326Sed} 94193326Sed 95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96249423Sdim_mm_sqrt_ss(__m128 __a) 97193326Sed{ 98249423Sdim __m128 __c = __builtin_ia32_sqrtss(__a); 99249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 100193326Sed} 101193326Sed 102206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 103249423Sdim_mm_sqrt_ps(__m128 __a) 104193326Sed{ 105249423Sdim return __builtin_ia32_sqrtps(__a); 106193326Sed} 107193326Sed 108206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 109249423Sdim_mm_rcp_ss(__m128 __a) 110193326Sed{ 111249423Sdim __m128 __c = __builtin_ia32_rcpss(__a); 112249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 113193326Sed} 114193326Sed 115206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116249423Sdim_mm_rcp_ps(__m128 __a) 117193326Sed{ 118249423Sdim return __builtin_ia32_rcpps(__a); 119193326Sed} 120193326Sed 121206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122249423Sdim_mm_rsqrt_ss(__m128 __a) 123193326Sed{ 124249423Sdim __m128 __c = __builtin_ia32_rsqrtss(__a); 125249423Sdim return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 126193326Sed} 127193326Sed 128206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 129249423Sdim_mm_rsqrt_ps(__m128 __a) 130193326Sed{ 131249423Sdim return __builtin_ia32_rsqrtps(__a); 132193326Sed} 133193326Sed 134206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 135249423Sdim_mm_min_ss(__m128 __a, __m128 __b) 136193326Sed{ 137249423Sdim return __builtin_ia32_minss(__a, __b); 138193326Sed} 139193326Sed 140206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 141249423Sdim_mm_min_ps(__m128 __a, __m128 __b) 142193326Sed{ 143249423Sdim return __builtin_ia32_minps(__a, __b); 144193326Sed} 145193326Sed 146206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 147249423Sdim_mm_max_ss(__m128 __a, __m128 __b) 148193326Sed{ 149249423Sdim return __builtin_ia32_maxss(__a, __b); 150193326Sed} 151193326Sed 152206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 153249423Sdim_mm_max_ps(__m128 __a, __m128 __b) 154193326Sed{ 155249423Sdim return __builtin_ia32_maxps(__a, __b); 156193326Sed} 157193326Sed 158206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 159249423Sdim_mm_and_ps(__m128 __a, __m128 __b) 160193326Sed{ 161249423Sdim return (__m128)((__v4si)__a & (__v4si)__b); 162193326Sed} 163193326Sed 164206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 165249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b) 166193326Sed{ 167249423Sdim return (__m128)(~(__v4si)__a & (__v4si)__b); 168193326Sed} 169193326Sed 170206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 171249423Sdim_mm_or_ps(__m128 __a, __m128 __b) 172193326Sed{ 173249423Sdim return (__m128)((__v4si)__a | (__v4si)__b); 174193326Sed} 175193326Sed 176206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 177249423Sdim_mm_xor_ps(__m128 __a, __m128 __b) 178193326Sed{ 179249423Sdim return (__m128)((__v4si)__a ^ (__v4si)__b); 180193326Sed} 181193326Sed 182206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 183249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b) 184193326Sed{ 185249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 0); 186193326Sed} 187193326Sed 188206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 189249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b) 190193326Sed{ 191249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 0); 192193326Sed} 193193326Sed 194206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 195249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b) 196193326Sed{ 197249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 1); 198193326Sed} 199193326Sed 200206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 201249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b) 202193326Sed{ 203249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 1); 204193326Sed} 205193326Sed 206206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 207249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b) 208193326Sed{ 209249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 2); 210193326Sed} 211193326Sed 212206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 213249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b) 214193326Sed{ 215249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 2); 216193326Sed} 217193326Sed 218206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 219249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b) 220193326Sed{ 221261991Sdim return (__m128)__builtin_shufflevector(__a, 222261991Sdim __builtin_ia32_cmpss(__b, __a, 1), 223261991Sdim 4, 1, 2, 3); 224193326Sed} 225193326Sed 226206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 227249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b) 228193326Sed{ 229249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 1); 230193326Sed} 231193326Sed 232206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 233249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b) 234193326Sed{ 235261991Sdim return (__m128)__builtin_shufflevector(__a, 236261991Sdim __builtin_ia32_cmpss(__b, __a, 2), 237261991Sdim 4, 1, 2, 3); 238193326Sed} 239193326Sed 240206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 241249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b) 242193326Sed{ 243249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 2); 244193326Sed} 245193326Sed 246206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 247249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b) 248193326Sed{ 249249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 4); 250193326Sed} 251193326Sed 252206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 253249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b) 254193326Sed{ 255249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 4); 256193326Sed} 257193326Sed 258206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 259249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b) 260193326Sed{ 261249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 5); 262193326Sed} 263193326Sed 264206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 265249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b) 266193326Sed{ 267249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 5); 268193326Sed} 269193326Sed 270206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 271249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b) 272193326Sed{ 273249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 6); 274193326Sed} 275193326Sed 276206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 277249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b) 278193326Sed{ 279249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 6); 280193326Sed} 281193326Sed 282206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 283249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b) 284193326Sed{ 285261991Sdim return (__m128)__builtin_shufflevector(__a, 286261991Sdim __builtin_ia32_cmpss(__b, __a, 5), 287261991Sdim 4, 1, 2, 3); 288193326Sed} 289193326Sed 290206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 291249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b) 292193326Sed{ 293249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 5); 294193326Sed} 295193326Sed 296206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 297249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b) 298193326Sed{ 299261991Sdim return (__m128)__builtin_shufflevector(__a, 300261991Sdim __builtin_ia32_cmpss(__b, __a, 6), 301261991Sdim 4, 1, 2, 3); 302193326Sed} 303193326Sed 304206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 305249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b) 306193326Sed{ 307249423Sdim return (__m128)__builtin_ia32_cmpps(__b, __a, 6); 308193326Sed} 309193326Sed 310206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 311249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b) 312193326Sed{ 313249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 7); 314193326Sed} 315193326Sed 316206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 317249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b) 318193326Sed{ 319249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 7); 320193326Sed} 321193326Sed 322206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 323249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b) 324193326Sed{ 325249423Sdim return (__m128)__builtin_ia32_cmpss(__a, __b, 3); 326193326Sed} 327193326Sed 328206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 329249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b) 330193326Sed{ 331249423Sdim return (__m128)__builtin_ia32_cmpps(__a, __b, 3); 332193326Sed} 333193326Sed 334206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 335249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b) 336193326Sed{ 337249423Sdim return __builtin_ia32_comieq(__a, __b); 338193326Sed} 339193326Sed 340206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 341249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b) 342193326Sed{ 343249423Sdim return __builtin_ia32_comilt(__a, __b); 344193326Sed} 345193326Sed 346206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 347249423Sdim_mm_comile_ss(__m128 __a, __m128 __b) 348193326Sed{ 349249423Sdim return __builtin_ia32_comile(__a, __b); 350193326Sed} 351193326Sed 352206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 353249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b) 354193326Sed{ 355249423Sdim return __builtin_ia32_comigt(__a, __b); 356193326Sed} 357193326Sed 358206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 359249423Sdim_mm_comige_ss(__m128 __a, __m128 __b) 360193326Sed{ 361249423Sdim return __builtin_ia32_comige(__a, __b); 362193326Sed} 363193326Sed 364206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 365249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b) 366193326Sed{ 367249423Sdim return __builtin_ia32_comineq(__a, __b); 368193326Sed} 369193326Sed 370206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 371249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b) 372193326Sed{ 373249423Sdim return __builtin_ia32_ucomieq(__a, __b); 374193326Sed} 375193326Sed 376206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 377249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b) 378193326Sed{ 379249423Sdim return __builtin_ia32_ucomilt(__a, __b); 380193326Sed} 381193326Sed 382206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 383249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b) 384193326Sed{ 385249423Sdim return __builtin_ia32_ucomile(__a, __b); 386193326Sed} 387193326Sed 388206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 389249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b) 390193326Sed{ 391249423Sdim return __builtin_ia32_ucomigt(__a, __b); 392193326Sed} 393193326Sed 394206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 395249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b) 396193326Sed{ 397249423Sdim return __builtin_ia32_ucomige(__a, __b); 398193326Sed} 399193326Sed 400206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 401249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b) 402193326Sed{ 403249423Sdim return __builtin_ia32_ucomineq(__a, __b); 404193326Sed} 405193326Sed 406206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 407249423Sdim_mm_cvtss_si32(__m128 __a) 408193326Sed{ 409249423Sdim return __builtin_ia32_cvtss2si(__a); 410193326Sed} 411193326Sed 412206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 413249423Sdim_mm_cvt_ss2si(__m128 __a) 414204643Srdivacky{ 415249423Sdim return _mm_cvtss_si32(__a); 416204643Srdivacky} 417204643Srdivacky 418193576Sed#ifdef __x86_64__ 419193576Sed 420206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 421249423Sdim_mm_cvtss_si64(__m128 __a) 422193326Sed{ 423249423Sdim return __builtin_ia32_cvtss2si64(__a); 424193326Sed} 425193326Sed 426193576Sed#endif 427193576Sed 428206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 429249423Sdim_mm_cvtps_pi32(__m128 __a) 430193326Sed{ 431249423Sdim return (__m64)__builtin_ia32_cvtps2pi(__a); 432193326Sed} 433193326Sed 434212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 435249423Sdim_mm_cvt_ps2pi(__m128 __a) 436212904Sdim{ 437249423Sdim return _mm_cvtps_pi32(__a); 438212904Sdim} 439212904Sdim 440206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 441249423Sdim_mm_cvttss_si32(__m128 __a) 442193326Sed{ 443249423Sdim return __a[0]; 444193326Sed} 445193326Sed 446206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 447249423Sdim_mm_cvtt_ss2si(__m128 __a) 448204643Srdivacky{ 449249423Sdim return _mm_cvttss_si32(__a); 450204643Srdivacky} 451204643Srdivacky 452206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__)) 453249423Sdim_mm_cvttss_si64(__m128 __a) 454193326Sed{ 455249423Sdim return __a[0]; 456193326Sed} 457193326Sed 458206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 459249423Sdim_mm_cvttps_pi32(__m128 __a) 460193326Sed{ 461249423Sdim return (__m64)__builtin_ia32_cvttps2pi(__a); 462193326Sed} 463193326Sed 464212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 465249423Sdim_mm_cvtt_ps2pi(__m128 __a) 466212904Sdim{ 467249423Sdim return _mm_cvttps_pi32(__a); 468212904Sdim} 469212904Sdim 470206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 471249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b) 472193326Sed{ 473249423Sdim __a[0] = __b; 474249423Sdim return __a; 475193326Sed} 476193326Sed 477212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 478249423Sdim_mm_cvt_si2ss(__m128 __a, int __b) 479212904Sdim{ 480249423Sdim return _mm_cvtsi32_ss(__a, __b); 481212904Sdim} 482212904Sdim 483193326Sed#ifdef __x86_64__ 484193326Sed 485206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 486249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b) 487193326Sed{ 488249423Sdim __a[0] = __b; 489249423Sdim return __a; 490193326Sed} 491193326Sed 492193326Sed#endif 493193326Sed 494206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 495249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b) 496193326Sed{ 497249423Sdim return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 498193326Sed} 499193326Sed 500212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 501249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b) 502212904Sdim{ 503249423Sdim return _mm_cvtpi32_ps(__a, __b); 504212904Sdim} 505212904Sdim 506206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__)) 507249423Sdim_mm_cvtss_f32(__m128 __a) 508193326Sed{ 509249423Sdim return __a[0]; 510193326Sed} 511193326Sed 512206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 513249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p) 514193326Sed{ 515226633Sdim typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 516226633Sdim struct __mm_loadh_pi_struct { 517249423Sdim __mm_loadh_pi_v2f32 __u; 518226633Sdim } __attribute__((__packed__, __may_alias__)); 519249423Sdim __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 520249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 521249423Sdim return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 522193326Sed} 523193326Sed 524206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 525249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p) 526193326Sed{ 527226633Sdim typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 528226633Sdim struct __mm_loadl_pi_struct { 529249423Sdim __mm_loadl_pi_v2f32 __u; 530226633Sdim } __attribute__((__packed__, __may_alias__)); 531249423Sdim __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 532249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 533249423Sdim return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 534193326Sed} 535193326Sed 536206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 537249423Sdim_mm_load_ss(const float *__p) 538193326Sed{ 539226633Sdim struct __mm_load_ss_struct { 540249423Sdim float __u; 541226633Sdim } __attribute__((__packed__, __may_alias__)); 542249423Sdim float __u = ((struct __mm_load_ss_struct*)__p)->__u; 543249423Sdim return (__m128){ __u, 0, 0, 0 }; 544193326Sed} 545193326Sed 546206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 547249423Sdim_mm_load1_ps(const float *__p) 548193326Sed{ 549226633Sdim struct __mm_load1_ps_struct { 550249423Sdim float __u; 551226633Sdim } __attribute__((__packed__, __may_alias__)); 552249423Sdim float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 553249423Sdim return (__m128){ __u, __u, __u, __u }; 554193326Sed} 555193326Sed 556193326Sed#define _mm_load_ps1(p) _mm_load1_ps(p) 557193326Sed 558206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 559249423Sdim_mm_load_ps(const float *__p) 560193326Sed{ 561249423Sdim return *(__m128*)__p; 562193326Sed} 563193326Sed 564206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 565249423Sdim_mm_loadu_ps(const float *__p) 566193326Sed{ 567223017Sdim struct __loadu_ps { 568249423Sdim __m128 __v; 569226633Sdim } __attribute__((__packed__, __may_alias__)); 570249423Sdim return ((struct __loadu_ps*)__p)->__v; 571193326Sed} 572193326Sed 573206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 574249423Sdim_mm_loadr_ps(const float *__p) 575193326Sed{ 576249423Sdim __m128 __a = _mm_load_ps(__p); 577249423Sdim return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 578193326Sed} 579193326Sed 580206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 581249423Sdim_mm_set_ss(float __w) 582193326Sed{ 583249423Sdim return (__m128){ __w, 0, 0, 0 }; 584193326Sed} 585193326Sed 586206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 587249423Sdim_mm_set1_ps(float __w) 588193326Sed{ 589249423Sdim return (__m128){ __w, __w, __w, __w }; 590193326Sed} 591193326Sed 592193326Sed// Microsoft specific. 593206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 594249423Sdim_mm_set_ps1(float __w) 595193326Sed{ 596249423Sdim return _mm_set1_ps(__w); 597193326Sed} 598193326Sed 599206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 600249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w) 601193326Sed{ 602249423Sdim return (__m128){ __w, __x, __y, __z }; 603193326Sed} 604193326Sed 605206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 606249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w) 607193326Sed{ 608249423Sdim return (__m128){ __z, __y, __x, __w }; 609193326Sed} 610193326Sed 611206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__)) 612193326Sed_mm_setzero_ps(void) 613193326Sed{ 614193326Sed return (__m128){ 0, 0, 0, 0 }; 615193326Sed} 616193326Sed 617206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 618249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a) 619193326Sed{ 620249423Sdim __builtin_ia32_storehps((__v2si *)__p, __a); 621193326Sed} 622193326Sed 623206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 624249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a) 625193326Sed{ 626249423Sdim __builtin_ia32_storelps((__v2si *)__p, __a); 627193326Sed} 628193326Sed 629206084Srdivackystatic __inline__ void __attribute__((__always_inline__)) 630249423Sdim_mm_store_ss(float *__p, __m128 __a) 631193326Sed{ 632226633Sdim struct __mm_store_ss_struct { 633249423Sdim float __u; 634226633Sdim } __attribute__((__packed__, __may_alias__)); 635249423Sdim ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 636193326Sed} 637193326Sed 638206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 639249423Sdim_mm_storeu_ps(float *__p, __m128 __a) 640193326Sed{ 641249423Sdim __builtin_ia32_storeups(__p, __a); 642193326Sed} 643193326Sed 644206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 645249423Sdim_mm_store1_ps(float *__p, __m128 __a) 646193326Sed{ 647249423Sdim __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 648249423Sdim _mm_storeu_ps(__p, __a); 649193326Sed} 650193326Sed 651206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 652249423Sdim_mm_store_ps1(float *__p, __m128 __a) 653212904Sdim{ 654249423Sdim return _mm_store1_ps(__p, __a); 655212904Sdim} 656212904Sdim 657212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 658249423Sdim_mm_store_ps(float *__p, __m128 __a) 659193326Sed{ 660249423Sdim *(__m128 *)__p = __a; 661193326Sed} 662193326Sed 663206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 664249423Sdim_mm_storer_ps(float *__p, __m128 __a) 665193326Sed{ 666249423Sdim __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 667249423Sdim _mm_store_ps(__p, __a); 668193326Sed} 669193326Sed 670212904Sdim#define _MM_HINT_T0 3 671193326Sed#define _MM_HINT_T1 2 672212904Sdim#define _MM_HINT_T2 1 673193326Sed#define _MM_HINT_NTA 0 674193326Sed 675210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and 676193326Sed Sema doesn't do any form of constant propagation yet. */ 677193326Sed 678234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 679193326Sed 680206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 681249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a) 682193326Sed{ 683249423Sdim __builtin_ia32_movntq(__p, __a); 684193326Sed} 685193326Sed 686206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 687249423Sdim_mm_stream_ps(float *__p, __m128 __a) 688193326Sed{ 689249423Sdim __builtin_ia32_movntps(__p, __a); 690193326Sed} 691193326Sed 692206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 693193326Sed_mm_sfence(void) 694193326Sed{ 695193326Sed __builtin_ia32_sfence(); 696193326Sed} 697193326Sed 698206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 699249423Sdim_mm_extract_pi16(__m64 __a, int __n) 700193326Sed{ 701249423Sdim __v4hi __b = (__v4hi)__a; 702249423Sdim return (unsigned short)__b[__n & 3]; 703193326Sed} 704193326Sed 705206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 706249423Sdim_mm_insert_pi16(__m64 __a, int __d, int __n) 707193326Sed{ 708249423Sdim __v4hi __b = (__v4hi)__a; 709249423Sdim __b[__n & 3] = __d; 710249423Sdim return (__m64)__b; 711193326Sed} 712193326Sed 713206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 714249423Sdim_mm_max_pi16(__m64 __a, __m64 __b) 715193326Sed{ 716249423Sdim return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 717193326Sed} 718193326Sed 719206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 720249423Sdim_mm_max_pu8(__m64 __a, __m64 __b) 721193326Sed{ 722249423Sdim return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 723193326Sed} 724193326Sed 725206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 726249423Sdim_mm_min_pi16(__m64 __a, __m64 __b) 727193326Sed{ 728249423Sdim return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 729193326Sed} 730193326Sed 731206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 732249423Sdim_mm_min_pu8(__m64 __a, __m64 __b) 733193326Sed{ 734249423Sdim return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 735193326Sed} 736193326Sed 737206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 738249423Sdim_mm_movemask_pi8(__m64 __a) 739193326Sed{ 740249423Sdim return __builtin_ia32_pmovmskb((__v8qi)__a); 741193326Sed} 742193326Sed 743206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 744249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b) 745193326Sed{ 746249423Sdim return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 747193326Sed} 748193326Sed 749234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \ 750234353Sdim __m64 __a = (a); \ 751234353Sdim (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) 752193326Sed 753206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 754249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 755193326Sed{ 756249423Sdim __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 757193326Sed} 758193326Sed 759206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 760249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b) 761193326Sed{ 762249423Sdim return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 763193326Sed} 764193326Sed 765206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 766249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b) 767193326Sed{ 768249423Sdim return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 769193326Sed} 770193326Sed 771206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 772249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b) 773193326Sed{ 774249423Sdim return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 775193326Sed} 776193326Sed 777206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 778193326Sed_mm_getcsr(void) 779193326Sed{ 780193326Sed return __builtin_ia32_stmxcsr(); 781193326Sed} 782193326Sed 783206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__)) 784249423Sdim_mm_setcsr(unsigned int __i) 785193326Sed{ 786249423Sdim __builtin_ia32_ldmxcsr(__i); 787193326Sed} 788193326Sed 789234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 790234353Sdim __m128 __a = (a); \ 791234353Sdim __m128 __b = (b); \ 792234353Sdim (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ 793234353Sdim (mask) & 0x3, ((mask) & 0xc) >> 2, \ 794234353Sdim (((mask) & 0x30) >> 4) + 4, \ 795234353Sdim (((mask) & 0xc0) >> 6) + 4); }) 796193326Sed 797206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 798249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b) 799193326Sed{ 800249423Sdim return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 801193326Sed} 802193326Sed 803206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 804249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b) 805193326Sed{ 806249423Sdim return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 807193326Sed} 808193326Sed 809206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 810249423Sdim_mm_move_ss(__m128 __a, __m128 __b) 811193326Sed{ 812249423Sdim return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 813193326Sed} 814193326Sed 815206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 816249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b) 817193326Sed{ 818249423Sdim return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 819193326Sed} 820193326Sed 821206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 822249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b) 823193326Sed{ 824249423Sdim return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 825193326Sed} 826193326Sed 827206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 828249423Sdim_mm_cvtpi16_ps(__m64 __a) 829193326Sed{ 830249423Sdim __m64 __b, __c; 831249423Sdim __m128 __r; 832193326Sed 833249423Sdim __b = _mm_setzero_si64(); 834249423Sdim __b = _mm_cmpgt_pi16(__b, __a); 835249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 836249423Sdim __r = _mm_setzero_ps(); 837249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 838249423Sdim __r = _mm_movelh_ps(__r, __r); 839249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 840249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 841193326Sed 842249423Sdim return __r; 843193326Sed} 844193326Sed 845206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 846249423Sdim_mm_cvtpu16_ps(__m64 __a) 847193326Sed{ 848249423Sdim __m64 __b, __c; 849249423Sdim __m128 __r; 850193326Sed 851249423Sdim __b = _mm_setzero_si64(); 852249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 853249423Sdim __r = _mm_setzero_ps(); 854249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 855249423Sdim __r = _mm_movelh_ps(__r, __r); 856249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 857249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 858193326Sed 859249423Sdim return __r; 860193326Sed} 861193326Sed 862206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 863249423Sdim_mm_cvtpi8_ps(__m64 __a) 864193326Sed{ 865249423Sdim __m64 __b; 866193326Sed 867249423Sdim __b = _mm_setzero_si64(); 868249423Sdim __b = _mm_cmpgt_pi8(__b, __a); 869249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 870193326Sed 871249423Sdim return _mm_cvtpi16_ps(__b); 872193326Sed} 873193326Sed 874206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 875249423Sdim_mm_cvtpu8_ps(__m64 __a) 876193326Sed{ 877249423Sdim __m64 __b; 878193326Sed 879249423Sdim __b = _mm_setzero_si64(); 880249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 881193326Sed 882249423Sdim return _mm_cvtpi16_ps(__b); 883193326Sed} 884193326Sed 885206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 886249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 887193326Sed{ 888249423Sdim __m128 __c; 889193326Sed 890249423Sdim __c = _mm_setzero_ps(); 891249423Sdim __c = _mm_cvtpi32_ps(__c, __b); 892249423Sdim __c = _mm_movelh_ps(__c, __c); 893193326Sed 894249423Sdim return _mm_cvtpi32_ps(__c, __a); 895193326Sed} 896193326Sed 897206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 898249423Sdim_mm_cvtps_pi16(__m128 __a) 899193326Sed{ 900249423Sdim __m64 __b, __c; 901193326Sed 902249423Sdim __b = _mm_cvtps_pi32(__a); 903249423Sdim __a = _mm_movehl_ps(__a, __a); 904249423Sdim __c = _mm_cvtps_pi32(__a); 905193326Sed 906266674Sdim return _mm_packs_pi32(__b, __c); 907193326Sed} 908193326Sed 909206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 910249423Sdim_mm_cvtps_pi8(__m128 __a) 911193326Sed{ 912249423Sdim __m64 __b, __c; 913193326Sed 914249423Sdim __b = _mm_cvtps_pi16(__a); 915249423Sdim __c = _mm_setzero_si64(); 916193326Sed 917249423Sdim return _mm_packs_pi16(__b, __c); 918193326Sed} 919193326Sed 920206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 921249423Sdim_mm_movemask_ps(__m128 __a) 922193326Sed{ 923249423Sdim return __builtin_ia32_movmskps(__a); 924193326Sed} 925193326Sed 926193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 927193326Sed 928193326Sed#define _MM_EXCEPT_INVALID (0x0001) 929193326Sed#define _MM_EXCEPT_DENORM (0x0002) 930193326Sed#define _MM_EXCEPT_DIV_ZERO (0x0004) 931193326Sed#define _MM_EXCEPT_OVERFLOW (0x0008) 932193326Sed#define _MM_EXCEPT_UNDERFLOW (0x0010) 933193326Sed#define _MM_EXCEPT_INEXACT (0x0020) 934193326Sed#define _MM_EXCEPT_MASK (0x003f) 935193326Sed 936193326Sed#define _MM_MASK_INVALID (0x0080) 937193326Sed#define _MM_MASK_DENORM (0x0100) 938193326Sed#define _MM_MASK_DIV_ZERO (0x0200) 939193326Sed#define _MM_MASK_OVERFLOW (0x0400) 940193326Sed#define _MM_MASK_UNDERFLOW (0x0800) 941193326Sed#define _MM_MASK_INEXACT (0x1000) 942193326Sed#define _MM_MASK_MASK (0x1f80) 943193326Sed 944193326Sed#define _MM_ROUND_NEAREST (0x0000) 945193326Sed#define _MM_ROUND_DOWN (0x2000) 946193326Sed#define _MM_ROUND_UP (0x4000) 947193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000) 948193326Sed#define _MM_ROUND_MASK (0x6000) 949193326Sed 950193326Sed#define _MM_FLUSH_ZERO_MASK (0x8000) 951193326Sed#define _MM_FLUSH_ZERO_ON (0x8000) 952234353Sdim#define _MM_FLUSH_ZERO_OFF (0x0000) 953193326Sed 954193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 955193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 956193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 957193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 958193326Sed 959193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 960193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 961193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 962193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 963193326Sed 964193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 965193326Seddo { \ 966193326Sed __m128 tmp3, tmp2, tmp1, tmp0; \ 967193326Sed tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 968193326Sed tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 969193326Sed tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 970193326Sed tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 971193326Sed (row0) = _mm_movelh_ps(tmp0, tmp2); \ 972193326Sed (row1) = _mm_movehl_ps(tmp2, tmp0); \ 973193326Sed (row2) = _mm_movelh_ps(tmp1, tmp3); \ 974203955Srdivacky (row3) = _mm_movehl_ps(tmp3, tmp1); \ 975193326Sed} while (0) 976193326Sed 977212904Sdim/* Aliases for compatibility. */ 978212904Sdim#define _m_pextrw _mm_extract_pi16 979212904Sdim#define _m_pinsrw _mm_insert_pi16 980212904Sdim#define _m_pmaxsw _mm_max_pi16 981212904Sdim#define _m_pmaxub _mm_max_pu8 982212904Sdim#define _m_pminsw _mm_min_pi16 983212904Sdim#define _m_pminub _mm_min_pu8 984212904Sdim#define _m_pmovmskb _mm_movemask_pi8 985212904Sdim#define _m_pmulhuw _mm_mulhi_pu16 986212904Sdim#define _m_pshufw _mm_shuffle_pi16 987212904Sdim#define _m_maskmovq _mm_maskmove_si64 988212904Sdim#define _m_pavgb _mm_avg_pu8 989212904Sdim#define _m_pavgw _mm_avg_pu16 990212904Sdim#define _m_psadbw _mm_sad_pu8 991212904Sdim#define _m_ _mm_ 992212904Sdim#define _m_ _mm_ 993212904Sdim 994194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */ 995194179Sed#ifdef __SSE2__ 996193326Sed#include <emmintrin.h> 997194179Sed#endif 998193326Sed 999193326Sed#endif /* __SSE__ */ 1000193326Sed 1001193326Sed#endif /* __XMMINTRIN_H */ 1002