xmmintrin.h revision 194179
167754Smsmith/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 267754Smsmith * 367754Smsmith * Permission is hereby granted, free of charge, to any person obtaining a copy 4123315Snjl * of this software and associated documentation files (the "Software"), to deal 567754Smsmith * in the Software without restriction, including without limitation the rights 667754Smsmith * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 767754Smsmith * copies of the Software, and to permit persons to whom the Software is 867754Smsmith * furnished to do so, subject to the following conditions: 967754Smsmith * 1067754Smsmith * The above copyright notice and this permission notice shall be included in 1167754Smsmith * all copies or substantial portions of the Software. 12114237Snjl * 1370243Smsmith * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1467754Smsmith * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1567754Smsmith * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1667754Smsmith * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1767754Smsmith * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1867754Smsmith * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 1967754Smsmith * THE SOFTWARE. 2067754Smsmith * 2167754Smsmith *===-----------------------------------------------------------------------=== 2267754Smsmith */ 2367754Smsmith 2467754Smsmith#ifndef __XMMINTRIN_H 2567754Smsmith#define __XMMINTRIN_H 2667754Smsmith 2767754Smsmith#ifndef __SSE__ 2867754Smsmith#error "SSE instruction set not enabled" 2967754Smsmith#else 3067754Smsmith 3167754Smsmith#include <mmintrin.h> 3267754Smsmith 3367754Smsmithtypedef float __v4sf __attribute__((__vector_size__(16))); 3467754Smsmithtypedef float __m128 __attribute__((__vector_size__(16))); 3567754Smsmith 3667754Smsmith#include <mm_malloc.h> 3767754Smsmith 3867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 3967754Smsmith_mm_add_ss(__m128 a, __m128 b) 4067754Smsmith{ 4167754Smsmith a[0] += b[0]; 4267754Smsmith return a; 4367754Smsmith} 4467754Smsmith 4567754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 4667754Smsmith_mm_add_ps(__m128 a, __m128 b) 4767754Smsmith{ 4867754Smsmith return a + b; 4967754Smsmith} 5067754Smsmith 5167754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 5267754Smsmith_mm_sub_ss(__m128 a, __m128 b) 5367754Smsmith{ 5467754Smsmith a[0] -= b[0]; 5567754Smsmith return a; 5667754Smsmith} 5767754Smsmith 5867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 5967754Smsmith_mm_sub_ps(__m128 a, __m128 b) 6067754Smsmith{ 6167754Smsmith return a - b; 6267754Smsmith} 6367754Smsmith 6467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 6567754Smsmith_mm_mul_ss(__m128 a, __m128 b) 6667754Smsmith{ 6767754Smsmith a[0] *= b[0]; 6867754Smsmith return a; 6967754Smsmith} 7067754Smsmith 7167754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 7267754Smsmith_mm_mul_ps(__m128 a, __m128 b) 7367754Smsmith{ 7467754Smsmith return a * b; 7567754Smsmith} 7667754Smsmith 7767754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 7867754Smsmith_mm_div_ss(__m128 a, __m128 b) 7967754Smsmith{ 8067754Smsmith a[0] /= b[0]; 8167754Smsmith return a; 8267754Smsmith} 8367754Smsmith 8467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 8567754Smsmith_mm_div_ps(__m128 a, __m128 b) 8667754Smsmith{ 8767754Smsmith return a / b; 8867754Smsmith} 8967754Smsmith 9067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 9167754Smsmith_mm_sqrt_ss(__m128 a) 9267754Smsmith{ 9367754Smsmith return __builtin_ia32_sqrtss(a); 9467754Smsmith} 9567754Smsmith 9667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 9767754Smsmith_mm_sqrt_ps(__m128 a) 9867754Smsmith{ 9967754Smsmith return __builtin_ia32_sqrtps(a); 10067754Smsmith} 10167754Smsmith 10267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 10367754Smsmith_mm_rcp_ss(__m128 a) 10467754Smsmith{ 10567754Smsmith return __builtin_ia32_rcpss(a); 10667754Smsmith} 10767754Smsmith 10867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 10967754Smsmith_mm_rcp_ps(__m128 a) 11067754Smsmith{ 11167754Smsmith return __builtin_ia32_rcpps(a); 11267754Smsmith} 11367754Smsmith 11467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 11567754Smsmith_mm_rsqrt_ss(__m128 a) 11667754Smsmith{ 11767754Smsmith return __builtin_ia32_rsqrtss(a); 11867754Smsmith} 11967754Smsmith 12067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 12167754Smsmith_mm_rsqrt_ps(__m128 a) 12267754Smsmith{ 12377424Smsmith return __builtin_ia32_rsqrtps(a); 12491116Smsmith} 12567754Smsmith 12667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 12767754Smsmith_mm_min_ss(__m128 a, __m128 b) 12867754Smsmith{ 12967754Smsmith return __builtin_ia32_minss(a, b); 13067754Smsmith} 13167754Smsmith 132107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 13367754Smsmith_mm_min_ps(__m128 a, __m128 b) 13477424Smsmith{ 13567754Smsmith return __builtin_ia32_minps(a, b); 13667754Smsmith} 13767754Smsmith 138114237Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 139114237Snjl_mm_max_ss(__m128 a, __m128 b) 140107325Siwasaki{ 14167754Smsmith return __builtin_ia32_maxss(a, b); 14267754Smsmith} 14367754Smsmith 14467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 14567754Smsmith_mm_max_ps(__m128 a, __m128 b) 14667754Smsmith{ 14767754Smsmith return __builtin_ia32_maxps(a, b); 14867754Smsmith} 14967754Smsmith 15067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 15167754Smsmith_mm_and_ps(__m128 a, __m128 b) 15267754Smsmith{ 15367754Smsmith typedef int __v4si __attribute__((__vector_size__(16))); 15467754Smsmith return (__m128)((__v4si)a & (__v4si)b); 15567754Smsmith} 15667754Smsmith 15767754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 15891116Smsmith_mm_andnot_ps(__m128 a, __m128 b) 15967754Smsmith{ 16067754Smsmith typedef int __v4si __attribute__((__vector_size__(16))); 16167754Smsmith return (__m128)(~(__v4si)a & (__v4si)b); 16267754Smsmith} 16367754Smsmith 16491116Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 16567754Smsmith_mm_or_ps(__m128 a, __m128 b) 16671867Smsmith{ 167102550Siwasaki typedef int __v4si __attribute__((__vector_size__(16))); 16882367Smsmith return (__m128)((__v4si)a | (__v4si)b); 16967754Smsmith} 170114237Snjl 17177424Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 17291116Smsmith_mm_xor_ps(__m128 a, __m128 b) 17371867Smsmith{ 17471867Smsmith typedef int __v4si __attribute__((__vector_size__(16))); 175123315Snjl return (__m128)((__v4si)a ^ ~(__v4si)b); 17691116Smsmith} 17771867Smsmith 17880062Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 17971867Smsmith_mm_cmpeq_ss(__m128 a, __m128 b) 18067754Smsmith{ 18171867Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 0); 18267754Smsmith} 18367754Smsmith 184114237Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 185107325Siwasaki_mm_cmpeq_ps(__m128 a, __m128 b) 18667754Smsmith{ 18767754Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 0); 18867754Smsmith} 18967754Smsmith 19067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 19167754Smsmith_mm_cmplt_ss(__m128 a, __m128 b) 19299146Siwasaki{ 19367754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 1); 19467754Smsmith} 195107325Siwasaki 19667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 19783174Smsmith_mm_cmplt_ps(__m128 a, __m128 b) 198123315Snjl{ 199117521Snjl return (__m128)__builtin_ia32_cmpps(a, b, 1); 200123315Snjl} 20167754Smsmith 20267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 20367754Smsmith_mm_cmple_ss(__m128 a, __m128 b) 20467754Smsmith{ 20567754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 2); 20667754Smsmith} 20767754Smsmith 20867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 20967754Smsmith_mm_cmple_ps(__m128 a, __m128 b) 21067754Smsmith{ 21167754Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 2); 21267754Smsmith} 21367754Smsmith 21467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 21567754Smsmith_mm_cmpgt_ss(__m128 a, __m128 b) 21667754Smsmith{ 21767754Smsmith return (__m128)__builtin_ia32_cmpss(b, a, 1); 21867754Smsmith} 21967754Smsmith 22067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 22167754Smsmith_mm_cmpgt_ps(__m128 a, __m128 b) 222107325Siwasaki{ 22367754Smsmith return (__m128)__builtin_ia32_cmpps(b, a, 1); 224117521Snjl} 225123315Snjl 226117521Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 227123315Snjl_mm_cmpge_ss(__m128 a, __m128 b) 22867754Smsmith{ 22967754Smsmith return (__m128)__builtin_ia32_cmpss(b, a, 2); 23067754Smsmith} 23167754Smsmith 23267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 23367754Smsmith_mm_cmpge_ps(__m128 a, __m128 b) 23467754Smsmith{ 23567754Smsmith return (__m128)__builtin_ia32_cmpps(b, a, 2); 23667754Smsmith} 23767754Smsmith 238107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 23967754Smsmith_mm_cmpneq_ss(__m128 a, __m128 b) 24077424Smsmith{ 24167754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 4); 24267754Smsmith} 24367754Smsmith 24467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 245107325Siwasaki_mm_cmpneq_ps(__m128 a, __m128 b) 24667754Smsmith{ 24767754Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 4); 24867754Smsmith} 24967754Smsmith 25067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 25167754Smsmith_mm_cmpnlt_ss(__m128 a, __m128 b) 25267754Smsmith{ 25367754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 5); 25467754Smsmith} 25567754Smsmith 25667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 25767754Smsmith_mm_cmpnlt_ps(__m128 a, __m128 b) 25869450Smsmith{ 25967754Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 5); 26067754Smsmith} 26167754Smsmith 26291116Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 26367754Smsmith_mm_cmpnle_ss(__m128 a, __m128 b) 26467754Smsmith{ 26567754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 6); 26667754Smsmith} 26767754Smsmith 26867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 26991116Smsmith_mm_cmpnle_ps(__m128 a, __m128 b) 27067754Smsmith{ 27167754Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 6); 27291116Smsmith} 27367754Smsmith 27467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 275114237Snjl_mm_cmpngt_ss(__m128 a, __m128 b) 276107325Siwasaki{ 27767754Smsmith return (__m128)__builtin_ia32_cmpss(b, a, 5); 278107325Siwasaki} 27967754Smsmith 280107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 281107325Siwasaki_mm_cmpngt_ps(__m128 a, __m128 b) 282107325Siwasaki{ 283107325Siwasaki return (__m128)__builtin_ia32_cmpps(b, a, 5); 28467754Smsmith} 285107325Siwasaki 286107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 287107325Siwasaki_mm_cmpnge_ss(__m128 a, __m128 b) 288107325Siwasaki{ 289107325Siwasaki return (__m128)__builtin_ia32_cmpss(b, a, 6); 29067754Smsmith} 29167754Smsmith 29267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 29367754Smsmith_mm_cmpnge_ps(__m128 a, __m128 b) 29467754Smsmith{ 29587031Smsmith return (__m128)__builtin_ia32_cmpps(b, a, 6); 29667754Smsmith} 29767754Smsmith 29867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 29967754Smsmith_mm_cmpord_ss(__m128 a, __m128 b) 30067754Smsmith{ 30167754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 7); 30267754Smsmith} 30391116Smsmith 30487031Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 30591116Smsmith_mm_cmpord_ps(__m128 a, __m128 b) 30687031Smsmith{ 30787031Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 7); 30867754Smsmith} 30967754Smsmith 31067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 31167754Smsmith_mm_cmpunord_ss(__m128 a, __m128 b) 31267754Smsmith{ 31367754Smsmith return (__m128)__builtin_ia32_cmpss(a, b, 3); 31467754Smsmith} 31567754Smsmith 31667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 31767754Smsmith_mm_cmpunord_ps(__m128 a, __m128 b) 31867754Smsmith{ 31991116Smsmith return (__m128)__builtin_ia32_cmpps(a, b, 3); 32067754Smsmith} 32167754Smsmith 32267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 32367754Smsmith_mm_comieq_ss(__m128 a, __m128 b) 32467754Smsmith{ 32567754Smsmith return __builtin_ia32_comieq(a, b); 32667754Smsmith} 32767754Smsmith 32867754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 32967754Smsmith_mm_comilt_ss(__m128 a, __m128 b) 33067754Smsmith{ 33167754Smsmith return __builtin_ia32_comilt(a, b); 33267754Smsmith} 33367754Smsmith 334107325Siwasakistatic inline int __attribute__((__always_inline__, __nodebug__)) 335107325Siwasaki_mm_comile_ss(__m128 a, __m128 b) 33667754Smsmith{ 33767754Smsmith return __builtin_ia32_comile(a, b); 33867754Smsmith} 33977424Smsmith 34067754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 34167754Smsmith_mm_comigt_ss(__m128 a, __m128 b) 34267754Smsmith{ 343107325Siwasaki return __builtin_ia32_comigt(a, b); 34467754Smsmith} 34567754Smsmith 34667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 34767754Smsmith_mm_comige_ss(__m128 a, __m128 b) 348107325Siwasaki{ 34967754Smsmith return __builtin_ia32_comige(a, b); 35067754Smsmith} 35167754Smsmith 35267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 35367754Smsmith_mm_comineq_ss(__m128 a, __m128 b) 35467754Smsmith{ 35567754Smsmith return __builtin_ia32_comineq(a, b); 35667754Smsmith} 35767754Smsmith 35891116Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 35991116Smsmith_mm_ucomieq_ss(__m128 a, __m128 b) 36067754Smsmith{ 36167754Smsmith return __builtin_ia32_ucomieq(a, b); 36267754Smsmith} 36367754Smsmith 36467754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 36567754Smsmith_mm_ucomilt_ss(__m128 a, __m128 b) 36667754Smsmith{ 36791116Smsmith return __builtin_ia32_ucomilt(a, b); 36867754Smsmith} 36967754Smsmith 37067754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 37167754Smsmith_mm_ucomile_ss(__m128 a, __m128 b) 37267754Smsmith{ 37367754Smsmith return __builtin_ia32_ucomile(a, b); 374102550Siwasaki} 37567754Smsmith 37667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 377102550Siwasaki_mm_ucomigt_ss(__m128 a, __m128 b) 37867754Smsmith{ 37967754Smsmith return __builtin_ia32_ucomigt(a, b); 38067754Smsmith} 38167754Smsmith 38267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 38377424Smsmith_mm_ucomige_ss(__m128 a, __m128 b) 38467754Smsmith{ 385114237Snjl return __builtin_ia32_ucomige(a, b); 38667754Smsmith} 38767754Smsmith 38867754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 38967754Smsmith_mm_ucomineq_ss(__m128 a, __m128 b) 390107325Siwasaki{ 39167754Smsmith return __builtin_ia32_ucomineq(a, b); 39291116Smsmith} 39377424Smsmith 39467754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 39567754Smsmith_mm_cvtss_si32(__m128 a) 39667754Smsmith{ 39777424Smsmith return __builtin_ia32_cvtss2si(a); 39877424Smsmith} 39971867Smsmith 40071867Smsmith#ifdef __x86_64__ 40191116Smsmith 40271867Smsmithstatic inline long long __attribute__((__always_inline__, __nodebug__)) 40387031Smsmith_mm_cvtss_si64(__m128 a) 40471867Smsmith{ 40571867Smsmith return __builtin_ia32_cvtss2si64(a); 40671867Smsmith} 40767754Smsmith 40867754Smsmith#endif 40967754Smsmith 41067754Smsmithstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 41167754Smsmith_mm_cvtps_pi32(__m128 a) 41267754Smsmith{ 41367754Smsmith return (__m64)__builtin_ia32_cvtps2pi(a); 41487031Smsmith} 41567754Smsmith 41667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__)) 41767754Smsmith_mm_cvttss_si32(__m128 a) 41867754Smsmith{ 41967754Smsmith return a[0]; 42067754Smsmith} 42167754Smsmith 42291116Smsmithstatic inline long long __attribute__((__always_inline__, __nodebug__)) 42391116Smsmith_mm_cvttss_si64(__m128 a) 42467754Smsmith{ 42567754Smsmith return a[0]; 426107325Siwasaki} 42767754Smsmith 42867754Smsmithstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 42967754Smsmith_mm_cvttps_pi32(__m128 a) 43067754Smsmith{ 43167754Smsmith return (__m64)__builtin_ia32_cvttps2pi(a); 43267754Smsmith} 43367754Smsmith 43467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 43567754Smsmith_mm_cvtsi32_ss(__m128 a, int b) 43667754Smsmith{ 43767754Smsmith a[0] = b; 43867754Smsmith return a; 43967754Smsmith} 44091116Smsmith 44167754Smsmith#ifdef __x86_64__ 44282367Smsmith 44387031Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 44467754Smsmith_mm_cvtsi64_ss(__m128 a, long long b) 44567754Smsmith{ 44667754Smsmith a[0] = b; 44767754Smsmith return a; 44867754Smsmith} 44967754Smsmith 45067754Smsmith#endif 45167754Smsmith 45267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 45367754Smsmith_mm_cvtpi32_ps(__m128 a, __m64 b) 45467754Smsmith{ 45567754Smsmith return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 45667754Smsmith} 45767754Smsmith 45867754Smsmithstatic inline float __attribute__((__always_inline__, __nodebug__)) 45967754Smsmith_mm_cvtss_f32(__m128 a) 46067754Smsmith{ 46167754Smsmith return a[0]; 46267754Smsmith} 46367754Smsmith 464static inline __m128 __attribute__((__always_inline__, __nodebug__)) 465_mm_loadh_pi(__m128 a, __m64 const *p) 466{ 467 __m128 b; 468 b[0] = *(float*)p; 469 b[1] = *((float*)p+1); 470 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 471} 472 473static inline __m128 __attribute__((__always_inline__, __nodebug__)) 474_mm_loadl_pi(__m128 a, __m64 const *p) 475{ 476 __m128 b; 477 b[0] = *(float*)p; 478 b[1] = *((float*)p+1); 479 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 480} 481 482static inline __m128 __attribute__((__always_inline__, __nodebug__)) 483_mm_load_ss(float *p) 484{ 485 return (__m128){ *p, 0, 0, 0 }; 486} 487 488static inline __m128 __attribute__((__always_inline__, __nodebug__)) 489_mm_load1_ps(float *p) 490{ 491 return (__m128){ *p, *p, *p, *p }; 492} 493 494#define _mm_load_ps1(p) _mm_load1_ps(p) 495 496static inline __m128 __attribute__((__always_inline__, __nodebug__)) 497_mm_load_ps(float *p) 498{ 499 return *(__m128*)p; 500} 501 502static inline __m128 __attribute__((__always_inline__, __nodebug__)) 503_mm_loadu_ps(float *p) 504{ 505 return __builtin_ia32_loadups(p); 506} 507 508static inline __m128 __attribute__((__always_inline__, __nodebug__)) 509_mm_loadr_ps(float *p) 510{ 511 __m128 a = _mm_load_ps(p); 512 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 513} 514 515static inline __m128 __attribute__((__always_inline__, __nodebug__)) 516_mm_set_ss(float w) 517{ 518 return (__m128){ w, 0, 0, 0 }; 519} 520 521static inline __m128 __attribute__((__always_inline__, __nodebug__)) 522_mm_set1_ps(float w) 523{ 524 return (__m128){ w, w, w, w }; 525} 526 527// Microsoft specific. 528static inline __m128 __attribute__((__always_inline__, __nodebug__)) 529_mm_set_ps1(float w) 530{ 531 return _mm_set1_ps(w); 532} 533 534static inline __m128 __attribute__((__always_inline__, __nodebug__)) 535_mm_set_ps(float z, float y, float x, float w) 536{ 537 return (__m128){ w, x, y, z }; 538} 539 540static inline __m128 __attribute__((__always_inline__, __nodebug__)) 541_mm_setr_ps(float z, float y, float x, float w) 542{ 543 return (__m128){ z, y, x, w }; 544} 545 546static inline __m128 __attribute__((__always_inline__)) 547_mm_setzero_ps(void) 548{ 549 return (__m128){ 0, 0, 0, 0 }; 550} 551 552static inline void __attribute__((__always_inline__)) 553_mm_storeh_pi(__m64 *p, __m128 a) 554{ 555 __builtin_ia32_storehps((__v2si *)p, a); 556} 557 558static inline void __attribute__((__always_inline__)) 559_mm_storel_pi(__m64 *p, __m128 a) 560{ 561 __builtin_ia32_storelps((__v2si *)p, a); 562} 563 564static inline void __attribute__((__always_inline__)) 565_mm_store_ss(float *p, __m128 a) 566{ 567 *p = a[0]; 568} 569 570static inline void __attribute__((__always_inline__, __nodebug__)) 571_mm_storeu_ps(float *p, __m128 a) 572{ 573 __builtin_ia32_storeups(p, a); 574} 575 576static inline void __attribute__((__always_inline__, __nodebug__)) 577_mm_store1_ps(float *p, __m128 a) 578{ 579 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 580 _mm_storeu_ps(p, a); 581} 582 583static inline void __attribute__((__always_inline__, __nodebug__)) 584_mm_store_ps(float *p, __m128 a) 585{ 586 *(__m128 *)p = a; 587} 588 589static inline void __attribute__((__always_inline__, __nodebug__)) 590_mm_storer_ps(float *p, __m128 a) 591{ 592 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 593 _mm_store_ps(p, a); 594} 595 596#define _MM_HINT_T0 1 597#define _MM_HINT_T1 2 598#define _MM_HINT_T2 3 599#define _MM_HINT_NTA 0 600 601/* FIXME: We have to #define this because "sel" must be a constant integer, and 602 Sema doesn't do any form of constant propagation yet. */ 603 604#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 605 606static inline void __attribute__((__always_inline__, __nodebug__)) 607_mm_stream_pi(__m64 *p, __m64 a) 608{ 609 __builtin_ia32_movntq(p, a); 610} 611 612static inline void __attribute__((__always_inline__, __nodebug__)) 613_mm_stream_ps(float *p, __m128 a) 614{ 615 __builtin_ia32_movntps(p, a); 616} 617 618static inline void __attribute__((__always_inline__, __nodebug__)) 619_mm_sfence(void) 620{ 621 __builtin_ia32_sfence(); 622} 623 624static inline int __attribute__((__always_inline__, __nodebug__)) 625_mm_extract_pi16(__m64 a, int n) 626{ 627 __v4hi b = (__v4hi)a; 628 return (unsigned short)b[n & 3]; 629} 630 631static inline __m64 __attribute__((__always_inline__, __nodebug__)) 632_mm_insert_pi16(__m64 a, int d, int n) 633{ 634 __v4hi b = (__v4hi)a; 635 b[n & 3] = d; 636 return (__m64)b; 637} 638 639static inline __m64 __attribute__((__always_inline__, __nodebug__)) 640_mm_max_pi16(__m64 a, __m64 b) 641{ 642 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 643} 644 645static inline __m64 __attribute__((__always_inline__, __nodebug__)) 646_mm_max_pu8(__m64 a, __m64 b) 647{ 648 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 649} 650 651static inline __m64 __attribute__((__always_inline__, __nodebug__)) 652_mm_min_pi16(__m64 a, __m64 b) 653{ 654 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 655} 656 657static inline __m64 __attribute__((__always_inline__, __nodebug__)) 658_mm_min_pu8(__m64 a, __m64 b) 659{ 660 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 661} 662 663static inline int __attribute__((__always_inline__, __nodebug__)) 664_mm_movemask_pi8(__m64 a) 665{ 666 return __builtin_ia32_pmovmskb((__v8qi)a); 667} 668 669static inline __m64 __attribute__((__always_inline__, __nodebug__)) 670_mm_mulhi_pu16(__m64 a, __m64 b) 671{ 672 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 673} 674 675#define _mm_shuffle_pi16(a, n) \ 676 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 677 (n) & 0x3, ((n) & 0xc) >> 2, \ 678 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 679 680static inline void __attribute__((__always_inline__, __nodebug__)) 681_mm_maskmove_si64(__m64 d, __m64 n, char *p) 682{ 683 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 684} 685 686static inline __m64 __attribute__((__always_inline__, __nodebug__)) 687_mm_avg_pu8(__m64 a, __m64 b) 688{ 689 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 690} 691 692static inline __m64 __attribute__((__always_inline__, __nodebug__)) 693_mm_avg_pu16(__m64 a, __m64 b) 694{ 695 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 696} 697 698static inline __m64 __attribute__((__always_inline__, __nodebug__)) 699_mm_sad_pu8(__m64 a, __m64 b) 700{ 701 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 702} 703 704static inline unsigned int __attribute__((__always_inline__, __nodebug__)) 705_mm_getcsr(void) 706{ 707 return __builtin_ia32_stmxcsr(); 708} 709 710static inline void __attribute__((__always_inline__, __nodebug__)) 711_mm_setcsr(unsigned int i) 712{ 713 __builtin_ia32_ldmxcsr(i); 714} 715 716#define _mm_shuffle_ps(a, b, mask) \ 717 (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \ 718 (((mask) & 0x30) >> 4) + 4, \ 719 (((mask) & 0xc0) >> 6) + 4)) 720 721static inline __m128 __attribute__((__always_inline__, __nodebug__)) 722_mm_unpackhi_ps(__m128 a, __m128 b) 723{ 724 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 725} 726 727static inline __m128 __attribute__((__always_inline__, __nodebug__)) 728_mm_unpacklo_ps(__m128 a, __m128 b) 729{ 730 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 731} 732 733static inline __m128 __attribute__((__always_inline__, __nodebug__)) 734_mm_move_ss(__m128 a, __m128 b) 735{ 736 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 737} 738 739static inline __m128 __attribute__((__always_inline__, __nodebug__)) 740_mm_movehl_ps(__m128 a, __m128 b) 741{ 742 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 743} 744 745static inline __m128 __attribute__((__always_inline__, __nodebug__)) 746_mm_movelh_ps(__m128 a, __m128 b) 747{ 748 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 749} 750 751static inline __m128 __attribute__((__always_inline__, __nodebug__)) 752_mm_cvtpi16_ps(__m64 a) 753{ 754 __m64 b, c; 755 __m128 r; 756 757 b = _mm_setzero_si64(); 758 b = _mm_cmpgt_pi16(b, a); 759 c = _mm_unpackhi_pi16(a, b); 760 r = _mm_setzero_ps(); 761 r = _mm_cvtpi32_ps(r, c); 762 r = _mm_movelh_ps(r, r); 763 c = _mm_unpacklo_pi16(a, b); 764 r = _mm_cvtpi32_ps(r, c); 765 766 return r; 767} 768 769static inline __m128 __attribute__((__always_inline__, __nodebug__)) 770_mm_cvtpu16_ps(__m64 a) 771{ 772 __m64 b, c; 773 __m128 r; 774 775 b = _mm_setzero_si64(); 776 c = _mm_unpackhi_pi16(a, b); 777 r = _mm_setzero_ps(); 778 r = _mm_cvtpi32_ps(r, c); 779 r = _mm_movelh_ps(r, r); 780 c = _mm_unpacklo_pi16(a, b); 781 r = _mm_cvtpi32_ps(r, c); 782 783 return r; 784} 785 786static inline __m128 __attribute__((__always_inline__, __nodebug__)) 787_mm_cvtpi8_ps(__m64 a) 788{ 789 __m64 b; 790 791 b = _mm_setzero_si64(); 792 b = _mm_cmpgt_pi8(b, a); 793 b = _mm_unpacklo_pi8(a, b); 794 795 return _mm_cvtpi16_ps(b); 796} 797 798static inline __m128 __attribute__((__always_inline__, __nodebug__)) 799_mm_cvtpu8_ps(__m64 a) 800{ 801 __m64 b; 802 803 b = _mm_setzero_si64(); 804 b = _mm_unpacklo_pi8(a, b); 805 806 return _mm_cvtpi16_ps(b); 807} 808 809static inline __m128 __attribute__((__always_inline__, __nodebug__)) 810_mm_cvtpi32x2_ps(__m64 a, __m64 b) 811{ 812 __m128 c; 813 814 c = _mm_setzero_ps(); 815 c = _mm_cvtpi32_ps(c, b); 816 c = _mm_movelh_ps(c, c); 817 818 return _mm_cvtpi32_ps(c, a); 819} 820 821static inline __m64 __attribute__((__always_inline__, __nodebug__)) 822_mm_cvtps_pi16(__m128 a) 823{ 824 __m64 b, c; 825 826 b = _mm_cvtps_pi32(a); 827 a = _mm_movehl_ps(a, a); 828 c = _mm_cvtps_pi32(a); 829 830 return _mm_packs_pi16(b, c); 831} 832 833static inline __m64 __attribute__((__always_inline__, __nodebug__)) 834_mm_cvtps_pi8(__m128 a) 835{ 836 __m64 b, c; 837 838 b = _mm_cvtps_pi16(a); 839 c = _mm_setzero_si64(); 840 841 return _mm_packs_pi16(b, c); 842} 843 844static inline int __attribute__((__always_inline__, __nodebug__)) 845_mm_movemask_ps(__m128 a) 846{ 847 return __builtin_ia32_movmskps(a); 848} 849 850#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 851 852#define _MM_EXCEPT_INVALID (0x0001) 853#define _MM_EXCEPT_DENORM (0x0002) 854#define _MM_EXCEPT_DIV_ZERO (0x0004) 855#define _MM_EXCEPT_OVERFLOW (0x0008) 856#define _MM_EXCEPT_UNDERFLOW (0x0010) 857#define _MM_EXCEPT_INEXACT (0x0020) 858#define _MM_EXCEPT_MASK (0x003f) 859 860#define _MM_MASK_INVALID (0x0080) 861#define _MM_MASK_DENORM (0x0100) 862#define _MM_MASK_DIV_ZERO (0x0200) 863#define _MM_MASK_OVERFLOW (0x0400) 864#define _MM_MASK_UNDERFLOW (0x0800) 865#define _MM_MASK_INEXACT (0x1000) 866#define _MM_MASK_MASK (0x1f80) 867 868#define _MM_ROUND_NEAREST (0x0000) 869#define _MM_ROUND_DOWN (0x2000) 870#define _MM_ROUND_UP (0x4000) 871#define _MM_ROUND_TOWARD_ZERO (0x6000) 872#define _MM_ROUND_MASK (0x6000) 873 874#define _MM_FLUSH_ZERO_MASK (0x8000) 875#define _MM_FLUSH_ZERO_ON (0x8000) 876#define _MM_FLUSH_ZERO_OFF (0x8000) 877 878#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 879#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 880#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 881#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 882 883#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 884#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 885#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 886#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 887 888#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 889do { \ 890 __m128 tmp3, tmp2, tmp1, tmp0; \ 891 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 892 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 893 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 894 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 895 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 896 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 897 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 898 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 899} while (0) 900 901/* Ugly hack for backwards-compatibility (compatible with gcc) */ 902#ifdef __SSE2__ 903#include <emmintrin.h> 904#endif 905 906#endif /* __SSE__ */ 907 908#endif /* __XMMINTRIN_H */ 909