xmmintrin.h revision 193326
1225394Sjchandra/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2233563Sjchandra * 3233563Sjchandra * Permission is hereby granted, free of charge, to any person obtaining a copy 4225394Sjchandra * of this software and associated documentation files (the "Software"), to deal 5225394Sjchandra * in the Software without restriction, including without limitation the rights 6225394Sjchandra * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7225394Sjchandra * copies of the Software, and to permit persons to whom the Software is 8233563Sjchandra * furnished to do so, subject to the following conditions: 9225394Sjchandra * 10225394Sjchandra * The above copyright notice and this permission notice shall be included in 11225394Sjchandra * all copies or substantial portions of the Software. 12233563Sjchandra * 13233563Sjchandra * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14233563Sjchandra * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15233563Sjchandra * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16233563Sjchandra * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17233563Sjchandra * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18233563Sjchandra * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19233563Sjchandra * THE SOFTWARE. 20233563Sjchandra * 21233563Sjchandra *===-----------------------------------------------------------------------=== 22233563Sjchandra */ 23233563Sjchandra 24233563Sjchandra#ifndef __XMMINTRIN_H 25233563Sjchandra#define __XMMINTRIN_H 26233563Sjchandra 27233563Sjchandra#ifndef __SSE__ 28233563Sjchandra#error "SSE instruction set not enabled" 29225394Sjchandra#else 30225394Sjchandra 31225394Sjchandra#include <mmintrin.h> 32225394Sjchandra 33225394Sjchandratypedef float __v4sf __attribute__((__vector_size__(16))); 34225394Sjchandratypedef float __m128 __attribute__((__vector_size__(16))); 35225394Sjchandra 36225394Sjchandra#include <mm_malloc.h> 37225394Sjchandra 38225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 39225394Sjchandra_mm_add_ss(__m128 a, __m128 b) 40225394Sjchandra{ 41233563Sjchandra return __builtin_ia32_addss(a, b); 42225394Sjchandra} 43225394Sjchandra 44225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 45225394Sjchandra_mm_add_ps(__m128 a, __m128 b) 46225394Sjchandra{ 47225394Sjchandra return a + b; 48225394Sjchandra} 49233563Sjchandra 50233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 51225394Sjchandra_mm_sub_ss(__m128 a, __m128 b) 52225394Sjchandra{ 53225394Sjchandra return __builtin_ia32_subss(a, b); 54225394Sjchandra} 55279345Sjchandra 56279345Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 57279345Sjchandra_mm_sub_ps(__m128 a, __m128 b) 58225394Sjchandra{ 59225394Sjchandra return a - b; 60225394Sjchandra} 61225394Sjchandra 62225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 63225394Sjchandra_mm_mul_ss(__m128 a, __m128 b) 64225394Sjchandra{ 65225394Sjchandra return __builtin_ia32_mulss(a, b); 66225394Sjchandra} 67225394Sjchandra 68233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 69233556Sjchandra_mm_mul_ps(__m128 a, __m128 b) 70225394Sjchandra{ 71225394Sjchandra return a * b; 72225394Sjchandra} 73225394Sjchandra 74225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 75233563Sjchandra_mm_div_ss(__m128 a, __m128 b) 76225394Sjchandra{ 77233563Sjchandra return __builtin_ia32_divss(a, b); 78233563Sjchandra} 79233563Sjchandra 80233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 81233563Sjchandra_mm_div_ps(__m128 a, __m128 b) 82233563Sjchandra{ 83233563Sjchandra return a / b; 84233563Sjchandra} 85233563Sjchandra 86233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 87225394Sjchandra_mm_sqrt_ss(__m128 a) 88225394Sjchandra{ 89233563Sjchandra return __builtin_ia32_sqrtss(a); 90233563Sjchandra} 91233563Sjchandra 92233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 93233563Sjchandra_mm_sqrt_ps(__m128 a) 94233563Sjchandra{ 95233563Sjchandra return __builtin_ia32_sqrtps(a); 96233563Sjchandra} 97233563Sjchandra 98233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 99233563Sjchandra_mm_rcp_ss(__m128 a) 100233563Sjchandra{ 101233563Sjchandra return __builtin_ia32_rcpss(a); 102233563Sjchandra} 103233563Sjchandra 104233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 105233563Sjchandra_mm_rcp_ps(__m128 a) 106233563Sjchandra{ 107233563Sjchandra return __builtin_ia32_rcpps(a); 108233563Sjchandra} 109233563Sjchandra 110233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 111233563Sjchandra_mm_rsqrt_ss(__m128 a) 112233563Sjchandra{ 113233563Sjchandra return __builtin_ia32_rsqrtss(a); 114233563Sjchandra} 115233564Sjchandra 116233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 117233563Sjchandra_mm_rsqrt_ps(__m128 a) 118233564Sjchandra{ 119233564Sjchandra return __builtin_ia32_rsqrtps(a); 120233564Sjchandra} 121233564Sjchandra 122233564Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 123233564Sjchandra_mm_min_ss(__m128 a, __m128 b) 124233564Sjchandra{ 125233564Sjchandra return __builtin_ia32_minss(a, b); 126233564Sjchandra} 127233564Sjchandra 128233564Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 129233564Sjchandra_mm_min_ps(__m128 a, __m128 b) 130233564Sjchandra{ 131233563Sjchandra return __builtin_ia32_minps(a, b); 132233563Sjchandra} 133233563Sjchandra 134233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 135233563Sjchandra_mm_max_ss(__m128 a, __m128 b) 136233563Sjchandra{ 137233563Sjchandra return __builtin_ia32_maxss(a, b); 138233563Sjchandra} 139233563Sjchandra 140233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 141233563Sjchandra_mm_max_ps(__m128 a, __m128 b) 142233563Sjchandra{ 143233563Sjchandra return __builtin_ia32_maxps(a, b); 144233563Sjchandra} 145233563Sjchandra 146233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 147233563Sjchandra_mm_and_ps(__m128 a, __m128 b) 148233563Sjchandra{ 149233563Sjchandra return __builtin_ia32_andps(a, b); 150233563Sjchandra} 151233563Sjchandra 152233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 153233563Sjchandra_mm_andnot_ps(__m128 a, __m128 b) 154233563Sjchandra{ 155233563Sjchandra return __builtin_ia32_andnps(a, b); 156233563Sjchandra} 157233563Sjchandra 158233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 159233563Sjchandra_mm_or_ps(__m128 a, __m128 b) 160233563Sjchandra{ 161233563Sjchandra return __builtin_ia32_orps(a, b); 162233563Sjchandra} 163233563Sjchandra 164233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 165233563Sjchandra_mm_xor_ps(__m128 a, __m128 b) 166233563Sjchandra{ 167233563Sjchandra return __builtin_ia32_xorps(a, b); 168233563Sjchandra} 169279306Sjchandra 170233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 171233563Sjchandra_mm_cmpeq_ss(__m128 a, __m128 b) 172233563Sjchandra{ 173233563Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 0); 174233563Sjchandra} 175233563Sjchandra 176233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 177233563Sjchandra_mm_cmpeq_ps(__m128 a, __m128 b) 178233563Sjchandra{ 179233563Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 0); 180279306Sjchandra} 181233563Sjchandra 182279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 183279306Sjchandra_mm_cmplt_ss(__m128 a, __m128 b) 184279306Sjchandra{ 185279306Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 1); 186279306Sjchandra} 187279306Sjchandra 188279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 189279306Sjchandra_mm_cmplt_ps(__m128 a, __m128 b) 190279306Sjchandra{ 191279306Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 1); 192279306Sjchandra} 193279306Sjchandra 194279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 195279306Sjchandra_mm_cmple_ss(__m128 a, __m128 b) 196279306Sjchandra{ 197279306Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 2); 198279306Sjchandra} 199279306Sjchandra 200279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 201279306Sjchandra_mm_cmple_ps(__m128 a, __m128 b) 202279306Sjchandra{ 203279306Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 2); 204279306Sjchandra} 205279306Sjchandra 206279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 207233563Sjchandra_mm_cmpgt_ss(__m128 a, __m128 b) 208233563Sjchandra{ 209233563Sjchandra return (__m128)__builtin_ia32_cmpss(b, a, 1); 210233563Sjchandra} 211233563Sjchandra 212233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 213233563Sjchandra_mm_cmpgt_ps(__m128 a, __m128 b) 214233563Sjchandra{ 215233564Sjchandra return (__m128)__builtin_ia32_cmpps(b, a, 1); 216279306Sjchandra} 217279306Sjchandra 218279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 219279306Sjchandra_mm_cmpge_ss(__m128 a, __m128 b) 220279306Sjchandra{ 221233564Sjchandra return (__m128)__builtin_ia32_cmpss(b, a, 2); 222233563Sjchandra} 223233563Sjchandra 224233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 225233563Sjchandra_mm_cmpge_ps(__m128 a, __m128 b) 226233563Sjchandra{ 227233563Sjchandra return (__m128)__builtin_ia32_cmpps(b, a, 2); 228233563Sjchandra} 229233563Sjchandra 230233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 231233563Sjchandra_mm_cmpneq_ss(__m128 a, __m128 b) 232233563Sjchandra{ 233233563Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 4); 234233563Sjchandra} 235233563Sjchandra 236233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 237233563Sjchandra_mm_cmpneq_ps(__m128 a, __m128 b) 238233563Sjchandra{ 239233563Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 4); 240233563Sjchandra} 241233563Sjchandra 242233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 243233563Sjchandra_mm_cmpnlt_ss(__m128 a, __m128 b) 244233563Sjchandra{ 245233563Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 5); 246233563Sjchandra} 247233563Sjchandra 248233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 249233563Sjchandra_mm_cmpnlt_ps(__m128 a, __m128 b) 250233563Sjchandra{ 251233563Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 5); 252233563Sjchandra} 253233563Sjchandra 254233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 255233563Sjchandra_mm_cmpnle_ss(__m128 a, __m128 b) 256233563Sjchandra{ 257233563Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 6); 258233563Sjchandra} 259233563Sjchandra 260233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 261233563Sjchandra_mm_cmpnle_ps(__m128 a, __m128 b) 262233563Sjchandra{ 263233563Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 6); 264233563Sjchandra} 265233563Sjchandra 266233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 267233563Sjchandra_mm_cmpngt_ss(__m128 a, __m128 b) 268233563Sjchandra{ 269233563Sjchandra return (__m128)__builtin_ia32_cmpss(b, a, 5); 270233563Sjchandra} 271233563Sjchandra 272233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 273233563Sjchandra_mm_cmpngt_ps(__m128 a, __m128 b) 274233563Sjchandra{ 275233563Sjchandra return (__m128)__builtin_ia32_cmpps(b, a, 5); 276233563Sjchandra} 277233563Sjchandra 278233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 279233563Sjchandra_mm_cmpnge_ss(__m128 a, __m128 b) 280233563Sjchandra{ 281233563Sjchandra return (__m128)__builtin_ia32_cmpss(b, a, 6); 282233563Sjchandra} 283233570Sjchandra 284233570Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 285233563Sjchandra_mm_cmpnge_ps(__m128 a, __m128 b) 286233563Sjchandra{ 287225394Sjchandra return (__m128)__builtin_ia32_cmpps(b, a, 6); 288225394Sjchandra} 289225394Sjchandra 290233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 291225394Sjchandra_mm_cmpord_ss(__m128 a, __m128 b) 292225394Sjchandra{ 293225394Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 7); 294225394Sjchandra} 295225394Sjchandra 296225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 297225394Sjchandra_mm_cmpord_ps(__m128 a, __m128 b) 298225394Sjchandra{ 299225394Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 7); 300225394Sjchandra} 301225394Sjchandra 302225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 303225394Sjchandra_mm_cmpunord_ss(__m128 a, __m128 b) 304225394Sjchandra{ 305233536Sjchandra return (__m128)__builtin_ia32_cmpss(a, b, 3); 306225394Sjchandra} 307225394Sjchandra 308225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 309225394Sjchandra_mm_cmpunord_ps(__m128 a, __m128 b) 310225394Sjchandra{ 311225394Sjchandra return (__m128)__builtin_ia32_cmpps(a, b, 3); 312225394Sjchandra} 313233536Sjchandra 314225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 315225394Sjchandra_mm_comieq_ss(__m128 a, __m128 b) 316233536Sjchandra{ 317233536Sjchandra return __builtin_ia32_comieq(a, b); 318233536Sjchandra} 319233536Sjchandra 320225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 321225394Sjchandra_mm_comilt_ss(__m128 a, __m128 b) 322225394Sjchandra{ 323225394Sjchandra return __builtin_ia32_comilt(a, b); 324225394Sjchandra} 325233563Sjchandra 326225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 327225394Sjchandra_mm_comile_ss(__m128 a, __m128 b) 328225394Sjchandra{ 329225394Sjchandra return __builtin_ia32_comile(a, b); 330225394Sjchandra} 331225394Sjchandra 332225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 333279345Sjchandra_mm_comigt_ss(__m128 a, __m128 b) 334279345Sjchandra{ 335279345Sjchandra return __builtin_ia32_comigt(a, b); 336279345Sjchandra} 337279345Sjchandra 338225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 339225394Sjchandra_mm_comige_ss(__m128 a, __m128 b) 340225394Sjchandra{ 341225394Sjchandra return __builtin_ia32_comige(a, b); 342225394Sjchandra} 343225394Sjchandra 344225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 345225394Sjchandra_mm_comineq_ss(__m128 a, __m128 b) 346225394Sjchandra{ 347225394Sjchandra return __builtin_ia32_comineq(a, b); 348225394Sjchandra} 349225394Sjchandra 350225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 351225394Sjchandra_mm_ucomieq_ss(__m128 a, __m128 b) 352225394Sjchandra{ 353225394Sjchandra return __builtin_ia32_ucomieq(a, b); 354225394Sjchandra} 355225394Sjchandra 356225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 357225394Sjchandra_mm_ucomilt_ss(__m128 a, __m128 b) 358225394Sjchandra{ 359225394Sjchandra return __builtin_ia32_ucomilt(a, b); 360225394Sjchandra} 361225394Sjchandra 362225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 363225394Sjchandra_mm_ucomile_ss(__m128 a, __m128 b) 364225394Sjchandra{ 365225394Sjchandra return __builtin_ia32_ucomile(a, b); 366225394Sjchandra} 367225394Sjchandra 368225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 369225394Sjchandra_mm_ucomigt_ss(__m128 a, __m128 b) 370225394Sjchandra{ 371225394Sjchandra return __builtin_ia32_ucomigt(a, b); 372225394Sjchandra} 373225394Sjchandra 374225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 375225394Sjchandra_mm_ucomige_ss(__m128 a, __m128 b) 376225394Sjchandra{ 377225394Sjchandra return __builtin_ia32_ucomige(a, b); 378225394Sjchandra} 379225394Sjchandra 380225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 381225394Sjchandra_mm_ucomineq_ss(__m128 a, __m128 b) 382225394Sjchandra{ 383225394Sjchandra return __builtin_ia32_ucomineq(a, b); 384225394Sjchandra} 385225394Sjchandra 386225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 387225394Sjchandra_mm_cvtss_si32(__m128 a) 388225394Sjchandra{ 389233563Sjchandra return __builtin_ia32_cvtss2si(a); 390233563Sjchandra} 391233563Sjchandra 392225394Sjchandrastatic inline long long __attribute__((__always_inline__, __nodebug__)) 393233563Sjchandra_mm_cvtss_si64(__m128 a) 394233563Sjchandra{ 395225394Sjchandra return __builtin_ia32_cvtss2si64(a); 396233563Sjchandra} 397225394Sjchandra 398225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 399225394Sjchandra_mm_cvtps_pi32(__m128 a) 400225394Sjchandra{ 401225394Sjchandra return (__m64)__builtin_ia32_cvtps2pi(a); 402225394Sjchandra} 403225394Sjchandra 404225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 405225394Sjchandra_mm_cvttss_si32(__m128 a) 406225394Sjchandra{ 407225394Sjchandra return __builtin_ia32_cvttss2si(a); 408225394Sjchandra} 409225394Sjchandra 410225394Sjchandrastatic inline long long __attribute__((__always_inline__, __nodebug__)) 411225394Sjchandra_mm_cvttss_si64(__m128 a) 412225394Sjchandra{ 413225394Sjchandra return __builtin_ia32_cvttss2si64(a); 414225394Sjchandra} 415225394Sjchandra 416225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 417225394Sjchandra_mm_cvttps_pi32(__m128 a) 418225394Sjchandra{ 419225394Sjchandra return (__m64)__builtin_ia32_cvttps2pi(a); 420225394Sjchandra} 421225394Sjchandra 422225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 423225394Sjchandra_mm_cvtsi32_ss(__m128 a, int b) 424225394Sjchandra{ 425225394Sjchandra return __builtin_ia32_cvtsi2ss(a, b); 426225394Sjchandra} 427225394Sjchandra 428225394Sjchandra#ifdef __x86_64__ 429225394Sjchandra 430225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 431233563Sjchandra_mm_cvtsi64_ss(__m128 a, long long b) 432233563Sjchandra{ 433233563Sjchandra return __builtin_ia32_cvtsi642ss(a, b); 434233563Sjchandra} 435233563Sjchandra 436225394Sjchandra#endif 437225394Sjchandra 438225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 439233536Sjchandra_mm_cvtpi32_ps(__m128 a, __m64 b) 440245877Sjchandra{ 441245877Sjchandra return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 442245877Sjchandra} 443233536Sjchandra 444233536Sjchandrastatic inline float __attribute__((__always_inline__, __nodebug__)) 445233563Sjchandra_mm_cvtss_f32(__m128 a) 446233536Sjchandra{ 447245877Sjchandra return a[0]; 448233536Sjchandra} 449233536Sjchandra 450233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 451233536Sjchandra_mm_loadh_pi(__m128 a, __m64 const *p) 452233536Sjchandra{ 453233536Sjchandra return __builtin_ia32_loadhps(a, (__v2si *)p); 454233536Sjchandra} 455233536Sjchandra 456233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 457233536Sjchandra_mm_loadl_pi(__m128 a, __m64 const *p) 458233536Sjchandra{ 459233536Sjchandra return __builtin_ia32_loadlps(a, (__v2si *)p); 460233536Sjchandra} 461233536Sjchandra 462238289Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 463233536Sjchandra_mm_load_ss(float *p) 464233536Sjchandra{ 465233536Sjchandra return (__m128){ *p, 0, 0, 0 }; 466233536Sjchandra} 467233536Sjchandra 468238289Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 469245877Sjchandra_mm_load1_ps(float *p) 470233536Sjchandra{ 471233536Sjchandra return (__m128){ *p, *p, *p, *p }; 472225394Sjchandra} 473225394Sjchandra 474225394Sjchandra#define _mm_load_ps1(p) _mm_load1_ps(p) 475233536Sjchandra 476225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 477257338Snwhitehorn_mm_load_ps(float *p) 478257338Snwhitehorn{ 479233536Sjchandra return *(__m128*)p; 480233536Sjchandra} 481233536Sjchandra 482233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 483233536Sjchandra_mm_loadu_ps(float *p) 484225394Sjchandra{ 485225394Sjchandra return __builtin_ia32_loadups(p); 486225394Sjchandra} 487225394Sjchandra 488225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 489225394Sjchandra_mm_loadr_ps(float *p) 490225394Sjchandra{ 491225394Sjchandra __m128 a = _mm_load_ps(p); 492225394Sjchandra return __builtin_shufflevector(a, a, 3, 2, 1, 0); 493225394Sjchandra} 494225394Sjchandra 495225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 496225394Sjchandra_mm_set_ss(float w) 497225394Sjchandra{ 498225394Sjchandra return (__m128){ w, 0, 0, 0 }; 499225394Sjchandra} 500225394Sjchandra 501225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 502225394Sjchandra_mm_set1_ps(float w) 503225394Sjchandra{ 504225394Sjchandra return (__m128){ w, w, w, w }; 505225394Sjchandra} 506225394Sjchandra 507225394Sjchandra// Microsoft specific. 508225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 509225394Sjchandra_mm_set_ps1(float w) 510225394Sjchandra{ 511225394Sjchandra return _mm_set1_ps(w); 512225394Sjchandra} 513225394Sjchandra 514225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 515225394Sjchandra_mm_set_ps(float z, float y, float x, float w) 516225394Sjchandra{ 517225394Sjchandra return (__m128){ w, x, y, z }; 518225394Sjchandra} 519225394Sjchandra 520225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 521225394Sjchandra_mm_setr_ps(float z, float y, float x, float w) 522225394Sjchandra{ 523225394Sjchandra return (__m128){ z, y, x, w }; 524225394Sjchandra} 525225394Sjchandra 526225394Sjchandrastatic inline __m128 __attribute__((__always_inline__)) 527225394Sjchandra_mm_setzero_ps(void) 528225394Sjchandra{ 529225394Sjchandra return (__m128){ 0, 0, 0, 0 }; 530225394Sjchandra} 531225394Sjchandra 532225394Sjchandrastatic inline void __attribute__((__always_inline__)) 533225394Sjchandra_mm_storeh_pi(__m64 *p, __m128 a) 534225394Sjchandra{ 535225394Sjchandra __builtin_ia32_storehps((__v2si *)p, a); 536225394Sjchandra} 537225394Sjchandra 538225394Sjchandrastatic inline void __attribute__((__always_inline__)) 539225394Sjchandra_mm_storel_pi(__m64 *p, __m128 a) 540225394Sjchandra{ 541225394Sjchandra __builtin_ia32_storelps((__v2si *)p, a); 542225394Sjchandra} 543225394Sjchandra 544225394Sjchandrastatic inline void __attribute__((__always_inline__)) 545225394Sjchandra_mm_store_ss(float *p, __m128 a) 546225394Sjchandra{ 547225394Sjchandra *p = a[0]; 548279306Sjchandra} 549225394Sjchandra 550279306Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 551225394Sjchandra_mm_storeu_ps(float *p, __m128 a) 552225394Sjchandra{ 553225394Sjchandra __builtin_ia32_storeups(p, a); 554225394Sjchandra} 555279306Sjchandra 556279306Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 557279306Sjchandra_mm_store1_ps(float *p, __m128 a) 558279306Sjchandra{ 559225394Sjchandra a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 560225394Sjchandra _mm_storeu_ps(p, a); 561225394Sjchandra} 562279341Sjchandra 563225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 564225394Sjchandra_mm_store_ps(float *p, __m128 a) 565225394Sjchandra{ 566225394Sjchandra *(__m128 *)p = a; 567225394Sjchandra} 568225394Sjchandra 569225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 570227783Sjchandra_mm_storer_ps(float *p, __m128 a) 571225394Sjchandra{ 572225394Sjchandra a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 573225394Sjchandra _mm_store_ps(p, a); 574225394Sjchandra} 575225394Sjchandra 576225394Sjchandra#define _MM_HINT_T0 1 577225394Sjchandra#define _MM_HINT_T1 2 578225394Sjchandra#define _MM_HINT_T2 3 579225394Sjchandra#define _MM_HINT_NTA 0 580225394Sjchandra 581225394Sjchandra/* FIXME: We have to #define this because "sel" must be a constant integer, and 582225394Sjchandra Sema doesn't do any form of constant propagation yet. */ 583225394Sjchandra 584225394Sjchandra#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 585225394Sjchandra 586225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 587225394Sjchandra_mm_stream_pi(__m64 *p, __m64 a) 588225394Sjchandra{ 589225394Sjchandra __builtin_ia32_movntq(p, a); 590225394Sjchandra} 591225394Sjchandra 592233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 593225394Sjchandra_mm_stream_ps(float *p, __m128 a) 594225394Sjchandra{ 595225394Sjchandra __builtin_ia32_movntps(p, a); 596225394Sjchandra} 597225394Sjchandra 598225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 599225394Sjchandra_mm_sfence(void) 600225394Sjchandra{ 601225394Sjchandra __builtin_ia32_sfence(); 602225394Sjchandra} 603225394Sjchandra 604225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 605225394Sjchandra_mm_extract_pi16(__m64 a, int n) 606225394Sjchandra{ 607225394Sjchandra /* FIXME: 608233563Sjchandra * This should force n to be an immediate. 609233563Sjchandra * This does not use the PEXTRW instruction. From looking at the LLVM source, the 610225394Sjchandra instruction doesn't seem to be hooked up. 611233563Sjchandra * The code could probably be made better :) 612225394Sjchandra */ 613225394Sjchandra __v4hi b = (__v4hi)a; 614225394Sjchandra return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))]; 615225394Sjchandra} 616225394Sjchandra 617225394Sjchandra/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to 618225394Sjchandra the already existing __builtin_shufflevector. 619227783Sjchandra*/ 620227783Sjchandra/* 621227783Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 622225394Sjchandra_mm_insert_pi16(__m64 a, int d, int n) 623225394Sjchandra{ 624225394Sjchandra return (__m64){ 0LL }; 625225394Sjchandra} 626225394Sjchandra*/ 627233536Sjchandra 628225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 629225394Sjchandra_mm_max_pi16(__m64 a, __m64 b) 630225394Sjchandra{ 631225394Sjchandra return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 632225394Sjchandra} 633225394Sjchandra 634225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 635225394Sjchandra_mm_max_pu8(__m64 a, __m64 b) 636233536Sjchandra{ 637225394Sjchandra return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 638225394Sjchandra} 639225394Sjchandra 640225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 641233536Sjchandra_mm_min_pi16(__m64 a, __m64 b) 642225394Sjchandra{ 643225394Sjchandra return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 644225394Sjchandra} 645233536Sjchandra 646233536Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 647225394Sjchandra_mm_min_pu8(__m64 a, __m64 b) 648225394Sjchandra{ 649225394Sjchandra return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 650225394Sjchandra} 651233536Sjchandra 652233536Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__)) 653279306Sjchandra_mm_movemask_pi8(__m64 a) 654279306Sjchandra{ 655225394Sjchandra return __builtin_ia32_pmovmskb((__v8qi)a); 656279341Sjchandra} 657279341Sjchandra 658279341Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 659279341Sjchandra_mm_mulhi_pu16(__m64 a, __m64 b) 660279341Sjchandra{ 661225394Sjchandra return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 662225394Sjchandra} 663225394Sjchandra 664225394Sjchandra#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n)) 665225394Sjchandra 666233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 667225394Sjchandra_mm_maskmove_si64(__m64 d, __m64 n, char *p) 668225394Sjchandra{ 669225394Sjchandra __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 670225394Sjchandra} 671225394Sjchandra 672225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 673225394Sjchandra_mm_avg_pu8(__m64 a, __m64 b) 674225394Sjchandra{ 675225394Sjchandra return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 676225394Sjchandra} 677233563Sjchandra 678225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 679225394Sjchandra_mm_avg_pu16(__m64 a, __m64 b) 680225394Sjchandra{ 681225394Sjchandra return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 682233563Sjchandra} 683225394Sjchandra 684225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 685233563Sjchandra_mm_sad_pu8(__m64 a, __m64 b) 686233563Sjchandra{ 687233563Sjchandra return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 688233563Sjchandra} 689225394Sjchandra 690233563Sjchandrastatic inline unsigned int __attribute__((__always_inline__, __nodebug__)) 691233563Sjchandra_mm_getcsr(void) 692233563Sjchandra{ 693225394Sjchandra return __builtin_ia32_stmxcsr(); 694233563Sjchandra} 695233563Sjchandra 696233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__)) 697233563Sjchandra_mm_setcsr(unsigned int i) 698225394Sjchandra{ 699225394Sjchandra __builtin_ia32_ldmxcsr(i); 700225394Sjchandra} 701233563Sjchandra 702233563Sjchandra#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask)) 703225394Sjchandra 704225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 705225394Sjchandra_mm_unpackhi_ps(__m128 a, __m128 b) 706233563Sjchandra{ 707233563Sjchandra return __builtin_shufflevector(a, b, 2, 6, 3, 7); 708225394Sjchandra} 709225394Sjchandra 710225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 711225394Sjchandra_mm_unpacklo_ps(__m128 a, __m128 b) 712233563Sjchandra{ 713233563Sjchandra return __builtin_shufflevector(a, b, 0, 4, 1, 5); 714233563Sjchandra} 715225394Sjchandra 716225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 717225394Sjchandra_mm_move_ss(__m128 a, __m128 b) 718225394Sjchandra{ 719225394Sjchandra return __builtin_shufflevector(a, b, 4, 1, 2, 3); 720225394Sjchandra} 721225394Sjchandra 722225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 723225394Sjchandra_mm_movehl_ps(__m128 a, __m128 b) 724225394Sjchandra{ 725225394Sjchandra return __builtin_shufflevector(a, b, 6, 7, 2, 3); 726233563Sjchandra} 727225394Sjchandra 728225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 729225394Sjchandra_mm_movelh_ps(__m128 a, __m128 b) 730225394Sjchandra{ 731225394Sjchandra return __builtin_shufflevector(a, b, 0, 1, 4, 5); 732225394Sjchandra} 733225394Sjchandra 734233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 735225394Sjchandra_mm_cvtpi16_ps(__m64 a) 736225394Sjchandra{ 737225394Sjchandra __m64 b, c; 738225394Sjchandra __m128 r; 739225394Sjchandra 740225394Sjchandra b = _mm_setzero_si64(); 741225394Sjchandra b = _mm_cmpgt_pi16(b, a); 742233563Sjchandra c = _mm_unpackhi_pi16(a, b); 743225394Sjchandra r = _mm_setzero_ps(); 744225394Sjchandra r = _mm_cvtpi32_ps(r, c); 745225394Sjchandra r = _mm_movelh_ps(r, r); 746225394Sjchandra c = _mm_unpacklo_pi16(a, b); 747225394Sjchandra r = _mm_cvtpi32_ps(r, c); 748225394Sjchandra 749225394Sjchandra return r; 750233563Sjchandra} 751225394Sjchandra 752279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 753225394Sjchandra_mm_cvtpu16_ps(__m64 a) 754225394Sjchandra{ 755225394Sjchandra __m64 b, c; 756225394Sjchandra __m128 r; 757225394Sjchandra 758225394Sjchandra b = _mm_setzero_si64(); 759225394Sjchandra c = _mm_unpackhi_pi16(a, b); 760227783Sjchandra r = _mm_setzero_ps(); 761227783Sjchandra r = _mm_cvtpi32_ps(r, c); 762227783Sjchandra r = _mm_movelh_ps(r, r); 763227783Sjchandra c = _mm_unpacklo_pi16(a, b); 764227783Sjchandra r = _mm_cvtpi32_ps(r, c); 765227783Sjchandra 766227783Sjchandra return r; 767227783Sjchandra} 768227783Sjchandra 769279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 770279306Sjchandra_mm_cvtpi8_ps(__m64 a) 771279306Sjchandra{ 772279306Sjchandra __m64 b; 773227783Sjchandra 774227783Sjchandra b = _mm_setzero_si64(); 775279306Sjchandra b = _mm_cmpgt_pi8(b, a); 776227783Sjchandra b = _mm_unpacklo_pi8(a, b); 777225394Sjchandra 778225394Sjchandra return _mm_cvtpi16_ps(b); 779225394Sjchandra} 780225394Sjchandra 781225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 782225394Sjchandra_mm_cvtpu8_ps(__m64 a) 783225394Sjchandra{ 784225394Sjchandra __m64 b; 785225394Sjchandra 786225394Sjchandra b = _mm_setzero_si64(); 787233563Sjchandra b = _mm_unpacklo_pi8(a, b); 788233563Sjchandra 789233563Sjchandra return _mm_cvtpi16_ps(b); 790233563Sjchandra} 791233563Sjchandra 792233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__)) 793225394Sjchandra_mm_cvtpi32x2_ps(__m64 a, __m64 b) 794225394Sjchandra{ 795225394Sjchandra __m128 c; 796225394Sjchandra 797225394Sjchandra c = _mm_setzero_ps(); 798233563Sjchandra c = _mm_cvtpi32_ps(c, b); 799225394Sjchandra c = _mm_movelh_ps(c, c); 800225394Sjchandra 801225394Sjchandra return _mm_cvtpi32_ps(c, a); 802225394Sjchandra} 803225394Sjchandra 804227843Smariusstatic inline __m64 __attribute__((__always_inline__, __nodebug__)) 805225394Sjchandra_mm_cvtps_pi16(__m128 a) 806225394Sjchandra{ 807225394Sjchandra __m64 b, c; 808225394Sjchandra 809225394Sjchandra b = _mm_cvtps_pi32(a); 810233563Sjchandra a = _mm_movehl_ps(a, a); 811225394Sjchandra c = _mm_cvtps_pi32(a); 812225394Sjchandra 813279345Sjchandra return _mm_packs_pi16(b, c); 814279345Sjchandra} 815 816static inline __m64 __attribute__((__always_inline__, __nodebug__)) 817_mm_cvtps_pi8(__m128 a) 818{ 819 __m64 b, c; 820 821 b = _mm_cvtps_pi16(a); 822 c = _mm_setzero_si64(); 823 824 return _mm_packs_pi16(b, c); 825} 826 827static inline int __attribute__((__always_inline__, __nodebug__)) 828_mm_movemask_ps(__m128 a) 829{ 830 return __builtin_ia32_movmskps(a); 831} 832 833#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 834 835#define _MM_EXCEPT_INVALID (0x0001) 836#define _MM_EXCEPT_DENORM (0x0002) 837#define _MM_EXCEPT_DIV_ZERO (0x0004) 838#define _MM_EXCEPT_OVERFLOW (0x0008) 839#define _MM_EXCEPT_UNDERFLOW (0x0010) 840#define _MM_EXCEPT_INEXACT (0x0020) 841#define _MM_EXCEPT_MASK (0x003f) 842 843#define _MM_MASK_INVALID (0x0080) 844#define _MM_MASK_DENORM (0x0100) 845#define _MM_MASK_DIV_ZERO (0x0200) 846#define _MM_MASK_OVERFLOW (0x0400) 847#define _MM_MASK_UNDERFLOW (0x0800) 848#define _MM_MASK_INEXACT (0x1000) 849#define _MM_MASK_MASK (0x1f80) 850 851#define _MM_ROUND_NEAREST (0x0000) 852#define _MM_ROUND_DOWN (0x2000) 853#define _MM_ROUND_UP (0x4000) 854#define _MM_ROUND_TOWARD_ZERO (0x6000) 855#define _MM_ROUND_MASK (0x6000) 856 857#define _MM_FLUSH_ZERO_MASK (0x8000) 858#define _MM_FLUSH_ZERO_ON (0x8000) 859#define _MM_FLUSH_ZERO_OFF (0x8000) 860 861#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 862#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 863#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 864#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 865 866#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 867#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 868#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 869#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 870 871#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 872do { \ 873 __m128 tmp3, tmp2, tmp1, tmp0; \ 874 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 875 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 876 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 877 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 878 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 879 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 880 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 881 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 882} while (0) 883 884#include <emmintrin.h> 885 886#endif /* __SSE__ */ 887 888#endif /* __XMMINTRIN_H */ 889