1/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GCC; see the file COPYING. If not, write to 17 the Free Software Foundation, 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 8.0. */ 29 30#ifndef _XMMINTRIN_H_INCLUDED 31#define _XMMINTRIN_H_INCLUDED 32 33#ifndef __SSE__ 34# error "SSE instruction set not enabled" 35#else 36 37/* We need type definitions from the MMX header file. */ 38#include <mmintrin.h> 39 40/* Get _mm_malloc () and _mm_free (). */ 41#include <mm_malloc.h> 42 43/* The Intel API is flexible enough that we must allow aliasing with other 44 vector types, and their scalar components. */ 45typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 46 47/* Internal data types for implementing the intrinsics. */ 48typedef float __v4sf __attribute__ ((__vector_size__ (16))); 49 50/* Create a selector for use with the SHUFPS instruction. */ 51#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 52 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 53 54/* Constants for use with _mm_prefetch. */ 55enum _mm_hint 56{ 57 _MM_HINT_T0 = 3, 58 _MM_HINT_T1 = 2, 59 _MM_HINT_T2 = 1, 60 _MM_HINT_NTA = 0 61}; 62 63/* Bits in the MXCSR. */ 64#define _MM_EXCEPT_MASK 0x003f 65#define _MM_EXCEPT_INVALID 0x0001 66#define _MM_EXCEPT_DENORM 0x0002 67#define _MM_EXCEPT_DIV_ZERO 0x0004 68#define _MM_EXCEPT_OVERFLOW 0x0008 69#define _MM_EXCEPT_UNDERFLOW 0x0010 70#define _MM_EXCEPT_INEXACT 0x0020 71 72#define _MM_MASK_MASK 0x1f80 73#define _MM_MASK_INVALID 0x0080 74#define _MM_MASK_DENORM 0x0100 75#define _MM_MASK_DIV_ZERO 0x0200 76#define _MM_MASK_OVERFLOW 0x0400 77#define _MM_MASK_UNDERFLOW 0x0800 78#define _MM_MASK_INEXACT 0x1000 79 80#define _MM_ROUND_MASK 0x6000 81#define _MM_ROUND_NEAREST 0x0000 82#define _MM_ROUND_DOWN 0x2000 83#define _MM_ROUND_UP 0x4000 84#define _MM_ROUND_TOWARD_ZERO 0x6000 85 86#define _MM_FLUSH_ZERO_MASK 0x8000 87#define _MM_FLUSH_ZERO_ON 0x8000 88#define _MM_FLUSH_ZERO_OFF 0x0000 89 90/* Create a vector of zeros. */ 91static __inline __m128 __attribute__((__always_inline__)) 92_mm_setzero_ps (void) 93{ 94 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 95} 96 97/* Perform the respective operation on the lower SPFP (single-precision 98 floating-point) values of A and B; the upper three SPFP values are 99 passed through from A. */ 100 101static __inline __m128 __attribute__((__always_inline__)) 102_mm_add_ss (__m128 __A, __m128 __B) 103{ 104 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 105} 106 107static __inline __m128 __attribute__((__always_inline__)) 108_mm_sub_ss (__m128 __A, __m128 __B) 109{ 110 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 111} 112 113static __inline __m128 __attribute__((__always_inline__)) 114_mm_mul_ss (__m128 __A, __m128 __B) 115{ 116 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 117} 118 119static __inline __m128 __attribute__((__always_inline__)) 120_mm_div_ss (__m128 __A, __m128 __B) 121{ 122 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 123} 124 125static __inline __m128 __attribute__((__always_inline__)) 126_mm_sqrt_ss (__m128 __A) 127{ 128 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 129} 130 131static __inline __m128 __attribute__((__always_inline__)) 132_mm_rcp_ss (__m128 __A) 133{ 134 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 135} 136 137static __inline __m128 __attribute__((__always_inline__)) 138_mm_rsqrt_ss (__m128 __A) 139{ 140 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 141} 142 143static __inline __m128 __attribute__((__always_inline__)) 144_mm_min_ss (__m128 __A, __m128 __B) 145{ 146 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 147} 148 149static __inline __m128 __attribute__((__always_inline__)) 150_mm_max_ss (__m128 __A, __m128 __B) 151{ 152 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 153} 154 155/* Perform the respective operation on the four SPFP values in A and B. */ 156 157static __inline __m128 __attribute__((__always_inline__)) 158_mm_add_ps (__m128 __A, __m128 __B) 159{ 160 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 161} 162 163static __inline __m128 __attribute__((__always_inline__)) 164_mm_sub_ps (__m128 __A, __m128 __B) 165{ 166 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 167} 168 169static __inline __m128 __attribute__((__always_inline__)) 170_mm_mul_ps (__m128 __A, __m128 __B) 171{ 172 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 173} 174 175static __inline __m128 __attribute__((__always_inline__)) 176_mm_div_ps (__m128 __A, __m128 __B) 177{ 178 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 179} 180 181static __inline __m128 __attribute__((__always_inline__)) 182_mm_sqrt_ps (__m128 __A) 183{ 184 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 185} 186 187static __inline __m128 __attribute__((__always_inline__)) 188_mm_rcp_ps (__m128 __A) 189{ 190 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 191} 192 193static __inline __m128 __attribute__((__always_inline__)) 194_mm_rsqrt_ps (__m128 __A) 195{ 196 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 197} 198 199static __inline __m128 __attribute__((__always_inline__)) 200_mm_min_ps (__m128 __A, __m128 __B) 201{ 202 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 203} 204 205static __inline __m128 __attribute__((__always_inline__)) 206_mm_max_ps (__m128 __A, __m128 __B) 207{ 208 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 209} 210 211/* Perform logical bit-wise operations on 128-bit values. */ 212 213static __inline __m128 __attribute__((__always_inline__)) 214_mm_and_ps (__m128 __A, __m128 __B) 215{ 216 return __builtin_ia32_andps (__A, __B); 217} 218 219static __inline __m128 __attribute__((__always_inline__)) 220_mm_andnot_ps (__m128 __A, __m128 __B) 221{ 222 return __builtin_ia32_andnps (__A, __B); 223} 224 225static __inline __m128 __attribute__((__always_inline__)) 226_mm_or_ps (__m128 __A, __m128 __B) 227{ 228 return __builtin_ia32_orps (__A, __B); 229} 230 231static __inline __m128 __attribute__((__always_inline__)) 232_mm_xor_ps (__m128 __A, __m128 __B) 233{ 234 return __builtin_ia32_xorps (__A, __B); 235} 236 237/* Perform a comparison on the lower SPFP values of A and B. If the 238 comparison is true, place a mask of all ones in the result, otherwise a 239 mask of zeros. The upper three SPFP values are passed through from A. */ 240 241static __inline __m128 __attribute__((__always_inline__)) 242_mm_cmpeq_ss (__m128 __A, __m128 __B) 243{ 244 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 245} 246 247static __inline __m128 __attribute__((__always_inline__)) 248_mm_cmplt_ss (__m128 __A, __m128 __B) 249{ 250 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 251} 252 253static __inline __m128 __attribute__((__always_inline__)) 254_mm_cmple_ss (__m128 __A, __m128 __B) 255{ 256 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 257} 258 259static __inline __m128 __attribute__((__always_inline__)) 260_mm_cmpgt_ss (__m128 __A, __m128 __B) 261{ 262 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 263 (__v4sf) 264 __builtin_ia32_cmpltss ((__v4sf) __B, 265 (__v4sf) 266 __A)); 267} 268 269static __inline __m128 __attribute__((__always_inline__)) 270_mm_cmpge_ss (__m128 __A, __m128 __B) 271{ 272 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 273 (__v4sf) 274 __builtin_ia32_cmpless ((__v4sf) __B, 275 (__v4sf) 276 __A)); 277} 278 279static __inline __m128 __attribute__((__always_inline__)) 280_mm_cmpneq_ss (__m128 __A, __m128 __B) 281{ 282 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 283} 284 285static __inline __m128 __attribute__((__always_inline__)) 286_mm_cmpnlt_ss (__m128 __A, __m128 __B) 287{ 288 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 289} 290 291static __inline __m128 __attribute__((__always_inline__)) 292_mm_cmpnle_ss (__m128 __A, __m128 __B) 293{ 294 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 295} 296 297static __inline __m128 __attribute__((__always_inline__)) 298_mm_cmpngt_ss (__m128 __A, __m128 __B) 299{ 300 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 301 (__v4sf) 302 __builtin_ia32_cmpnltss ((__v4sf) __B, 303 (__v4sf) 304 __A)); 305} 306 307static __inline __m128 __attribute__((__always_inline__)) 308_mm_cmpnge_ss (__m128 __A, __m128 __B) 309{ 310 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 311 (__v4sf) 312 __builtin_ia32_cmpnless ((__v4sf) __B, 313 (__v4sf) 314 __A)); 315} 316 317static __inline __m128 __attribute__((__always_inline__)) 318_mm_cmpord_ss (__m128 __A, __m128 __B) 319{ 320 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 321} 322 323static __inline __m128 __attribute__((__always_inline__)) 324_mm_cmpunord_ss (__m128 __A, __m128 __B) 325{ 326 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 327} 328 329/* Perform a comparison on the four SPFP values of A and B. For each 330 element, if the comparison is true, place a mask of all ones in the 331 result, otherwise a mask of zeros. */ 332 333static __inline __m128 __attribute__((__always_inline__)) 334_mm_cmpeq_ps (__m128 __A, __m128 __B) 335{ 336 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 337} 338 339static __inline __m128 __attribute__((__always_inline__)) 340_mm_cmplt_ps (__m128 __A, __m128 __B) 341{ 342 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 343} 344 345static __inline __m128 __attribute__((__always_inline__)) 346_mm_cmple_ps (__m128 __A, __m128 __B) 347{ 348 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 349} 350 351static __inline __m128 __attribute__((__always_inline__)) 352_mm_cmpgt_ps (__m128 __A, __m128 __B) 353{ 354 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 355} 356 357static __inline __m128 __attribute__((__always_inline__)) 358_mm_cmpge_ps (__m128 __A, __m128 __B) 359{ 360 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 361} 362 363static __inline __m128 __attribute__((__always_inline__)) 364_mm_cmpneq_ps (__m128 __A, __m128 __B) 365{ 366 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 367} 368 369static __inline __m128 __attribute__((__always_inline__)) 370_mm_cmpnlt_ps (__m128 __A, __m128 __B) 371{ 372 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 373} 374 375static __inline __m128 __attribute__((__always_inline__)) 376_mm_cmpnle_ps (__m128 __A, __m128 __B) 377{ 378 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 379} 380 381static __inline __m128 __attribute__((__always_inline__)) 382_mm_cmpngt_ps (__m128 __A, __m128 __B) 383{ 384 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 385} 386 387static __inline __m128 __attribute__((__always_inline__)) 388_mm_cmpnge_ps (__m128 __A, __m128 __B) 389{ 390 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 391} 392 393static __inline __m128 __attribute__((__always_inline__)) 394_mm_cmpord_ps (__m128 __A, __m128 __B) 395{ 396 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 397} 398 399static __inline __m128 __attribute__((__always_inline__)) 400_mm_cmpunord_ps (__m128 __A, __m128 __B) 401{ 402 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 403} 404 405/* Compare the lower SPFP values of A and B and return 1 if true 406 and 0 if false. */ 407 408static __inline int __attribute__((__always_inline__)) 409_mm_comieq_ss (__m128 __A, __m128 __B) 410{ 411 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 412} 413 414static __inline int __attribute__((__always_inline__)) 415_mm_comilt_ss (__m128 __A, __m128 __B) 416{ 417 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 418} 419 420static __inline int __attribute__((__always_inline__)) 421_mm_comile_ss (__m128 __A, __m128 __B) 422{ 423 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 424} 425 426static __inline int __attribute__((__always_inline__)) 427_mm_comigt_ss (__m128 __A, __m128 __B) 428{ 429 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 430} 431 432static __inline int __attribute__((__always_inline__)) 433_mm_comige_ss (__m128 __A, __m128 __B) 434{ 435 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 436} 437 438static __inline int __attribute__((__always_inline__)) 439_mm_comineq_ss (__m128 __A, __m128 __B) 440{ 441 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 442} 443 444static __inline int __attribute__((__always_inline__)) 445_mm_ucomieq_ss (__m128 __A, __m128 __B) 446{ 447 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 448} 449 450static __inline int __attribute__((__always_inline__)) 451_mm_ucomilt_ss (__m128 __A, __m128 __B) 452{ 453 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 454} 455 456static __inline int __attribute__((__always_inline__)) 457_mm_ucomile_ss (__m128 __A, __m128 __B) 458{ 459 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 460} 461 462static __inline int __attribute__((__always_inline__)) 463_mm_ucomigt_ss (__m128 __A, __m128 __B) 464{ 465 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 466} 467 468static __inline int __attribute__((__always_inline__)) 469_mm_ucomige_ss (__m128 __A, __m128 __B) 470{ 471 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 472} 473 474static __inline int __attribute__((__always_inline__)) 475_mm_ucomineq_ss (__m128 __A, __m128 __B) 476{ 477 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 478} 479 480/* Convert the lower SPFP value to a 32-bit integer according to the current 481 rounding mode. */ 482static __inline int __attribute__((__always_inline__)) 483_mm_cvtss_si32 (__m128 __A) 484{ 485 return __builtin_ia32_cvtss2si ((__v4sf) __A); 486} 487 488static __inline int __attribute__((__always_inline__)) 489_mm_cvt_ss2si (__m128 __A) 490{ 491 return _mm_cvtss_si32 (__A); 492} 493 494#ifdef __x86_64__ 495/* Convert the lower SPFP value to a 32-bit integer according to the current 496 rounding mode. */ 497static __inline long long __attribute__((__always_inline__)) 498_mm_cvtss_si64x (__m128 __A) 499{ 500 return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 501} 502#endif 503 504/* Convert the two lower SPFP values to 32-bit integers according to the 505 current rounding mode. Return the integers in packed form. */ 506static __inline __m64 __attribute__((__always_inline__)) 507_mm_cvtps_pi32 (__m128 __A) 508{ 509 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 510} 511 512static __inline __m64 __attribute__((__always_inline__)) 513_mm_cvt_ps2pi (__m128 __A) 514{ 515 return _mm_cvtps_pi32 (__A); 516} 517 518/* Truncate the lower SPFP value to a 32-bit integer. */ 519static __inline int __attribute__((__always_inline__)) 520_mm_cvttss_si32 (__m128 __A) 521{ 522 return __builtin_ia32_cvttss2si ((__v4sf) __A); 523} 524 525static __inline int __attribute__((__always_inline__)) 526_mm_cvtt_ss2si (__m128 __A) 527{ 528 return _mm_cvttss_si32 (__A); 529} 530 531#ifdef __x86_64__ 532/* Truncate the lower SPFP value to a 32-bit integer. */ 533static __inline long long __attribute__((__always_inline__)) 534_mm_cvttss_si64x (__m128 __A) 535{ 536 return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 537} 538#endif 539 540/* Truncate the two lower SPFP values to 32-bit integers. Return the 541 integers in packed form. */ 542static __inline __m64 __attribute__((__always_inline__)) 543_mm_cvttps_pi32 (__m128 __A) 544{ 545 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 546} 547 548static __inline __m64 __attribute__((__always_inline__)) 549_mm_cvtt_ps2pi (__m128 __A) 550{ 551 return _mm_cvttps_pi32 (__A); 552} 553 554/* Convert B to a SPFP value and insert it as element zero in A. */ 555static __inline __m128 __attribute__((__always_inline__)) 556_mm_cvtsi32_ss (__m128 __A, int __B) 557{ 558 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 559} 560 561static __inline __m128 __attribute__((__always_inline__)) 562_mm_cvt_si2ss (__m128 __A, int __B) 563{ 564 return _mm_cvtsi32_ss (__A, __B); 565} 566 567#ifdef __x86_64__ 568/* Convert B to a SPFP value and insert it as element zero in A. */ 569static __inline __m128 __attribute__((__always_inline__)) 570_mm_cvtsi64x_ss (__m128 __A, long long __B) 571{ 572 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 573} 574#endif 575 576/* Convert the two 32-bit values in B to SPFP form and insert them 577 as the two lower elements in A. */ 578static __inline __m128 __attribute__((__always_inline__)) 579_mm_cvtpi32_ps (__m128 __A, __m64 __B) 580{ 581 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 582} 583 584static __inline __m128 __attribute__((__always_inline__)) 585_mm_cvt_pi2ps (__m128 __A, __m64 __B) 586{ 587 return _mm_cvtpi32_ps (__A, __B); 588} 589 590/* Convert the four signed 16-bit values in A to SPFP form. */ 591static __inline __m128 __attribute__((__always_inline__)) 592_mm_cvtpi16_ps (__m64 __A) 593{ 594 __v4hi __sign; 595 __v2si __hisi, __losi; 596 __v4sf __r; 597 598 /* This comparison against zero gives us a mask that can be used to 599 fill in the missing sign bits in the unpack operations below, so 600 that we get signed values after unpacking. */ 601 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); 602 603 /* Convert the four words to doublewords. */ 604 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 605 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 606 607 /* Convert the doublewords to floating point two at a time. */ 608 __r = (__v4sf) _mm_setzero_ps (); 609 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 610 __r = __builtin_ia32_movlhps (__r, __r); 611 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 612 613 return (__m128) __r; 614} 615 616/* Convert the four unsigned 16-bit values in A to SPFP form. */ 617static __inline __m128 __attribute__((__always_inline__)) 618_mm_cvtpu16_ps (__m64 __A) 619{ 620 __v2si __hisi, __losi; 621 __v4sf __r; 622 623 /* Convert the four words to doublewords. */ 624 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); 625 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); 626 627 /* Convert the doublewords to floating point two at a time. */ 628 __r = (__v4sf) _mm_setzero_ps (); 629 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 630 __r = __builtin_ia32_movlhps (__r, __r); 631 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 632 633 return (__m128) __r; 634} 635 636/* Convert the low four signed 8-bit values in A to SPFP form. */ 637static __inline __m128 __attribute__((__always_inline__)) 638_mm_cvtpi8_ps (__m64 __A) 639{ 640 __v8qi __sign; 641 642 /* This comparison against zero gives us a mask that can be used to 643 fill in the missing sign bits in the unpack operations below, so 644 that we get signed values after unpacking. */ 645 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); 646 647 /* Convert the four low bytes to words. */ 648 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 649 650 return _mm_cvtpi16_ps(__A); 651} 652 653/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 654static __inline __m128 __attribute__((__always_inline__)) 655_mm_cvtpu8_ps(__m64 __A) 656{ 657 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); 658 return _mm_cvtpu16_ps(__A); 659} 660 661/* Convert the four signed 32-bit values in A and B to SPFP form. */ 662static __inline __m128 __attribute__((__always_inline__)) 663_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 664{ 665 __v4sf __zero = (__v4sf) _mm_setzero_ps (); 666 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 667 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 668 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 669} 670 671/* Convert the four SPFP values in A to four signed 16-bit integers. */ 672static __inline __m64 __attribute__((__always_inline__)) 673_mm_cvtps_pi16(__m128 __A) 674{ 675 __v4sf __hisf = (__v4sf)__A; 676 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 677 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 678 __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 679 return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 680} 681 682/* Convert the four SPFP values in A to four signed 8-bit integers. */ 683static __inline __m64 __attribute__((__always_inline__)) 684_mm_cvtps_pi8(__m128 __A) 685{ 686 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 687 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); 688} 689 690/* Selects four specific SPFP values from A and B based on MASK. */ 691#if 0 692static __inline __m128 __attribute__((__always_inline__)) 693_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 694{ 695 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 696} 697#else 698#define _mm_shuffle_ps(A, B, MASK) \ 699 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 700#endif 701 702 703/* Selects and interleaves the upper two SPFP values from A and B. */ 704static __inline __m128 __attribute__((__always_inline__)) 705_mm_unpackhi_ps (__m128 __A, __m128 __B) 706{ 707 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 708} 709 710/* Selects and interleaves the lower two SPFP values from A and B. */ 711static __inline __m128 __attribute__((__always_inline__)) 712_mm_unpacklo_ps (__m128 __A, __m128 __B) 713{ 714 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 715} 716 717/* Sets the upper two SPFP values with 64-bits of data loaded from P; 718 the lower two values are passed through from A. */ 719static __inline __m128 __attribute__((__always_inline__)) 720_mm_loadh_pi (__m128 __A, __m64 const *__P) 721{ 722 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 723} 724 725/* Stores the upper two SPFP values of A into P. */ 726static __inline void __attribute__((__always_inline__)) 727_mm_storeh_pi (__m64 *__P, __m128 __A) 728{ 729 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 730} 731 732/* Moves the upper two values of B into the lower two values of A. */ 733static __inline __m128 __attribute__((__always_inline__)) 734_mm_movehl_ps (__m128 __A, __m128 __B) 735{ 736 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 737} 738 739/* Moves the lower two values of B into the upper two values of A. */ 740static __inline __m128 __attribute__((__always_inline__)) 741_mm_movelh_ps (__m128 __A, __m128 __B) 742{ 743 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 744} 745 746/* Sets the lower two SPFP values with 64-bits of data loaded from P; 747 the upper two values are passed through from A. */ 748static __inline __m128 __attribute__((__always_inline__)) 749_mm_loadl_pi (__m128 __A, __m64 const *__P) 750{ 751 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 752} 753 754/* Stores the lower two SPFP values of A into P. */ 755static __inline void __attribute__((__always_inline__)) 756_mm_storel_pi (__m64 *__P, __m128 __A) 757{ 758 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 759} 760 761/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 762static __inline int __attribute__((__always_inline__)) 763_mm_movemask_ps (__m128 __A) 764{ 765 return __builtin_ia32_movmskps ((__v4sf)__A); 766} 767 768/* Return the contents of the control register. */ 769static __inline unsigned int __attribute__((__always_inline__)) 770_mm_getcsr (void) 771{ 772 return __builtin_ia32_stmxcsr (); 773} 774 775/* Read exception bits from the control register. */ 776static __inline unsigned int __attribute__((__always_inline__)) 777_MM_GET_EXCEPTION_STATE (void) 778{ 779 return _mm_getcsr() & _MM_EXCEPT_MASK; 780} 781 782static __inline unsigned int __attribute__((__always_inline__)) 783_MM_GET_EXCEPTION_MASK (void) 784{ 785 return _mm_getcsr() & _MM_MASK_MASK; 786} 787 788static __inline unsigned int __attribute__((__always_inline__)) 789_MM_GET_ROUNDING_MODE (void) 790{ 791 return _mm_getcsr() & _MM_ROUND_MASK; 792} 793 794static __inline unsigned int __attribute__((__always_inline__)) 795_MM_GET_FLUSH_ZERO_MODE (void) 796{ 797 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 798} 799 800/* Set the control register to I. */ 801static __inline void __attribute__((__always_inline__)) 802_mm_setcsr (unsigned int __I) 803{ 804 __builtin_ia32_ldmxcsr (__I); 805} 806 807/* Set exception bits in the control register. */ 808static __inline void __attribute__((__always_inline__)) 809_MM_SET_EXCEPTION_STATE(unsigned int __mask) 810{ 811 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 812} 813 814static __inline void __attribute__((__always_inline__)) 815_MM_SET_EXCEPTION_MASK (unsigned int __mask) 816{ 817 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 818} 819 820static __inline void __attribute__((__always_inline__)) 821_MM_SET_ROUNDING_MODE (unsigned int __mode) 822{ 823 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 824} 825 826static __inline void __attribute__((__always_inline__)) 827_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 828{ 829 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 830} 831 832/* Create a vector with element 0 as F and the rest zero. */ 833static __inline __m128 __attribute__((__always_inline__)) 834_mm_set_ss (float __F) 835{ 836 return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 }; 837} 838 839/* Create a vector with all four elements equal to F. */ 840static __inline __m128 __attribute__((__always_inline__)) 841_mm_set1_ps (float __F) 842{ 843 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 844} 845 846static __inline __m128 __attribute__((__always_inline__)) 847_mm_set_ps1 (float __F) 848{ 849 return _mm_set1_ps (__F); 850} 851 852/* Create a vector with element 0 as *P and the rest zero. */ 853static __inline __m128 __attribute__((__always_inline__)) 854_mm_load_ss (float const *__P) 855{ 856 return _mm_set_ss (*__P); 857} 858 859/* Create a vector with all four elements equal to *P. */ 860static __inline __m128 __attribute__((__always_inline__)) 861_mm_load1_ps (float const *__P) 862{ 863 return _mm_set1_ps (*__P); 864} 865 866static __inline __m128 __attribute__((__always_inline__)) 867_mm_load_ps1 (float const *__P) 868{ 869 return _mm_load1_ps (__P); 870} 871 872/* Load four SPFP values from P. The address must be 16-byte aligned. */ 873static __inline __m128 __attribute__((__always_inline__)) 874_mm_load_ps (float const *__P) 875{ 876 return (__m128) *(__v4sf *)__P; 877} 878 879/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 880static __inline __m128 __attribute__((__always_inline__)) 881_mm_loadu_ps (float const *__P) 882{ 883 return (__m128) __builtin_ia32_loadups (__P); 884} 885 886/* Load four SPFP values in reverse order. The address must be aligned. */ 887static __inline __m128 __attribute__((__always_inline__)) 888_mm_loadr_ps (float const *__P) 889{ 890 __v4sf __tmp = *(__v4sf *)__P; 891 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 892} 893 894/* Create the vector [Z Y X W]. */ 895static __inline __m128 __attribute__((__always_inline__)) 896_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 897{ 898 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 899} 900 901/* Create the vector [W X Y Z]. */ 902static __inline __m128 __attribute__((__always_inline__)) 903_mm_setr_ps (float __Z, float __Y, float __X, float __W) 904{ 905 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 906} 907 908/* Stores the lower SPFP value. */ 909static __inline void __attribute__((__always_inline__)) 910_mm_store_ss (float *__P, __m128 __A) 911{ 912 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 913} 914 915/* Store four SPFP values. The address must be 16-byte aligned. */ 916static __inline void __attribute__((__always_inline__)) 917_mm_store_ps (float *__P, __m128 __A) 918{ 919 *(__v4sf *)__P = (__v4sf)__A; 920} 921 922/* Store four SPFP values. The address need not be 16-byte aligned. */ 923static __inline void __attribute__((__always_inline__)) 924_mm_storeu_ps (float *__P, __m128 __A) 925{ 926 __builtin_ia32_storeups (__P, (__v4sf)__A); 927} 928 929/* Store the lower SPFP value across four words. */ 930static __inline void __attribute__((__always_inline__)) 931_mm_store1_ps (float *__P, __m128 __A) 932{ 933 __v4sf __va = (__v4sf)__A; 934 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 935 _mm_storeu_ps (__P, __tmp); 936} 937 938static __inline void __attribute__((__always_inline__)) 939_mm_store_ps1 (float *__P, __m128 __A) 940{ 941 _mm_store1_ps (__P, __A); 942} 943 944/* Store four SPFP values in reverse order. The address must be aligned. */ 945static __inline void __attribute__((__always_inline__)) 946_mm_storer_ps (float *__P, __m128 __A) 947{ 948 __v4sf __va = (__v4sf)__A; 949 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 950 _mm_store_ps (__P, __tmp); 951} 952 953/* Sets the low SPFP value of A from the low value of B. */ 954static __inline __m128 __attribute__((__always_inline__)) 955_mm_move_ss (__m128 __A, __m128 __B) 956{ 957 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 958} 959 960/* Extracts one of the four words of A. The selector N must be immediate. */ 961#if 0 962static __inline int __attribute__((__always_inline__)) 963_mm_extract_pi16 (__m64 const __A, int const __N) 964{ 965 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); 966} 967 968static __inline int __attribute__((__always_inline__)) 969_m_pextrw (__m64 const __A, int const __N) 970{ 971 return _mm_extract_pi16 (__A, __N); 972} 973#else 974#define _mm_extract_pi16(A, N) __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N)) 975#define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 976#endif 977 978/* Inserts word D into one of four words of A. The selector N must be 979 immediate. */ 980#if 0 981static __inline __m64 __attribute__((__always_inline__)) 982_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 983{ 984 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); 985} 986 987static __inline __m64 __attribute__((__always_inline__)) 988_m_pinsrw (__m64 const __A, int const __D, int const __N) 989{ 990 return _mm_insert_pi16 (__A, __D, __N); 991} 992#else 993#define _mm_insert_pi16(A, D, N) \ 994 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N))) 995#define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 996#endif 997 998/* Compute the element-wise maximum of signed 16-bit values. */ 999static __inline __m64 __attribute__((__always_inline__)) 1000_mm_max_pi16 (__m64 __A, __m64 __B) 1001{ 1002 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 1003} 1004 1005static __inline __m64 __attribute__((__always_inline__)) 1006_m_pmaxsw (__m64 __A, __m64 __B) 1007{ 1008 return _mm_max_pi16 (__A, __B); 1009} 1010 1011/* Compute the element-wise maximum of unsigned 8-bit values. */ 1012static __inline __m64 __attribute__((__always_inline__)) 1013_mm_max_pu8 (__m64 __A, __m64 __B) 1014{ 1015 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 1016} 1017 1018static __inline __m64 __attribute__((__always_inline__)) 1019_m_pmaxub (__m64 __A, __m64 __B) 1020{ 1021 return _mm_max_pu8 (__A, __B); 1022} 1023 1024/* Compute the element-wise minimum of signed 16-bit values. */ 1025static __inline __m64 __attribute__((__always_inline__)) 1026_mm_min_pi16 (__m64 __A, __m64 __B) 1027{ 1028 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 1029} 1030 1031static __inline __m64 __attribute__((__always_inline__)) 1032_m_pminsw (__m64 __A, __m64 __B) 1033{ 1034 return _mm_min_pi16 (__A, __B); 1035} 1036 1037/* Compute the element-wise minimum of unsigned 8-bit values. */ 1038static __inline __m64 __attribute__((__always_inline__)) 1039_mm_min_pu8 (__m64 __A, __m64 __B) 1040{ 1041 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 1042} 1043 1044static __inline __m64 __attribute__((__always_inline__)) 1045_m_pminub (__m64 __A, __m64 __B) 1046{ 1047 return _mm_min_pu8 (__A, __B); 1048} 1049 1050/* Create an 8-bit mask of the signs of 8-bit values. */ 1051static __inline int __attribute__((__always_inline__)) 1052_mm_movemask_pi8 (__m64 __A) 1053{ 1054 return __builtin_ia32_pmovmskb ((__v8qi)__A); 1055} 1056 1057static __inline int __attribute__((__always_inline__)) 1058_m_pmovmskb (__m64 __A) 1059{ 1060 return _mm_movemask_pi8 (__A); 1061} 1062 1063/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 1064 in B and produce the high 16 bits of the 32-bit results. */ 1065static __inline __m64 __attribute__((__always_inline__)) 1066_mm_mulhi_pu16 (__m64 __A, __m64 __B) 1067{ 1068 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 1069} 1070 1071static __inline __m64 __attribute__((__always_inline__)) 1072_m_pmulhuw (__m64 __A, __m64 __B) 1073{ 1074 return _mm_mulhi_pu16 (__A, __B); 1075} 1076 1077/* Return a combination of the four 16-bit values in A. The selector 1078 must be an immediate. */ 1079#if 0 1080static __inline __m64 __attribute__((__always_inline__)) 1081_mm_shuffle_pi16 (__m64 __A, int __N) 1082{ 1083 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 1084} 1085 1086static __inline __m64 __attribute__((__always_inline__)) 1087_m_pshufw (__m64 __A, int __N) 1088{ 1089 return _mm_shuffle_pi16 (__A, __N); 1090} 1091#else 1092#define _mm_shuffle_pi16(A, N) \ 1093 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1094#define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 1095#endif 1096 1097/* Conditionally store byte elements of A into P. The high bit of each 1098 byte in the selector N determines whether the corresponding byte from 1099 A is stored. */ 1100static __inline void __attribute__((__always_inline__)) 1101_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 1102{ 1103 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 1104} 1105 1106static __inline void __attribute__((__always_inline__)) 1107_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1108{ 1109 _mm_maskmove_si64 (__A, __N, __P); 1110} 1111 1112/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1113static __inline __m64 __attribute__((__always_inline__)) 1114_mm_avg_pu8 (__m64 __A, __m64 __B) 1115{ 1116 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 1117} 1118 1119static __inline __m64 __attribute__((__always_inline__)) 1120_m_pavgb (__m64 __A, __m64 __B) 1121{ 1122 return _mm_avg_pu8 (__A, __B); 1123} 1124 1125/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1126static __inline __m64 __attribute__((__always_inline__)) 1127_mm_avg_pu16 (__m64 __A, __m64 __B) 1128{ 1129 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 1130} 1131 1132static __inline __m64 __attribute__((__always_inline__)) 1133_m_pavgw (__m64 __A, __m64 __B) 1134{ 1135 return _mm_avg_pu16 (__A, __B); 1136} 1137 1138/* Compute the sum of the absolute differences of the unsigned 8-bit 1139 values in A and B. Return the value in the lower 16-bit word; the 1140 upper words are cleared. */ 1141static __inline __m64 __attribute__((__always_inline__)) 1142_mm_sad_pu8 (__m64 __A, __m64 __B) 1143{ 1144 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 1145} 1146 1147static __inline __m64 __attribute__((__always_inline__)) 1148_m_psadbw (__m64 __A, __m64 __B) 1149{ 1150 return _mm_sad_pu8 (__A, __B); 1151} 1152 1153/* Loads one cache line from address P to a location "closer" to the 1154 processor. The selector I specifies the type of prefetch operation. */ 1155#if 0 1156static __inline void __attribute__((__always_inline__)) 1157_mm_prefetch (void *__P, enum _mm_hint __I) 1158{ 1159 __builtin_prefetch (__P, 0, __I); 1160} 1161#else 1162#define _mm_prefetch(P, I) \ 1163 __builtin_prefetch ((P), 0, (I)) 1164#endif 1165 1166/* Stores the data in A to the address P without polluting the caches. */ 1167static __inline void __attribute__((__always_inline__)) 1168_mm_stream_pi (__m64 *__P, __m64 __A) 1169{ 1170 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); 1171} 1172 1173/* Likewise. The address must be 16-byte aligned. */ 1174static __inline void __attribute__((__always_inline__)) 1175_mm_stream_ps (float *__P, __m128 __A) 1176{ 1177 __builtin_ia32_movntps (__P, (__v4sf)__A); 1178} 1179 1180/* Guarantees that every preceding store is globally visible before 1181 any subsequent store. */ 1182static __inline void __attribute__((__always_inline__)) 1183_mm_sfence (void) 1184{ 1185 __builtin_ia32_sfence (); 1186} 1187 1188/* The execution of the next instruction is delayed by an implementation 1189 specific amount of time. The instruction does not modify the 1190 architectural state. */ 1191static __inline void __attribute__((__always_inline__)) 1192_mm_pause (void) 1193{ 1194 __asm__ __volatile__ ("rep; nop" : : ); 1195} 1196 1197/* Transpose the 4x4 matrix composed of row[0-3]. */ 1198#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1199do { \ 1200 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1201 __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1202 __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1203 __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 1204 __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 1205 (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 1206 (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 1207 (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 1208 (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 1209} while (0) 1210 1211/* For backward source compatibility. */ 1212#include <emmintrin.h> 1213 1214#endif /* __SSE__ */ 1215#endif /* _XMMINTRIN_H_INCLUDED */ 1216