xmmintrin.h revision 90075
1/* Copyright (C) 2002 Free Software Foundation, Inc. 2 3 This file is part of GNU CC. 4 5 GNU CC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GNU CC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GNU CC; see the file COPYING. If not, write to 17 the Free Software Foundation, 59 Temple Place - Suite 330, 18 Boston, MA 02111-1307, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 5.0. */ 29 30#ifndef _XMMINTRIN_H_INCLUDED 31#define _XMMINTRIN_H_INCLUDED 32 33/* We need type definitions from the MMX header file. */ 34#include <mmintrin.h> 35 36/* The data type indended for user use. */ 37typedef int __m128 __attribute__ ((__mode__(__V4SF__))); 38 39/* Internal data types for implementing the instrinsics. */ 40typedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 41typedef int __v4si __attribute__ ((__mode__(__V4SI__))); 42 43/* Create a selector for use with the SHUFPS instruction. */ 44#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 45 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 46 47/* Constants for use with _mm_prefetch. */ 48enum _mm_hint 49{ 50 _MM_HINT_T0 = 3, 51 _MM_HINT_T1 = 2, 52 _MM_HINT_T2 = 1, 53 _MM_HINT_NTA = 0 54}; 55 56/* Bits in the MXCSR. */ 57#define _MM_EXCEPT_MASK 0x003f 58#define _MM_EXCEPT_INVALID 0x0001 59#define _MM_EXCEPT_DENORM 0x0002 60#define _MM_EXCEPT_DIV_ZERO 0x0004 61#define _MM_EXCEPT_OVERFLOW 0x0008 62#define _MM_EXCEPT_UNDERFLOW 0x0010 63#define _MM_EXCEPT_INEXACT 0x0020 64 65#define _MM_MASK_MASK 0x1f80 66#define _MM_MASK_INVALID 0x0080 67#define _MM_MASK_DENORM 0x0100 68#define _MM_MASK_DIV_ZERO 0x0200 69#define _MM_MASK_OVERFLOW 0x0400 70#define _MM_MASK_UNDERFLOW 0x0800 71#define _MM_MASK_INEXACT 0x1000 72 73#define _MM_ROUND_MASK 0x6000 74#define _MM_ROUND_NEAREST 0x0000 75#define _MM_ROUND_DOWN 0x2000 76#define _MM_ROUND_UP 0x4000 77#define _MM_ROUND_TOWARD_ZERO 0x6000 78 79#define _MM_FLUSH_ZERO_MASK 0x8000 80#define _MM_FLUSH_ZERO_ON 0x8000 81#define _MM_FLUSH_ZERO_OFF 0x0000 82 83/* Perform the respective operation on the lower SPFP (single-precision 84 floating-point) values of A and B; the upper three SPFP values are 85 passed through from A. */ 86 87static __inline __m128 88_mm_add_ss (__m128 __A, __m128 __B) 89{ 90 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 91} 92 93static __inline __m128 94_mm_sub_ss (__m128 __A, __m128 __B) 95{ 96 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 97} 98 99static __inline __m128 100_mm_mul_ss (__m128 __A, __m128 __B) 101{ 102 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 103} 104 105static __inline __m128 106_mm_div_ss (__m128 __A, __m128 __B) 107{ 108 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 109} 110 111static __inline __m128 112_mm_sqrt_ss (__m128 __A) 113{ 114 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 115} 116 117static __inline __m128 118_mm_rcp_ss (__m128 __A) 119{ 120 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 121} 122 123static __inline __m128 124_mm_rsqrt_ss (__m128 __A) 125{ 126 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 127} 128 129static __inline __m128 130_mm_min_ss (__m128 __A, __m128 __B) 131{ 132 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 133} 134 135static __inline __m128 136_mm_max_ss (__m128 __A, __m128 __B) 137{ 138 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 139} 140 141/* Perform the respective operation on the four SPFP values in A and B. */ 142 143static __inline __m128 144_mm_add_ps (__m128 __A, __m128 __B) 145{ 146 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 147} 148 149static __inline __m128 150_mm_sub_ps (__m128 __A, __m128 __B) 151{ 152 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 153} 154 155static __inline __m128 156_mm_mul_ps (__m128 __A, __m128 __B) 157{ 158 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 159} 160 161static __inline __m128 162_mm_div_ps (__m128 __A, __m128 __B) 163{ 164 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 165} 166 167static __inline __m128 168_mm_sqrt_ps (__m128 __A) 169{ 170 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 171} 172 173static __inline __m128 174_mm_rcp_ps (__m128 __A) 175{ 176 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 177} 178 179static __inline __m128 180_mm_rsqrt_ps (__m128 __A) 181{ 182 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 183} 184 185static __inline __m128 186_mm_min_ps (__m128 __A, __m128 __B) 187{ 188 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 189} 190 191static __inline __m128 192_mm_max_ps (__m128 __A, __m128 __B) 193{ 194 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 195} 196 197/* Perform logical bit-wise operations on 128-bit values. */ 198 199static __inline __m128 200_mm_and_ps (__m128 __A, __m128 __B) 201{ 202 return __builtin_ia32_andps (__A, __B); 203} 204 205static __inline __m128 206_mm_andnot_ps (__m128 __A, __m128 __B) 207{ 208 return __builtin_ia32_andnps (__A, __B); 209} 210 211static __inline __m128 212_mm_or_ps (__m128 __A, __m128 __B) 213{ 214 return __builtin_ia32_orps (__A, __B); 215} 216 217static __inline __m128 218_mm_xor_ps (__m128 __A, __m128 __B) 219{ 220 return __builtin_ia32_xorps (__A, __B); 221} 222 223/* Perform a comparison on the lower SPFP values of A and B. If the 224 comparison is true, place a mask of all ones in the result, otherwise a 225 mask of zeros. The upper three SPFP values are passed through from A. */ 226 227static __inline __m128 228_mm_cmpeq_ss (__m128 __A, __m128 __B) 229{ 230 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 231} 232 233static __inline __m128 234_mm_cmplt_ss (__m128 __A, __m128 __B) 235{ 236 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 237} 238 239static __inline __m128 240_mm_cmple_ss (__m128 __A, __m128 __B) 241{ 242 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 243} 244 245static __inline __m128 246_mm_cmpgt_ss (__m128 __A, __m128 __B) 247{ 248 return (__m128) __builtin_ia32_cmpgtss ((__v4sf)__A, (__v4sf)__B); 249} 250 251static __inline __m128 252_mm_cmpge_ss (__m128 __A, __m128 __B) 253{ 254 return (__m128) __builtin_ia32_cmpgess ((__v4sf)__A, (__v4sf)__B); 255} 256 257static __inline __m128 258_mm_cmpneq_ss (__m128 __A, __m128 __B) 259{ 260 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 261} 262 263static __inline __m128 264_mm_cmpnlt_ss (__m128 __A, __m128 __B) 265{ 266 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 267} 268 269static __inline __m128 270_mm_cmpnle_ss (__m128 __A, __m128 __B) 271{ 272 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 273} 274 275static __inline __m128 276_mm_cmpngt_ss (__m128 __A, __m128 __B) 277{ 278 return (__m128) __builtin_ia32_cmpngtss ((__v4sf)__A, (__v4sf)__B); 279} 280 281static __inline __m128 282_mm_cmpnge_ss (__m128 __A, __m128 __B) 283{ 284 return (__m128) __builtin_ia32_cmpngess ((__v4sf)__A, (__v4sf)__B); 285} 286 287static __inline __m128 288_mm_cmpord_ss (__m128 __A, __m128 __B) 289{ 290 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 291} 292 293static __inline __m128 294_mm_cmpunord_ss (__m128 __A, __m128 __B) 295{ 296 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 297} 298 299/* Perform a comparison on the four SPFP values of A and B. For each 300 element, if the comparison is true, place a mask of all ones in the 301 result, otherwise a mask of zeros. */ 302 303static __inline __m128 304_mm_cmpeq_ps (__m128 __A, __m128 __B) 305{ 306 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 307} 308 309static __inline __m128 310_mm_cmplt_ps (__m128 __A, __m128 __B) 311{ 312 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 313} 314 315static __inline __m128 316_mm_cmple_ps (__m128 __A, __m128 __B) 317{ 318 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 319} 320 321static __inline __m128 322_mm_cmpgt_ps (__m128 __A, __m128 __B) 323{ 324 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 325} 326 327static __inline __m128 328_mm_cmpge_ps (__m128 __A, __m128 __B) 329{ 330 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 331} 332 333static __inline __m128 334_mm_cmpneq_ps (__m128 __A, __m128 __B) 335{ 336 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 337} 338 339static __inline __m128 340_mm_cmpnlt_ps (__m128 __A, __m128 __B) 341{ 342 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 343} 344 345static __inline __m128 346_mm_cmpnle_ps (__m128 __A, __m128 __B) 347{ 348 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 349} 350 351static __inline __m128 352_mm_cmpngt_ps (__m128 __A, __m128 __B) 353{ 354 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 355} 356 357static __inline __m128 358_mm_cmpnge_ps (__m128 __A, __m128 __B) 359{ 360 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 361} 362 363static __inline __m128 364_mm_cmpord_ps (__m128 __A, __m128 __B) 365{ 366 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 367} 368 369static __inline __m128 370_mm_cmpunord_ps (__m128 __A, __m128 __B) 371{ 372 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 373} 374 375/* Compare the lower SPFP values of A and B and return 1 if true 376 and 0 if false. */ 377 378static __inline int 379_mm_comieq_ss (__m128 __A, __m128 __B) 380{ 381 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 382} 383 384static __inline int 385_mm_comilt_ss (__m128 __A, __m128 __B) 386{ 387 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 388} 389 390static __inline int 391_mm_comile_ss (__m128 __A, __m128 __B) 392{ 393 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 394} 395 396static __inline int 397_mm_comigt_ss (__m128 __A, __m128 __B) 398{ 399 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 400} 401 402static __inline int 403_mm_comige_ss (__m128 __A, __m128 __B) 404{ 405 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 406} 407 408static __inline int 409_mm_comineq_ss (__m128 __A, __m128 __B) 410{ 411 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 412} 413 414static __inline int 415_mm_ucomieq_ss (__m128 __A, __m128 __B) 416{ 417 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 418} 419 420static __inline int 421_mm_ucomilt_ss (__m128 __A, __m128 __B) 422{ 423 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 424} 425 426static __inline int 427_mm_ucomile_ss (__m128 __A, __m128 __B) 428{ 429 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 430} 431 432static __inline int 433_mm_ucomigt_ss (__m128 __A, __m128 __B) 434{ 435 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 436} 437 438static __inline int 439_mm_ucomige_ss (__m128 __A, __m128 __B) 440{ 441 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 442} 443 444static __inline int 445_mm_ucomineq_ss (__m128 __A, __m128 __B) 446{ 447 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 448} 449 450/* Convert the lower SPFP value to a 32-bit integer according to the current 451 rounding mode. */ 452static __inline int 453_mm_cvtss_si32 (__m128 __A) 454{ 455 return __builtin_ia32_cvtss2si ((__v4sf) __A); 456} 457 458/* Convert the two lower SPFP values to 32-bit integers according to the 459 current rounding mode. Return the integers in packed form. */ 460static __inline __m64 461_mm_cvtps_pi32 (__m128 __A) 462{ 463 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 464} 465 466/* Truncate the lower SPFP value to a 32-bit integer. */ 467static __inline int 468_mm_cvttss_si32 (__m128 __A) 469{ 470 return __builtin_ia32_cvttss2si ((__v4sf) __A); 471} 472 473/* Truncate the two lower SPFP values to 32-bit integers. Return the 474 integers in packed form. */ 475static __inline __m64 476_mm_cvttps_pi32 (__m128 __A) 477{ 478 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 479} 480 481/* Convert B to a SPFP value and insert it as element zero in A. */ 482static __inline __m128 483_mm_cvtsi32_ss (__m128 __A, int __B) 484{ 485 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 486} 487 488/* Convert the two 32-bit values in B to SPFP form and insert them 489 as the two lower elements in A. */ 490static __inline __m128 491_mm_cvtpi32_ps (__m128 __A, __m64 __B) 492{ 493 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 494} 495 496/* Convert the four signed 16-bit values in A to SPFP form. */ 497static __inline __m128 498_mm_cvtpi16_ps (__m64 __A) 499{ 500 __v4hi __sign; 501 __v2si __hisi, __losi; 502 __v4sf __r; 503 504 /* This comparison against zero gives us a mask that can be used to 505 fill in the missing sign bits in the unpack operations below, so 506 that we get signed values after unpacking. */ 507 __sign = (__v4hi) __builtin_ia32_mmx_zero (); 508 __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 509 510 /* Convert the four words to doublewords. */ 511 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 512 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 513 514 /* Convert the doublewords to floating point two at a time. */ 515 __r = (__v4sf) __builtin_ia32_setzerops (); 516 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 517 __r = __builtin_ia32_movlhps (__r, __r); 518 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 519 520 return (__m128) __r; 521} 522 523/* Convert the four unsigned 16-bit values in A to SPFP form. */ 524static __inline __m128 525_mm_cvtpu16_ps (__m64 __A) 526{ 527 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 528 __v2si __hisi, __losi; 529 __v4sf __r; 530 531 /* Convert the four words to doublewords. */ 532 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 533 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 534 535 /* Convert the doublewords to floating point two at a time. */ 536 __r = (__v4sf) __builtin_ia32_setzerops (); 537 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 538 __r = __builtin_ia32_movlhps (__r, __r); 539 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 540 541 return (__m128) __r; 542} 543 544/* Convert the low four signed 8-bit values in A to SPFP form. */ 545static __inline __m128 546_mm_cvtpi8_ps (__m64 __A) 547{ 548 __v8qi __sign; 549 550 /* This comparison against zero gives us a mask that can be used to 551 fill in the missing sign bits in the unpack operations below, so 552 that we get signed values after unpacking. */ 553 __sign = (__v8qi) __builtin_ia32_mmx_zero (); 554 __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 555 556 /* Convert the four low bytes to words. */ 557 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 558 559 return _mm_cvtpi16_ps(__A); 560} 561 562/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 563static __inline __m128 564_mm_cvtpu8_ps(__m64 __A) 565{ 566 __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 567 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 568 return _mm_cvtpu16_ps(__A); 569} 570 571/* Convert the four signed 32-bit values in A and B to SPFP form. */ 572static __inline __m128 573_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 574{ 575 __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 576 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 577 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 578 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 579} 580 581/* Convert the four SPFP values in A to four signed 16-bit integers. */ 582static __inline __m64 583_mm_cvtps_pi16(__m128 __A) 584{ 585 __v4sf __hisf = (__v4sf)__A; 586 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 587 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 588 __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 589 return (__m64) __builtin_ia32_packssdw (__losi, __hisi); 590} 591 592/* Convert the four SPFP values in A to four signed 8-bit integers. */ 593static __inline __m64 594_mm_cvtps_pi8(__m128 __A) 595{ 596 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 597 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 598 return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 599} 600 601/* Selects four specific SPFP values from A and B based on MASK. */ 602#if 0 603static __inline __m128 604_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 605{ 606 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 607} 608#else 609#define _mm_shuffle_ps(A, B, MASK) \ 610 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 611#endif 612 613 614/* Selects and interleaves the upper two SPFP values from A and B. */ 615static __inline __m128 616_mm_unpackhi_ps (__m128 __A, __m128 __B) 617{ 618 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 619} 620 621/* Selects and interleaves the lower two SPFP values from A and B. */ 622static __inline __m128 623_mm_unpacklo_ps (__m128 __A, __m128 __B) 624{ 625 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 626} 627 628/* Sets the upper two SPFP values with 64-bits of data loaded from P; 629 the lower two values are passed through from A. */ 630static __inline __m128 631_mm_loadh_pi (__m128 __A, __m64 *__P) 632{ 633 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 634} 635 636/* Stores the upper two SPFP values of A into P. */ 637static __inline void 638_mm_storeh_pi (__m64 *__P, __m128 __A) 639{ 640 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 641} 642 643/* Moves the upper two values of B into the lower two values of A. */ 644static __inline __m128 645_mm_movehl_ps (__m128 __A, __m128 __B) 646{ 647 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 648} 649 650/* Moves the lower two values of B into the upper two values of A. */ 651static __inline __m128 652_mm_movelh_ps (__m128 __A, __m128 __B) 653{ 654 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 655} 656 657/* Sets the lower two SPFP values with 64-bits of data loaded from P; 658 the upper two values are passed through from A. */ 659static __inline __m128 660_mm_loadl_pi (__m128 __A, __m64 *__P) 661{ 662 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 663} 664 665/* Stores the lower two SPFP values of A into P. */ 666static __inline void 667_mm_storel_pi (__m64 *__P, __m128 __A) 668{ 669 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 670} 671 672/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 673static __inline int 674_mm_movemask_ps (__m128 __A) 675{ 676 return __builtin_ia32_movmskps ((__v4sf)__A); 677} 678 679/* Return the contents of the control register. */ 680static __inline unsigned int 681_mm_getcsr (void) 682{ 683 return __builtin_ia32_stmxcsr (); 684} 685 686/* Read exception bits from the control register. */ 687static __inline unsigned int 688_MM_GET_EXCEPTION_STATE (void) 689{ 690 return _mm_getcsr() & _MM_EXCEPT_MASK; 691} 692 693static __inline unsigned int 694_MM_GET_EXCEPTION_MASK (void) 695{ 696 return _mm_getcsr() & _MM_MASK_MASK; 697} 698 699static __inline unsigned int 700_MM_GET_ROUNDING_MODE (void) 701{ 702 return _mm_getcsr() & _MM_ROUND_MASK; 703} 704 705static __inline unsigned int 706_MM_GET_FLUSH_ZERO_MODE (void) 707{ 708 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 709} 710 711/* Set the control register to I. */ 712static __inline void 713_mm_setcsr (unsigned int __I) 714{ 715 __builtin_ia32_ldmxcsr (__I); 716} 717 718/* Set exception bits in the control register. */ 719static __inline void 720_MM_SET_EXCEPTION_STATE(unsigned int __mask) 721{ 722 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 723} 724 725static __inline void 726_MM_SET_EXCEPTION_MASK (unsigned int __mask) 727{ 728 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 729} 730 731static __inline void 732_MM_SET_ROUNDING_MODE (unsigned int __mode) 733{ 734 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 735} 736 737static __inline void 738_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 739{ 740 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 741} 742 743/* Create a vector with element 0 as *P and the rest zero. */ 744static __inline __m128 745_mm_load_ss (float *__P) 746{ 747 return (__m128) __builtin_ia32_loadss (__P); 748} 749 750/* Create a vector with all four elements equal to *P. */ 751static __inline __m128 752_mm_load1_ps (float *__P) 753{ 754 __v4sf __tmp = __builtin_ia32_loadss (__P); 755 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 756} 757 758static __inline __m128 759_mm_load_ps1 (float *__P) 760{ 761 return _mm_load1_ps (__P); 762} 763 764/* Load four SPFP values from P. The address must be 16-byte aligned. */ 765static __inline __m128 766_mm_load_ps (float *__P) 767{ 768 return (__m128) __builtin_ia32_loadaps (__P); 769} 770 771/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 772static __inline __m128 773_mm_loadu_ps (float *__P) 774{ 775 return (__m128) __builtin_ia32_loadups (__P); 776} 777 778/* Load four SPFP values in reverse order. The address must be aligned. */ 779static __inline __m128 780_mm_loadr_ps (float *__P) 781{ 782 __v4sf __tmp = __builtin_ia32_loadaps (__P); 783 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 784} 785 786/* Create a vector with element 0 as F and the rest zero. */ 787static __inline __m128 788_mm_set_ss (float __F) 789{ 790 return (__m128) __builtin_ia32_loadss (&__F); 791} 792 793/* Create a vector with all four elements equal to F. */ 794static __inline __m128 795_mm_set1_ps (float __F) 796{ 797 __v4sf __tmp = __builtin_ia32_loadss (&__F); 798 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 799} 800 801static __inline __m128 802_mm_set_ps1 (float __F) 803{ 804 return _mm_set1_ps (__F); 805} 806 807/* Create the vector [Z Y X W]. */ 808static __inline __m128 809_mm_set_ps (float __Z, float __Y, float __X, float __W) 810{ 811 union { 812 float __a[4]; 813 __m128 __v; 814 } __u; 815 816 __u.__a[0] = __W; 817 __u.__a[1] = __X; 818 __u.__a[2] = __Y; 819 __u.__a[3] = __Z; 820 821 return __u.__v; 822} 823 824/* Create the vector [W X Y Z]. */ 825static __inline __m128 826_mm_setr_ps (float __Z, float __Y, float __X, float __W) 827{ 828 return _mm_set_ps (__W, __X, __Y, __Z); 829} 830 831/* Create a vector of zeros. */ 832static __inline __m128 833_mm_setzero_ps (void) 834{ 835 return (__m128) __builtin_ia32_setzerops (); 836} 837 838/* Stores the lower SPFP value. */ 839static __inline void 840_mm_store_ss (float *__P, __m128 __A) 841{ 842 __builtin_ia32_storess (__P, (__v4sf)__A); 843} 844 845/* Store the lower SPFP value across four words. */ 846static __inline void 847_mm_store1_ps (float *__P, __m128 __A) 848{ 849 __v4sf __va = (__v4sf)__A; 850 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 851 __builtin_ia32_storeaps (__P, __tmp); 852} 853 854static __inline void 855_mm_store_ps1 (float *__P, __m128 __A) 856{ 857 _mm_store1_ps (__P, __A); 858} 859 860/* Store four SPFP values. The address must be 16-byte aligned. */ 861static __inline void 862_mm_store_ps (float *__P, __m128 __A) 863{ 864 __builtin_ia32_storeaps (__P, (__v4sf)__A); 865} 866 867/* Store four SPFP values. The address need not be 16-byte aligned. */ 868static __inline void 869_mm_storeu_ps (float *__P, __m128 __A) 870{ 871 __builtin_ia32_storeups (__P, (__v4sf)__A); 872} 873 874/* Store four SPFP values in reverse order. The addres must be aligned. */ 875static __inline void 876_mm_storer_ps (float *__P, __m128 __A) 877{ 878 __v4sf __va = (__v4sf)__A; 879 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 880 __builtin_ia32_storeaps (__P, __tmp); 881} 882 883/* Sets the low SPFP value of A from the low value of B. */ 884static __inline __m128 885_mm_move_ss (__m128 __A, __m128 __B) 886{ 887 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 888} 889 890/* Extracts one of the four words of A. The selector N must be immediate. */ 891#if 0 892static __inline int 893_mm_extract_pi16 (__m64 __A, int __N) 894{ 895 return __builtin_ia32_pextrw ((__v4hi)__A, __N); 896} 897#else 898#define _mm_extract_pi16(A, N) \ 899 __builtin_ia32_pextrw ((__v4hi)(A), (N)) 900#endif 901 902/* Inserts word D into one of four words of A. The selector N must be 903 immediate. */ 904#if 0 905static __inline __m64 906_mm_insert_pi16 (__m64 __A, int __D, int __N) 907{ 908 return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 909} 910#else 911#define _mm_insert_pi16(A, D, N) \ 912 ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 913#endif 914 915/* Compute the element-wise maximum of signed 16-bit values. */ 916static __inline __m64 917_mm_max_pi16 (__m64 __A, __m64 __B) 918{ 919 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 920} 921 922/* Compute the element-wise maximum of unsigned 8-bit values. */ 923static __inline __m64 924_mm_max_pu8 (__m64 __A, __m64 __B) 925{ 926 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 927} 928 929/* Compute the element-wise minimum of signed 16-bit values. */ 930static __inline __m64 931_mm_min_pi16 (__m64 __A, __m64 __B) 932{ 933 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 934} 935 936/* Compute the element-wise minimum of unsigned 8-bit values. */ 937static __inline __m64 938_mm_min_pu8 (__m64 __A, __m64 __B) 939{ 940 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 941} 942 943/* Create an 8-bit mask of the signs of 8-bit values. */ 944static __inline int 945_mm_movemask_pi8 (__m64 __A) 946{ 947 return __builtin_ia32_pmovmskb ((__v8qi)__A); 948} 949 950/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 951 in B and produce the high 16 bits of the 32-bit results. */ 952static __inline __m64 953_mm_mulhi_pu16 (__m64 __A, __m64 __B) 954{ 955 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 956} 957 958/* Return a combination of the four 16-bit values in A. The selector 959 must be an immediate. */ 960#if 0 961static __inline __m64 962_mm_shuffle_pi16 (__m64 __A, int __N) 963{ 964 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 965} 966#else 967#define _mm_shuffle_pi16(A, N) \ 968 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 969#endif 970 971/* Conditionally store byte elements of A into P. The high bit of each 972 byte in the selector N determines whether the corresponding byte from 973 A is stored. */ 974static __inline void 975_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 976{ 977 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 978} 979 980/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 981static __inline __m64 982_mm_avg_pu8 (__m64 __A, __m64 __B) 983{ 984 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 985} 986 987/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 988static __inline __m64 989_mm_avg_pu16 (__m64 __A, __m64 __B) 990{ 991 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 992} 993 994/* Compute the sum of the absolute differences of the unsigned 8-bit 995 values in A and B. Return the value in the lower 16-bit word; the 996 upper words are cleared. */ 997static __inline __m64 998_mm_sad_pu8 (__m64 __A, __m64 __B) 999{ 1000 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 1001} 1002 1003/* Loads one cache line from address P to a location "closer" to the 1004 processor. The selector I specifies the type of prefetch operation. */ 1005#if 0 1006static __inline void 1007_mm_prefetch (void *__P, enum _mm_hint __I) 1008{ 1009 __builtin_prefetch (__P, 0, __I); 1010} 1011#else 1012#define _mm_prefetch(P, I) \ 1013 __builtin_prefetch ((P), 0, (I)) 1014#endif 1015 1016/* Stores the data in A to the address P without polluting the caches. */ 1017static __inline void 1018_mm_stream_pi (__m64 *__P, __m64 __A) 1019{ 1020 __builtin_ia32_movntq (__P, __A); 1021} 1022 1023/* Likewise. The address must be 16-byte aligned. */ 1024static __inline void 1025_mm_stream_ps (float *__P, __m128 __A) 1026{ 1027 __builtin_ia32_movntps (__P, (__v4sf)__A); 1028} 1029 1030/* Guarantees that every preceeding store is globally visible before 1031 any subsequent store. */ 1032static __inline void 1033_mm_sfence (void) 1034{ 1035 __builtin_ia32_sfence (); 1036} 1037 1038/* The execution of the next instruction is delayed by an implementation 1039 specific amount of time. The instruction does not modify the 1040 architectural state. */ 1041static __inline void 1042_mm_pause (void) 1043{ 1044 __asm__ __volatile__ ("rep; nop" : : ); 1045} 1046 1047/* Transpose the 4x4 matrix composed of row[0-3]. */ 1048#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1049do { \ 1050 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1051 __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1052 __v4sf __t1 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1053 __v4sf __t2 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 1054 __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 1055 (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 1056 (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 1057 (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 1058 (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 1059} while (0) 1060 1061#endif /* _XMMINTRIN_H_INCLUDED */ 1062