xmmintrin.h revision 107590
1/* Copyright (C) 2002 Free Software Foundation, Inc. 2 3 This file is part of GNU CC. 4 5 GNU CC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GNU CC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GNU CC; see the file COPYING. If not, write to 17 the Free Software Foundation, 59 Temple Place - Suite 330, 18 Boston, MA 02111-1307, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 5.0. */ 29 30#ifndef _XMMINTRIN_H_INCLUDED 31#define _XMMINTRIN_H_INCLUDED 32 33/* We need type definitions from the MMX header file. */ 34#include <mmintrin.h> 35 36/* The data type indended for user use. */ 37typedef int __m128 __attribute__ ((__mode__(__V4SF__))); 38 39/* Internal data types for implementing the instrinsics. */ 40typedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 41typedef int __v4si __attribute__ ((__mode__(__V4SI__))); 42 43/* Create a selector for use with the SHUFPS instruction. */ 44#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 45 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 46 47/* Constants for use with _mm_prefetch. */ 48enum _mm_hint 49{ 50 _MM_HINT_T0 = 3, 51 _MM_HINT_T1 = 2, 52 _MM_HINT_T2 = 1, 53 _MM_HINT_NTA = 0 54}; 55 56/* Bits in the MXCSR. */ 57#define _MM_EXCEPT_MASK 0x003f 58#define _MM_EXCEPT_INVALID 0x0001 59#define _MM_EXCEPT_DENORM 0x0002 60#define _MM_EXCEPT_DIV_ZERO 0x0004 61#define _MM_EXCEPT_OVERFLOW 0x0008 62#define _MM_EXCEPT_UNDERFLOW 0x0010 63#define _MM_EXCEPT_INEXACT 0x0020 64 65#define _MM_MASK_MASK 0x1f80 66#define _MM_MASK_INVALID 0x0080 67#define _MM_MASK_DENORM 0x0100 68#define _MM_MASK_DIV_ZERO 0x0200 69#define _MM_MASK_OVERFLOW 0x0400 70#define _MM_MASK_UNDERFLOW 0x0800 71#define _MM_MASK_INEXACT 0x1000 72 73#define _MM_ROUND_MASK 0x6000 74#define _MM_ROUND_NEAREST 0x0000 75#define _MM_ROUND_DOWN 0x2000 76#define _MM_ROUND_UP 0x4000 77#define _MM_ROUND_TOWARD_ZERO 0x6000 78 79#define _MM_FLUSH_ZERO_MASK 0x8000 80#define _MM_FLUSH_ZERO_ON 0x8000 81#define _MM_FLUSH_ZERO_OFF 0x0000 82 83/* Perform the respective operation on the lower SPFP (single-precision 84 floating-point) values of A and B; the upper three SPFP values are 85 passed through from A. */ 86 87static __inline __m128 88_mm_add_ss (__m128 __A, __m128 __B) 89{ 90 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 91} 92 93static __inline __m128 94_mm_sub_ss (__m128 __A, __m128 __B) 95{ 96 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 97} 98 99static __inline __m128 100_mm_mul_ss (__m128 __A, __m128 __B) 101{ 102 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 103} 104 105static __inline __m128 106_mm_div_ss (__m128 __A, __m128 __B) 107{ 108 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 109} 110 111static __inline __m128 112_mm_sqrt_ss (__m128 __A) 113{ 114 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 115} 116 117static __inline __m128 118_mm_rcp_ss (__m128 __A) 119{ 120 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 121} 122 123static __inline __m128 124_mm_rsqrt_ss (__m128 __A) 125{ 126 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 127} 128 129static __inline __m128 130_mm_min_ss (__m128 __A, __m128 __B) 131{ 132 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 133} 134 135static __inline __m128 136_mm_max_ss (__m128 __A, __m128 __B) 137{ 138 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 139} 140 141/* Perform the respective operation on the four SPFP values in A and B. */ 142 143static __inline __m128 144_mm_add_ps (__m128 __A, __m128 __B) 145{ 146 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 147} 148 149static __inline __m128 150_mm_sub_ps (__m128 __A, __m128 __B) 151{ 152 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 153} 154 155static __inline __m128 156_mm_mul_ps (__m128 __A, __m128 __B) 157{ 158 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 159} 160 161static __inline __m128 162_mm_div_ps (__m128 __A, __m128 __B) 163{ 164 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 165} 166 167static __inline __m128 168_mm_sqrt_ps (__m128 __A) 169{ 170 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 171} 172 173static __inline __m128 174_mm_rcp_ps (__m128 __A) 175{ 176 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 177} 178 179static __inline __m128 180_mm_rsqrt_ps (__m128 __A) 181{ 182 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 183} 184 185static __inline __m128 186_mm_min_ps (__m128 __A, __m128 __B) 187{ 188 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 189} 190 191static __inline __m128 192_mm_max_ps (__m128 __A, __m128 __B) 193{ 194 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 195} 196 197/* Perform logical bit-wise operations on 128-bit values. */ 198 199static __inline __m128 200_mm_and_ps (__m128 __A, __m128 __B) 201{ 202 return __builtin_ia32_andps (__A, __B); 203} 204 205static __inline __m128 206_mm_andnot_ps (__m128 __A, __m128 __B) 207{ 208 return __builtin_ia32_andnps (__A, __B); 209} 210 211static __inline __m128 212_mm_or_ps (__m128 __A, __m128 __B) 213{ 214 return __builtin_ia32_orps (__A, __B); 215} 216 217static __inline __m128 218_mm_xor_ps (__m128 __A, __m128 __B) 219{ 220 return __builtin_ia32_xorps (__A, __B); 221} 222 223/* Perform a comparison on the lower SPFP values of A and B. If the 224 comparison is true, place a mask of all ones in the result, otherwise a 225 mask of zeros. The upper three SPFP values are passed through from A. */ 226 227static __inline __m128 228_mm_cmpeq_ss (__m128 __A, __m128 __B) 229{ 230 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 231} 232 233static __inline __m128 234_mm_cmplt_ss (__m128 __A, __m128 __B) 235{ 236 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 237} 238 239static __inline __m128 240_mm_cmple_ss (__m128 __A, __m128 __B) 241{ 242 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 243} 244 245static __inline __m128 246_mm_cmpgt_ss (__m128 __A, __m128 __B) 247{ 248 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 249 (__v4sf) 250 __builtin_ia32_cmpltss ((__v4sf) __B, 251 (__v4sf) 252 __A)); 253} 254 255static __inline __m128 256_mm_cmpge_ss (__m128 __A, __m128 __B) 257{ 258 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 259 (__v4sf) 260 __builtin_ia32_cmpless ((__v4sf) __B, 261 (__v4sf) 262 __A)); 263} 264 265static __inline __m128 266_mm_cmpneq_ss (__m128 __A, __m128 __B) 267{ 268 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 269} 270 271static __inline __m128 272_mm_cmpnlt_ss (__m128 __A, __m128 __B) 273{ 274 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 275} 276 277static __inline __m128 278_mm_cmpnle_ss (__m128 __A, __m128 __B) 279{ 280 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 281} 282 283static __inline __m128 284_mm_cmpngt_ss (__m128 __A, __m128 __B) 285{ 286 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 287 (__v4sf) 288 __builtin_ia32_cmpnltss ((__v4sf) __B, 289 (__v4sf) 290 __A)); 291} 292 293static __inline __m128 294_mm_cmpnge_ss (__m128 __A, __m128 __B) 295{ 296 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 297 (__v4sf) 298 __builtin_ia32_cmpnless ((__v4sf) __B, 299 (__v4sf) 300 __A)); 301} 302 303static __inline __m128 304_mm_cmpord_ss (__m128 __A, __m128 __B) 305{ 306 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 307} 308 309static __inline __m128 310_mm_cmpunord_ss (__m128 __A, __m128 __B) 311{ 312 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 313} 314 315/* Perform a comparison on the four SPFP values of A and B. For each 316 element, if the comparison is true, place a mask of all ones in the 317 result, otherwise a mask of zeros. */ 318 319static __inline __m128 320_mm_cmpeq_ps (__m128 __A, __m128 __B) 321{ 322 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 323} 324 325static __inline __m128 326_mm_cmplt_ps (__m128 __A, __m128 __B) 327{ 328 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 329} 330 331static __inline __m128 332_mm_cmple_ps (__m128 __A, __m128 __B) 333{ 334 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 335} 336 337static __inline __m128 338_mm_cmpgt_ps (__m128 __A, __m128 __B) 339{ 340 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 341} 342 343static __inline __m128 344_mm_cmpge_ps (__m128 __A, __m128 __B) 345{ 346 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 347} 348 349static __inline __m128 350_mm_cmpneq_ps (__m128 __A, __m128 __B) 351{ 352 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 353} 354 355static __inline __m128 356_mm_cmpnlt_ps (__m128 __A, __m128 __B) 357{ 358 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 359} 360 361static __inline __m128 362_mm_cmpnle_ps (__m128 __A, __m128 __B) 363{ 364 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 365} 366 367static __inline __m128 368_mm_cmpngt_ps (__m128 __A, __m128 __B) 369{ 370 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 371} 372 373static __inline __m128 374_mm_cmpnge_ps (__m128 __A, __m128 __B) 375{ 376 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 377} 378 379static __inline __m128 380_mm_cmpord_ps (__m128 __A, __m128 __B) 381{ 382 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 383} 384 385static __inline __m128 386_mm_cmpunord_ps (__m128 __A, __m128 __B) 387{ 388 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 389} 390 391/* Compare the lower SPFP values of A and B and return 1 if true 392 and 0 if false. */ 393 394static __inline int 395_mm_comieq_ss (__m128 __A, __m128 __B) 396{ 397 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 398} 399 400static __inline int 401_mm_comilt_ss (__m128 __A, __m128 __B) 402{ 403 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 404} 405 406static __inline int 407_mm_comile_ss (__m128 __A, __m128 __B) 408{ 409 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 410} 411 412static __inline int 413_mm_comigt_ss (__m128 __A, __m128 __B) 414{ 415 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 416} 417 418static __inline int 419_mm_comige_ss (__m128 __A, __m128 __B) 420{ 421 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 422} 423 424static __inline int 425_mm_comineq_ss (__m128 __A, __m128 __B) 426{ 427 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 428} 429 430static __inline int 431_mm_ucomieq_ss (__m128 __A, __m128 __B) 432{ 433 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 434} 435 436static __inline int 437_mm_ucomilt_ss (__m128 __A, __m128 __B) 438{ 439 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 440} 441 442static __inline int 443_mm_ucomile_ss (__m128 __A, __m128 __B) 444{ 445 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 446} 447 448static __inline int 449_mm_ucomigt_ss (__m128 __A, __m128 __B) 450{ 451 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 452} 453 454static __inline int 455_mm_ucomige_ss (__m128 __A, __m128 __B) 456{ 457 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 458} 459 460static __inline int 461_mm_ucomineq_ss (__m128 __A, __m128 __B) 462{ 463 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 464} 465 466/* Convert the lower SPFP value to a 32-bit integer according to the current 467 rounding mode. */ 468static __inline int 469_mm_cvtss_si32 (__m128 __A) 470{ 471 return __builtin_ia32_cvtss2si ((__v4sf) __A); 472} 473 474/* Convert the two lower SPFP values to 32-bit integers according to the 475 current rounding mode. Return the integers in packed form. */ 476static __inline __m64 477_mm_cvtps_pi32 (__m128 __A) 478{ 479 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 480} 481 482/* Truncate the lower SPFP value to a 32-bit integer. */ 483static __inline int 484_mm_cvttss_si32 (__m128 __A) 485{ 486 return __builtin_ia32_cvttss2si ((__v4sf) __A); 487} 488 489/* Truncate the two lower SPFP values to 32-bit integers. Return the 490 integers in packed form. */ 491static __inline __m64 492_mm_cvttps_pi32 (__m128 __A) 493{ 494 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 495} 496 497/* Convert B to a SPFP value and insert it as element zero in A. */ 498static __inline __m128 499_mm_cvtsi32_ss (__m128 __A, int __B) 500{ 501 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 502} 503 504/* Convert the two 32-bit values in B to SPFP form and insert them 505 as the two lower elements in A. */ 506static __inline __m128 507_mm_cvtpi32_ps (__m128 __A, __m64 __B) 508{ 509 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 510} 511 512/* Convert the four signed 16-bit values in A to SPFP form. */ 513static __inline __m128 514_mm_cvtpi16_ps (__m64 __A) 515{ 516 __v4hi __sign; 517 __v2si __hisi, __losi; 518 __v4sf __r; 519 520 /* This comparison against zero gives us a mask that can be used to 521 fill in the missing sign bits in the unpack operations below, so 522 that we get signed values after unpacking. */ 523 __sign = (__v4hi) __builtin_ia32_mmx_zero (); 524 __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 525 526 /* Convert the four words to doublewords. */ 527 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 528 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 529 530 /* Convert the doublewords to floating point two at a time. */ 531 __r = (__v4sf) __builtin_ia32_setzerops (); 532 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 533 __r = __builtin_ia32_movlhps (__r, __r); 534 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 535 536 return (__m128) __r; 537} 538 539/* Convert the four unsigned 16-bit values in A to SPFP form. */ 540static __inline __m128 541_mm_cvtpu16_ps (__m64 __A) 542{ 543 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 544 __v2si __hisi, __losi; 545 __v4sf __r; 546 547 /* Convert the four words to doublewords. */ 548 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 549 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 550 551 /* Convert the doublewords to floating point two at a time. */ 552 __r = (__v4sf) __builtin_ia32_setzerops (); 553 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 554 __r = __builtin_ia32_movlhps (__r, __r); 555 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 556 557 return (__m128) __r; 558} 559 560/* Convert the low four signed 8-bit values in A to SPFP form. */ 561static __inline __m128 562_mm_cvtpi8_ps (__m64 __A) 563{ 564 __v8qi __sign; 565 566 /* This comparison against zero gives us a mask that can be used to 567 fill in the missing sign bits in the unpack operations below, so 568 that we get signed values after unpacking. */ 569 __sign = (__v8qi) __builtin_ia32_mmx_zero (); 570 __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 571 572 /* Convert the four low bytes to words. */ 573 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 574 575 return _mm_cvtpi16_ps(__A); 576} 577 578/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 579static __inline __m128 580_mm_cvtpu8_ps(__m64 __A) 581{ 582 __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 583 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 584 return _mm_cvtpu16_ps(__A); 585} 586 587/* Convert the four signed 32-bit values in A and B to SPFP form. */ 588static __inline __m128 589_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 590{ 591 __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 592 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 593 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 594 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 595} 596 597/* Convert the four SPFP values in A to four signed 16-bit integers. */ 598static __inline __m64 599_mm_cvtps_pi16(__m128 __A) 600{ 601 __v4sf __hisf = (__v4sf)__A; 602 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 603 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 604 __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 605 return (__m64) __builtin_ia32_packssdw (__losi, __hisi); 606} 607 608/* Convert the four SPFP values in A to four signed 8-bit integers. */ 609static __inline __m64 610_mm_cvtps_pi8(__m128 __A) 611{ 612 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 613 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 614 return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 615} 616 617/* Selects four specific SPFP values from A and B based on MASK. */ 618#if 0 619static __inline __m128 620_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 621{ 622 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 623} 624#else 625#define _mm_shuffle_ps(A, B, MASK) \ 626 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 627#endif 628 629 630/* Selects and interleaves the upper two SPFP values from A and B. */ 631static __inline __m128 632_mm_unpackhi_ps (__m128 __A, __m128 __B) 633{ 634 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 635} 636 637/* Selects and interleaves the lower two SPFP values from A and B. */ 638static __inline __m128 639_mm_unpacklo_ps (__m128 __A, __m128 __B) 640{ 641 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 642} 643 644/* Sets the upper two SPFP values with 64-bits of data loaded from P; 645 the lower two values are passed through from A. */ 646static __inline __m128 647_mm_loadh_pi (__m128 __A, __m64 *__P) 648{ 649 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 650} 651 652/* Stores the upper two SPFP values of A into P. */ 653static __inline void 654_mm_storeh_pi (__m64 *__P, __m128 __A) 655{ 656 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 657} 658 659/* Moves the upper two values of B into the lower two values of A. */ 660static __inline __m128 661_mm_movehl_ps (__m128 __A, __m128 __B) 662{ 663 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 664} 665 666/* Moves the lower two values of B into the upper two values of A. */ 667static __inline __m128 668_mm_movelh_ps (__m128 __A, __m128 __B) 669{ 670 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 671} 672 673/* Sets the lower two SPFP values with 64-bits of data loaded from P; 674 the upper two values are passed through from A. */ 675static __inline __m128 676_mm_loadl_pi (__m128 __A, __m64 *__P) 677{ 678 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 679} 680 681/* Stores the lower two SPFP values of A into P. */ 682static __inline void 683_mm_storel_pi (__m64 *__P, __m128 __A) 684{ 685 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 686} 687 688/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 689static __inline int 690_mm_movemask_ps (__m128 __A) 691{ 692 return __builtin_ia32_movmskps ((__v4sf)__A); 693} 694 695/* Return the contents of the control register. */ 696static __inline unsigned int 697_mm_getcsr (void) 698{ 699 return __builtin_ia32_stmxcsr (); 700} 701 702/* Read exception bits from the control register. */ 703static __inline unsigned int 704_MM_GET_EXCEPTION_STATE (void) 705{ 706 return _mm_getcsr() & _MM_EXCEPT_MASK; 707} 708 709static __inline unsigned int 710_MM_GET_EXCEPTION_MASK (void) 711{ 712 return _mm_getcsr() & _MM_MASK_MASK; 713} 714 715static __inline unsigned int 716_MM_GET_ROUNDING_MODE (void) 717{ 718 return _mm_getcsr() & _MM_ROUND_MASK; 719} 720 721static __inline unsigned int 722_MM_GET_FLUSH_ZERO_MODE (void) 723{ 724 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 725} 726 727/* Set the control register to I. */ 728static __inline void 729_mm_setcsr (unsigned int __I) 730{ 731 __builtin_ia32_ldmxcsr (__I); 732} 733 734/* Set exception bits in the control register. */ 735static __inline void 736_MM_SET_EXCEPTION_STATE(unsigned int __mask) 737{ 738 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 739} 740 741static __inline void 742_MM_SET_EXCEPTION_MASK (unsigned int __mask) 743{ 744 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 745} 746 747static __inline void 748_MM_SET_ROUNDING_MODE (unsigned int __mode) 749{ 750 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 751} 752 753static __inline void 754_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 755{ 756 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 757} 758 759/* Create a vector with element 0 as *P and the rest zero. */ 760static __inline __m128 761_mm_load_ss (float *__P) 762{ 763 return (__m128) __builtin_ia32_loadss (__P); 764} 765 766/* Create a vector with all four elements equal to *P. */ 767static __inline __m128 768_mm_load1_ps (float *__P) 769{ 770 __v4sf __tmp = __builtin_ia32_loadss (__P); 771 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 772} 773 774static __inline __m128 775_mm_load_ps1 (float *__P) 776{ 777 return _mm_load1_ps (__P); 778} 779 780/* Load four SPFP values from P. The address must be 16-byte aligned. */ 781static __inline __m128 782_mm_load_ps (float *__P) 783{ 784 return (__m128) __builtin_ia32_loadaps (__P); 785} 786 787/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 788static __inline __m128 789_mm_loadu_ps (float *__P) 790{ 791 return (__m128) __builtin_ia32_loadups (__P); 792} 793 794/* Load four SPFP values in reverse order. The address must be aligned. */ 795static __inline __m128 796_mm_loadr_ps (float *__P) 797{ 798 __v4sf __tmp = __builtin_ia32_loadaps (__P); 799 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 800} 801 802/* Create a vector with element 0 as F and the rest zero. */ 803static __inline __m128 804_mm_set_ss (float __F) 805{ 806 return (__m128) __builtin_ia32_loadss (&__F); 807} 808 809/* Create a vector with all four elements equal to F. */ 810static __inline __m128 811_mm_set1_ps (float __F) 812{ 813 __v4sf __tmp = __builtin_ia32_loadss (&__F); 814 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 815} 816 817static __inline __m128 818_mm_set_ps1 (float __F) 819{ 820 return _mm_set1_ps (__F); 821} 822 823/* Create the vector [Z Y X W]. */ 824static __inline __m128 825_mm_set_ps (float __Z, float __Y, float __X, float __W) 826{ 827 union { 828 float __a[4]; 829 __m128 __v; 830 } __u; 831 832 __u.__a[0] = __W; 833 __u.__a[1] = __X; 834 __u.__a[2] = __Y; 835 __u.__a[3] = __Z; 836 837 return __u.__v; 838} 839 840/* Create the vector [W X Y Z]. */ 841static __inline __m128 842_mm_setr_ps (float __Z, float __Y, float __X, float __W) 843{ 844 return _mm_set_ps (__W, __X, __Y, __Z); 845} 846 847/* Create a vector of zeros. */ 848static __inline __m128 849_mm_setzero_ps (void) 850{ 851 return (__m128) __builtin_ia32_setzerops (); 852} 853 854/* Stores the lower SPFP value. */ 855static __inline void 856_mm_store_ss (float *__P, __m128 __A) 857{ 858 __builtin_ia32_storess (__P, (__v4sf)__A); 859} 860 861/* Store the lower SPFP value across four words. */ 862static __inline void 863_mm_store1_ps (float *__P, __m128 __A) 864{ 865 __v4sf __va = (__v4sf)__A; 866 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 867 __builtin_ia32_storeaps (__P, __tmp); 868} 869 870static __inline void 871_mm_store_ps1 (float *__P, __m128 __A) 872{ 873 _mm_store1_ps (__P, __A); 874} 875 876/* Store four SPFP values. The address must be 16-byte aligned. */ 877static __inline void 878_mm_store_ps (float *__P, __m128 __A) 879{ 880 __builtin_ia32_storeaps (__P, (__v4sf)__A); 881} 882 883/* Store four SPFP values. The address need not be 16-byte aligned. */ 884static __inline void 885_mm_storeu_ps (float *__P, __m128 __A) 886{ 887 __builtin_ia32_storeups (__P, (__v4sf)__A); 888} 889 890/* Store four SPFP values in reverse order. The addres must be aligned. */ 891static __inline void 892_mm_storer_ps (float *__P, __m128 __A) 893{ 894 __v4sf __va = (__v4sf)__A; 895 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 896 __builtin_ia32_storeaps (__P, __tmp); 897} 898 899/* Sets the low SPFP value of A from the low value of B. */ 900static __inline __m128 901_mm_move_ss (__m128 __A, __m128 __B) 902{ 903 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 904} 905 906/* Extracts one of the four words of A. The selector N must be immediate. */ 907#if 0 908static __inline int 909_mm_extract_pi16 (__m64 __A, int __N) 910{ 911 return __builtin_ia32_pextrw ((__v4hi)__A, __N); 912} 913#else 914#define _mm_extract_pi16(A, N) \ 915 __builtin_ia32_pextrw ((__v4hi)(A), (N)) 916#endif 917 918/* Inserts word D into one of four words of A. The selector N must be 919 immediate. */ 920#if 0 921static __inline __m64 922_mm_insert_pi16 (__m64 __A, int __D, int __N) 923{ 924 return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 925} 926#else 927#define _mm_insert_pi16(A, D, N) \ 928 ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 929#endif 930 931/* Compute the element-wise maximum of signed 16-bit values. */ 932static __inline __m64 933_mm_max_pi16 (__m64 __A, __m64 __B) 934{ 935 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 936} 937 938/* Compute the element-wise maximum of unsigned 8-bit values. */ 939static __inline __m64 940_mm_max_pu8 (__m64 __A, __m64 __B) 941{ 942 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 943} 944 945/* Compute the element-wise minimum of signed 16-bit values. */ 946static __inline __m64 947_mm_min_pi16 (__m64 __A, __m64 __B) 948{ 949 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 950} 951 952/* Compute the element-wise minimum of unsigned 8-bit values. */ 953static __inline __m64 954_mm_min_pu8 (__m64 __A, __m64 __B) 955{ 956 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 957} 958 959/* Create an 8-bit mask of the signs of 8-bit values. */ 960static __inline int 961_mm_movemask_pi8 (__m64 __A) 962{ 963 return __builtin_ia32_pmovmskb ((__v8qi)__A); 964} 965 966/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 967 in B and produce the high 16 bits of the 32-bit results. */ 968static __inline __m64 969_mm_mulhi_pu16 (__m64 __A, __m64 __B) 970{ 971 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 972} 973 974/* Return a combination of the four 16-bit values in A. The selector 975 must be an immediate. */ 976#if 0 977static __inline __m64 978_mm_shuffle_pi16 (__m64 __A, int __N) 979{ 980 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 981} 982#else 983#define _mm_shuffle_pi16(A, N) \ 984 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 985#endif 986 987/* Conditionally store byte elements of A into P. The high bit of each 988 byte in the selector N determines whether the corresponding byte from 989 A is stored. */ 990static __inline void 991_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 992{ 993 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 994} 995 996/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 997static __inline __m64 998_mm_avg_pu8 (__m64 __A, __m64 __B) 999{ 1000 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 1001} 1002 1003/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1004static __inline __m64 1005_mm_avg_pu16 (__m64 __A, __m64 __B) 1006{ 1007 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 1008} 1009 1010/* Compute the sum of the absolute differences of the unsigned 8-bit 1011 values in A and B. Return the value in the lower 16-bit word; the 1012 upper words are cleared. */ 1013static __inline __m64 1014_mm_sad_pu8 (__m64 __A, __m64 __B) 1015{ 1016 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 1017} 1018 1019/* Loads one cache line from address P to a location "closer" to the 1020 processor. The selector I specifies the type of prefetch operation. */ 1021#if 0 1022static __inline void 1023_mm_prefetch (void *__P, enum _mm_hint __I) 1024{ 1025 __builtin_prefetch (__P, 0, __I); 1026} 1027#else 1028#define _mm_prefetch(P, I) \ 1029 __builtin_prefetch ((P), 0, (I)) 1030#endif 1031 1032/* Stores the data in A to the address P without polluting the caches. */ 1033static __inline void 1034_mm_stream_pi (__m64 *__P, __m64 __A) 1035{ 1036 __builtin_ia32_movntq (__P, (long long)__A); 1037} 1038 1039/* Likewise. The address must be 16-byte aligned. */ 1040static __inline void 1041_mm_stream_ps (float *__P, __m128 __A) 1042{ 1043 __builtin_ia32_movntps (__P, (__v4sf)__A); 1044} 1045 1046/* Guarantees that every preceeding store is globally visible before 1047 any subsequent store. */ 1048static __inline void 1049_mm_sfence (void) 1050{ 1051 __builtin_ia32_sfence (); 1052} 1053 1054/* The execution of the next instruction is delayed by an implementation 1055 specific amount of time. The instruction does not modify the 1056 architectural state. */ 1057static __inline void 1058_mm_pause (void) 1059{ 1060 __asm__ __volatile__ ("rep; nop" : : ); 1061} 1062 1063/* Transpose the 4x4 matrix composed of row[0-3]. */ 1064#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1065do { \ 1066 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1067 __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1068 __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1069 __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 1070 __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 1071 (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 1072 (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 1073 (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 1074 (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 1075} while (0) 1076 1077#endif /* _XMMINTRIN_H_INCLUDED */ 1078