mmintrin.h revision 107590
1/* Copyright (C) 2002 Free Software Foundation, Inc. 2 3 This file is part of GNU CC. 4 5 GNU CC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GNU CC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GNU CC; see the file COPYING. If not, write to 17 the Free Software Foundation, 59 Temple Place - Suite 330, 18 Boston, MA 02111-1307, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 5.0. */ 29 30#ifndef _MMINTRIN_H_INCLUDED 31#define _MMINTRIN_H_INCLUDED 32 33/* The data type intended for user use. */ 34typedef int __m64 __attribute__ ((__mode__ (__V2SI__))); 35 36/* Internal data types for implementing the intrinsics. */ 37typedef int __v2si __attribute__ ((__mode__ (__V2SI__))); 38typedef int __v4hi __attribute__ ((__mode__ (__V4HI__))); 39typedef int __v8qi __attribute__ ((__mode__ (__V8QI__))); 40 41/* Empty the multimedia state. */ 42static __inline void 43_mm_empty (void) 44{ 45 __builtin_ia32_emms (); 46} 47 48/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 49static __inline __m64 50_mm_cvtsi32_si64 (int __i) 51{ 52 long long __tmp = (unsigned int)__i; 53 return (__m64) __tmp; 54} 55 56/* Convert the lower 32 bits of the __m64 object into an integer. */ 57static __inline int 58_mm_cvtsi64_si32 (__m64 __i) 59{ 60 long long __tmp = (long long)__i; 61 return __tmp; 62} 63 64/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 65 the result, and the four 16-bit values from M2 into the upper four 8-bit 66 values of the result, all with signed saturation. */ 67static __inline __m64 68_mm_packs_pi16 (__m64 __m1, __m64 __m2) 69{ 70 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); 71} 72 73/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 74 the result, and the two 32-bit values from M2 into the upper two 16-bit 75 values of the result, all with signed saturation. */ 76static __inline __m64 77_mm_packs_pi32 (__m64 __m1, __m64 __m2) 78{ 79 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); 80} 81 82/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 83 the result, and the four 16-bit values from M2 into the upper four 8-bit 84 values of the result, all with unsigned saturation. */ 85static __inline __m64 86_mm_packs_pu16 (__m64 __m1, __m64 __m2) 87{ 88 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); 89} 90 91/* Interleave the four 8-bit values from the high half of M1 with the four 92 8-bit values from the high half of M2. */ 93static __inline __m64 94_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 95{ 96 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); 97} 98 99/* Interleave the two 16-bit values from the high half of M1 with the two 100 16-bit values from the high half of M2. */ 101static __inline __m64 102_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 103{ 104 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); 105} 106 107/* Interleave the 32-bit value from the high half of M1 with the 32-bit 108 value from the high half of M2. */ 109static __inline __m64 110_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 111{ 112 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); 113} 114 115/* Interleave the four 8-bit values from the low half of M1 with the four 116 8-bit values from the low half of M2. */ 117static __inline __m64 118_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 119{ 120 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); 121} 122 123/* Interleave the two 16-bit values from the low half of M1 with the two 124 16-bit values from the low half of M2. */ 125static __inline __m64 126_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 127{ 128 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); 129} 130 131/* Interleave the 32-bit value from the low half of M1 with the 32-bit 132 value from the low half of M2. */ 133static __inline __m64 134_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 135{ 136 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); 137} 138 139/* Add the 8-bit values in M1 to the 8-bit values in M2. */ 140static __inline __m64 141_mm_add_pi8 (__m64 __m1, __m64 __m2) 142{ 143 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); 144} 145 146/* Add the 16-bit values in M1 to the 16-bit values in M2. */ 147static __inline __m64 148_mm_add_pi16 (__m64 __m1, __m64 __m2) 149{ 150 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); 151} 152 153/* Add the 32-bit values in M1 to the 32-bit values in M2. */ 154static __inline __m64 155_mm_add_pi32 (__m64 __m1, __m64 __m2) 156{ 157 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); 158} 159 160/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 161 saturated arithmetic. */ 162static __inline __m64 163_mm_adds_pi8 (__m64 __m1, __m64 __m2) 164{ 165 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); 166} 167 168/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 169 saturated arithmetic. */ 170static __inline __m64 171_mm_adds_pi16 (__m64 __m1, __m64 __m2) 172{ 173 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); 174} 175 176/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 177 saturated arithmetic. */ 178static __inline __m64 179_mm_adds_pu8 (__m64 __m1, __m64 __m2) 180{ 181 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); 182} 183 184/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 185 saturated arithmetic. */ 186static __inline __m64 187_mm_adds_pu16 (__m64 __m1, __m64 __m2) 188{ 189 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); 190} 191 192/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 193static __inline __m64 194_mm_sub_pi8 (__m64 __m1, __m64 __m2) 195{ 196 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); 197} 198 199/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 200static __inline __m64 201_mm_sub_pi16 (__m64 __m1, __m64 __m2) 202{ 203 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); 204} 205 206/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 207static __inline __m64 208_mm_sub_pi32 (__m64 __m1, __m64 __m2) 209{ 210 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); 211} 212 213/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 214 saturating arithmetic. */ 215static __inline __m64 216_mm_subs_pi8 (__m64 __m1, __m64 __m2) 217{ 218 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); 219} 220 221/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 222 signed saturating arithmetic. */ 223static __inline __m64 224_mm_subs_pi16 (__m64 __m1, __m64 __m2) 225{ 226 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); 227} 228 229/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 230 unsigned saturating arithmetic. */ 231static __inline __m64 232_mm_subs_pu8 (__m64 __m1, __m64 __m2) 233{ 234 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); 235} 236 237/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 238 unsigned saturating arithmetic. */ 239static __inline __m64 240_mm_subs_pu16 (__m64 __m1, __m64 __m2) 241{ 242 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); 243} 244 245/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 246 four 32-bit intermediate results, which are then summed by pairs to 247 produce two 32-bit results. */ 248static __inline __m64 249_mm_madd_pi16 (__m64 __m1, __m64 __m2) 250{ 251 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); 252} 253 254/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 255 M2 and produce the high 16 bits of the 32-bit results. */ 256static __inline __m64 257_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 258{ 259 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); 260} 261 262/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 263 the low 16 bits of the results. */ 264static __inline __m64 265_mm_mullo_pi16 (__m64 __m1, __m64 __m2) 266{ 267 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); 268} 269 270/* Shift four 16-bit values in M left by COUNT. */ 271static __inline __m64 272_mm_sll_pi16 (__m64 __m, __m64 __count) 273{ 274 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count); 275} 276 277static __inline __m64 278_mm_slli_pi16 (__m64 __m, int __count) 279{ 280 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count); 281} 282 283/* Shift two 32-bit values in M left by COUNT. */ 284static __inline __m64 285_mm_sll_pi32 (__m64 __m, __m64 __count) 286{ 287 return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count); 288} 289 290static __inline __m64 291_mm_slli_pi32 (__m64 __m, int __count) 292{ 293 return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count); 294} 295 296/* Shift the 64-bit value in M left by COUNT. */ 297static __inline __m64 298_mm_sll_si64 (__m64 __m, __m64 __count) 299{ 300 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); 301} 302 303static __inline __m64 304_mm_slli_si64 (__m64 __m, int __count) 305{ 306 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); 307} 308 309/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 310static __inline __m64 311_mm_sra_pi16 (__m64 __m, __m64 __count) 312{ 313 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count); 314} 315 316static __inline __m64 317_mm_srai_pi16 (__m64 __m, int __count) 318{ 319 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count); 320} 321 322/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 323static __inline __m64 324_mm_sra_pi32 (__m64 __m, __m64 __count) 325{ 326 return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count); 327} 328 329static __inline __m64 330_mm_srai_pi32 (__m64 __m, int __count) 331{ 332 return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count); 333} 334 335/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 336static __inline __m64 337_mm_srl_pi16 (__m64 __m, __m64 __count) 338{ 339 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count); 340} 341 342static __inline __m64 343_mm_srli_pi16 (__m64 __m, int __count) 344{ 345 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count); 346} 347 348/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 349static __inline __m64 350_mm_srl_pi32 (__m64 __m, __m64 __count) 351{ 352 return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count); 353} 354 355static __inline __m64 356_mm_srli_pi32 (__m64 __m, int __count) 357{ 358 return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count); 359} 360 361/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 362static __inline __m64 363_mm_srl_si64 (__m64 __m, __m64 __count) 364{ 365 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); 366} 367 368static __inline __m64 369_mm_srli_si64 (__m64 __m, int __count) 370{ 371 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); 372} 373 374/* Bit-wise AND the 64-bit values in M1 and M2. */ 375static __inline __m64 376_mm_and_si64 (__m64 __m1, __m64 __m2) 377{ 378 return (__m64) __builtin_ia32_pand ((long long)__m1, (long long)__m2); 379} 380 381/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 382 64-bit value in M2. */ 383static __inline __m64 384_mm_andnot_si64 (__m64 __m1, __m64 __m2) 385{ 386 return (__m64) __builtin_ia32_pandn ((long long)__m1, (long long)__m2); 387} 388 389/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 390static __inline __m64 391_mm_or_si64 (__m64 __m1, __m64 __m2) 392{ 393 return (__m64)__builtin_ia32_por ((long long)__m1, (long long)__m2); 394} 395 396/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 397static __inline __m64 398_mm_xor_si64 (__m64 __m1, __m64 __m2) 399{ 400 return (__m64)__builtin_ia32_pxor ((long long)__m1, (long long)__m2); 401} 402 403/* Compare eight 8-bit values. The result of the comparison is 0xFF if the 404 test is true and zero if false. */ 405static __inline __m64 406_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 407{ 408 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); 409} 410 411static __inline __m64 412_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 413{ 414 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); 415} 416 417/* Compare four 16-bit values. The result of the comparison is 0xFFFF if 418 the test is true and zero if false. */ 419static __inline __m64 420_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 421{ 422 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); 423} 424 425static __inline __m64 426_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 427{ 428 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); 429} 430 431/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 432 the test is true and zero if false. */ 433static __inline __m64 434_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 435{ 436 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); 437} 438 439static __inline __m64 440_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 441{ 442 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); 443} 444 445/* Creates a 64-bit zero. */ 446static __inline __m64 447_mm_setzero_si64 (void) 448{ 449 return (__m64)__builtin_ia32_mmx_zero (); 450} 451 452/* Creates a vector of two 32-bit values; I0 is least significant. */ 453static __inline __m64 454_mm_set_pi32 (int __i1, int __i0) 455{ 456 union { 457 __m64 __q; 458 struct { 459 unsigned int __i0; 460 unsigned int __i1; 461 } __s; 462 } __u; 463 464 __u.__s.__i0 = __i0; 465 __u.__s.__i1 = __i1; 466 467 return __u.__q; 468} 469 470/* Creates a vector of four 16-bit values; W0 is least significant. */ 471static __inline __m64 472_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 473{ 474 unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; 475 unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; 476 return _mm_set_pi32 (__i1, __i0); 477 478} 479 480/* Creates a vector of eight 8-bit values; B0 is least significant. */ 481static __inline __m64 482_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 483 char __b3, char __b2, char __b1, char __b0) 484{ 485 unsigned int __i1, __i0; 486 487 __i1 = (unsigned char)__b7; 488 __i1 = __i1 << 8 | (unsigned char)__b6; 489 __i1 = __i1 << 8 | (unsigned char)__b5; 490 __i1 = __i1 << 8 | (unsigned char)__b4; 491 492 __i0 = (unsigned char)__b3; 493 __i0 = __i0 << 8 | (unsigned char)__b2; 494 __i0 = __i0 << 8 | (unsigned char)__b1; 495 __i0 = __i0 << 8 | (unsigned char)__b0; 496 497 return _mm_set_pi32 (__i1, __i0); 498} 499 500/* Similar, but with the arguments in reverse order. */ 501static __inline __m64 502_mm_setr_pi32 (int __i0, int __i1) 503{ 504 return _mm_set_pi32 (__i1, __i0); 505} 506 507static __inline __m64 508_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 509{ 510 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 511} 512 513static __inline __m64 514_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 515 char __b4, char __b5, char __b6, char __b7) 516{ 517 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 518} 519 520/* Creates a vector of two 32-bit values, both elements containing I. */ 521static __inline __m64 522_mm_set1_pi32 (int __i) 523{ 524 return _mm_set_pi32 (__i, __i); 525} 526 527/* Creates a vector of four 16-bit values, all elements containing W. */ 528static __inline __m64 529_mm_set1_pi16 (short __w) 530{ 531 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w; 532 return _mm_set1_pi32 (__i); 533} 534 535/* Creates a vector of four 16-bit values, all elements containing B. */ 536static __inline __m64 537_mm_set1_pi8 (char __b) 538{ 539 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b; 540 unsigned int __i = __w << 16 | __w; 541 return _mm_set1_pi32 (__i); 542} 543 544#endif /* _MMINTRIN_H_INCLUDED */ 545