1/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GCC; see the file COPYING. If not, write to 17 the Free Software Foundation, 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 8.0. */ 29 30#ifndef _MMINTRIN_H_INCLUDED 31#define _MMINTRIN_H_INCLUDED 32 33#ifndef __MMX__ 34# error "MMX instruction set not enabled" 35#else 36/* The Intel API is flexible enough that we must allow aliasing with other 37 vector types, and their scalar components. */ 38typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); 39 40/* Internal data types for implementing the intrinsics. */ 41typedef int __v2si __attribute__ ((__vector_size__ (8))); 42typedef short __v4hi __attribute__ ((__vector_size__ (8))); 43typedef char __v8qi __attribute__ ((__vector_size__ (8))); 44 45/* Empty the multimedia state. */ 46static __inline void __attribute__((__always_inline__)) 47_mm_empty (void) 48{ 49 __builtin_ia32_emms (); 50} 51 52static __inline void __attribute__((__always_inline__)) 53_m_empty (void) 54{ 55 _mm_empty (); 56} 57 58/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 59static __inline __m64 __attribute__((__always_inline__)) 60_mm_cvtsi32_si64 (int __i) 61{ 62 return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); 63} 64 65static __inline __m64 __attribute__((__always_inline__)) 66_m_from_int (int __i) 67{ 68 return _mm_cvtsi32_si64 (__i); 69} 70 71#ifdef __x86_64__ 72/* Convert I to a __m64 object. */ 73static __inline __m64 __attribute__((__always_inline__)) 74_mm_cvtsi64x_si64 (long long __i) 75{ 76 return (__m64) __i; 77} 78 79/* Convert I to a __m64 object. */ 80static __inline __m64 __attribute__((__always_inline__)) 81_mm_set_pi64x (long long __i) 82{ 83 return (__m64) __i; 84} 85#endif 86 87/* Convert the lower 32 bits of the __m64 object into an integer. */ 88static __inline int __attribute__((__always_inline__)) 89_mm_cvtsi64_si32 (__m64 __i) 90{ 91 return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); 92} 93 94static __inline int __attribute__((__always_inline__)) 95_m_to_int (__m64 __i) 96{ 97 return _mm_cvtsi64_si32 (__i); 98} 99 100#ifdef __x86_64__ 101/* Convert the lower 32 bits of the __m64 object into an integer. */ 102static __inline long long __attribute__((__always_inline__)) 103_mm_cvtsi64_si64x (__m64 __i) 104{ 105 return (long long)__i; 106} 107#endif 108 109/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 110 the result, and the four 16-bit values from M2 into the upper four 8-bit 111 values of the result, all with signed saturation. */ 112static __inline __m64 __attribute__((__always_inline__)) 113_mm_packs_pi16 (__m64 __m1, __m64 __m2) 114{ 115 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); 116} 117 118static __inline __m64 __attribute__((__always_inline__)) 119_m_packsswb (__m64 __m1, __m64 __m2) 120{ 121 return _mm_packs_pi16 (__m1, __m2); 122} 123 124/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 125 the result, and the two 32-bit values from M2 into the upper two 16-bit 126 values of the result, all with signed saturation. */ 127static __inline __m64 __attribute__((__always_inline__)) 128_mm_packs_pi32 (__m64 __m1, __m64 __m2) 129{ 130 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); 131} 132 133static __inline __m64 __attribute__((__always_inline__)) 134_m_packssdw (__m64 __m1, __m64 __m2) 135{ 136 return _mm_packs_pi32 (__m1, __m2); 137} 138 139/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 140 the result, and the four 16-bit values from M2 into the upper four 8-bit 141 values of the result, all with unsigned saturation. */ 142static __inline __m64 __attribute__((__always_inline__)) 143_mm_packs_pu16 (__m64 __m1, __m64 __m2) 144{ 145 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); 146} 147 148static __inline __m64 __attribute__((__always_inline__)) 149_m_packuswb (__m64 __m1, __m64 __m2) 150{ 151 return _mm_packs_pu16 (__m1, __m2); 152} 153 154/* Interleave the four 8-bit values from the high half of M1 with the four 155 8-bit values from the high half of M2. */ 156static __inline __m64 __attribute__((__always_inline__)) 157_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 158{ 159 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); 160} 161 162static __inline __m64 __attribute__((__always_inline__)) 163_m_punpckhbw (__m64 __m1, __m64 __m2) 164{ 165 return _mm_unpackhi_pi8 (__m1, __m2); 166} 167 168/* Interleave the two 16-bit values from the high half of M1 with the two 169 16-bit values from the high half of M2. */ 170static __inline __m64 __attribute__((__always_inline__)) 171_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 172{ 173 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); 174} 175 176static __inline __m64 __attribute__((__always_inline__)) 177_m_punpckhwd (__m64 __m1, __m64 __m2) 178{ 179 return _mm_unpackhi_pi16 (__m1, __m2); 180} 181 182/* Interleave the 32-bit value from the high half of M1 with the 32-bit 183 value from the high half of M2. */ 184static __inline __m64 __attribute__((__always_inline__)) 185_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 186{ 187 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); 188} 189 190static __inline __m64 __attribute__((__always_inline__)) 191_m_punpckhdq (__m64 __m1, __m64 __m2) 192{ 193 return _mm_unpackhi_pi32 (__m1, __m2); 194} 195 196/* Interleave the four 8-bit values from the low half of M1 with the four 197 8-bit values from the low half of M2. */ 198static __inline __m64 __attribute__((__always_inline__)) 199_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 200{ 201 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); 202} 203 204static __inline __m64 __attribute__((__always_inline__)) 205_m_punpcklbw (__m64 __m1, __m64 __m2) 206{ 207 return _mm_unpacklo_pi8 (__m1, __m2); 208} 209 210/* Interleave the two 16-bit values from the low half of M1 with the two 211 16-bit values from the low half of M2. */ 212static __inline __m64 __attribute__((__always_inline__)) 213_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 214{ 215 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); 216} 217 218static __inline __m64 __attribute__((__always_inline__)) 219_m_punpcklwd (__m64 __m1, __m64 __m2) 220{ 221 return _mm_unpacklo_pi16 (__m1, __m2); 222} 223 224/* Interleave the 32-bit value from the low half of M1 with the 32-bit 225 value from the low half of M2. */ 226static __inline __m64 __attribute__((__always_inline__)) 227_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 228{ 229 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); 230} 231 232static __inline __m64 __attribute__((__always_inline__)) 233_m_punpckldq (__m64 __m1, __m64 __m2) 234{ 235 return _mm_unpacklo_pi32 (__m1, __m2); 236} 237 238/* Add the 8-bit values in M1 to the 8-bit values in M2. */ 239static __inline __m64 __attribute__((__always_inline__)) 240_mm_add_pi8 (__m64 __m1, __m64 __m2) 241{ 242 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); 243} 244 245static __inline __m64 __attribute__((__always_inline__)) 246_m_paddb (__m64 __m1, __m64 __m2) 247{ 248 return _mm_add_pi8 (__m1, __m2); 249} 250 251/* Add the 16-bit values in M1 to the 16-bit values in M2. */ 252static __inline __m64 __attribute__((__always_inline__)) 253_mm_add_pi16 (__m64 __m1, __m64 __m2) 254{ 255 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); 256} 257 258static __inline __m64 __attribute__((__always_inline__)) 259_m_paddw (__m64 __m1, __m64 __m2) 260{ 261 return _mm_add_pi16 (__m1, __m2); 262} 263 264/* Add the 32-bit values in M1 to the 32-bit values in M2. */ 265static __inline __m64 __attribute__((__always_inline__)) 266_mm_add_pi32 (__m64 __m1, __m64 __m2) 267{ 268 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); 269} 270 271static __inline __m64 __attribute__((__always_inline__)) 272_m_paddd (__m64 __m1, __m64 __m2) 273{ 274 return _mm_add_pi32 (__m1, __m2); 275} 276 277/* Add the 64-bit values in M1 to the 64-bit values in M2. */ 278static __inline __m64 __attribute__((__always_inline__)) 279_mm_add_si64 (__m64 __m1, __m64 __m2) 280{ 281 return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2); 282} 283 284/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 285 saturated arithmetic. */ 286static __inline __m64 __attribute__((__always_inline__)) 287_mm_adds_pi8 (__m64 __m1, __m64 __m2) 288{ 289 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); 290} 291 292static __inline __m64 __attribute__((__always_inline__)) 293_m_paddsb (__m64 __m1, __m64 __m2) 294{ 295 return _mm_adds_pi8 (__m1, __m2); 296} 297 298/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 299 saturated arithmetic. */ 300static __inline __m64 __attribute__((__always_inline__)) 301_mm_adds_pi16 (__m64 __m1, __m64 __m2) 302{ 303 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); 304} 305 306static __inline __m64 __attribute__((__always_inline__)) 307_m_paddsw (__m64 __m1, __m64 __m2) 308{ 309 return _mm_adds_pi16 (__m1, __m2); 310} 311 312/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 313 saturated arithmetic. */ 314static __inline __m64 __attribute__((__always_inline__)) 315_mm_adds_pu8 (__m64 __m1, __m64 __m2) 316{ 317 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); 318} 319 320static __inline __m64 __attribute__((__always_inline__)) 321_m_paddusb (__m64 __m1, __m64 __m2) 322{ 323 return _mm_adds_pu8 (__m1, __m2); 324} 325 326/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 327 saturated arithmetic. */ 328static __inline __m64 __attribute__((__always_inline__)) 329_mm_adds_pu16 (__m64 __m1, __m64 __m2) 330{ 331 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); 332} 333 334static __inline __m64 __attribute__((__always_inline__)) 335_m_paddusw (__m64 __m1, __m64 __m2) 336{ 337 return _mm_adds_pu16 (__m1, __m2); 338} 339 340/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 341static __inline __m64 __attribute__((__always_inline__)) 342_mm_sub_pi8 (__m64 __m1, __m64 __m2) 343{ 344 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); 345} 346 347static __inline __m64 __attribute__((__always_inline__)) 348_m_psubb (__m64 __m1, __m64 __m2) 349{ 350 return _mm_sub_pi8 (__m1, __m2); 351} 352 353/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 354static __inline __m64 __attribute__((__always_inline__)) 355_mm_sub_pi16 (__m64 __m1, __m64 __m2) 356{ 357 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); 358} 359 360static __inline __m64 __attribute__((__always_inline__)) 361_m_psubw (__m64 __m1, __m64 __m2) 362{ 363 return _mm_sub_pi16 (__m1, __m2); 364} 365 366/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 367static __inline __m64 __attribute__((__always_inline__)) 368_mm_sub_pi32 (__m64 __m1, __m64 __m2) 369{ 370 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); 371} 372 373static __inline __m64 __attribute__((__always_inline__)) 374_m_psubd (__m64 __m1, __m64 __m2) 375{ 376 return _mm_sub_pi32 (__m1, __m2); 377} 378 379/* Add the 64-bit values in M1 to the 64-bit values in M2. */ 380static __inline __m64 __attribute__((__always_inline__)) 381_mm_sub_si64 (__m64 __m1, __m64 __m2) 382{ 383 return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2); 384} 385 386/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 387 saturating arithmetic. */ 388static __inline __m64 __attribute__((__always_inline__)) 389_mm_subs_pi8 (__m64 __m1, __m64 __m2) 390{ 391 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); 392} 393 394static __inline __m64 __attribute__((__always_inline__)) 395_m_psubsb (__m64 __m1, __m64 __m2) 396{ 397 return _mm_subs_pi8 (__m1, __m2); 398} 399 400/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 401 signed saturating arithmetic. */ 402static __inline __m64 __attribute__((__always_inline__)) 403_mm_subs_pi16 (__m64 __m1, __m64 __m2) 404{ 405 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); 406} 407 408static __inline __m64 __attribute__((__always_inline__)) 409_m_psubsw (__m64 __m1, __m64 __m2) 410{ 411 return _mm_subs_pi16 (__m1, __m2); 412} 413 414/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 415 unsigned saturating arithmetic. */ 416static __inline __m64 __attribute__((__always_inline__)) 417_mm_subs_pu8 (__m64 __m1, __m64 __m2) 418{ 419 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); 420} 421 422static __inline __m64 __attribute__((__always_inline__)) 423_m_psubusb (__m64 __m1, __m64 __m2) 424{ 425 return _mm_subs_pu8 (__m1, __m2); 426} 427 428/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 429 unsigned saturating arithmetic. */ 430static __inline __m64 __attribute__((__always_inline__)) 431_mm_subs_pu16 (__m64 __m1, __m64 __m2) 432{ 433 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); 434} 435 436static __inline __m64 __attribute__((__always_inline__)) 437_m_psubusw (__m64 __m1, __m64 __m2) 438{ 439 return _mm_subs_pu16 (__m1, __m2); 440} 441 442/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 443 four 32-bit intermediate results, which are then summed by pairs to 444 produce two 32-bit results. */ 445static __inline __m64 __attribute__((__always_inline__)) 446_mm_madd_pi16 (__m64 __m1, __m64 __m2) 447{ 448 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); 449} 450 451static __inline __m64 __attribute__((__always_inline__)) 452_m_pmaddwd (__m64 __m1, __m64 __m2) 453{ 454 return _mm_madd_pi16 (__m1, __m2); 455} 456 457/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 458 M2 and produce the high 16 bits of the 32-bit results. */ 459static __inline __m64 __attribute__((__always_inline__)) 460_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 461{ 462 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); 463} 464 465static __inline __m64 __attribute__((__always_inline__)) 466_m_pmulhw (__m64 __m1, __m64 __m2) 467{ 468 return _mm_mulhi_pi16 (__m1, __m2); 469} 470 471/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 472 the low 16 bits of the results. */ 473static __inline __m64 __attribute__((__always_inline__)) 474_mm_mullo_pi16 (__m64 __m1, __m64 __m2) 475{ 476 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); 477} 478 479static __inline __m64 __attribute__((__always_inline__)) 480_m_pmullw (__m64 __m1, __m64 __m2) 481{ 482 return _mm_mullo_pi16 (__m1, __m2); 483} 484 485/* Shift four 16-bit values in M left by COUNT. */ 486static __inline __m64 __attribute__((__always_inline__)) 487_mm_sll_pi16 (__m64 __m, __m64 __count) 488{ 489 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count); 490} 491 492static __inline __m64 __attribute__((__always_inline__)) 493_m_psllw (__m64 __m, __m64 __count) 494{ 495 return _mm_sll_pi16 (__m, __count); 496} 497 498static __inline __m64 __attribute__((__always_inline__)) 499_mm_slli_pi16 (__m64 __m, int __count) 500{ 501 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count); 502} 503 504static __inline __m64 __attribute__((__always_inline__)) 505_m_psllwi (__m64 __m, int __count) 506{ 507 return _mm_slli_pi16 (__m, __count); 508} 509 510/* Shift two 32-bit values in M left by COUNT. */ 511static __inline __m64 __attribute__((__always_inline__)) 512_mm_sll_pi32 (__m64 __m, __m64 __count) 513{ 514 return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count); 515} 516 517static __inline __m64 __attribute__((__always_inline__)) 518_m_pslld (__m64 __m, __m64 __count) 519{ 520 return _mm_sll_pi32 (__m, __count); 521} 522 523static __inline __m64 __attribute__((__always_inline__)) 524_mm_slli_pi32 (__m64 __m, int __count) 525{ 526 return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count); 527} 528 529static __inline __m64 __attribute__((__always_inline__)) 530_m_pslldi (__m64 __m, int __count) 531{ 532 return _mm_slli_pi32 (__m, __count); 533} 534 535/* Shift the 64-bit value in M left by COUNT. */ 536static __inline __m64 __attribute__((__always_inline__)) 537_mm_sll_si64 (__m64 __m, __m64 __count) 538{ 539 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); 540} 541 542static __inline __m64 __attribute__((__always_inline__)) 543_m_psllq (__m64 __m, __m64 __count) 544{ 545 return _mm_sll_si64 (__m, __count); 546} 547 548static __inline __m64 __attribute__((__always_inline__)) 549_mm_slli_si64 (__m64 __m, int __count) 550{ 551 return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); 552} 553 554static __inline __m64 __attribute__((__always_inline__)) 555_m_psllqi (__m64 __m, int __count) 556{ 557 return _mm_slli_si64 (__m, __count); 558} 559 560/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 561static __inline __m64 __attribute__((__always_inline__)) 562_mm_sra_pi16 (__m64 __m, __m64 __count) 563{ 564 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count); 565} 566 567static __inline __m64 __attribute__((__always_inline__)) 568_m_psraw (__m64 __m, __m64 __count) 569{ 570 return _mm_sra_pi16 (__m, __count); 571} 572 573static __inline __m64 __attribute__((__always_inline__)) 574_mm_srai_pi16 (__m64 __m, int __count) 575{ 576 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count); 577} 578 579static __inline __m64 __attribute__((__always_inline__)) 580_m_psrawi (__m64 __m, int __count) 581{ 582 return _mm_srai_pi16 (__m, __count); 583} 584 585/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 586static __inline __m64 __attribute__((__always_inline__)) 587_mm_sra_pi32 (__m64 __m, __m64 __count) 588{ 589 return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count); 590} 591 592static __inline __m64 __attribute__((__always_inline__)) 593_m_psrad (__m64 __m, __m64 __count) 594{ 595 return _mm_sra_pi32 (__m, __count); 596} 597 598static __inline __m64 __attribute__((__always_inline__)) 599_mm_srai_pi32 (__m64 __m, int __count) 600{ 601 return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count); 602} 603 604static __inline __m64 __attribute__((__always_inline__)) 605_m_psradi (__m64 __m, int __count) 606{ 607 return _mm_srai_pi32 (__m, __count); 608} 609 610/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 611static __inline __m64 __attribute__((__always_inline__)) 612_mm_srl_pi16 (__m64 __m, __m64 __count) 613{ 614 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count); 615} 616 617static __inline __m64 __attribute__((__always_inline__)) 618_m_psrlw (__m64 __m, __m64 __count) 619{ 620 return _mm_srl_pi16 (__m, __count); 621} 622 623static __inline __m64 __attribute__((__always_inline__)) 624_mm_srli_pi16 (__m64 __m, int __count) 625{ 626 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count); 627} 628 629static __inline __m64 __attribute__((__always_inline__)) 630_m_psrlwi (__m64 __m, int __count) 631{ 632 return _mm_srli_pi16 (__m, __count); 633} 634 635/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 636static __inline __m64 __attribute__((__always_inline__)) 637_mm_srl_pi32 (__m64 __m, __m64 __count) 638{ 639 return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count); 640} 641 642static __inline __m64 __attribute__((__always_inline__)) 643_m_psrld (__m64 __m, __m64 __count) 644{ 645 return _mm_srl_pi32 (__m, __count); 646} 647 648static __inline __m64 __attribute__((__always_inline__)) 649_mm_srli_pi32 (__m64 __m, int __count) 650{ 651 return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count); 652} 653 654static __inline __m64 __attribute__((__always_inline__)) 655_m_psrldi (__m64 __m, int __count) 656{ 657 return _mm_srli_pi32 (__m, __count); 658} 659 660/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 661static __inline __m64 __attribute__((__always_inline__)) 662_mm_srl_si64 (__m64 __m, __m64 __count) 663{ 664 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); 665} 666 667static __inline __m64 __attribute__((__always_inline__)) 668_m_psrlq (__m64 __m, __m64 __count) 669{ 670 return _mm_srl_si64 (__m, __count); 671} 672 673static __inline __m64 __attribute__((__always_inline__)) 674_mm_srli_si64 (__m64 __m, int __count) 675{ 676 return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); 677} 678 679static __inline __m64 __attribute__((__always_inline__)) 680_m_psrlqi (__m64 __m, int __count) 681{ 682 return _mm_srli_si64 (__m, __count); 683} 684 685/* Bit-wise AND the 64-bit values in M1 and M2. */ 686static __inline __m64 __attribute__((__always_inline__)) 687_mm_and_si64 (__m64 __m1, __m64 __m2) 688{ 689 return __builtin_ia32_pand (__m1, __m2); 690} 691 692static __inline __m64 __attribute__((__always_inline__)) 693_m_pand (__m64 __m1, __m64 __m2) 694{ 695 return _mm_and_si64 (__m1, __m2); 696} 697 698/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 699 64-bit value in M2. */ 700static __inline __m64 __attribute__((__always_inline__)) 701_mm_andnot_si64 (__m64 __m1, __m64 __m2) 702{ 703 return __builtin_ia32_pandn (__m1, __m2); 704} 705 706static __inline __m64 __attribute__((__always_inline__)) 707_m_pandn (__m64 __m1, __m64 __m2) 708{ 709 return _mm_andnot_si64 (__m1, __m2); 710} 711 712/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 713static __inline __m64 __attribute__((__always_inline__)) 714_mm_or_si64 (__m64 __m1, __m64 __m2) 715{ 716 return __builtin_ia32_por (__m1, __m2); 717} 718 719static __inline __m64 __attribute__((__always_inline__)) 720_m_por (__m64 __m1, __m64 __m2) 721{ 722 return _mm_or_si64 (__m1, __m2); 723} 724 725/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 726static __inline __m64 __attribute__((__always_inline__)) 727_mm_xor_si64 (__m64 __m1, __m64 __m2) 728{ 729 return __builtin_ia32_pxor (__m1, __m2); 730} 731 732static __inline __m64 __attribute__((__always_inline__)) 733_m_pxor (__m64 __m1, __m64 __m2) 734{ 735 return _mm_xor_si64 (__m1, __m2); 736} 737 738/* Compare eight 8-bit values. The result of the comparison is 0xFF if the 739 test is true and zero if false. */ 740static __inline __m64 __attribute__((__always_inline__)) 741_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 742{ 743 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); 744} 745 746static __inline __m64 __attribute__((__always_inline__)) 747_m_pcmpeqb (__m64 __m1, __m64 __m2) 748{ 749 return _mm_cmpeq_pi8 (__m1, __m2); 750} 751 752static __inline __m64 __attribute__((__always_inline__)) 753_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 754{ 755 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); 756} 757 758static __inline __m64 __attribute__((__always_inline__)) 759_m_pcmpgtb (__m64 __m1, __m64 __m2) 760{ 761 return _mm_cmpgt_pi8 (__m1, __m2); 762} 763 764/* Compare four 16-bit values. The result of the comparison is 0xFFFF if 765 the test is true and zero if false. */ 766static __inline __m64 __attribute__((__always_inline__)) 767_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 768{ 769 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); 770} 771 772static __inline __m64 __attribute__((__always_inline__)) 773_m_pcmpeqw (__m64 __m1, __m64 __m2) 774{ 775 return _mm_cmpeq_pi16 (__m1, __m2); 776} 777 778static __inline __m64 __attribute__((__always_inline__)) 779_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 780{ 781 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); 782} 783 784static __inline __m64 __attribute__((__always_inline__)) 785_m_pcmpgtw (__m64 __m1, __m64 __m2) 786{ 787 return _mm_cmpgt_pi16 (__m1, __m2); 788} 789 790/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 791 the test is true and zero if false. */ 792static __inline __m64 __attribute__((__always_inline__)) 793_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 794{ 795 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); 796} 797 798static __inline __m64 __attribute__((__always_inline__)) 799_m_pcmpeqd (__m64 __m1, __m64 __m2) 800{ 801 return _mm_cmpeq_pi32 (__m1, __m2); 802} 803 804static __inline __m64 __attribute__((__always_inline__)) 805_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 806{ 807 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); 808} 809 810static __inline __m64 __attribute__((__always_inline__)) 811_m_pcmpgtd (__m64 __m1, __m64 __m2) 812{ 813 return _mm_cmpgt_pi32 (__m1, __m2); 814} 815 816/* Creates a 64-bit zero. */ 817static __inline __m64 __attribute__((__always_inline__)) 818_mm_setzero_si64 (void) 819{ 820 return (__m64)0LL; 821} 822 823/* Creates a vector of two 32-bit values; I0 is least significant. */ 824static __inline __m64 __attribute__((__always_inline__)) 825_mm_set_pi32 (int __i1, int __i0) 826{ 827 return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); 828} 829 830/* Creates a vector of four 16-bit values; W0 is least significant. */ 831static __inline __m64 __attribute__((__always_inline__)) 832_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 833{ 834 return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); 835} 836 837/* Creates a vector of eight 8-bit values; B0 is least significant. */ 838static __inline __m64 __attribute__((__always_inline__)) 839_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 840 char __b3, char __b2, char __b1, char __b0) 841{ 842 return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, 843 __b4, __b5, __b6, __b7); 844} 845 846/* Similar, but with the arguments in reverse order. */ 847static __inline __m64 __attribute__((__always_inline__)) 848_mm_setr_pi32 (int __i0, int __i1) 849{ 850 return _mm_set_pi32 (__i1, __i0); 851} 852 853static __inline __m64 __attribute__((__always_inline__)) 854_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 855{ 856 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 857} 858 859static __inline __m64 __attribute__((__always_inline__)) 860_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 861 char __b4, char __b5, char __b6, char __b7) 862{ 863 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 864} 865 866/* Creates a vector of two 32-bit values, both elements containing I. */ 867static __inline __m64 __attribute__((__always_inline__)) 868_mm_set1_pi32 (int __i) 869{ 870 return _mm_set_pi32 (__i, __i); 871} 872 873/* Creates a vector of four 16-bit values, all elements containing W. */ 874static __inline __m64 __attribute__((__always_inline__)) 875_mm_set1_pi16 (short __w) 876{ 877 return _mm_set_pi16 (__w, __w, __w, __w); 878} 879 880/* Creates a vector of eight 8-bit values, all elements containing B. */ 881static __inline __m64 __attribute__((__always_inline__)) 882_mm_set1_pi8 (char __b) 883{ 884 return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); 885} 886 887#endif /* __MMX__ */ 888#endif /* _MMINTRIN_H_INCLUDED */ 889