1/* Copyright (C) 2002-2022 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27#ifndef _MMINTRIN_H_INCLUDED 28#define _MMINTRIN_H_INCLUDED 29 30#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__ 31#pragma GCC push_options 32#ifdef __MMX_WITH_SSE__ 33#pragma GCC target("sse2") 34#elif defined __x86_64__ 35#pragma GCC target("sse,mmx") 36#else 37#pragma GCC target("mmx") 38#endif 39#define __DISABLE_MMX__ 40#endif /* __MMX__ */ 41 42/* The Intel API is flexible enough that we must allow aliasing with other 43 vector types, and their scalar components. */ 44typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); 45typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__)); 46typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__)); 47 48/* Unaligned version of the same type */ 49typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); 50typedef int __m32_u __attribute__ ((__vector_size__ (4), \ 51 __may_alias__, __aligned__ (1))); 52typedef short __m16_u __attribute__ ((__vector_size__ (2), \ 53 __may_alias__, __aligned__ (1))); 54 55/* Internal data types for implementing the intrinsics. */ 56typedef int __v2si __attribute__ ((__vector_size__ (8))); 57typedef short __v4hi __attribute__ ((__vector_size__ (8))); 58typedef char __v8qi __attribute__ ((__vector_size__ (8))); 59typedef long long __v1di __attribute__ ((__vector_size__ (8))); 60typedef float __v2sf __attribute__ ((__vector_size__ (8))); 61 62/* Empty the multimedia state. */ 63extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 64_mm_empty (void) 65{ 66 __builtin_ia32_emms (); 67} 68 69extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70_m_empty (void) 71{ 72 _mm_empty (); 73} 74 75/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 76extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 77_mm_cvtsi32_si64 (int __i) 78{ 79 return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); 80} 81 82extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 83_m_from_int (int __i) 84{ 85 return _mm_cvtsi32_si64 (__i); 86} 87 88#ifdef __x86_64__ 89/* Convert I to a __m64 object. */ 90 91/* Intel intrinsic. */ 92extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93_m_from_int64 (long long __i) 94{ 95 return (__m64) __i; 96} 97 98extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 99_mm_cvtsi64_m64 (long long __i) 100{ 101 return (__m64) __i; 102} 103 104/* Microsoft intrinsic. */ 105extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106_mm_cvtsi64x_si64 (long long __i) 107{ 108 return (__m64) __i; 109} 110 111extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 112_mm_set_pi64x (long long __i) 113{ 114 return (__m64) __i; 115} 116#endif 117 118/* Convert the lower 32 bits of the __m64 object into an integer. */ 119extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120_mm_cvtsi64_si32 (__m64 __i) 121{ 122 return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); 123} 124 125extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126_m_to_int (__m64 __i) 127{ 128 return _mm_cvtsi64_si32 (__i); 129} 130 131#ifdef __x86_64__ 132/* Convert the __m64 object to a 64bit integer. */ 133 134/* Intel intrinsic. */ 135extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 136_m_to_int64 (__m64 __i) 137{ 138 return (long long)__i; 139} 140 141extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142_mm_cvtm64_si64 (__m64 __i) 143{ 144 return (long long)__i; 145} 146 147/* Microsoft intrinsic. */ 148extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 149_mm_cvtsi64_si64x (__m64 __i) 150{ 151 return (long long)__i; 152} 153#endif 154 155/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 156 the result, and the four 16-bit values from M2 into the upper four 8-bit 157 values of the result, all with signed saturation. */ 158extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159_mm_packs_pi16 (__m64 __m1, __m64 __m2) 160{ 161 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); 162} 163 164extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 165_m_packsswb (__m64 __m1, __m64 __m2) 166{ 167 return _mm_packs_pi16 (__m1, __m2); 168} 169 170/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 171 the result, and the two 32-bit values from M2 into the upper two 16-bit 172 values of the result, all with signed saturation. */ 173extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 174_mm_packs_pi32 (__m64 __m1, __m64 __m2) 175{ 176 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); 177} 178 179extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 180_m_packssdw (__m64 __m1, __m64 __m2) 181{ 182 return _mm_packs_pi32 (__m1, __m2); 183} 184 185/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 186 the result, and the four 16-bit values from M2 into the upper four 8-bit 187 values of the result, all with unsigned saturation. */ 188extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 189_mm_packs_pu16 (__m64 __m1, __m64 __m2) 190{ 191 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); 192} 193 194extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 195_m_packuswb (__m64 __m1, __m64 __m2) 196{ 197 return _mm_packs_pu16 (__m1, __m2); 198} 199 200/* Interleave the four 8-bit values from the high half of M1 with the four 201 8-bit values from the high half of M2. */ 202extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 203_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 204{ 205 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); 206} 207 208extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 209_m_punpckhbw (__m64 __m1, __m64 __m2) 210{ 211 return _mm_unpackhi_pi8 (__m1, __m2); 212} 213 214/* Interleave the two 16-bit values from the high half of M1 with the two 215 16-bit values from the high half of M2. */ 216extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 217_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 218{ 219 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); 220} 221 222extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 223_m_punpckhwd (__m64 __m1, __m64 __m2) 224{ 225 return _mm_unpackhi_pi16 (__m1, __m2); 226} 227 228/* Interleave the 32-bit value from the high half of M1 with the 32-bit 229 value from the high half of M2. */ 230extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 231_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 232{ 233 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); 234} 235 236extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237_m_punpckhdq (__m64 __m1, __m64 __m2) 238{ 239 return _mm_unpackhi_pi32 (__m1, __m2); 240} 241 242/* Interleave the four 8-bit values from the low half of M1 with the four 243 8-bit values from the low half of M2. */ 244extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 245_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 246{ 247 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); 248} 249 250extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251_m_punpcklbw (__m64 __m1, __m64 __m2) 252{ 253 return _mm_unpacklo_pi8 (__m1, __m2); 254} 255 256/* Interleave the two 16-bit values from the low half of M1 with the two 257 16-bit values from the low half of M2. */ 258extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 259_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 260{ 261 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); 262} 263 264extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 265_m_punpcklwd (__m64 __m1, __m64 __m2) 266{ 267 return _mm_unpacklo_pi16 (__m1, __m2); 268} 269 270/* Interleave the 32-bit value from the low half of M1 with the 32-bit 271 value from the low half of M2. */ 272extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 273_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 274{ 275 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); 276} 277 278extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 279_m_punpckldq (__m64 __m1, __m64 __m2) 280{ 281 return _mm_unpacklo_pi32 (__m1, __m2); 282} 283 284/* Add the 8-bit values in M1 to the 8-bit values in M2. */ 285extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 286_mm_add_pi8 (__m64 __m1, __m64 __m2) 287{ 288 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); 289} 290 291extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 292_m_paddb (__m64 __m1, __m64 __m2) 293{ 294 return _mm_add_pi8 (__m1, __m2); 295} 296 297/* Add the 16-bit values in M1 to the 16-bit values in M2. */ 298extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 299_mm_add_pi16 (__m64 __m1, __m64 __m2) 300{ 301 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); 302} 303 304extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 305_m_paddw (__m64 __m1, __m64 __m2) 306{ 307 return _mm_add_pi16 (__m1, __m2); 308} 309 310/* Add the 32-bit values in M1 to the 32-bit values in M2. */ 311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 312_mm_add_pi32 (__m64 __m1, __m64 __m2) 313{ 314 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); 315} 316 317extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 318_m_paddd (__m64 __m1, __m64 __m2) 319{ 320 return _mm_add_pi32 (__m1, __m2); 321} 322 323/* Add the 64-bit values in M1 to the 64-bit values in M2. */ 324#ifndef __SSE2__ 325#pragma GCC push_options 326#ifdef __MMX_WITH_SSE__ 327#pragma GCC target("sse2") 328#else 329#pragma GCC target("sse2,mmx") 330#endif 331#define __DISABLE_SSE2__ 332#endif /* __SSE2__ */ 333 334extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 335_mm_add_si64 (__m64 __m1, __m64 __m2) 336{ 337 return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2); 338} 339#ifdef __DISABLE_SSE2__ 340#undef __DISABLE_SSE2__ 341#pragma GCC pop_options 342#endif /* __DISABLE_SSE2__ */ 343 344/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 345 saturated arithmetic. */ 346extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 347_mm_adds_pi8 (__m64 __m1, __m64 __m2) 348{ 349 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); 350} 351 352extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 353_m_paddsb (__m64 __m1, __m64 __m2) 354{ 355 return _mm_adds_pi8 (__m1, __m2); 356} 357 358/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 359 saturated arithmetic. */ 360extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 361_mm_adds_pi16 (__m64 __m1, __m64 __m2) 362{ 363 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); 364} 365 366extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367_m_paddsw (__m64 __m1, __m64 __m2) 368{ 369 return _mm_adds_pi16 (__m1, __m2); 370} 371 372/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 373 saturated arithmetic. */ 374extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 375_mm_adds_pu8 (__m64 __m1, __m64 __m2) 376{ 377 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); 378} 379 380extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381_m_paddusb (__m64 __m1, __m64 __m2) 382{ 383 return _mm_adds_pu8 (__m1, __m2); 384} 385 386/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 387 saturated arithmetic. */ 388extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 389_mm_adds_pu16 (__m64 __m1, __m64 __m2) 390{ 391 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); 392} 393 394extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 395_m_paddusw (__m64 __m1, __m64 __m2) 396{ 397 return _mm_adds_pu16 (__m1, __m2); 398} 399 400/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 401extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402_mm_sub_pi8 (__m64 __m1, __m64 __m2) 403{ 404 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); 405} 406 407extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 408_m_psubb (__m64 __m1, __m64 __m2) 409{ 410 return _mm_sub_pi8 (__m1, __m2); 411} 412 413/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 414extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 415_mm_sub_pi16 (__m64 __m1, __m64 __m2) 416{ 417 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); 418} 419 420extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 421_m_psubw (__m64 __m1, __m64 __m2) 422{ 423 return _mm_sub_pi16 (__m1, __m2); 424} 425 426/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 427extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 428_mm_sub_pi32 (__m64 __m1, __m64 __m2) 429{ 430 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); 431} 432 433extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 434_m_psubd (__m64 __m1, __m64 __m2) 435{ 436 return _mm_sub_pi32 (__m1, __m2); 437} 438 439/* Add the 64-bit values in M1 to the 64-bit values in M2. */ 440#ifndef __SSE2__ 441#pragma GCC push_options 442#ifdef __MMX_WITH_SSE__ 443#pragma GCC target("sse2") 444#else 445#pragma GCC target("sse2,mmx") 446#endif 447#define __DISABLE_SSE2__ 448#endif /* __SSE2__ */ 449 450extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 451_mm_sub_si64 (__m64 __m1, __m64 __m2) 452{ 453 return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2); 454} 455#ifdef __DISABLE_SSE2__ 456#undef __DISABLE_SSE2__ 457#pragma GCC pop_options 458#endif /* __DISABLE_SSE2__ */ 459 460/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 461 saturating arithmetic. */ 462extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 463_mm_subs_pi8 (__m64 __m1, __m64 __m2) 464{ 465 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); 466} 467 468extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 469_m_psubsb (__m64 __m1, __m64 __m2) 470{ 471 return _mm_subs_pi8 (__m1, __m2); 472} 473 474/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 475 signed saturating arithmetic. */ 476extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477_mm_subs_pi16 (__m64 __m1, __m64 __m2) 478{ 479 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); 480} 481 482extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 483_m_psubsw (__m64 __m1, __m64 __m2) 484{ 485 return _mm_subs_pi16 (__m1, __m2); 486} 487 488/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 489 unsigned saturating arithmetic. */ 490extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 491_mm_subs_pu8 (__m64 __m1, __m64 __m2) 492{ 493 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); 494} 495 496extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497_m_psubusb (__m64 __m1, __m64 __m2) 498{ 499 return _mm_subs_pu8 (__m1, __m2); 500} 501 502/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 503 unsigned saturating arithmetic. */ 504extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 505_mm_subs_pu16 (__m64 __m1, __m64 __m2) 506{ 507 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); 508} 509 510extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 511_m_psubusw (__m64 __m1, __m64 __m2) 512{ 513 return _mm_subs_pu16 (__m1, __m2); 514} 515 516/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 517 four 32-bit intermediate results, which are then summed by pairs to 518 produce two 32-bit results. */ 519extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 520_mm_madd_pi16 (__m64 __m1, __m64 __m2) 521{ 522 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); 523} 524 525extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 526_m_pmaddwd (__m64 __m1, __m64 __m2) 527{ 528 return _mm_madd_pi16 (__m1, __m2); 529} 530 531/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 532 M2 and produce the high 16 bits of the 32-bit results. */ 533extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 534_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 535{ 536 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); 537} 538 539extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 540_m_pmulhw (__m64 __m1, __m64 __m2) 541{ 542 return _mm_mulhi_pi16 (__m1, __m2); 543} 544 545/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 546 the low 16 bits of the results. */ 547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 548_mm_mullo_pi16 (__m64 __m1, __m64 __m2) 549{ 550 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); 551} 552 553extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 554_m_pmullw (__m64 __m1, __m64 __m2) 555{ 556 return _mm_mullo_pi16 (__m1, __m2); 557} 558 559/* Shift four 16-bit values in M left by COUNT. */ 560extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 561_mm_sll_pi16 (__m64 __m, __m64 __count) 562{ 563 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); 564} 565 566extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 567_m_psllw (__m64 __m, __m64 __count) 568{ 569 return _mm_sll_pi16 (__m, __count); 570} 571 572extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 573_mm_slli_pi16 (__m64 __m, int __count) 574{ 575 return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count); 576} 577 578extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 579_m_psllwi (__m64 __m, int __count) 580{ 581 return _mm_slli_pi16 (__m, __count); 582} 583 584/* Shift two 32-bit values in M left by COUNT. */ 585extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 586_mm_sll_pi32 (__m64 __m, __m64 __count) 587{ 588 return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count); 589} 590 591extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 592_m_pslld (__m64 __m, __m64 __count) 593{ 594 return _mm_sll_pi32 (__m, __count); 595} 596 597extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 598_mm_slli_pi32 (__m64 __m, int __count) 599{ 600 return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count); 601} 602 603extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604_m_pslldi (__m64 __m, int __count) 605{ 606 return _mm_slli_pi32 (__m, __count); 607} 608 609/* Shift the 64-bit value in M left by COUNT. */ 610extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 611_mm_sll_si64 (__m64 __m, __m64 __count) 612{ 613 return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count); 614} 615 616extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 617_m_psllq (__m64 __m, __m64 __count) 618{ 619 return _mm_sll_si64 (__m, __count); 620} 621 622extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 623_mm_slli_si64 (__m64 __m, int __count) 624{ 625 return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count); 626} 627 628extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 629_m_psllqi (__m64 __m, int __count) 630{ 631 return _mm_slli_si64 (__m, __count); 632} 633 634/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 636_mm_sra_pi16 (__m64 __m, __m64 __count) 637{ 638 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count); 639} 640 641extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 642_m_psraw (__m64 __m, __m64 __count) 643{ 644 return _mm_sra_pi16 (__m, __count); 645} 646 647extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 648_mm_srai_pi16 (__m64 __m, int __count) 649{ 650 return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count); 651} 652 653extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 654_m_psrawi (__m64 __m, int __count) 655{ 656 return _mm_srai_pi16 (__m, __count); 657} 658 659/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 660extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 661_mm_sra_pi32 (__m64 __m, __m64 __count) 662{ 663 return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count); 664} 665 666extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 667_m_psrad (__m64 __m, __m64 __count) 668{ 669 return _mm_sra_pi32 (__m, __count); 670} 671 672extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 673_mm_srai_pi32 (__m64 __m, int __count) 674{ 675 return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count); 676} 677 678extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 679_m_psradi (__m64 __m, int __count) 680{ 681 return _mm_srai_pi32 (__m, __count); 682} 683 684/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 685extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 686_mm_srl_pi16 (__m64 __m, __m64 __count) 687{ 688 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count); 689} 690 691extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 692_m_psrlw (__m64 __m, __m64 __count) 693{ 694 return _mm_srl_pi16 (__m, __count); 695} 696 697extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 698_mm_srli_pi16 (__m64 __m, int __count) 699{ 700 return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count); 701} 702 703extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 704_m_psrlwi (__m64 __m, int __count) 705{ 706 return _mm_srli_pi16 (__m, __count); 707} 708 709/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 710extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 711_mm_srl_pi32 (__m64 __m, __m64 __count) 712{ 713 return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count); 714} 715 716extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 717_m_psrld (__m64 __m, __m64 __count) 718{ 719 return _mm_srl_pi32 (__m, __count); 720} 721 722extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 723_mm_srli_pi32 (__m64 __m, int __count) 724{ 725 return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count); 726} 727 728extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729_m_psrldi (__m64 __m, int __count) 730{ 731 return _mm_srli_pi32 (__m, __count); 732} 733 734/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 735extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736_mm_srl_si64 (__m64 __m, __m64 __count) 737{ 738 return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count); 739} 740 741extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742_m_psrlq (__m64 __m, __m64 __count) 743{ 744 return _mm_srl_si64 (__m, __count); 745} 746 747extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 748_mm_srli_si64 (__m64 __m, int __count) 749{ 750 return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count); 751} 752 753extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 754_m_psrlqi (__m64 __m, int __count) 755{ 756 return _mm_srli_si64 (__m, __count); 757} 758 759/* Bit-wise AND the 64-bit values in M1 and M2. */ 760extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 761_mm_and_si64 (__m64 __m1, __m64 __m2) 762{ 763 return __builtin_ia32_pand (__m1, __m2); 764} 765 766extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 767_m_pand (__m64 __m1, __m64 __m2) 768{ 769 return _mm_and_si64 (__m1, __m2); 770} 771 772/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 773 64-bit value in M2. */ 774extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 775_mm_andnot_si64 (__m64 __m1, __m64 __m2) 776{ 777 return __builtin_ia32_pandn (__m1, __m2); 778} 779 780extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 781_m_pandn (__m64 __m1, __m64 __m2) 782{ 783 return _mm_andnot_si64 (__m1, __m2); 784} 785 786/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 787extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 788_mm_or_si64 (__m64 __m1, __m64 __m2) 789{ 790 return __builtin_ia32_por (__m1, __m2); 791} 792 793extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 794_m_por (__m64 __m1, __m64 __m2) 795{ 796 return _mm_or_si64 (__m1, __m2); 797} 798 799/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 800extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 801_mm_xor_si64 (__m64 __m1, __m64 __m2) 802{ 803 return __builtin_ia32_pxor (__m1, __m2); 804} 805 806extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 807_m_pxor (__m64 __m1, __m64 __m2) 808{ 809 return _mm_xor_si64 (__m1, __m2); 810} 811 812/* Compare eight 8-bit values. The result of the comparison is 0xFF if the 813 test is true and zero if false. */ 814extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 815_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 816{ 817 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); 818} 819 820extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 821_m_pcmpeqb (__m64 __m1, __m64 __m2) 822{ 823 return _mm_cmpeq_pi8 (__m1, __m2); 824} 825 826extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 827_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 828{ 829 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); 830} 831 832extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 833_m_pcmpgtb (__m64 __m1, __m64 __m2) 834{ 835 return _mm_cmpgt_pi8 (__m1, __m2); 836} 837 838/* Compare four 16-bit values. The result of the comparison is 0xFFFF if 839 the test is true and zero if false. */ 840extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 841_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 842{ 843 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); 844} 845 846extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 847_m_pcmpeqw (__m64 __m1, __m64 __m2) 848{ 849 return _mm_cmpeq_pi16 (__m1, __m2); 850} 851 852extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 853_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 854{ 855 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); 856} 857 858extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 859_m_pcmpgtw (__m64 __m1, __m64 __m2) 860{ 861 return _mm_cmpgt_pi16 (__m1, __m2); 862} 863 864/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 865 the test is true and zero if false. */ 866extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 867_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 868{ 869 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); 870} 871 872extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 873_m_pcmpeqd (__m64 __m1, __m64 __m2) 874{ 875 return _mm_cmpeq_pi32 (__m1, __m2); 876} 877 878extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 879_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 880{ 881 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); 882} 883 884extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 885_m_pcmpgtd (__m64 __m1, __m64 __m2) 886{ 887 return _mm_cmpgt_pi32 (__m1, __m2); 888} 889 890/* Creates a 64-bit zero. */ 891extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 892_mm_setzero_si64 (void) 893{ 894 return (__m64)0LL; 895} 896 897/* Creates a vector of two 32-bit values; I0 is least significant. */ 898extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 899_mm_set_pi32 (int __i1, int __i0) 900{ 901 return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); 902} 903 904/* Creates a vector of four 16-bit values; W0 is least significant. */ 905extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 907{ 908 return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); 909} 910 911/* Creates a vector of eight 8-bit values; B0 is least significant. */ 912extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 913_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 914 char __b3, char __b2, char __b1, char __b0) 915{ 916 return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, 917 __b4, __b5, __b6, __b7); 918} 919 920/* Similar, but with the arguments in reverse order. */ 921extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 922_mm_setr_pi32 (int __i0, int __i1) 923{ 924 return _mm_set_pi32 (__i1, __i0); 925} 926 927extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 928_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 929{ 930 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 931} 932 933extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 934_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 935 char __b4, char __b5, char __b6, char __b7) 936{ 937 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 938} 939 940/* Creates a vector of two 32-bit values, both elements containing I. */ 941extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 942_mm_set1_pi32 (int __i) 943{ 944 return _mm_set_pi32 (__i, __i); 945} 946 947/* Creates a vector of four 16-bit values, all elements containing W. */ 948extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 949_mm_set1_pi16 (short __w) 950{ 951 return _mm_set_pi16 (__w, __w, __w, __w); 952} 953 954/* Creates a vector of eight 8-bit values, all elements containing B. */ 955extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 956_mm_set1_pi8 (char __b) 957{ 958 return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); 959} 960#ifdef __DISABLE_MMX__ 961#undef __DISABLE_MMX__ 962#pragma GCC pop_options 963#endif /* __DISABLE_MMX__ */ 964 965#endif /* _MMINTRIN_H_INCLUDED */ 966