1/* Copyright (C) 2002-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27#ifndef NO_WARN_X86_INTRINSICS 28/* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC 37 target does not support a native __vector_size__ (8) type. Instead 38 we typedef __m64 to a 64-bit unsigned long long, which is natively 39 supported in 64-bit mode. This works well for the _si64 and some 40 _pi32 operations, but starts to generate long sequences for _pi16 41 and _pi8 operations. For those cases it better (faster and 42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit 43 unit, perform the operation, and then transfer the result back to 44 the __m64 type. This implies that the direct register move 45 instructions, introduced with power8, are available for efficient 46 implementation of these transfers. 47 48 Most MMX intrinsic operations can be performed efficiently as 49 C language 64-bit scalar operation or optimized to use the newer 50 128-bit SSE/Altivec operations. We recomend this for new 51 applications. */ 52#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 53#endif 54 55#ifndef _MMINTRIN_H_INCLUDED 56#define _MMINTRIN_H_INCLUDED 57 58#include <altivec.h> 59/* The Intel API is flexible enough that we must allow aliasing with other 60 vector types, and their scalar components. */ 61typedef __attribute__ ((__aligned__ (8), 62 __may_alias__)) unsigned long long __m64; 63 64typedef __attribute__ ((__aligned__ (8))) 65union 66 { 67 __m64 as_m64; 68 char as_char[8]; 69 signed char as_signed_char [8]; 70 short as_short[4]; 71 int as_int[2]; 72 long long as_long_long; 73 float as_float[2]; 74 double as_double; 75 } __m64_union; 76 77/* Empty the multimedia state. */ 78extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 79_mm_empty (void) 80{ 81 /* nothing to do on PowerPC. */ 82} 83 84extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85_m_empty (void) 86{ 87 /* nothing to do on PowerPC. */ 88} 89 90/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 91extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92_mm_cvtsi32_si64 (int __i) 93{ 94 return (__m64) (unsigned int) __i; 95} 96 97extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 98_m_from_int (int __i) 99{ 100 return _mm_cvtsi32_si64 (__i); 101} 102 103/* Convert the lower 32 bits of the __m64 object into an integer. */ 104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 105_mm_cvtsi64_si32 (__m64 __i) 106{ 107 return ((int) __i); 108} 109 110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 111_m_to_int (__m64 __i) 112{ 113 return _mm_cvtsi64_si32 (__i); 114} 115 116/* Convert I to a __m64 object. */ 117 118/* Intel intrinsic. */ 119extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120_m_from_int64 (long long __i) 121{ 122 return (__m64) __i; 123} 124 125extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126_mm_cvtsi64_m64 (long long __i) 127{ 128 return (__m64) __i; 129} 130 131/* Microsoft intrinsic. */ 132extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133_mm_cvtsi64x_si64 (long long __i) 134{ 135 return (__m64) __i; 136} 137 138extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139_mm_set_pi64x (long long __i) 140{ 141 return (__m64) __i; 142} 143 144/* Convert the __m64 object to a 64bit integer. */ 145 146/* Intel intrinsic. */ 147extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148_m_to_int64 (__m64 __i) 149{ 150 return (long long)__i; 151} 152 153extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 154_mm_cvtm64_si64 (__m64 __i) 155{ 156 return (long long) __i; 157} 158 159/* Microsoft intrinsic. */ 160extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 161_mm_cvtsi64_si64x (__m64 __i) 162{ 163 return (long long) __i; 164} 165 166#ifdef _ARCH_PWR8 167/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 168 the result, and the four 16-bit values from M2 into the upper four 8-bit 169 values of the result, all with signed saturation. */ 170extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 171_mm_packs_pi16 (__m64 __m1, __m64 __m2) 172{ 173 __vector signed short __vm1; 174 __vector signed char __vresult; 175 176 __vm1 = (__vector signed short) (__vector unsigned long long) 177#ifdef __LITTLE_ENDIAN__ 178 { __m1, __m2 }; 179#else 180 { __m2, __m1 }; 181#endif 182 __vresult = vec_packs (__vm1, __vm1); 183 return (__m64) ((__vector long long) __vresult)[0]; 184} 185 186extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 187_m_packsswb (__m64 __m1, __m64 __m2) 188{ 189 return _mm_packs_pi16 (__m1, __m2); 190} 191 192/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 193 the result, and the two 32-bit values from M2 into the upper two 16-bit 194 values of the result, all with signed saturation. */ 195extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196_mm_packs_pi32 (__m64 __m1, __m64 __m2) 197{ 198 __vector signed int __vm1; 199 __vector signed short __vresult; 200 201 __vm1 = (__vector signed int) (__vector unsigned long long) 202#ifdef __LITTLE_ENDIAN__ 203 { __m1, __m2 }; 204#else 205 { __m2, __m1 }; 206#endif 207 __vresult = vec_packs (__vm1, __vm1); 208 return (__m64) ((__vector long long) __vresult)[0]; 209} 210 211extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 212_m_packssdw (__m64 __m1, __m64 __m2) 213{ 214 return _mm_packs_pi32 (__m1, __m2); 215} 216 217/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 218 the result, and the four 16-bit values from M2 into the upper four 8-bit 219 values of the result, all with unsigned saturation. */ 220extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221_mm_packs_pu16 (__m64 __m1, __m64 __m2) 222{ 223 __vector unsigned char __r; 224 __vector signed short __vm1 = (__vector signed short) (__vector long long) 225#ifdef __LITTLE_ENDIAN__ 226 { __m1, __m2 }; 227#else 228 { __m2, __m1 }; 229#endif 230 const __vector signed short __zero = { 0 }; 231 __vector __bool short __select = vec_cmplt (__vm1, __zero); 232 __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1); 233 __vector __bool char __packsel = vec_pack (__select, __select); 234 __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel); 235 return (__m64) ((__vector long long) __r)[0]; 236} 237 238extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239_m_packuswb (__m64 __m1, __m64 __m2) 240{ 241 return _mm_packs_pu16 (__m1, __m2); 242} 243#endif /* end ARCH_PWR8 */ 244 245/* Interleave the four 8-bit values from the high half of M1 with the four 246 8-bit values from the high half of M2. */ 247extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 249{ 250#if _ARCH_PWR8 251 __vector unsigned char __a, __b, __c; 252 253 __a = (__vector unsigned char)vec_splats (__m1); 254 __b = (__vector unsigned char)vec_splats (__m2); 255 __c = vec_mergel (__a, __b); 256 return (__m64) ((__vector long long) __c)[1]; 257#else 258 __m64_union __mu1, __mu2, __res; 259 260 __mu1.as_m64 = __m1; 261 __mu2.as_m64 = __m2; 262 263 __res.as_char[0] = __mu1.as_char[4]; 264 __res.as_char[1] = __mu2.as_char[4]; 265 __res.as_char[2] = __mu1.as_char[5]; 266 __res.as_char[3] = __mu2.as_char[5]; 267 __res.as_char[4] = __mu1.as_char[6]; 268 __res.as_char[5] = __mu2.as_char[6]; 269 __res.as_char[6] = __mu1.as_char[7]; 270 __res.as_char[7] = __mu2.as_char[7]; 271 272 return (__m64) __res.as_m64; 273#endif 274} 275 276extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277_m_punpckhbw (__m64 __m1, __m64 __m2) 278{ 279 return _mm_unpackhi_pi8 (__m1, __m2); 280} 281 282/* Interleave the two 16-bit values from the high half of M1 with the two 283 16-bit values from the high half of M2. */ 284extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 285_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 286{ 287 __m64_union __mu1, __mu2, __res; 288 289 __mu1.as_m64 = __m1; 290 __mu2.as_m64 = __m2; 291 292 __res.as_short[0] = __mu1.as_short[2]; 293 __res.as_short[1] = __mu2.as_short[2]; 294 __res.as_short[2] = __mu1.as_short[3]; 295 __res.as_short[3] = __mu2.as_short[3]; 296 297 return (__m64) __res.as_m64; 298} 299 300extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 301_m_punpckhwd (__m64 __m1, __m64 __m2) 302{ 303 return _mm_unpackhi_pi16 (__m1, __m2); 304} 305/* Interleave the 32-bit value from the high half of M1 with the 32-bit 306 value from the high half of M2. */ 307extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 309{ 310 __m64_union __mu1, __mu2, __res; 311 312 __mu1.as_m64 = __m1; 313 __mu2.as_m64 = __m2; 314 315 __res.as_int[0] = __mu1.as_int[1]; 316 __res.as_int[1] = __mu2.as_int[1]; 317 318 return (__m64) __res.as_m64; 319} 320 321extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 322_m_punpckhdq (__m64 __m1, __m64 __m2) 323{ 324 return _mm_unpackhi_pi32 (__m1, __m2); 325} 326/* Interleave the four 8-bit values from the low half of M1 with the four 327 8-bit values from the low half of M2. */ 328extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 329_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 330{ 331#if _ARCH_PWR8 332 __vector unsigned char __a, __b, __c; 333 334 __a = (__vector unsigned char)vec_splats (__m1); 335 __b = (__vector unsigned char)vec_splats (__m2); 336 __c = vec_mergel (__a, __b); 337 return (__m64) ((__vector long long) __c)[0]; 338#else 339 __m64_union __mu1, __mu2, __res; 340 341 __mu1.as_m64 = __m1; 342 __mu2.as_m64 = __m2; 343 344 __res.as_char[0] = __mu1.as_char[0]; 345 __res.as_char[1] = __mu2.as_char[0]; 346 __res.as_char[2] = __mu1.as_char[1]; 347 __res.as_char[3] = __mu2.as_char[1]; 348 __res.as_char[4] = __mu1.as_char[2]; 349 __res.as_char[5] = __mu2.as_char[2]; 350 __res.as_char[6] = __mu1.as_char[3]; 351 __res.as_char[7] = __mu2.as_char[3]; 352 353 return (__m64) __res.as_m64; 354#endif 355} 356 357extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 358_m_punpcklbw (__m64 __m1, __m64 __m2) 359{ 360 return _mm_unpacklo_pi8 (__m1, __m2); 361} 362/* Interleave the two 16-bit values from the low half of M1 with the two 363 16-bit values from the low half of M2. */ 364extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 365_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 366{ 367 __m64_union __mu1, __mu2, __res; 368 369 __mu1.as_m64 = __m1; 370 __mu2.as_m64 = __m2; 371 372 __res.as_short[0] = __mu1.as_short[0]; 373 __res.as_short[1] = __mu2.as_short[0]; 374 __res.as_short[2] = __mu1.as_short[1]; 375 __res.as_short[3] = __mu2.as_short[1]; 376 377 return (__m64) __res.as_m64; 378} 379 380extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381_m_punpcklwd (__m64 __m1, __m64 __m2) 382{ 383 return _mm_unpacklo_pi16 (__m1, __m2); 384} 385 386/* Interleave the 32-bit value from the low half of M1 with the 32-bit 387 value from the low half of M2. */ 388extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 389_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 390{ 391 __m64_union __mu1, __mu2, __res; 392 393 __mu1.as_m64 = __m1; 394 __mu2.as_m64 = __m2; 395 396 __res.as_int[0] = __mu1.as_int[0]; 397 __res.as_int[1] = __mu2.as_int[0]; 398 399 return (__m64) __res.as_m64; 400} 401 402extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 403_m_punpckldq (__m64 __m1, __m64 __m2) 404{ 405 return _mm_unpacklo_pi32 (__m1, __m2); 406} 407 408/* Add the 8-bit values in M1 to the 8-bit values in M2. */ 409extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 410_mm_add_pi8 (__m64 __m1, __m64 __m2) 411{ 412#if _ARCH_PWR8 413 __vector signed char __a, __b, __c; 414 415 __a = (__vector signed char)vec_splats (__m1); 416 __b = (__vector signed char)vec_splats (__m2); 417 __c = vec_add (__a, __b); 418 return (__m64) ((__vector long long) __c)[0]; 419#else 420 __m64_union __mu1, __mu2, __res; 421 422 __mu1.as_m64 = __m1; 423 __mu2.as_m64 = __m2; 424 425 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; 426 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; 427 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; 428 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; 429 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; 430 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; 431 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; 432 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; 433 434 return (__m64) __res.as_m64; 435#endif 436} 437 438extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 439_m_paddb (__m64 __m1, __m64 __m2) 440{ 441 return _mm_add_pi8 (__m1, __m2); 442} 443 444/* Add the 16-bit values in M1 to the 16-bit values in M2. */ 445extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 446_mm_add_pi16 (__m64 __m1, __m64 __m2) 447{ 448#if _ARCH_PWR8 449 __vector signed short __a, __b, __c; 450 451 __a = (__vector signed short)vec_splats (__m1); 452 __b = (__vector signed short)vec_splats (__m2); 453 __c = vec_add (__a, __b); 454 return (__m64) ((__vector long long) __c)[0]; 455#else 456 __m64_union __mu1, __mu2, __res; 457 458 __mu1.as_m64 = __m1; 459 __mu2.as_m64 = __m2; 460 461 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; 462 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; 463 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; 464 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; 465 466 return (__m64) __res.as_m64; 467#endif 468} 469 470extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 471_m_paddw (__m64 __m1, __m64 __m2) 472{ 473 return _mm_add_pi16 (__m1, __m2); 474} 475 476/* Add the 32-bit values in M1 to the 32-bit values in M2. */ 477extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478_mm_add_pi32 (__m64 __m1, __m64 __m2) 479{ 480#if _ARCH_PWR9 481 __vector signed int __a, __b, __c; 482 483 __a = (__vector signed int)vec_splats (__m1); 484 __b = (__vector signed int)vec_splats (__m2); 485 __c = vec_add (__a, __b); 486 return (__m64) ((__vector long long) __c)[0]; 487#else 488 __m64_union __mu1, __mu2, __res; 489 490 __mu1.as_m64 = __m1; 491 __mu2.as_m64 = __m2; 492 493 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; 494 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; 495 496 return (__m64) __res.as_m64; 497#endif 498} 499 500extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501_m_paddd (__m64 __m1, __m64 __m2) 502{ 503 return _mm_add_pi32 (__m1, __m2); 504} 505 506/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 507extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 508_mm_sub_pi8 (__m64 __m1, __m64 __m2) 509{ 510#if _ARCH_PWR8 511 __vector signed char __a, __b, __c; 512 513 __a = (__vector signed char)vec_splats (__m1); 514 __b = (__vector signed char)vec_splats (__m2); 515 __c = vec_sub (__a, __b); 516 return (__m64) ((__vector long long) __c)[0]; 517#else 518 __m64_union __mu1, __mu2, __res; 519 520 __mu1.as_m64 = __m1; 521 __mu2.as_m64 = __m2; 522 523 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; 524 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; 525 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; 526 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; 527 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; 528 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; 529 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; 530 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; 531 532 return (__m64) __res.as_m64; 533#endif 534} 535 536extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 537_m_psubb (__m64 __m1, __m64 __m2) 538{ 539 return _mm_sub_pi8 (__m1, __m2); 540} 541 542/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 543extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 544_mm_sub_pi16 (__m64 __m1, __m64 __m2) 545{ 546#if _ARCH_PWR8 547 __vector signed short __a, __b, __c; 548 549 __a = (__vector signed short)vec_splats (__m1); 550 __b = (__vector signed short)vec_splats (__m2); 551 __c = vec_sub (__a, __b); 552 return (__m64) ((__vector long long) __c)[0]; 553#else 554 __m64_union __mu1, __mu2, __res; 555 556 __mu1.as_m64 = __m1; 557 __mu2.as_m64 = __m2; 558 559 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; 560 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; 561 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; 562 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; 563 564 return (__m64) __res.as_m64; 565#endif 566} 567 568extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569_m_psubw (__m64 __m1, __m64 __m2) 570{ 571 return _mm_sub_pi16 (__m1, __m2); 572} 573 574/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 575extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576_mm_sub_pi32 (__m64 __m1, __m64 __m2) 577{ 578#if _ARCH_PWR9 579 __vector signed int __a, __b, __c; 580 581 __a = (__vector signed int)vec_splats (__m1); 582 __b = (__vector signed int)vec_splats (__m2); 583 __c = vec_sub (__a, __b); 584 return (__m64) ((__vector long long) __c)[0]; 585#else 586 __m64_union __mu1, __mu2, __res; 587 588 __mu1.as_m64 = __m1; 589 __mu2.as_m64 = __m2; 590 591 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; 592 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; 593 594 return (__m64) __res.as_m64; 595#endif 596} 597 598extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 599_m_psubd (__m64 __m1, __m64 __m2) 600{ 601 return _mm_sub_pi32 (__m1, __m2); 602} 603 604extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 605_mm_add_si64 (__m64 __m1, __m64 __m2) 606{ 607 return (__m1 + __m2); 608} 609 610extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 611_mm_sub_si64 (__m64 __m1, __m64 __m2) 612{ 613 return (__m1 - __m2); 614} 615 616/* Shift the 64-bit value in M left by COUNT. */ 617extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 618_mm_sll_si64 (__m64 __m, __m64 __count) 619{ 620 return (__m << __count); 621} 622 623extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 624_m_psllq (__m64 __m, __m64 __count) 625{ 626 return _mm_sll_si64 (__m, __count); 627} 628 629extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 630_mm_slli_si64 (__m64 __m, const int __count) 631{ 632 return (__m << __count); 633} 634 635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 636_m_psllqi (__m64 __m, const int __count) 637{ 638 return _mm_slli_si64 (__m, __count); 639} 640 641/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 642extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643_mm_srl_si64 (__m64 __m, __m64 __count) 644{ 645 return (__m >> __count); 646} 647 648extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649_m_psrlq (__m64 __m, __m64 __count) 650{ 651 return _mm_srl_si64 (__m, __count); 652} 653 654extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 655_mm_srli_si64 (__m64 __m, const int __count) 656{ 657 return (__m >> __count); 658} 659 660extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 661_m_psrlqi (__m64 __m, const int __count) 662{ 663 return _mm_srli_si64 (__m, __count); 664} 665 666/* Bit-wise AND the 64-bit values in M1 and M2. */ 667extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 668_mm_and_si64 (__m64 __m1, __m64 __m2) 669{ 670 return (__m1 & __m2); 671} 672 673extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 674_m_pand (__m64 __m1, __m64 __m2) 675{ 676 return _mm_and_si64 (__m1, __m2); 677} 678 679/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 680 64-bit value in M2. */ 681extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682_mm_andnot_si64 (__m64 __m1, __m64 __m2) 683{ 684 return (~__m1 & __m2); 685} 686 687extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 688_m_pandn (__m64 __m1, __m64 __m2) 689{ 690 return _mm_andnot_si64 (__m1, __m2); 691} 692 693/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 694extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695_mm_or_si64 (__m64 __m1, __m64 __m2) 696{ 697 return (__m1 | __m2); 698} 699 700extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701_m_por (__m64 __m1, __m64 __m2) 702{ 703 return _mm_or_si64 (__m1, __m2); 704} 705 706/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 707extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 708_mm_xor_si64 (__m64 __m1, __m64 __m2) 709{ 710 return (__m1 ^ __m2); 711} 712 713extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 714_m_pxor (__m64 __m1, __m64 __m2) 715{ 716 return _mm_xor_si64 (__m1, __m2); 717} 718 719/* Creates a 64-bit zero. */ 720extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721_mm_setzero_si64 (void) 722{ 723 return (__m64) 0; 724} 725 726/* Compare eight 8-bit values. The result of the comparison is 0xFF if the 727 test is true and zero if false. */ 728extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 730{ 731#if defined(_ARCH_PWR6) && defined(__powerpc64__) 732 __m64 __res; 733 __asm__( 734 "cmpb %0,%1,%2;\n" 735 : "=r" (__res) 736 : "r" (__m1), 737 "r" (__m2) 738 : ); 739 return (__res); 740#else 741 __m64_union __mu1, __mu2, __res; 742 743 __mu1.as_m64 = __m1; 744 __mu2.as_m64 = __m2; 745 746 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0; 747 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0; 748 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0; 749 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0; 750 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0; 751 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0; 752 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0; 753 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0; 754 755 return (__m64) __res.as_m64; 756#endif 757} 758 759extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 760_m_pcmpeqb (__m64 __m1, __m64 __m2) 761{ 762 return _mm_cmpeq_pi8 (__m1, __m2); 763} 764 765extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 767{ 768#if _ARCH_PWR8 769 __vector signed char __a, __b, __c; 770 771 __a = (__vector signed char)vec_splats (__m1); 772 __b = (__vector signed char)vec_splats (__m2); 773 __c = (__vector signed char)vec_cmpgt (__a, __b); 774 return (__m64) ((__vector long long) __c)[0]; 775#else 776 __m64_union __mu1, __mu2, __res; 777 778 __mu1.as_m64 = __m1; 779 __mu2.as_m64 = __m2; 780 781 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0; 782 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0; 783 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0; 784 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0; 785 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0; 786 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0; 787 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0; 788 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0; 789 790 return (__m64) __res.as_m64; 791#endif 792} 793 794extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 795_m_pcmpgtb (__m64 __m1, __m64 __m2) 796{ 797 return _mm_cmpgt_pi8 (__m1, __m2); 798} 799 800/* Compare four 16-bit values. The result of the comparison is 0xFFFF if 801 the test is true and zero if false. */ 802extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 803_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 804{ 805#if _ARCH_PWR8 806 __vector signed short __a, __b, __c; 807 808 __a = (__vector signed short)vec_splats (__m1); 809 __b = (__vector signed short)vec_splats (__m2); 810 __c = (__vector signed short)vec_cmpeq (__a, __b); 811 return (__m64) ((__vector long long) __c)[0]; 812#else 813 __m64_union __mu1, __mu2, __res; 814 815 __mu1.as_m64 = __m1; 816 __mu2.as_m64 = __m2; 817 818 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0; 819 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0; 820 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0; 821 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0; 822 823 return (__m64) __res.as_m64; 824#endif 825} 826 827extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828_m_pcmpeqw (__m64 __m1, __m64 __m2) 829{ 830 return _mm_cmpeq_pi16 (__m1, __m2); 831} 832 833extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 835{ 836#if _ARCH_PWR8 837 __vector signed short __a, __b, __c; 838 839 __a = (__vector signed short)vec_splats (__m1); 840 __b = (__vector signed short)vec_splats (__m2); 841 __c = (__vector signed short)vec_cmpgt (__a, __b); 842 return (__m64) ((__vector long long) __c)[0]; 843#else 844 __m64_union __mu1, __mu2, __res; 845 846 __mu1.as_m64 = __m1; 847 __mu2.as_m64 = __m2; 848 849 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0; 850 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0; 851 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0; 852 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0; 853 854 return (__m64) __res.as_m64; 855#endif 856} 857 858extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 859_m_pcmpgtw (__m64 __m1, __m64 __m2) 860{ 861 return _mm_cmpgt_pi16 (__m1, __m2); 862} 863 864/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 865 the test is true and zero if false. */ 866extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 867_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 868{ 869#if _ARCH_PWR9 870 __vector signed int __a, __b, __c; 871 872 __a = (__vector signed int)vec_splats (__m1); 873 __b = (__vector signed int)vec_splats (__m2); 874 __c = (__vector signed int)vec_cmpeq (__a, __b); 875 return (__m64) ((__vector long long) __c)[0]; 876#else 877 __m64_union __mu1, __mu2, __res; 878 879 __mu1.as_m64 = __m1; 880 __mu2.as_m64 = __m2; 881 882 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0; 883 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0; 884 885 return (__m64) __res.as_m64; 886#endif 887} 888 889extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 890_m_pcmpeqd (__m64 __m1, __m64 __m2) 891{ 892 return _mm_cmpeq_pi32 (__m1, __m2); 893} 894 895extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 896_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 897{ 898#if _ARCH_PWR9 899 __vector signed int __a, __b, __c; 900 901 __a = (__vector signed int)vec_splats (__m1); 902 __b = (__vector signed int)vec_splats (__m2); 903 __c = (__vector signed int)vec_cmpgt (__a, __b); 904 return (__m64) ((__vector long long) __c)[0]; 905#else 906 __m64_union __mu1, __mu2, __res; 907 908 __mu1.as_m64 = __m1; 909 __mu2.as_m64 = __m2; 910 911 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0; 912 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0; 913 914 return (__m64) __res.as_m64; 915#endif 916} 917 918extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919_m_pcmpgtd (__m64 __m1, __m64 __m2) 920{ 921 return _mm_cmpgt_pi32 (__m1, __m2); 922} 923 924#if _ARCH_PWR8 925/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 926 saturated arithmetic. */ 927extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 928_mm_adds_pi8 (__m64 __m1, __m64 __m2) 929{ 930 __vector signed char __a, __b, __c; 931 932 __a = (__vector signed char)vec_splats (__m1); 933 __b = (__vector signed char)vec_splats (__m2); 934 __c = vec_adds (__a, __b); 935 return (__m64) ((__vector long long) __c)[0]; 936} 937 938extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939_m_paddsb (__m64 __m1, __m64 __m2) 940{ 941 return _mm_adds_pi8 (__m1, __m2); 942} 943/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 944 saturated arithmetic. */ 945extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 946_mm_adds_pi16 (__m64 __m1, __m64 __m2) 947{ 948 __vector signed short __a, __b, __c; 949 950 __a = (__vector signed short)vec_splats (__m1); 951 __b = (__vector signed short)vec_splats (__m2); 952 __c = vec_adds (__a, __b); 953 return (__m64) ((__vector long long) __c)[0]; 954} 955 956extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 957_m_paddsw (__m64 __m1, __m64 __m2) 958{ 959 return _mm_adds_pi16 (__m1, __m2); 960} 961/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 962 saturated arithmetic. */ 963extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 964_mm_adds_pu8 (__m64 __m1, __m64 __m2) 965{ 966 __vector unsigned char __a, __b, __c; 967 968 __a = (__vector unsigned char)vec_splats (__m1); 969 __b = (__vector unsigned char)vec_splats (__m2); 970 __c = vec_adds (__a, __b); 971 return (__m64) ((__vector long long) __c)[0]; 972} 973 974extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975_m_paddusb (__m64 __m1, __m64 __m2) 976{ 977 return _mm_adds_pu8 (__m1, __m2); 978} 979 980/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 981 saturated arithmetic. */ 982extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983_mm_adds_pu16 (__m64 __m1, __m64 __m2) 984{ 985 __vector unsigned short __a, __b, __c; 986 987 __a = (__vector unsigned short)vec_splats (__m1); 988 __b = (__vector unsigned short)vec_splats (__m2); 989 __c = vec_adds (__a, __b); 990 return (__m64) ((__vector long long) __c)[0]; 991} 992 993extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 994_m_paddusw (__m64 __m1, __m64 __m2) 995{ 996 return _mm_adds_pu16 (__m1, __m2); 997} 998 999/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 1000 saturating arithmetic. */ 1001extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1002_mm_subs_pi8 (__m64 __m1, __m64 __m2) 1003{ 1004 __vector signed char __a, __b, __c; 1005 1006 __a = (__vector signed char)vec_splats (__m1); 1007 __b = (__vector signed char)vec_splats (__m2); 1008 __c = vec_subs (__a, __b); 1009 return (__m64) ((__vector long long) __c)[0]; 1010} 1011 1012extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1013_m_psubsb (__m64 __m1, __m64 __m2) 1014{ 1015 return _mm_subs_pi8 (__m1, __m2); 1016} 1017 1018/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1019 signed saturating arithmetic. */ 1020extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021_mm_subs_pi16 (__m64 __m1, __m64 __m2) 1022{ 1023 __vector signed short __a, __b, __c; 1024 1025 __a = (__vector signed short)vec_splats (__m1); 1026 __b = (__vector signed short)vec_splats (__m2); 1027 __c = vec_subs (__a, __b); 1028 return (__m64) ((__vector long long) __c)[0]; 1029} 1030 1031extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1032_m_psubsw (__m64 __m1, __m64 __m2) 1033{ 1034 return _mm_subs_pi16 (__m1, __m2); 1035} 1036 1037/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1038 unsigned saturating arithmetic. */ 1039extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040_mm_subs_pu8 (__m64 __m1, __m64 __m2) 1041{ 1042 __vector unsigned char __a, __b, __c; 1043 1044 __a = (__vector unsigned char)vec_splats (__m1); 1045 __b = (__vector unsigned char)vec_splats (__m2); 1046 __c = vec_subs (__a, __b); 1047 return (__m64) ((__vector long long) __c)[0]; 1048} 1049 1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051_m_psubusb (__m64 __m1, __m64 __m2) 1052{ 1053 return _mm_subs_pu8 (__m1, __m2); 1054} 1055 1056/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1057 unsigned saturating arithmetic. */ 1058extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059_mm_subs_pu16 (__m64 __m1, __m64 __m2) 1060{ 1061 __vector unsigned short __a, __b, __c; 1062 1063 __a = (__vector unsigned short)vec_splats (__m1); 1064 __b = (__vector unsigned short)vec_splats (__m2); 1065 __c = vec_subs (__a, __b); 1066 return (__m64) ((__vector long long) __c)[0]; 1067} 1068 1069extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1070_m_psubusw (__m64 __m1, __m64 __m2) 1071{ 1072 return _mm_subs_pu16 (__m1, __m2); 1073} 1074 1075/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1076 four 32-bit intermediate results, which are then summed by pairs to 1077 produce two 32-bit results. */ 1078extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1079_mm_madd_pi16 (__m64 __m1, __m64 __m2) 1080{ 1081 __vector signed short __a, __b; 1082 __vector signed int __c; 1083 __vector signed int __zero = {0, 0, 0, 0}; 1084 1085 __a = (__vector signed short)vec_splats (__m1); 1086 __b = (__vector signed short)vec_splats (__m2); 1087 __c = vec_vmsumshm (__a, __b, __zero); 1088 return (__m64) ((__vector long long) __c)[0]; 1089} 1090 1091extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1092_m_pmaddwd (__m64 __m1, __m64 __m2) 1093{ 1094 return _mm_madd_pi16 (__m1, __m2); 1095} 1096/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1097 M2 and produce the high 16 bits of the 32-bit results. */ 1098extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 1100{ 1101 __vector signed short __a, __b; 1102 __vector signed short __c; 1103 __vector signed int __w0, __w1; 1104 __vector unsigned char __xform1 = { 1105#ifdef __LITTLE_ENDIAN__ 1106 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1107 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1108#else 1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1110 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1111#endif 1112 }; 1113 1114 __a = (__vector signed short)vec_splats (__m1); 1115 __b = (__vector signed short)vec_splats (__m2); 1116 1117 __w0 = vec_vmulesh (__a, __b); 1118 __w1 = vec_vmulosh (__a, __b); 1119 __c = (__vector signed short)vec_perm (__w0, __w1, __xform1); 1120 1121 return (__m64) ((__vector long long) __c)[0]; 1122} 1123 1124extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1125_m_pmulhw (__m64 __m1, __m64 __m2) 1126{ 1127 return _mm_mulhi_pi16 (__m1, __m2); 1128} 1129 1130/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1131 the low 16 bits of the results. */ 1132extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1133_mm_mullo_pi16 (__m64 __m1, __m64 __m2) 1134{ 1135 __vector signed short __a, __b, __c; 1136 1137 __a = (__vector signed short)vec_splats (__m1); 1138 __b = (__vector signed short)vec_splats (__m2); 1139 __c = __a * __b; 1140 return (__m64) ((__vector long long) __c)[0]; 1141} 1142 1143extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144_m_pmullw (__m64 __m1, __m64 __m2) 1145{ 1146 return _mm_mullo_pi16 (__m1, __m2); 1147} 1148 1149/* Shift four 16-bit values in M left by COUNT. */ 1150extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1151_mm_sll_pi16 (__m64 __m, __m64 __count) 1152{ 1153 __vector signed short __r; 1154 __vector unsigned short __c; 1155 1156 if (__count <= 15) 1157 { 1158 __r = (__vector signed short)vec_splats (__m); 1159 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1160 __r = vec_sl (__r, (__vector unsigned short)__c); 1161 return (__m64) ((__vector long long) __r)[0]; 1162 } 1163 else 1164 return (0); 1165} 1166 1167extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1168_m_psllw (__m64 __m, __m64 __count) 1169{ 1170 return _mm_sll_pi16 (__m, __count); 1171} 1172 1173extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174_mm_slli_pi16 (__m64 __m, int __count) 1175{ 1176 /* Promote int to long then invoke mm_sll_pi16. */ 1177 return _mm_sll_pi16 (__m, __count); 1178} 1179 1180extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1181_m_psllwi (__m64 __m, int __count) 1182{ 1183 return _mm_slli_pi16 (__m, __count); 1184} 1185 1186/* Shift two 32-bit values in M left by COUNT. */ 1187extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1188_mm_sll_pi32 (__m64 __m, __m64 __count) 1189{ 1190 __m64_union __res; 1191 1192 __res.as_m64 = __m; 1193 1194 __res.as_int[0] = __res.as_int[0] << __count; 1195 __res.as_int[1] = __res.as_int[1] << __count; 1196 return (__res.as_m64); 1197} 1198 1199extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1200_m_pslld (__m64 __m, __m64 __count) 1201{ 1202 return _mm_sll_pi32 (__m, __count); 1203} 1204 1205extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206_mm_slli_pi32 (__m64 __m, int __count) 1207{ 1208 /* Promote int to long then invoke mm_sll_pi32. */ 1209 return _mm_sll_pi32 (__m, __count); 1210} 1211 1212extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1213_m_pslldi (__m64 __m, int __count) 1214{ 1215 return _mm_slli_pi32 (__m, __count); 1216} 1217 1218/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1220_mm_sra_pi16 (__m64 __m, __m64 __count) 1221{ 1222 __vector signed short __r; 1223 __vector unsigned short __c; 1224 1225 if (__count <= 15) 1226 { 1227 __r = (__vector signed short)vec_splats (__m); 1228 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1229 __r = vec_sra (__r, (__vector unsigned short)__c); 1230 return (__m64) ((__vector long long) __r)[0]; 1231 } 1232 else 1233 return (0); 1234} 1235 1236extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1237_m_psraw (__m64 __m, __m64 __count) 1238{ 1239 return _mm_sra_pi16 (__m, __count); 1240} 1241 1242extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1243_mm_srai_pi16 (__m64 __m, int __count) 1244{ 1245 /* Promote int to long then invoke mm_sra_pi32. */ 1246 return _mm_sra_pi16 (__m, __count); 1247} 1248 1249extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1250_m_psrawi (__m64 __m, int __count) 1251{ 1252 return _mm_srai_pi16 (__m, __count); 1253} 1254 1255/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1256extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1257_mm_sra_pi32 (__m64 __m, __m64 __count) 1258{ 1259 __m64_union __res; 1260 1261 __res.as_m64 = __m; 1262 1263 __res.as_int[0] = __res.as_int[0] >> __count; 1264 __res.as_int[1] = __res.as_int[1] >> __count; 1265 return (__res.as_m64); 1266} 1267 1268extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269_m_psrad (__m64 __m, __m64 __count) 1270{ 1271 return _mm_sra_pi32 (__m, __count); 1272} 1273 1274extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275_mm_srai_pi32 (__m64 __m, int __count) 1276{ 1277 /* Promote int to long then invoke mm_sra_pi32. */ 1278 return _mm_sra_pi32 (__m, __count); 1279} 1280 1281extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282_m_psradi (__m64 __m, int __count) 1283{ 1284 return _mm_srai_pi32 (__m, __count); 1285} 1286 1287/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1288extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289_mm_srl_pi16 (__m64 __m, __m64 __count) 1290{ 1291 __vector unsigned short __r; 1292 __vector unsigned short __c; 1293 1294 if (__count <= 15) 1295 { 1296 __r = (__vector unsigned short)vec_splats (__m); 1297 __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1298 __r = vec_sr (__r, (__vector unsigned short)__c); 1299 return (__m64) ((__vector long long) __r)[0]; 1300 } 1301 else 1302 return (0); 1303} 1304 1305extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1306_m_psrlw (__m64 __m, __m64 __count) 1307{ 1308 return _mm_srl_pi16 (__m, __count); 1309} 1310 1311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312_mm_srli_pi16 (__m64 __m, int __count) 1313{ 1314 /* Promote int to long then invoke mm_sra_pi32. */ 1315 return _mm_srl_pi16 (__m, __count); 1316} 1317 1318extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1319_m_psrlwi (__m64 __m, int __count) 1320{ 1321 return _mm_srli_pi16 (__m, __count); 1322} 1323 1324/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1325extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1326_mm_srl_pi32 (__m64 __m, __m64 __count) 1327{ 1328 __m64_union __res; 1329 1330 __res.as_m64 = __m; 1331 1332 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; 1333 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; 1334 return (__res.as_m64); 1335} 1336 1337extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1338_m_psrld (__m64 __m, __m64 __count) 1339{ 1340 return _mm_srl_pi32 (__m, __count); 1341} 1342 1343extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1344_mm_srli_pi32 (__m64 __m, int __count) 1345{ 1346 /* Promote int to long then invoke mm_srl_pi32. */ 1347 return _mm_srl_pi32 (__m, __count); 1348} 1349 1350extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351_m_psrldi (__m64 __m, int __count) 1352{ 1353 return _mm_srli_pi32 (__m, __count); 1354} 1355#endif /* _ARCH_PWR8 */ 1356 1357/* Creates a vector of two 32-bit values; I0 is least significant. */ 1358extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1359_mm_set_pi32 (int __i1, int __i0) 1360{ 1361 __m64_union __res; 1362 1363 __res.as_int[0] = __i0; 1364 __res.as_int[1] = __i1; 1365 return (__res.as_m64); 1366} 1367 1368/* Creates a vector of four 16-bit values; W0 is least significant. */ 1369extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 1371{ 1372 __m64_union __res; 1373 1374 __res.as_short[0] = __w0; 1375 __res.as_short[1] = __w1; 1376 __res.as_short[2] = __w2; 1377 __res.as_short[3] = __w3; 1378 return (__res.as_m64); 1379} 1380 1381/* Creates a vector of eight 8-bit values; B0 is least significant. */ 1382extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1383_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 1384 char __b3, char __b2, char __b1, char __b0) 1385{ 1386 __m64_union __res; 1387 1388 __res.as_char[0] = __b0; 1389 __res.as_char[1] = __b1; 1390 __res.as_char[2] = __b2; 1391 __res.as_char[3] = __b3; 1392 __res.as_char[4] = __b4; 1393 __res.as_char[5] = __b5; 1394 __res.as_char[6] = __b6; 1395 __res.as_char[7] = __b7; 1396 return (__res.as_m64); 1397} 1398 1399/* Similar, but with the arguments in reverse order. */ 1400extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1401_mm_setr_pi32 (int __i0, int __i1) 1402{ 1403 __m64_union __res; 1404 1405 __res.as_int[0] = __i0; 1406 __res.as_int[1] = __i1; 1407 return (__res.as_m64); 1408} 1409 1410extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 1412{ 1413 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 1414} 1415 1416extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1417_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 1418 char __b4, char __b5, char __b6, char __b7) 1419{ 1420 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1421} 1422 1423/* Creates a vector of two 32-bit values, both elements containing I. */ 1424extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1425_mm_set1_pi32 (int __i) 1426{ 1427 __m64_union __res; 1428 1429 __res.as_int[0] = __i; 1430 __res.as_int[1] = __i; 1431 return (__res.as_m64); 1432} 1433 1434/* Creates a vector of four 16-bit values, all elements containing W. */ 1435extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1436_mm_set1_pi16 (short __w) 1437{ 1438#if _ARCH_PWR9 1439 __vector signed short w; 1440 1441 w = (__vector signed short)vec_splats (__w); 1442 return (__m64) ((__vector long long) w)[0]; 1443#else 1444 __m64_union __res; 1445 1446 __res.as_short[0] = __w; 1447 __res.as_short[1] = __w; 1448 __res.as_short[2] = __w; 1449 __res.as_short[3] = __w; 1450 return (__res.as_m64); 1451#endif 1452} 1453 1454/* Creates a vector of eight 8-bit values, all elements containing B. */ 1455extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1456_mm_set1_pi8 (signed char __b) 1457{ 1458#if _ARCH_PWR8 1459 __vector signed char __res; 1460 1461 __res = (__vector signed char)vec_splats (__b); 1462 return (__m64) ((__vector long long) __res)[0]; 1463#else 1464 __m64_union __res; 1465 1466 __res.as_char[0] = __b; 1467 __res.as_char[1] = __b; 1468 __res.as_char[2] = __b; 1469 __res.as_char[3] = __b; 1470 __res.as_char[4] = __b; 1471 __res.as_char[5] = __b; 1472 __res.as_char[6] = __b; 1473 __res.as_char[7] = __b; 1474 return (__res.as_m64); 1475#endif 1476} 1477#endif /* _MMINTRIN_H_INCLUDED */ 1478