mmintrin.h revision 351280
1/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10/* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13#ifndef NO_WARN_X86_INTRINSICS 14/* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since PowerPC target doesn't support native 64-bit vector type, we 18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which 19 works well for _si64 and some _pi32 operations. 20 21 For _pi16 and _pi8 operations, it's better to transfer __m64 into 22 128-bit PowerPC vector first. Power8 introduced direct register 23 move instructions which helps for more efficient implementation. 24 25 It's user's responsibility to determine if the results of such port 26 are acceptable or further changes are needed. Please note that much 27 code using Intel intrinsics CAN BE REWRITTEN in more portable and 28 efficient standard C or GNU C extensions with 64-bit scalar 29 operations, or 128-bit SSE/Altivec operations, which are more 30 recommended. */ 31#error \ 32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33#endif 34 35#ifndef _MMINTRIN_H_INCLUDED 36#define _MMINTRIN_H_INCLUDED 37 38#include <altivec.h> 39/* The Intel API is flexible enough that we must allow aliasing with other 40 vector types, and their scalar components. */ 41typedef __attribute__((__aligned__(8))) unsigned long long __m64; 42 43typedef __attribute__((__aligned__(8))) union { 44 __m64 as_m64; 45 char as_char[8]; 46 signed char as_signed_char[8]; 47 short as_short[4]; 48 int as_int[2]; 49 long long as_long_long; 50 float as_float[2]; 51 double as_double; 52} __m64_union; 53 54/* Empty the multimedia state. */ 55extern __inline void 56 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 57 _mm_empty(void) { 58 /* nothing to do on PowerPC. */ 59} 60 61extern __inline void 62 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 63 _m_empty(void) { 64 /* nothing to do on PowerPC. */ 65} 66 67/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 68extern __inline __m64 69 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm_cvtsi32_si64(int __i) { 71 return (__m64)(unsigned int)__i; 72} 73 74extern __inline __m64 75 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 76 _m_from_int(int __i) { 77 return _mm_cvtsi32_si64(__i); 78} 79 80/* Convert the lower 32 bits of the __m64 object into an integer. */ 81extern __inline int 82 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 83 _mm_cvtsi64_si32(__m64 __i) { 84 return ((int)__i); 85} 86 87extern __inline int 88 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 89 _m_to_int(__m64 __i) { 90 return _mm_cvtsi64_si32(__i); 91} 92 93/* Convert I to a __m64 object. */ 94 95/* Intel intrinsic. */ 96extern __inline __m64 97 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 98 _m_from_int64(long long __i) { 99 return (__m64)__i; 100} 101 102extern __inline __m64 103 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 104 _mm_cvtsi64_m64(long long __i) { 105 return (__m64)__i; 106} 107 108/* Microsoft intrinsic. */ 109extern __inline __m64 110 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 111 _mm_cvtsi64x_si64(long long __i) { 112 return (__m64)__i; 113} 114 115extern __inline __m64 116 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 117 _mm_set_pi64x(long long __i) { 118 return (__m64)__i; 119} 120 121/* Convert the __m64 object to a 64bit integer. */ 122 123/* Intel intrinsic. */ 124extern __inline long long 125 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126 _m_to_int64(__m64 __i) { 127 return (long long)__i; 128} 129 130extern __inline long long 131 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 132 _mm_cvtm64_si64(__m64 __i) { 133 return (long long)__i; 134} 135 136/* Microsoft intrinsic. */ 137extern __inline long long 138 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_cvtsi64_si64x(__m64 __i) { 140 return (long long)__i; 141} 142 143#ifdef _ARCH_PWR8 144/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 145 the result, and the four 16-bit values from M2 into the upper four 8-bit 146 values of the result, all with signed saturation. */ 147extern __inline __m64 148 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 149 _mm_packs_pi16(__m64 __m1, __m64 __m2) { 150 __vector signed short vm1; 151 __vector signed char vresult; 152 153 vm1 = (__vector signed short)(__vector unsigned long long) 154#ifdef __LITTLE_ENDIAN__ 155 {__m1, __m2}; 156#else 157 {__m2, __m1}; 158#endif 159 vresult = vec_packs(vm1, vm1); 160 return (__m64)((__vector long long)vresult)[0]; 161} 162 163extern __inline __m64 164 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 165 _m_packsswb(__m64 __m1, __m64 __m2) { 166 return _mm_packs_pi16(__m1, __m2); 167} 168 169/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 170 the result, and the two 32-bit values from M2 into the upper two 16-bit 171 values of the result, all with signed saturation. */ 172extern __inline __m64 173 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 174 _mm_packs_pi32(__m64 __m1, __m64 __m2) { 175 __vector signed int vm1; 176 __vector signed short vresult; 177 178 vm1 = (__vector signed int)(__vector unsigned long long) 179#ifdef __LITTLE_ENDIAN__ 180 {__m1, __m2}; 181#else 182 {__m2, __m1}; 183#endif 184 vresult = vec_packs(vm1, vm1); 185 return (__m64)((__vector long long)vresult)[0]; 186} 187 188extern __inline __m64 189 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 190 _m_packssdw(__m64 __m1, __m64 __m2) { 191 return _mm_packs_pi32(__m1, __m2); 192} 193 194/* Pack the four 16-bit values from M1 into the lower four 8-bit values of 195 the result, and the four 16-bit values from M2 into the upper four 8-bit 196 values of the result, all with unsigned saturation. */ 197extern __inline __m64 198 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 199 _mm_packs_pu16(__m64 __m1, __m64 __m2) { 200 __vector unsigned char r; 201 __vector signed short vm1 = (__vector signed short)(__vector long long) 202#ifdef __LITTLE_ENDIAN__ 203 {__m1, __m2}; 204#else 205 {__m2, __m1}; 206#endif 207 const __vector signed short __zero = {0}; 208 __vector __bool short __select = vec_cmplt(vm1, __zero); 209 r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1); 210 __vector __bool char packsel = vec_pack(__select, __select); 211 r = vec_sel(r, (const __vector unsigned char)__zero, packsel); 212 return (__m64)((__vector long long)r)[0]; 213} 214 215extern __inline __m64 216 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 217 _m_packuswb(__m64 __m1, __m64 __m2) { 218 return _mm_packs_pu16(__m1, __m2); 219} 220#endif /* end ARCH_PWR8 */ 221 222/* Interleave the four 8-bit values from the high half of M1 with the four 223 8-bit values from the high half of M2. */ 224extern __inline __m64 225 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { 227#if _ARCH_PWR8 228 __vector unsigned char a, b, c; 229 230 a = (__vector unsigned char)vec_splats(__m1); 231 b = (__vector unsigned char)vec_splats(__m2); 232 c = vec_mergel(a, b); 233 return (__m64)((__vector long long)c)[1]; 234#else 235 __m64_union m1, m2, res; 236 237 m1.as_m64 = __m1; 238 m2.as_m64 = __m2; 239 240 res.as_char[0] = m1.as_char[4]; 241 res.as_char[1] = m2.as_char[4]; 242 res.as_char[2] = m1.as_char[5]; 243 res.as_char[3] = m2.as_char[5]; 244 res.as_char[4] = m1.as_char[6]; 245 res.as_char[5] = m2.as_char[6]; 246 res.as_char[6] = m1.as_char[7]; 247 res.as_char[7] = m2.as_char[7]; 248 249 return (__m64)res.as_m64; 250#endif 251} 252 253extern __inline __m64 254 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255 _m_punpckhbw(__m64 __m1, __m64 __m2) { 256 return _mm_unpackhi_pi8(__m1, __m2); 257} 258 259/* Interleave the two 16-bit values from the high half of M1 with the two 260 16-bit values from the high half of M2. */ 261extern __inline __m64 262 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { 264 __m64_union m1, m2, res; 265 266 m1.as_m64 = __m1; 267 m2.as_m64 = __m2; 268 269 res.as_short[0] = m1.as_short[2]; 270 res.as_short[1] = m2.as_short[2]; 271 res.as_short[2] = m1.as_short[3]; 272 res.as_short[3] = m2.as_short[3]; 273 274 return (__m64)res.as_m64; 275} 276 277extern __inline __m64 278 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 279 _m_punpckhwd(__m64 __m1, __m64 __m2) { 280 return _mm_unpackhi_pi16(__m1, __m2); 281} 282/* Interleave the 32-bit value from the high half of M1 with the 32-bit 283 value from the high half of M2. */ 284extern __inline __m64 285 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 286 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { 287 __m64_union m1, m2, res; 288 289 m1.as_m64 = __m1; 290 m2.as_m64 = __m2; 291 292 res.as_int[0] = m1.as_int[1]; 293 res.as_int[1] = m2.as_int[1]; 294 295 return (__m64)res.as_m64; 296} 297 298extern __inline __m64 299 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _m_punpckhdq(__m64 __m1, __m64 __m2) { 301 return _mm_unpackhi_pi32(__m1, __m2); 302} 303/* Interleave the four 8-bit values from the low half of M1 with the four 304 8-bit values from the low half of M2. */ 305extern __inline __m64 306 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 307 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { 308#if _ARCH_PWR8 309 __vector unsigned char a, b, c; 310 311 a = (__vector unsigned char)vec_splats(__m1); 312 b = (__vector unsigned char)vec_splats(__m2); 313 c = vec_mergel(a, b); 314 return (__m64)((__vector long long)c)[0]; 315#else 316 __m64_union m1, m2, res; 317 318 m1.as_m64 = __m1; 319 m2.as_m64 = __m2; 320 321 res.as_char[0] = m1.as_char[0]; 322 res.as_char[1] = m2.as_char[0]; 323 res.as_char[2] = m1.as_char[1]; 324 res.as_char[3] = m2.as_char[1]; 325 res.as_char[4] = m1.as_char[2]; 326 res.as_char[5] = m2.as_char[2]; 327 res.as_char[6] = m1.as_char[3]; 328 res.as_char[7] = m2.as_char[3]; 329 330 return (__m64)res.as_m64; 331#endif 332} 333 334extern __inline __m64 335 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 336 _m_punpcklbw(__m64 __m1, __m64 __m2) { 337 return _mm_unpacklo_pi8(__m1, __m2); 338} 339/* Interleave the two 16-bit values from the low half of M1 with the two 340 16-bit values from the low half of M2. */ 341extern __inline __m64 342 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 343 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { 344 __m64_union m1, m2, res; 345 346 m1.as_m64 = __m1; 347 m2.as_m64 = __m2; 348 349 res.as_short[0] = m1.as_short[0]; 350 res.as_short[1] = m2.as_short[0]; 351 res.as_short[2] = m1.as_short[1]; 352 res.as_short[3] = m2.as_short[1]; 353 354 return (__m64)res.as_m64; 355} 356 357extern __inline __m64 358 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 359 _m_punpcklwd(__m64 __m1, __m64 __m2) { 360 return _mm_unpacklo_pi16(__m1, __m2); 361} 362 363/* Interleave the 32-bit value from the low half of M1 with the 32-bit 364 value from the low half of M2. */ 365extern __inline __m64 366 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { 368 __m64_union m1, m2, res; 369 370 m1.as_m64 = __m1; 371 m2.as_m64 = __m2; 372 373 res.as_int[0] = m1.as_int[0]; 374 res.as_int[1] = m2.as_int[0]; 375 376 return (__m64)res.as_m64; 377} 378 379extern __inline __m64 380 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381 _m_punpckldq(__m64 __m1, __m64 __m2) { 382 return _mm_unpacklo_pi32(__m1, __m2); 383} 384 385/* Add the 8-bit values in M1 to the 8-bit values in M2. */ 386extern __inline __m64 387 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_add_pi8(__m64 __m1, __m64 __m2) { 389#if _ARCH_PWR8 390 __vector signed char a, b, c; 391 392 a = (__vector signed char)vec_splats(__m1); 393 b = (__vector signed char)vec_splats(__m2); 394 c = vec_add(a, b); 395 return (__m64)((__vector long long)c)[0]; 396#else 397 __m64_union m1, m2, res; 398 399 m1.as_m64 = __m1; 400 m2.as_m64 = __m2; 401 402 res.as_char[0] = m1.as_char[0] + m2.as_char[0]; 403 res.as_char[1] = m1.as_char[1] + m2.as_char[1]; 404 res.as_char[2] = m1.as_char[2] + m2.as_char[2]; 405 res.as_char[3] = m1.as_char[3] + m2.as_char[3]; 406 res.as_char[4] = m1.as_char[4] + m2.as_char[4]; 407 res.as_char[5] = m1.as_char[5] + m2.as_char[5]; 408 res.as_char[6] = m1.as_char[6] + m2.as_char[6]; 409 res.as_char[7] = m1.as_char[7] + m2.as_char[7]; 410 411 return (__m64)res.as_m64; 412#endif 413} 414 415extern __inline __m64 416 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 417 _m_paddb(__m64 __m1, __m64 __m2) { 418 return _mm_add_pi8(__m1, __m2); 419} 420 421/* Add the 16-bit values in M1 to the 16-bit values in M2. */ 422extern __inline __m64 423 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 424 _mm_add_pi16(__m64 __m1, __m64 __m2) { 425#if _ARCH_PWR8 426 __vector signed short a, b, c; 427 428 a = (__vector signed short)vec_splats(__m1); 429 b = (__vector signed short)vec_splats(__m2); 430 c = vec_add(a, b); 431 return (__m64)((__vector long long)c)[0]; 432#else 433 __m64_union m1, m2, res; 434 435 m1.as_m64 = __m1; 436 m2.as_m64 = __m2; 437 438 res.as_short[0] = m1.as_short[0] + m2.as_short[0]; 439 res.as_short[1] = m1.as_short[1] + m2.as_short[1]; 440 res.as_short[2] = m1.as_short[2] + m2.as_short[2]; 441 res.as_short[3] = m1.as_short[3] + m2.as_short[3]; 442 443 return (__m64)res.as_m64; 444#endif 445} 446 447extern __inline __m64 448 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 449 _m_paddw(__m64 __m1, __m64 __m2) { 450 return _mm_add_pi16(__m1, __m2); 451} 452 453/* Add the 32-bit values in M1 to the 32-bit values in M2. */ 454extern __inline __m64 455 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 456 _mm_add_pi32(__m64 __m1, __m64 __m2) { 457#if _ARCH_PWR9 458 __vector signed int a, b, c; 459 460 a = (__vector signed int)vec_splats(__m1); 461 b = (__vector signed int)vec_splats(__m2); 462 c = vec_add(a, b); 463 return (__m64)((__vector long long)c)[0]; 464#else 465 __m64_union m1, m2, res; 466 467 m1.as_m64 = __m1; 468 m2.as_m64 = __m2; 469 470 res.as_int[0] = m1.as_int[0] + m2.as_int[0]; 471 res.as_int[1] = m1.as_int[1] + m2.as_int[1]; 472 473 return (__m64)res.as_m64; 474#endif 475} 476 477extern __inline __m64 478 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 479 _m_paddd(__m64 __m1, __m64 __m2) { 480 return _mm_add_pi32(__m1, __m2); 481} 482 483/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 484extern __inline __m64 485 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 486 _mm_sub_pi8(__m64 __m1, __m64 __m2) { 487#if _ARCH_PWR8 488 __vector signed char a, b, c; 489 490 a = (__vector signed char)vec_splats(__m1); 491 b = (__vector signed char)vec_splats(__m2); 492 c = vec_sub(a, b); 493 return (__m64)((__vector long long)c)[0]; 494#else 495 __m64_union m1, m2, res; 496 497 m1.as_m64 = __m1; 498 m2.as_m64 = __m2; 499 500 res.as_char[0] = m1.as_char[0] - m2.as_char[0]; 501 res.as_char[1] = m1.as_char[1] - m2.as_char[1]; 502 res.as_char[2] = m1.as_char[2] - m2.as_char[2]; 503 res.as_char[3] = m1.as_char[3] - m2.as_char[3]; 504 res.as_char[4] = m1.as_char[4] - m2.as_char[4]; 505 res.as_char[5] = m1.as_char[5] - m2.as_char[5]; 506 res.as_char[6] = m1.as_char[6] - m2.as_char[6]; 507 res.as_char[7] = m1.as_char[7] - m2.as_char[7]; 508 509 return (__m64)res.as_m64; 510#endif 511} 512 513extern __inline __m64 514 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 515 _m_psubb(__m64 __m1, __m64 __m2) { 516 return _mm_sub_pi8(__m1, __m2); 517} 518 519/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 520extern __inline __m64 521 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 522 _mm_sub_pi16(__m64 __m1, __m64 __m2) { 523#if _ARCH_PWR8 524 __vector signed short a, b, c; 525 526 a = (__vector signed short)vec_splats(__m1); 527 b = (__vector signed short)vec_splats(__m2); 528 c = vec_sub(a, b); 529 return (__m64)((__vector long long)c)[0]; 530#else 531 __m64_union m1, m2, res; 532 533 m1.as_m64 = __m1; 534 m2.as_m64 = __m2; 535 536 res.as_short[0] = m1.as_short[0] - m2.as_short[0]; 537 res.as_short[1] = m1.as_short[1] - m2.as_short[1]; 538 res.as_short[2] = m1.as_short[2] - m2.as_short[2]; 539 res.as_short[3] = m1.as_short[3] - m2.as_short[3]; 540 541 return (__m64)res.as_m64; 542#endif 543} 544 545extern __inline __m64 546 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 547 _m_psubw(__m64 __m1, __m64 __m2) { 548 return _mm_sub_pi16(__m1, __m2); 549} 550 551/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 552extern __inline __m64 553 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm_sub_pi32(__m64 __m1, __m64 __m2) { 555#if _ARCH_PWR9 556 __vector signed int a, b, c; 557 558 a = (__vector signed int)vec_splats(__m1); 559 b = (__vector signed int)vec_splats(__m2); 560 c = vec_sub(a, b); 561 return (__m64)((__vector long long)c)[0]; 562#else 563 __m64_union m1, m2, res; 564 565 m1.as_m64 = __m1; 566 m2.as_m64 = __m2; 567 568 res.as_int[0] = m1.as_int[0] - m2.as_int[0]; 569 res.as_int[1] = m1.as_int[1] - m2.as_int[1]; 570 571 return (__m64)res.as_m64; 572#endif 573} 574 575extern __inline __m64 576 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 577 _m_psubd(__m64 __m1, __m64 __m2) { 578 return _mm_sub_pi32(__m1, __m2); 579} 580 581extern __inline __m64 582 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 583 _mm_add_si64(__m64 __m1, __m64 __m2) { 584 return (__m1 + __m2); 585} 586 587extern __inline __m64 588 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589 _mm_sub_si64(__m64 __m1, __m64 __m2) { 590 return (__m1 - __m2); 591} 592 593/* Shift the 64-bit value in M left by COUNT. */ 594extern __inline __m64 595 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 596 _mm_sll_si64(__m64 __m, __m64 __count) { 597 return (__m << __count); 598} 599 600extern __inline __m64 601 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 602 _m_psllq(__m64 __m, __m64 __count) { 603 return _mm_sll_si64(__m, __count); 604} 605 606extern __inline __m64 607 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 608 _mm_slli_si64(__m64 __m, const int __count) { 609 return (__m << __count); 610} 611 612extern __inline __m64 613 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 614 _m_psllqi(__m64 __m, const int __count) { 615 return _mm_slli_si64(__m, __count); 616} 617 618/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 619extern __inline __m64 620 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 621 _mm_srl_si64(__m64 __m, __m64 __count) { 622 return (__m >> __count); 623} 624 625extern __inline __m64 626 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 627 _m_psrlq(__m64 __m, __m64 __count) { 628 return _mm_srl_si64(__m, __count); 629} 630 631extern __inline __m64 632 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 633 _mm_srli_si64(__m64 __m, const int __count) { 634 return (__m >> __count); 635} 636 637extern __inline __m64 638 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 639 _m_psrlqi(__m64 __m, const int __count) { 640 return _mm_srli_si64(__m, __count); 641} 642 643/* Bit-wise AND the 64-bit values in M1 and M2. */ 644extern __inline __m64 645 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646 _mm_and_si64(__m64 __m1, __m64 __m2) { 647 return (__m1 & __m2); 648} 649 650extern __inline __m64 651 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652 _m_pand(__m64 __m1, __m64 __m2) { 653 return _mm_and_si64(__m1, __m2); 654} 655 656/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 657 64-bit value in M2. */ 658extern __inline __m64 659 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 660 _mm_andnot_si64(__m64 __m1, __m64 __m2) { 661 return (~__m1 & __m2); 662} 663 664extern __inline __m64 665 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 666 _m_pandn(__m64 __m1, __m64 __m2) { 667 return _mm_andnot_si64(__m1, __m2); 668} 669 670/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 671extern __inline __m64 672 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 673 _mm_or_si64(__m64 __m1, __m64 __m2) { 674 return (__m1 | __m2); 675} 676 677extern __inline __m64 678 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 679 _m_por(__m64 __m1, __m64 __m2) { 680 return _mm_or_si64(__m1, __m2); 681} 682 683/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 684extern __inline __m64 685 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 686 _mm_xor_si64(__m64 __m1, __m64 __m2) { 687 return (__m1 ^ __m2); 688} 689 690extern __inline __m64 691 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 692 _m_pxor(__m64 __m1, __m64 __m2) { 693 return _mm_xor_si64(__m1, __m2); 694} 695 696/* Creates a 64-bit zero. */ 697extern __inline __m64 698 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 699 _mm_setzero_si64(void) { 700 return (__m64)0; 701} 702 703/* Compare eight 8-bit values. The result of the comparison is 0xFF if the 704 test is true and zero if false. */ 705extern __inline __m64 706 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 707 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { 708#if defined(_ARCH_PWR6) && defined(__powerpc64__) 709 __m64 res; 710 __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :); 711 return (res); 712#else 713 __m64_union m1, m2, res; 714 715 m1.as_m64 = __m1; 716 m2.as_m64 = __m2; 717 718 res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0; 719 res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0; 720 res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0; 721 res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0; 722 res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0; 723 res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0; 724 res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0; 725 res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0; 726 727 return (__m64)res.as_m64; 728#endif 729} 730 731extern __inline __m64 732 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 733 _m_pcmpeqb(__m64 __m1, __m64 __m2) { 734 return _mm_cmpeq_pi8(__m1, __m2); 735} 736 737extern __inline __m64 738 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 739 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { 740#if _ARCH_PWR8 741 __vector signed char a, b, c; 742 743 a = (__vector signed char)vec_splats(__m1); 744 b = (__vector signed char)vec_splats(__m2); 745 c = (__vector signed char)vec_cmpgt(a, b); 746 return (__m64)((__vector long long)c)[0]; 747#else 748 __m64_union m1, m2, res; 749 750 m1.as_m64 = __m1; 751 m2.as_m64 = __m2; 752 753 res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0; 754 res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0; 755 res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0; 756 res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0; 757 res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0; 758 res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0; 759 res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0; 760 res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0; 761 762 return (__m64)res.as_m64; 763#endif 764} 765 766extern __inline __m64 767 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 768 _m_pcmpgtb(__m64 __m1, __m64 __m2) { 769 return _mm_cmpgt_pi8(__m1, __m2); 770} 771 772/* Compare four 16-bit values. The result of the comparison is 0xFFFF if 773 the test is true and zero if false. */ 774extern __inline __m64 775 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 776 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { 777#if _ARCH_PWR8 778 __vector signed short a, b, c; 779 780 a = (__vector signed short)vec_splats(__m1); 781 b = (__vector signed short)vec_splats(__m2); 782 c = (__vector signed short)vec_cmpeq(a, b); 783 return (__m64)((__vector long long)c)[0]; 784#else 785 __m64_union m1, m2, res; 786 787 m1.as_m64 = __m1; 788 m2.as_m64 = __m2; 789 790 res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0; 791 res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0; 792 res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0; 793 res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0; 794 795 return (__m64)res.as_m64; 796#endif 797} 798 799extern __inline __m64 800 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 801 _m_pcmpeqw(__m64 __m1, __m64 __m2) { 802 return _mm_cmpeq_pi16(__m1, __m2); 803} 804 805extern __inline __m64 806 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 807 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { 808#if _ARCH_PWR8 809 __vector signed short a, b, c; 810 811 a = (__vector signed short)vec_splats(__m1); 812 b = (__vector signed short)vec_splats(__m2); 813 c = (__vector signed short)vec_cmpgt(a, b); 814 return (__m64)((__vector long long)c)[0]; 815#else 816 __m64_union m1, m2, res; 817 818 m1.as_m64 = __m1; 819 m2.as_m64 = __m2; 820 821 res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0; 822 res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0; 823 res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0; 824 res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0; 825 826 return (__m64)res.as_m64; 827#endif 828} 829 830extern __inline __m64 831 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 832 _m_pcmpgtw(__m64 __m1, __m64 __m2) { 833 return _mm_cmpgt_pi16(__m1, __m2); 834} 835 836/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 837 the test is true and zero if false. */ 838extern __inline __m64 839 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 840 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { 841#if _ARCH_PWR9 842 __vector signed int a, b, c; 843 844 a = (__vector signed int)vec_splats(__m1); 845 b = (__vector signed int)vec_splats(__m2); 846 c = (__vector signed int)vec_cmpeq(a, b); 847 return (__m64)((__vector long long)c)[0]; 848#else 849 __m64_union m1, m2, res; 850 851 m1.as_m64 = __m1; 852 m2.as_m64 = __m2; 853 854 res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0; 855 res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0; 856 857 return (__m64)res.as_m64; 858#endif 859} 860 861extern __inline __m64 862 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 863 _m_pcmpeqd(__m64 __m1, __m64 __m2) { 864 return _mm_cmpeq_pi32(__m1, __m2); 865} 866 867extern __inline __m64 868 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 869 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { 870#if _ARCH_PWR9 871 __vector signed int a, b, c; 872 873 a = (__vector signed int)vec_splats(__m1); 874 b = (__vector signed int)vec_splats(__m2); 875 c = (__vector signed int)vec_cmpgt(a, b); 876 return (__m64)((__vector long long)c)[0]; 877#else 878 __m64_union m1, m2, res; 879 880 m1.as_m64 = __m1; 881 m2.as_m64 = __m2; 882 883 res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0; 884 res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0; 885 886 return (__m64)res.as_m64; 887#endif 888} 889 890extern __inline __m64 891 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 892 _m_pcmpgtd(__m64 __m1, __m64 __m2) { 893 return _mm_cmpgt_pi32(__m1, __m2); 894} 895 896#if _ARCH_PWR8 897/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 898 saturated arithmetic. */ 899extern __inline __m64 900 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 901 _mm_adds_pi8(__m64 __m1, __m64 __m2) { 902 __vector signed char a, b, c; 903 904 a = (__vector signed char)vec_splats(__m1); 905 b = (__vector signed char)vec_splats(__m2); 906 c = vec_adds(a, b); 907 return (__m64)((__vector long long)c)[0]; 908} 909 910extern __inline __m64 911 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 912 _m_paddsb(__m64 __m1, __m64 __m2) { 913 return _mm_adds_pi8(__m1, __m2); 914} 915/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 916 saturated arithmetic. */ 917extern __inline __m64 918 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 _mm_adds_pi16(__m64 __m1, __m64 __m2) { 920 __vector signed short a, b, c; 921 922 a = (__vector signed short)vec_splats(__m1); 923 b = (__vector signed short)vec_splats(__m2); 924 c = vec_adds(a, b); 925 return (__m64)((__vector long long)c)[0]; 926} 927 928extern __inline __m64 929 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 930 _m_paddsw(__m64 __m1, __m64 __m2) { 931 return _mm_adds_pi16(__m1, __m2); 932} 933/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 934 saturated arithmetic. */ 935extern __inline __m64 936 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 937 _mm_adds_pu8(__m64 __m1, __m64 __m2) { 938 __vector unsigned char a, b, c; 939 940 a = (__vector unsigned char)vec_splats(__m1); 941 b = (__vector unsigned char)vec_splats(__m2); 942 c = vec_adds(a, b); 943 return (__m64)((__vector long long)c)[0]; 944} 945 946extern __inline __m64 947 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 948 _m_paddusb(__m64 __m1, __m64 __m2) { 949 return _mm_adds_pu8(__m1, __m2); 950} 951 952/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 953 saturated arithmetic. */ 954extern __inline __m64 955 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 956 _mm_adds_pu16(__m64 __m1, __m64 __m2) { 957 __vector unsigned short a, b, c; 958 959 a = (__vector unsigned short)vec_splats(__m1); 960 b = (__vector unsigned short)vec_splats(__m2); 961 c = vec_adds(a, b); 962 return (__m64)((__vector long long)c)[0]; 963} 964 965extern __inline __m64 966 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 967 _m_paddusw(__m64 __m1, __m64 __m2) { 968 return _mm_adds_pu16(__m1, __m2); 969} 970 971/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 972 saturating arithmetic. */ 973extern __inline __m64 974 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 _mm_subs_pi8(__m64 __m1, __m64 __m2) { 976 __vector signed char a, b, c; 977 978 a = (__vector signed char)vec_splats(__m1); 979 b = (__vector signed char)vec_splats(__m2); 980 c = vec_subs(a, b); 981 return (__m64)((__vector long long)c)[0]; 982} 983 984extern __inline __m64 985 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 986 _m_psubsb(__m64 __m1, __m64 __m2) { 987 return _mm_subs_pi8(__m1, __m2); 988} 989 990/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 991 signed saturating arithmetic. */ 992extern __inline __m64 993 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 994 _mm_subs_pi16(__m64 __m1, __m64 __m2) { 995 __vector signed short a, b, c; 996 997 a = (__vector signed short)vec_splats(__m1); 998 b = (__vector signed short)vec_splats(__m2); 999 c = vec_subs(a, b); 1000 return (__m64)((__vector long long)c)[0]; 1001} 1002 1003extern __inline __m64 1004 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1005 _m_psubsw(__m64 __m1, __m64 __m2) { 1006 return _mm_subs_pi16(__m1, __m2); 1007} 1008 1009/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1010 unsigned saturating arithmetic. */ 1011extern __inline __m64 1012 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1013 _mm_subs_pu8(__m64 __m1, __m64 __m2) { 1014 __vector unsigned char a, b, c; 1015 1016 a = (__vector unsigned char)vec_splats(__m1); 1017 b = (__vector unsigned char)vec_splats(__m2); 1018 c = vec_subs(a, b); 1019 return (__m64)((__vector long long)c)[0]; 1020} 1021 1022extern __inline __m64 1023 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1024 _m_psubusb(__m64 __m1, __m64 __m2) { 1025 return _mm_subs_pu8(__m1, __m2); 1026} 1027 1028/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1029 unsigned saturating arithmetic. */ 1030extern __inline __m64 1031 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1032 _mm_subs_pu16(__m64 __m1, __m64 __m2) { 1033 __vector unsigned short a, b, c; 1034 1035 a = (__vector unsigned short)vec_splats(__m1); 1036 b = (__vector unsigned short)vec_splats(__m2); 1037 c = vec_subs(a, b); 1038 return (__m64)((__vector long long)c)[0]; 1039} 1040 1041extern __inline __m64 1042 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1043 _m_psubusw(__m64 __m1, __m64 __m2) { 1044 return _mm_subs_pu16(__m1, __m2); 1045} 1046 1047/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1048 four 32-bit intermediate results, which are then summed by pairs to 1049 produce two 32-bit results. */ 1050extern __inline __m64 1051 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1052 _mm_madd_pi16(__m64 __m1, __m64 __m2) { 1053 __vector signed short a, b; 1054 __vector signed int c; 1055 __vector signed int zero = {0, 0, 0, 0}; 1056 1057 a = (__vector signed short)vec_splats(__m1); 1058 b = (__vector signed short)vec_splats(__m2); 1059 c = vec_vmsumshm(a, b, zero); 1060 return (__m64)((__vector long long)c)[0]; 1061} 1062 1063extern __inline __m64 1064 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1065 _m_pmaddwd(__m64 __m1, __m64 __m2) { 1066 return _mm_madd_pi16(__m1, __m2); 1067} 1068/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1069 M2 and produce the high 16 bits of the 32-bit results. */ 1070extern __inline __m64 1071 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1072 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { 1073 __vector signed short a, b; 1074 __vector signed short c; 1075 __vector signed int w0, w1; 1076 __vector unsigned char xform1 = { 1077#ifdef __LITTLE_ENDIAN__ 1078 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1079 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1080#else 1081 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, 1082 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1083#endif 1084 }; 1085 1086 a = (__vector signed short)vec_splats(__m1); 1087 b = (__vector signed short)vec_splats(__m2); 1088 1089 w0 = vec_vmulesh(a, b); 1090 w1 = vec_vmulosh(a, b); 1091 c = (__vector signed short)vec_perm(w0, w1, xform1); 1092 1093 return (__m64)((__vector long long)c)[0]; 1094} 1095 1096extern __inline __m64 1097 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1098 _m_pmulhw(__m64 __m1, __m64 __m2) { 1099 return _mm_mulhi_pi16(__m1, __m2); 1100} 1101 1102/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1103 the low 16 bits of the results. */ 1104extern __inline __m64 1105 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1106 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { 1107 __vector signed short a, b, c; 1108 1109 a = (__vector signed short)vec_splats(__m1); 1110 b = (__vector signed short)vec_splats(__m2); 1111 c = a * b; 1112 return (__m64)((__vector long long)c)[0]; 1113} 1114 1115extern __inline __m64 1116 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1117 _m_pmullw(__m64 __m1, __m64 __m2) { 1118 return _mm_mullo_pi16(__m1, __m2); 1119} 1120 1121/* Shift four 16-bit values in M left by COUNT. */ 1122extern __inline __m64 1123 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1124 _mm_sll_pi16(__m64 __m, __m64 __count) { 1125 __vector signed short m, r; 1126 __vector unsigned short c; 1127 1128 if (__count <= 15) { 1129 m = (__vector signed short)vec_splats(__m); 1130 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1131 r = vec_sl(m, (__vector unsigned short)c); 1132 return (__m64)((__vector long long)r)[0]; 1133 } else 1134 return (0); 1135} 1136 1137extern __inline __m64 1138 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1139 _m_psllw(__m64 __m, __m64 __count) { 1140 return _mm_sll_pi16(__m, __count); 1141} 1142 1143extern __inline __m64 1144 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1145 _mm_slli_pi16(__m64 __m, int __count) { 1146 /* Promote int to long then invoke mm_sll_pi16. */ 1147 return _mm_sll_pi16(__m, __count); 1148} 1149 1150extern __inline __m64 1151 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1152 _m_psllwi(__m64 __m, int __count) { 1153 return _mm_slli_pi16(__m, __count); 1154} 1155 1156/* Shift two 32-bit values in M left by COUNT. */ 1157extern __inline __m64 1158 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1159 _mm_sll_pi32(__m64 __m, __m64 __count) { 1160 __m64_union m, res; 1161 1162 m.as_m64 = __m; 1163 1164 res.as_int[0] = m.as_int[0] << __count; 1165 res.as_int[1] = m.as_int[1] << __count; 1166 return (res.as_m64); 1167} 1168 1169extern __inline __m64 1170 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1171 _m_pslld(__m64 __m, __m64 __count) { 1172 return _mm_sll_pi32(__m, __count); 1173} 1174 1175extern __inline __m64 1176 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1177 _mm_slli_pi32(__m64 __m, int __count) { 1178 /* Promote int to long then invoke mm_sll_pi32. */ 1179 return _mm_sll_pi32(__m, __count); 1180} 1181 1182extern __inline __m64 1183 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1184 _m_pslldi(__m64 __m, int __count) { 1185 return _mm_slli_pi32(__m, __count); 1186} 1187 1188/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1189extern __inline __m64 1190 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1191 _mm_sra_pi16(__m64 __m, __m64 __count) { 1192 __vector signed short m, r; 1193 __vector unsigned short c; 1194 1195 if (__count <= 15) { 1196 m = (__vector signed short)vec_splats(__m); 1197 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1198 r = vec_sra(m, (__vector unsigned short)c); 1199 return (__m64)((__vector long long)r)[0]; 1200 } else 1201 return (0); 1202} 1203 1204extern __inline __m64 1205 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 _m_psraw(__m64 __m, __m64 __count) { 1207 return _mm_sra_pi16(__m, __count); 1208} 1209 1210extern __inline __m64 1211 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212 _mm_srai_pi16(__m64 __m, int __count) { 1213 /* Promote int to long then invoke mm_sra_pi32. */ 1214 return _mm_sra_pi16(__m, __count); 1215} 1216 1217extern __inline __m64 1218 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1219 _m_psrawi(__m64 __m, int __count) { 1220 return _mm_srai_pi16(__m, __count); 1221} 1222 1223/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1224extern __inline __m64 1225 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1226 _mm_sra_pi32(__m64 __m, __m64 __count) { 1227 __m64_union m, res; 1228 1229 m.as_m64 = __m; 1230 1231 res.as_int[0] = m.as_int[0] >> __count; 1232 res.as_int[1] = m.as_int[1] >> __count; 1233 return (res.as_m64); 1234} 1235 1236extern __inline __m64 1237 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1238 _m_psrad(__m64 __m, __m64 __count) { 1239 return _mm_sra_pi32(__m, __count); 1240} 1241 1242extern __inline __m64 1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1244 _mm_srai_pi32(__m64 __m, int __count) { 1245 /* Promote int to long then invoke mm_sra_pi32. */ 1246 return _mm_sra_pi32(__m, __count); 1247} 1248 1249extern __inline __m64 1250 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1251 _m_psradi(__m64 __m, int __count) { 1252 return _mm_srai_pi32(__m, __count); 1253} 1254 1255/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1256extern __inline __m64 1257 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1258 _mm_srl_pi16(__m64 __m, __m64 __count) { 1259 __vector unsigned short m, r; 1260 __vector unsigned short c; 1261 1262 if (__count <= 15) { 1263 m = (__vector unsigned short)vec_splats(__m); 1264 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1265 r = vec_sr(m, (__vector unsigned short)c); 1266 return (__m64)((__vector long long)r)[0]; 1267 } else 1268 return (0); 1269} 1270 1271extern __inline __m64 1272 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1273 _m_psrlw(__m64 __m, __m64 __count) { 1274 return _mm_srl_pi16(__m, __count); 1275} 1276 1277extern __inline __m64 1278 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1279 _mm_srli_pi16(__m64 __m, int __count) { 1280 /* Promote int to long then invoke mm_sra_pi32. */ 1281 return _mm_srl_pi16(__m, __count); 1282} 1283 1284extern __inline __m64 1285 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1286 _m_psrlwi(__m64 __m, int __count) { 1287 return _mm_srli_pi16(__m, __count); 1288} 1289 1290/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1291extern __inline __m64 1292 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1293 _mm_srl_pi32(__m64 __m, __m64 __count) { 1294 __m64_union m, res; 1295 1296 m.as_m64 = __m; 1297 1298 res.as_int[0] = (unsigned int)m.as_int[0] >> __count; 1299 res.as_int[1] = (unsigned int)m.as_int[1] >> __count; 1300 return (res.as_m64); 1301} 1302 1303extern __inline __m64 1304 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1305 _m_psrld(__m64 __m, __m64 __count) { 1306 return _mm_srl_pi32(__m, __count); 1307} 1308 1309extern __inline __m64 1310 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1311 _mm_srli_pi32(__m64 __m, int __count) { 1312 /* Promote int to long then invoke mm_srl_pi32. */ 1313 return _mm_srl_pi32(__m, __count); 1314} 1315 1316extern __inline __m64 1317 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1318 _m_psrldi(__m64 __m, int __count) { 1319 return _mm_srli_pi32(__m, __count); 1320} 1321#endif /* _ARCH_PWR8 */ 1322 1323/* Creates a vector of two 32-bit values; I0 is least significant. */ 1324extern __inline __m64 1325 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1326 _mm_set_pi32(int __i1, int __i0) { 1327 __m64_union res; 1328 1329 res.as_int[0] = __i0; 1330 res.as_int[1] = __i1; 1331 return (res.as_m64); 1332} 1333 1334/* Creates a vector of four 16-bit values; W0 is least significant. */ 1335extern __inline __m64 1336 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1337 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { 1338 __m64_union res; 1339 1340 res.as_short[0] = __w0; 1341 res.as_short[1] = __w1; 1342 res.as_short[2] = __w2; 1343 res.as_short[3] = __w3; 1344 return (res.as_m64); 1345} 1346 1347/* Creates a vector of eight 8-bit values; B0 is least significant. */ 1348extern __inline __m64 1349 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1350 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, 1351 char __b2, char __b1, char __b0) { 1352 __m64_union res; 1353 1354 res.as_char[0] = __b0; 1355 res.as_char[1] = __b1; 1356 res.as_char[2] = __b2; 1357 res.as_char[3] = __b3; 1358 res.as_char[4] = __b4; 1359 res.as_char[5] = __b5; 1360 res.as_char[6] = __b6; 1361 res.as_char[7] = __b7; 1362 return (res.as_m64); 1363} 1364 1365/* Similar, but with the arguments in reverse order. */ 1366extern __inline __m64 1367 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1368 _mm_setr_pi32(int __i0, int __i1) { 1369 __m64_union res; 1370 1371 res.as_int[0] = __i0; 1372 res.as_int[1] = __i1; 1373 return (res.as_m64); 1374} 1375 1376extern __inline __m64 1377 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1378 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { 1379 return _mm_set_pi16(__w3, __w2, __w1, __w0); 1380} 1381 1382extern __inline __m64 1383 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1384 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, 1385 char __b5, char __b6, char __b7) { 1386 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1387} 1388 1389/* Creates a vector of two 32-bit values, both elements containing I. */ 1390extern __inline __m64 1391 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1392 _mm_set1_pi32(int __i) { 1393 __m64_union res; 1394 1395 res.as_int[0] = __i; 1396 res.as_int[1] = __i; 1397 return (res.as_m64); 1398} 1399 1400/* Creates a vector of four 16-bit values, all elements containing W. */ 1401extern __inline __m64 1402 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403 _mm_set1_pi16(short __w) { 1404#if _ARCH_PWR9 1405 __vector signed short w; 1406 1407 w = (__vector signed short)vec_splats(__w); 1408 return (__m64)((__vector long long)w)[0]; 1409#else 1410 __m64_union res; 1411 1412 res.as_short[0] = __w; 1413 res.as_short[1] = __w; 1414 res.as_short[2] = __w; 1415 res.as_short[3] = __w; 1416 return (res.as_m64); 1417#endif 1418} 1419 1420/* Creates a vector of eight 8-bit values, all elements containing B. */ 1421extern __inline __m64 1422 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1423 _mm_set1_pi8(signed char __b) { 1424#if _ARCH_PWR8 1425 __vector signed char b; 1426 1427 b = (__vector signed char)vec_splats(__b); 1428 return (__m64)((__vector long long)b)[0]; 1429#else 1430 __m64_union res; 1431 1432 res.as_char[0] = __b; 1433 res.as_char[1] = __b; 1434 res.as_char[2] = __b; 1435 res.as_char[3] = __b; 1436 res.as_char[4] = __b; 1437 res.as_char[5] = __b; 1438 res.as_char[6] = __b; 1439 res.as_char[7] = __b; 1440 return (res.as_m64); 1441#endif 1442} 1443#endif /* _MMINTRIN_H_INCLUDED */ 1444