emmintrin.h revision 351280
1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10/* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13#ifndef NO_WARN_X86_INTRINSICS 14/* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31*/ 32#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33#endif 34 35#ifndef EMMINTRIN_H_ 36#define EMMINTRIN_H_ 37 38#include <altivec.h> 39 40/* We need definitions from the SSE header files. */ 41#include <xmmintrin.h> 42 43/* SSE2 */ 44typedef __vector double __v2df; 45typedef __vector long long __v2di; 46typedef __vector unsigned long long __v2du; 47typedef __vector int __v4si; 48typedef __vector unsigned int __v4su; 49typedef __vector short __v8hi; 50typedef __vector unsigned short __v8hu; 51typedef __vector signed char __v16qi; 52typedef __vector unsigned char __v16qu; 53 54/* The Intel API is flexible enough that we must allow aliasing with other 55 vector types, and their scalar components. */ 56typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 57typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 58 59/* Unaligned version of the same types. */ 60typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 61typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 62 63/* Define two value permute mask. */ 64#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 65 66/* Create a vector with element 0 as F and the rest zero. */ 67extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 68_mm_set_sd (double __F) 69{ 70 return __extension__ (__m128d){ __F, 0.0 }; 71} 72 73/* Create a vector with both elements equal to F. */ 74extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 75_mm_set1_pd (double __F) 76{ 77 return __extension__ (__m128d){ __F, __F }; 78} 79 80extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81_mm_set_pd1 (double __F) 82{ 83 return _mm_set1_pd (__F); 84} 85 86/* Create a vector with the lower value X and upper value W. */ 87extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 88_mm_set_pd (double __W, double __X) 89{ 90 return __extension__ (__m128d){ __X, __W }; 91} 92 93/* Create a vector with the lower value W and upper value X. */ 94extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 95_mm_setr_pd (double __W, double __X) 96{ 97 return __extension__ (__m128d){ __W, __X }; 98} 99 100/* Create an undefined vector. */ 101extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 102_mm_undefined_pd (void) 103{ 104 __m128d __Y = __Y; 105 return __Y; 106} 107 108/* Create a vector of zeros. */ 109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 110_mm_setzero_pd (void) 111{ 112 return (__m128d) vec_splats (0); 113} 114 115/* Sets the low DPFP value of A from the low value of B. */ 116extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 117_mm_move_sd (__m128d __A, __m128d __B) 118{ 119 __v2df result = (__v2df) __A; 120 result [0] = ((__v2df) __B)[0]; 121 return (__m128d) result; 122} 123 124/* Load two DPFP values from P. The address must be 16-byte aligned. */ 125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126_mm_load_pd (double const *__P) 127{ 128 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 129} 130 131/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 132extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133_mm_loadu_pd (double const *__P) 134{ 135 return (vec_vsx_ld(0, __P)); 136} 137 138/* Create a vector with all two elements equal to *P. */ 139extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 140_mm_load1_pd (double const *__P) 141{ 142 return (vec_splats (*__P)); 143} 144 145/* Create a vector with element 0 as *P and the rest zero. */ 146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147_mm_load_sd (double const *__P) 148{ 149 return _mm_set_sd (*__P); 150} 151 152extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153_mm_load_pd1 (double const *__P) 154{ 155 return _mm_load1_pd (__P); 156} 157 158/* Load two DPFP values in reverse order. The address must be aligned. */ 159extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160_mm_loadr_pd (double const *__P) 161{ 162 __v2df __tmp = _mm_load_pd (__P); 163 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 164} 165 166/* Store two DPFP values. The address must be 16-byte aligned. */ 167extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 168_mm_store_pd (double *__P, __m128d __A) 169{ 170 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 171} 172 173/* Store two DPFP values. The address need not be 16-byte aligned. */ 174extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175_mm_storeu_pd (double *__P, __m128d __A) 176{ 177 *(__m128d_u *)__P = __A; 178} 179 180/* Stores the lower DPFP value. */ 181extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 182_mm_store_sd (double *__P, __m128d __A) 183{ 184 *__P = ((__v2df)__A)[0]; 185} 186 187extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188_mm_cvtsd_f64 (__m128d __A) 189{ 190 return ((__v2df)__A)[0]; 191} 192 193extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194_mm_storel_pd (double *__P, __m128d __A) 195{ 196 _mm_store_sd (__P, __A); 197} 198 199/* Stores the upper DPFP value. */ 200extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 201_mm_storeh_pd (double *__P, __m128d __A) 202{ 203 *__P = ((__v2df)__A)[1]; 204} 205/* Store the lower DPFP value across two words. 206 The address must be 16-byte aligned. */ 207extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 208_mm_store1_pd (double *__P, __m128d __A) 209{ 210 _mm_store_pd (__P, vec_splat (__A, 0)); 211} 212 213extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 214_mm_store_pd1 (double *__P, __m128d __A) 215{ 216 _mm_store1_pd (__P, __A); 217} 218 219/* Store two DPFP values in reverse order. The address must be aligned. */ 220extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221_mm_storer_pd (double *__P, __m128d __A) 222{ 223 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 224} 225 226/* Intel intrinsic. */ 227extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228_mm_cvtsi128_si64 (__m128i __A) 229{ 230 return ((__v2di)__A)[0]; 231} 232 233/* Microsoft intrinsic. */ 234extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235_mm_cvtsi128_si64x (__m128i __A) 236{ 237 return ((__v2di)__A)[0]; 238} 239 240extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241_mm_add_pd (__m128d __A, __m128d __B) 242{ 243 return (__m128d) ((__v2df)__A + (__v2df)__B); 244} 245 246/* Add the lower double-precision (64-bit) floating-point element in 247 a and b, store the result in the lower element of dst, and copy 248 the upper element from a to the upper element of dst. */ 249extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 250_mm_add_sd (__m128d __A, __m128d __B) 251{ 252 __A[0] = __A[0] + __B[0]; 253 return (__A); 254} 255 256extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257_mm_sub_pd (__m128d __A, __m128d __B) 258{ 259 return (__m128d) ((__v2df)__A - (__v2df)__B); 260} 261 262extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263_mm_sub_sd (__m128d __A, __m128d __B) 264{ 265 __A[0] = __A[0] - __B[0]; 266 return (__A); 267} 268 269extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 270_mm_mul_pd (__m128d __A, __m128d __B) 271{ 272 return (__m128d) ((__v2df)__A * (__v2df)__B); 273} 274 275extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 276_mm_mul_sd (__m128d __A, __m128d __B) 277{ 278 __A[0] = __A[0] * __B[0]; 279 return (__A); 280} 281 282extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 283_mm_div_pd (__m128d __A, __m128d __B) 284{ 285 return (__m128d) ((__v2df)__A / (__v2df)__B); 286} 287 288extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289_mm_div_sd (__m128d __A, __m128d __B) 290{ 291 __A[0] = __A[0] / __B[0]; 292 return (__A); 293} 294 295extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 296_mm_sqrt_pd (__m128d __A) 297{ 298 return (vec_sqrt (__A)); 299} 300 301/* Return pair {sqrt (B[0]), A[1]}. */ 302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303_mm_sqrt_sd (__m128d __A, __m128d __B) 304{ 305 __v2df c; 306 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 307 return (__m128d) _mm_setr_pd (c[0], __A[1]); 308} 309 310extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 311_mm_min_pd (__m128d __A, __m128d __B) 312{ 313 return (vec_min (__A, __B)); 314} 315 316extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317_mm_min_sd (__m128d __A, __m128d __B) 318{ 319 __v2df a, b, c; 320 a = vec_splats (__A[0]); 321 b = vec_splats (__B[0]); 322 c = vec_min (a, b); 323 return (__m128d) _mm_setr_pd (c[0], __A[1]); 324} 325 326extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 327_mm_max_pd (__m128d __A, __m128d __B) 328{ 329 return (vec_max (__A, __B)); 330} 331 332extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 333_mm_max_sd (__m128d __A, __m128d __B) 334{ 335 __v2df a, b, c; 336 a = vec_splats (__A[0]); 337 b = vec_splats (__B[0]); 338 c = vec_max (a, b); 339 return (__m128d) _mm_setr_pd (c[0], __A[1]); 340} 341 342extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 343_mm_cmpeq_pd (__m128d __A, __m128d __B) 344{ 345 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 346} 347 348extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 349_mm_cmplt_pd (__m128d __A, __m128d __B) 350{ 351 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 352} 353 354extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 355_mm_cmple_pd (__m128d __A, __m128d __B) 356{ 357 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 358} 359 360extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 361_mm_cmpgt_pd (__m128d __A, __m128d __B) 362{ 363 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 364} 365 366extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367_mm_cmpge_pd (__m128d __A, __m128d __B) 368{ 369 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 370} 371 372extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373_mm_cmpneq_pd (__m128d __A, __m128d __B) 374{ 375 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 376 return ((__m128d)vec_nor (temp, temp)); 377} 378 379extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380_mm_cmpnlt_pd (__m128d __A, __m128d __B) 381{ 382 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 383} 384 385extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386_mm_cmpnle_pd (__m128d __A, __m128d __B) 387{ 388 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 389} 390 391extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392_mm_cmpngt_pd (__m128d __A, __m128d __B) 393{ 394 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 395} 396 397extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 398_mm_cmpnge_pd (__m128d __A, __m128d __B) 399{ 400 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 401} 402 403extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404_mm_cmpord_pd (__m128d __A, __m128d __B) 405{ 406#if _ARCH_PWR8 407 __v2du c, d; 408 /* Compare against self will return false (0's) if NAN. */ 409 c = (__v2du)vec_cmpeq (__A, __A); 410 d = (__v2du)vec_cmpeq (__B, __B); 411#else 412 __v2du a, b; 413 __v2du c, d; 414 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; 415 a = (__v2du)vec_abs ((__v2df)__A); 416 b = (__v2du)vec_abs ((__v2df)__B); 417 c = (__v2du)vec_cmpgt (double_exp_mask, a); 418 d = (__v2du)vec_cmpgt (double_exp_mask, b); 419#endif 420 /* A != NAN and B != NAN. */ 421 return ((__m128d)vec_and(c, d)); 422} 423 424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 425_mm_cmpunord_pd (__m128d __A, __m128d __B) 426{ 427#if _ARCH_PWR8 428 __v2du c, d; 429 /* Compare against self will return false (0's) if NAN. */ 430 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 431 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 432 /* A == NAN OR B == NAN converts too: 433 NOT(A != NAN) OR NOT(B != NAN). */ 434 c = vec_nor (c, c); 435 return ((__m128d)vec_orc(c, d)); 436#else 437 __v2du c, d; 438 /* Compare against self will return false (0's) if NAN. */ 439 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 440 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 441 /* Convert the true ('1's) is NAN. */ 442 c = vec_nor (c, c); 443 d = vec_nor (d, d); 444 return ((__m128d)vec_or(c, d)); 445#endif 446} 447 448extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 449_mm_cmpeq_sd(__m128d __A, __m128d __B) 450{ 451 __v2df a, b, c; 452 /* PowerISA VSX does not allow partial (for just lower double) 453 results. So to insure we don't generate spurious exceptions 454 (from the upper double values) we splat the lower double 455 before we do the operation. */ 456 a = vec_splats (__A[0]); 457 b = vec_splats (__B[0]); 458 c = (__v2df) vec_cmpeq(a, b); 459 /* Then we merge the lower double result with the original upper 460 double from __A. */ 461 return (__m128d) _mm_setr_pd (c[0], __A[1]); 462} 463 464extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465_mm_cmplt_sd (__m128d __A, __m128d __B) 466{ 467 __v2df a, b, c; 468 a = vec_splats (__A[0]); 469 b = vec_splats (__B[0]); 470 c = (__v2df) vec_cmplt(a, b); 471 return (__m128d) _mm_setr_pd (c[0], __A[1]); 472} 473 474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 475_mm_cmple_sd (__m128d __A, __m128d __B) 476{ 477 __v2df a, b, c; 478 a = vec_splats (__A[0]); 479 b = vec_splats (__B[0]); 480 c = (__v2df) vec_cmple(a, b); 481 return (__m128d) _mm_setr_pd (c[0], __A[1]); 482} 483 484extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 485_mm_cmpgt_sd (__m128d __A, __m128d __B) 486{ 487 __v2df a, b, c; 488 a = vec_splats (__A[0]); 489 b = vec_splats (__B[0]); 490 c = (__v2df) vec_cmpgt(a, b); 491 return (__m128d) _mm_setr_pd (c[0], __A[1]); 492} 493 494extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 495_mm_cmpge_sd (__m128d __A, __m128d __B) 496{ 497 __v2df a, b, c; 498 a = vec_splats (__A[0]); 499 b = vec_splats (__B[0]); 500 c = (__v2df) vec_cmpge(a, b); 501 return (__m128d) _mm_setr_pd (c[0], __A[1]); 502} 503 504extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 505_mm_cmpneq_sd (__m128d __A, __m128d __B) 506{ 507 __v2df a, b, c; 508 a = vec_splats (__A[0]); 509 b = vec_splats (__B[0]); 510 c = (__v2df) vec_cmpeq(a, b); 511 c = vec_nor (c, c); 512 return (__m128d) _mm_setr_pd (c[0], __A[1]); 513} 514 515extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 516_mm_cmpnlt_sd (__m128d __A, __m128d __B) 517{ 518 __v2df a, b, c; 519 a = vec_splats (__A[0]); 520 b = vec_splats (__B[0]); 521 /* Not less than is just greater than or equal. */ 522 c = (__v2df) vec_cmpge(a, b); 523 return (__m128d) _mm_setr_pd (c[0], __A[1]); 524} 525 526extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 527_mm_cmpnle_sd (__m128d __A, __m128d __B) 528{ 529 __v2df a, b, c; 530 a = vec_splats (__A[0]); 531 b = vec_splats (__B[0]); 532 /* Not less than or equal is just greater than. */ 533 c = (__v2df) vec_cmpge(a, b); 534 return (__m128d) _mm_setr_pd (c[0], __A[1]); 535} 536 537extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 538_mm_cmpngt_sd (__m128d __A, __m128d __B) 539{ 540 __v2df a, b, c; 541 a = vec_splats (__A[0]); 542 b = vec_splats (__B[0]); 543 /* Not greater than is just less than or equal. */ 544 c = (__v2df) vec_cmple(a, b); 545 return (__m128d) _mm_setr_pd (c[0], __A[1]); 546} 547 548extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 549_mm_cmpnge_sd (__m128d __A, __m128d __B) 550{ 551 __v2df a, b, c; 552 a = vec_splats (__A[0]); 553 b = vec_splats (__B[0]); 554 /* Not greater than or equal is just less than. */ 555 c = (__v2df) vec_cmplt(a, b); 556 return (__m128d) _mm_setr_pd (c[0], __A[1]); 557} 558 559extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 560_mm_cmpord_sd (__m128d __A, __m128d __B) 561{ 562 __v2df r; 563 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 564 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 565} 566 567extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 568_mm_cmpunord_sd (__m128d __A, __m128d __B) 569{ 570 __v2df r; 571 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 572 return (__m128d) _mm_setr_pd (r[0], __A[1]); 573} 574 575/* FIXME 576 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 577 exactly the same because GCC for PowerPC only generates unordered 578 compares (scalar and vector). 579 Technically __mm_comieq_sp et all should be using the ordered 580 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 581 be OK. */ 582extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 583_mm_comieq_sd (__m128d __A, __m128d __B) 584{ 585 return (__A[0] == __B[0]); 586} 587 588extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589_mm_comilt_sd (__m128d __A, __m128d __B) 590{ 591 return (__A[0] < __B[0]); 592} 593 594extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595_mm_comile_sd (__m128d __A, __m128d __B) 596{ 597 return (__A[0] <= __B[0]); 598} 599 600extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 601_mm_comigt_sd (__m128d __A, __m128d __B) 602{ 603 return (__A[0] > __B[0]); 604} 605 606extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 607_mm_comige_sd (__m128d __A, __m128d __B) 608{ 609 return (__A[0] >= __B[0]); 610} 611 612extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 613_mm_comineq_sd (__m128d __A, __m128d __B) 614{ 615 return (__A[0] != __B[0]); 616} 617 618extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 619_mm_ucomieq_sd (__m128d __A, __m128d __B) 620{ 621 return (__A[0] == __B[0]); 622} 623 624extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 625_mm_ucomilt_sd (__m128d __A, __m128d __B) 626{ 627 return (__A[0] < __B[0]); 628} 629 630extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 631_mm_ucomile_sd (__m128d __A, __m128d __B) 632{ 633 return (__A[0] <= __B[0]); 634} 635 636extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 637_mm_ucomigt_sd (__m128d __A, __m128d __B) 638{ 639 return (__A[0] > __B[0]); 640} 641 642extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643_mm_ucomige_sd (__m128d __A, __m128d __B) 644{ 645 return (__A[0] >= __B[0]); 646} 647 648extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649_mm_ucomineq_sd (__m128d __A, __m128d __B) 650{ 651 return (__A[0] != __B[0]); 652} 653 654/* Create a vector of Qi, where i is the element number. */ 655extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 656_mm_set_epi64x (long long __q1, long long __q0) 657{ 658 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 659} 660 661extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 662_mm_set_epi64 (__m64 __q1, __m64 __q0) 663{ 664 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 665} 666 667extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 668_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 669{ 670 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 671} 672 673extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 674_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 675 short __q3, short __q2, short __q1, short __q0) 676{ 677 return __extension__ (__m128i)(__v8hi){ 678 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 679} 680 681extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 683 char __q11, char __q10, char __q09, char __q08, 684 char __q07, char __q06, char __q05, char __q04, 685 char __q03, char __q02, char __q01, char __q00) 686{ 687 return __extension__ (__m128i)(__v16qi){ 688 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 689 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 690 }; 691} 692 693/* Set all of the elements of the vector to A. */ 694extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695_mm_set1_epi64x (long long __A) 696{ 697 return _mm_set_epi64x (__A, __A); 698} 699 700extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701_mm_set1_epi64 (__m64 __A) 702{ 703 return _mm_set_epi64 (__A, __A); 704} 705 706extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 707_mm_set1_epi32 (int __A) 708{ 709 return _mm_set_epi32 (__A, __A, __A, __A); 710} 711 712extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713_mm_set1_epi16 (short __A) 714{ 715 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 716} 717 718extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 719_mm_set1_epi8 (char __A) 720{ 721 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 722 __A, __A, __A, __A, __A, __A, __A, __A); 723} 724 725/* Create a vector of Qi, where i is the element number. 726 The parameter order is reversed from the _mm_set_epi* functions. */ 727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728_mm_setr_epi64 (__m64 __q0, __m64 __q1) 729{ 730 return _mm_set_epi64 (__q1, __q0); 731} 732 733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 735{ 736 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 737} 738 739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 741 short __q4, short __q5, short __q6, short __q7) 742{ 743 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 744} 745 746extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 747_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 748 char __q04, char __q05, char __q06, char __q07, 749 char __q08, char __q09, char __q10, char __q11, 750 char __q12, char __q13, char __q14, char __q15) 751{ 752 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 753 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 754} 755 756/* Create a vector with element 0 as *P and the rest zero. */ 757extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 758_mm_load_si128 (__m128i const *__P) 759{ 760 return *__P; 761} 762 763extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 764_mm_loadu_si128 (__m128i_u const *__P) 765{ 766 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 767} 768 769extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 770_mm_loadl_epi64 (__m128i_u const *__P) 771{ 772 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 773} 774 775extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 776_mm_store_si128 (__m128i *__P, __m128i __B) 777{ 778 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 779} 780 781extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 782_mm_storeu_si128 (__m128i_u *__P, __m128i __B) 783{ 784 *__P = __B; 785} 786 787extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 788_mm_storel_epi64 (__m128i_u *__P, __m128i __B) 789{ 790 *(long long *)__P = ((__v2di)__B)[0]; 791} 792 793extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 794_mm_movepi64_pi64 (__m128i_u __B) 795{ 796 return (__m64) ((__v2di)__B)[0]; 797} 798 799extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 800_mm_movpi64_epi64 (__m64 __A) 801{ 802 return _mm_set_epi64 ((__m64)0LL, __A); 803} 804 805extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 806_mm_move_epi64 (__m128i __A) 807{ 808 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 809} 810 811/* Create an undefined vector. */ 812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 813_mm_undefined_si128 (void) 814{ 815 __m128i __Y = __Y; 816 return __Y; 817} 818 819/* Create a vector of zeros. */ 820extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 821_mm_setzero_si128 (void) 822{ 823 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 824} 825 826#ifdef _ARCH_PWR8 827extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828_mm_cvtepi32_pd (__m128i __A) 829{ 830 __v2di val; 831 /* For LE need to generate Vector Unpack Low Signed Word. 832 Which is generated from unpackh. */ 833 val = (__v2di)vec_unpackh ((__v4si)__A); 834 835 return (__m128d)vec_ctf (val, 0); 836} 837#endif 838 839extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 840_mm_cvtepi32_ps (__m128i __A) 841{ 842 return ((__m128)vec_ctf((__v4si)__A, 0)); 843} 844 845extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 846_mm_cvtpd_epi32 (__m128d __A) 847{ 848 __v2df rounded = vec_rint (__A); 849 __v4si result, temp; 850 const __v4si vzero = 851 { 0, 0, 0, 0 }; 852 853 /* VSX Vector truncate Double-Precision to integer and Convert to 854 Signed Integer Word format with Saturate. */ 855 __asm__( 856 "xvcvdpsxws %x0,%x1" 857 : "=wa" (temp) 858 : "wa" (rounded) 859 : ); 860 861#ifdef _ARCH_PWR8 862 temp = vec_mergeo (temp, temp); 863 result = (__v4si) vec_vpkudum ((__vector long long) temp, 864 (__vector long long) vzero); 865#else 866 { 867 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 868 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 869 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 870 } 871#endif 872 return (__m128i) result; 873} 874 875extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 876_mm_cvtpd_pi32 (__m128d __A) 877{ 878 __m128i result = _mm_cvtpd_epi32(__A); 879 880 return (__m64) result[0]; 881} 882 883extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 884_mm_cvtpd_ps (__m128d __A) 885{ 886 __v4sf result; 887 __v4si temp; 888 const __v4si vzero = { 0, 0, 0, 0 }; 889 890 __asm__( 891 "xvcvdpsp %x0,%x1" 892 : "=wa" (temp) 893 : "wa" (__A) 894 : ); 895 896#ifdef _ARCH_PWR8 897 temp = vec_mergeo (temp, temp); 898 result = (__v4sf) vec_vpkudum ((__vector long long) temp, 899 (__vector long long) vzero); 900#else 901 { 902 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 903 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 904 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 905 } 906#endif 907 return ((__m128)result); 908} 909 910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 911_mm_cvttpd_epi32 (__m128d __A) 912{ 913 __v4si result; 914 __v4si temp; 915 const __v4si vzero = { 0, 0, 0, 0 }; 916 917 /* VSX Vector truncate Double-Precision to integer and Convert to 918 Signed Integer Word format with Saturate. */ 919 __asm__( 920 "xvcvdpsxws %x0,%x1" 921 : "=wa" (temp) 922 : "wa" (__A) 923 : ); 924 925#ifdef _ARCH_PWR8 926 temp = vec_mergeo (temp, temp); 927 result = (__v4si) vec_vpkudum ((__vector long long) temp, 928 (__vector long long) vzero); 929#else 930 { 931 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 932 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 933 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 934 } 935#endif 936 937 return ((__m128i) result); 938} 939 940extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 941_mm_cvttpd_pi32 (__m128d __A) 942{ 943 __m128i result = _mm_cvttpd_epi32 (__A); 944 945 return (__m64) result[0]; 946} 947 948extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 949_mm_cvtsi128_si32 (__m128i __A) 950{ 951 return ((__v4si)__A)[0]; 952} 953 954#ifdef _ARCH_PWR8 955extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 956_mm_cvtpi32_pd (__m64 __A) 957{ 958 __v4si temp; 959 __v2di tmp2; 960 __v2df result; 961 962 temp = (__v4si)vec_splats (__A); 963 tmp2 = (__v2di)vec_unpackl (temp); 964 result = vec_ctf ((__vector signed long long) tmp2, 0); 965 return (__m128d)result; 966} 967#endif 968 969extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 970_mm_cvtps_epi32 (__m128 __A) 971{ 972 __v4sf rounded; 973 __v4si result; 974 975 rounded = vec_rint((__v4sf) __A); 976 result = vec_cts (rounded, 0); 977 return (__m128i) result; 978} 979 980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981_mm_cvttps_epi32 (__m128 __A) 982{ 983 __v4si result; 984 985 result = vec_cts ((__v4sf) __A, 0); 986 return (__m128i) result; 987} 988 989extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 990_mm_cvtps_pd (__m128 __A) 991{ 992 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 993#ifdef vec_doubleh 994 return (__m128d) vec_doubleh ((__v4sf)__A); 995#else 996 /* Otherwise the compiler is not current and so need to generate the 997 equivalent code. */ 998 __v4sf a = (__v4sf)__A; 999 __v4sf temp; 1000 __v2df result; 1001#ifdef __LITTLE_ENDIAN__ 1002 /* The input float values are in elements {[0], [1]} but the convert 1003 instruction needs them in elements {[1], [3]}, So we use two 1004 shift left double vector word immediates to get the elements 1005 lined up. */ 1006 temp = __builtin_vsx_xxsldwi (a, a, 3); 1007 temp = __builtin_vsx_xxsldwi (a, temp, 2); 1008#else 1009 /* The input float values are in elements {[0], [1]} but the convert 1010 instruction needs them in elements {[0], [2]}, So we use two 1011 shift left double vector word immediates to get the elements 1012 lined up. */ 1013 temp = vec_vmrghw (a, a); 1014#endif 1015 __asm__( 1016 " xvcvspdp %x0,%x1" 1017 : "=wa" (result) 1018 : "wa" (temp) 1019 : ); 1020 return (__m128d) result; 1021#endif 1022} 1023 1024extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1025_mm_cvtsd_si32 (__m128d __A) 1026{ 1027 __v2df rounded = vec_rint((__v2df) __A); 1028 int result = ((__v2df)rounded)[0]; 1029 1030 return result; 1031} 1032/* Intel intrinsic. */ 1033extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1034_mm_cvtsd_si64 (__m128d __A) 1035{ 1036 __v2df rounded = vec_rint ((__v2df) __A ); 1037 long long result = ((__v2df) rounded)[0]; 1038 1039 return result; 1040} 1041 1042/* Microsoft intrinsic. */ 1043extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1044_mm_cvtsd_si64x (__m128d __A) 1045{ 1046 return _mm_cvtsd_si64 ((__v2df)__A); 1047} 1048 1049extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1050_mm_cvttsd_si32 (__m128d __A) 1051{ 1052 int result = ((__v2df)__A)[0]; 1053 1054 return result; 1055} 1056 1057/* Intel intrinsic. */ 1058extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059_mm_cvttsd_si64 (__m128d __A) 1060{ 1061 long long result = ((__v2df)__A)[0]; 1062 1063 return result; 1064} 1065 1066/* Microsoft intrinsic. */ 1067extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1068_mm_cvttsd_si64x (__m128d __A) 1069{ 1070 return _mm_cvttsd_si64 (__A); 1071} 1072 1073extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1074_mm_cvtsd_ss (__m128 __A, __m128d __B) 1075{ 1076 __v4sf result = (__v4sf)__A; 1077 1078#ifdef __LITTLE_ENDIAN__ 1079 __v4sf temp_s; 1080 /* Copy double element[0] to element [1] for conversion. */ 1081 __v2df temp_b = vec_splat((__v2df)__B, 0); 1082 1083 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1084 result = __builtin_vsx_xxsldwi (result, result, 3); 1085 /* Convert double to single float scalar in a vector. */ 1086 __asm__( 1087 "xscvdpsp %x0,%x1" 1088 : "=wa" (temp_s) 1089 : "wa" (temp_b) 1090 : ); 1091 /* Shift the resulting scalar into vector element [0]. */ 1092 result = __builtin_vsx_xxsldwi (result, temp_s, 1); 1093#else 1094 result [0] = ((__v2df)__B)[0]; 1095#endif 1096 return (__m128) result; 1097} 1098 1099extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1100_mm_cvtsi32_sd (__m128d __A, int __B) 1101{ 1102 __v2df result = (__v2df)__A; 1103 double db = __B; 1104 result [0] = db; 1105 return (__m128d)result; 1106} 1107 1108/* Intel intrinsic. */ 1109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1110_mm_cvtsi64_sd (__m128d __A, long long __B) 1111{ 1112 __v2df result = (__v2df)__A; 1113 double db = __B; 1114 result [0] = db; 1115 return (__m128d)result; 1116} 1117 1118/* Microsoft intrinsic. */ 1119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120_mm_cvtsi64x_sd (__m128d __A, long long __B) 1121{ 1122 return _mm_cvtsi64_sd (__A, __B); 1123} 1124 1125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1126_mm_cvtss_sd (__m128d __A, __m128 __B) 1127{ 1128#ifdef __LITTLE_ENDIAN__ 1129 /* Use splat to move element [0] into position for the convert. */ 1130 __v4sf temp = vec_splat ((__v4sf)__B, 0); 1131 __v2df res; 1132 /* Convert single float scalar to double in a vector. */ 1133 __asm__( 1134 "xscvspdp %x0,%x1" 1135 : "=wa" (res) 1136 : "wa" (temp) 1137 : ); 1138 return (__m128d) vec_mergel (res, (__v2df)__A); 1139#else 1140 __v2df res = (__v2df)__A; 1141 res [0] = ((__v4sf)__B) [0]; 1142 return (__m128d) res; 1143#endif 1144} 1145 1146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1147_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1148{ 1149 __vector double result; 1150 const int litmsk = __mask & 0x3; 1151 1152 if (litmsk == 0) 1153 result = vec_mergeh (__A, __B); 1154#if __GNUC__ < 6 1155 else if (litmsk == 1) 1156 result = vec_xxpermdi (__B, __A, 2); 1157 else if (litmsk == 2) 1158 result = vec_xxpermdi (__B, __A, 1); 1159#else 1160 else if (litmsk == 1) 1161 result = vec_xxpermdi (__A, __B, 2); 1162 else if (litmsk == 2) 1163 result = vec_xxpermdi (__A, __B, 1); 1164#endif 1165 else 1166 result = vec_mergel (__A, __B); 1167 1168 return result; 1169} 1170 1171extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1172_mm_unpackhi_pd (__m128d __A, __m128d __B) 1173{ 1174 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1175} 1176 1177extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1178_mm_unpacklo_pd (__m128d __A, __m128d __B) 1179{ 1180 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1181} 1182 1183extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1184_mm_loadh_pd (__m128d __A, double const *__B) 1185{ 1186 __v2df result = (__v2df)__A; 1187 result [1] = *__B; 1188 return (__m128d)result; 1189} 1190 1191extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1192_mm_loadl_pd (__m128d __A, double const *__B) 1193{ 1194 __v2df result = (__v2df)__A; 1195 result [0] = *__B; 1196 return (__m128d)result; 1197} 1198 1199#ifdef _ARCH_PWR8 1200/* Intrinsic functions that require PowerISA 2.07 minimum. */ 1201 1202/* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1203extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1204_mm_movemask_pd (__m128d __A) 1205{ 1206 __vector unsigned long long result; 1207 static const __vector unsigned int perm_mask = 1208 { 1209#ifdef __LITTLE_ENDIAN__ 1210 0x80800040, 0x80808080, 0x80808080, 0x80808080 1211#else 1212 0x80808080, 0x80808080, 0x80808080, 0x80804000 1213#endif 1214 }; 1215 1216 result = ((__vector unsigned long long) 1217 vec_vbpermq ((__vector unsigned char) __A, 1218 (__vector unsigned char) perm_mask)); 1219 1220#ifdef __LITTLE_ENDIAN__ 1221 return result[1]; 1222#else 1223 return result[0]; 1224#endif 1225} 1226#endif /* _ARCH_PWR8 */ 1227 1228extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1229_mm_packs_epi16 (__m128i __A, __m128i __B) 1230{ 1231 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1232} 1233 1234extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1235_mm_packs_epi32 (__m128i __A, __m128i __B) 1236{ 1237 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1238} 1239 1240extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1241_mm_packus_epi16 (__m128i __A, __m128i __B) 1242{ 1243 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1244} 1245 1246extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1247_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1248{ 1249 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1250} 1251 1252extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1253_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1254{ 1255 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1256} 1257 1258extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1259_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1260{ 1261 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1262} 1263 1264extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1265_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1266{ 1267 return (__m128i) vec_mergel ((__vector long long) __A, 1268 (__vector long long) __B); 1269} 1270 1271extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1272_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1273{ 1274 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1275} 1276 1277extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1278_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1279{ 1280 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1281} 1282 1283extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1284_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1285{ 1286 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1287} 1288 1289extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1290_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1291{ 1292 return (__m128i) vec_mergeh ((__vector long long) __A, 1293 (__vector long long) __B); 1294} 1295 1296extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297_mm_add_epi8 (__m128i __A, __m128i __B) 1298{ 1299 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1300} 1301 1302extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1303_mm_add_epi16 (__m128i __A, __m128i __B) 1304{ 1305 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1306} 1307 1308extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1309_mm_add_epi32 (__m128i __A, __m128i __B) 1310{ 1311 return (__m128i) ((__v4su)__A + (__v4su)__B); 1312} 1313 1314extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1315_mm_add_epi64 (__m128i __A, __m128i __B) 1316{ 1317 return (__m128i) ((__v2du)__A + (__v2du)__B); 1318} 1319 1320extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321_mm_adds_epi8 (__m128i __A, __m128i __B) 1322{ 1323 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1324} 1325 1326extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1327_mm_adds_epi16 (__m128i __A, __m128i __B) 1328{ 1329 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1330} 1331 1332extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1333_mm_adds_epu8 (__m128i __A, __m128i __B) 1334{ 1335 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1336} 1337 1338extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1339_mm_adds_epu16 (__m128i __A, __m128i __B) 1340{ 1341 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1342} 1343 1344extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1345_mm_sub_epi8 (__m128i __A, __m128i __B) 1346{ 1347 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1348} 1349 1350extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351_mm_sub_epi16 (__m128i __A, __m128i __B) 1352{ 1353 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1354} 1355 1356extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1357_mm_sub_epi32 (__m128i __A, __m128i __B) 1358{ 1359 return (__m128i) ((__v4su)__A - (__v4su)__B); 1360} 1361 1362extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1363_mm_sub_epi64 (__m128i __A, __m128i __B) 1364{ 1365 return (__m128i) ((__v2du)__A - (__v2du)__B); 1366} 1367 1368extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1369_mm_subs_epi8 (__m128i __A, __m128i __B) 1370{ 1371 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1372} 1373 1374extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1375_mm_subs_epi16 (__m128i __A, __m128i __B) 1376{ 1377 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1378} 1379 1380extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1381_mm_subs_epu8 (__m128i __A, __m128i __B) 1382{ 1383 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1384} 1385 1386extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1387_mm_subs_epu16 (__m128i __A, __m128i __B) 1388{ 1389 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1390} 1391 1392extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1393_mm_madd_epi16 (__m128i __A, __m128i __B) 1394{ 1395 __vector signed int zero = {0, 0, 0, 0}; 1396 1397 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); 1398} 1399 1400extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1401_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1402{ 1403 __vector signed int w0, w1; 1404 1405 __vector unsigned char xform1 = { 1406#ifdef __LITTLE_ENDIAN__ 1407 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1408 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1409#else 1410 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1411 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1412#endif 1413 }; 1414 1415 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1416 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1417 return (__m128i) vec_perm (w0, w1, xform1); 1418} 1419 1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1421_mm_mullo_epi16 (__m128i __A, __m128i __B) 1422{ 1423 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1424} 1425 1426extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1427_mm_mul_su32 (__m64 __A, __m64 __B) 1428{ 1429 unsigned int a = __A; 1430 unsigned int b = __B; 1431 1432 return ((__m64)a * (__m64)b); 1433} 1434 1435extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1436_mm_mul_epu32 (__m128i __A, __m128i __B) 1437{ 1438#if __GNUC__ < 8 1439 __v2du result; 1440 1441#ifdef __LITTLE_ENDIAN__ 1442 /* VMX Vector Multiply Odd Unsigned Word. */ 1443 __asm__( 1444 "vmulouw %0,%1,%2" 1445 : "=v" (result) 1446 : "v" (__A), "v" (__B) 1447 : ); 1448#else 1449 /* VMX Vector Multiply Even Unsigned Word. */ 1450 __asm__( 1451 "vmuleuw %0,%1,%2" 1452 : "=v" (result) 1453 : "v" (__A), "v" (__B) 1454 : ); 1455#endif 1456 return (__m128i) result; 1457#else 1458 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1459#endif 1460} 1461 1462extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1463_mm_slli_epi16 (__m128i __A, int __B) 1464{ 1465 __v8hu lshift; 1466 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1467 1468 if (__B >= 0 && __B < 16) 1469 { 1470 if (__builtin_constant_p(__B)) 1471 lshift = (__v8hu) vec_splat_s16(__B); 1472 else 1473 lshift = vec_splats ((unsigned short) __B); 1474 1475 result = vec_sl ((__v8hi) __A, lshift); 1476 } 1477 1478 return (__m128i) result; 1479} 1480 1481extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1482_mm_slli_epi32 (__m128i __A, int __B) 1483{ 1484 __v4su lshift; 1485 __v4si result = { 0, 0, 0, 0 }; 1486 1487 if (__B >= 0 && __B < 32) 1488 { 1489 if (__builtin_constant_p(__B) && __B < 16) 1490 lshift = (__v4su) vec_splat_s32(__B); 1491 else 1492 lshift = vec_splats ((unsigned int) __B); 1493 1494 result = vec_sl ((__v4si) __A, lshift); 1495 } 1496 1497 return (__m128i) result; 1498} 1499 1500#ifdef _ARCH_PWR8 1501extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1502_mm_slli_epi64 (__m128i __A, int __B) 1503{ 1504 __v2du lshift; 1505 __v2di result = { 0, 0 }; 1506 1507 if (__B >= 0 && __B < 64) 1508 { 1509 if (__builtin_constant_p(__B) && __B < 16) 1510 lshift = (__v2du) vec_splat_s32(__B); 1511 else 1512 lshift = (__v2du) vec_splats ((unsigned int) __B); 1513 1514 result = vec_sl ((__v2di) __A, lshift); 1515 } 1516 1517 return (__m128i) result; 1518} 1519#endif 1520 1521extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1522_mm_srai_epi16 (__m128i __A, int __B) 1523{ 1524 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1525 __v8hi result; 1526 1527 if (__B < 16) 1528 { 1529 if (__builtin_constant_p(__B)) 1530 rshift = (__v8hu) vec_splat_s16(__B); 1531 else 1532 rshift = vec_splats ((unsigned short) __B); 1533 } 1534 result = vec_sra ((__v8hi) __A, rshift); 1535 1536 return (__m128i) result; 1537} 1538 1539extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1540_mm_srai_epi32 (__m128i __A, int __B) 1541{ 1542 __v4su rshift = { 31, 31, 31, 31 }; 1543 __v4si result; 1544 1545 if (__B < 32) 1546 { 1547 if (__builtin_constant_p(__B)) 1548 { 1549 if (__B < 16) 1550 rshift = (__v4su) vec_splat_s32(__B); 1551 else 1552 rshift = (__v4su) vec_splats((unsigned int)__B); 1553 } 1554 else 1555 rshift = vec_splats ((unsigned int) __B); 1556 } 1557 result = vec_sra ((__v4si) __A, rshift); 1558 1559 return (__m128i) result; 1560} 1561 1562extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1563_mm_bslli_si128 (__m128i __A, const int __N) 1564{ 1565 __v16qu result; 1566 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1567 1568 if (__N < 16) 1569 result = vec_sld ((__v16qu) __A, zeros, __N); 1570 else 1571 result = zeros; 1572 1573 return (__m128i) result; 1574} 1575 1576extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1577_mm_bsrli_si128 (__m128i __A, const int __N) 1578{ 1579 __v16qu result; 1580 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1581 1582 if (__N < 16) 1583#ifdef __LITTLE_ENDIAN__ 1584 if (__builtin_constant_p(__N)) 1585 /* Would like to use Vector Shift Left Double by Octet 1586 Immediate here to use the immediate form and avoid 1587 load of __N * 8 value into a separate VR. */ 1588 result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); 1589 else 1590#endif 1591 { 1592 __v16qu shift = vec_splats((unsigned char)(__N*8)); 1593#ifdef __LITTLE_ENDIAN__ 1594 result = vec_sro ((__v16qu)__A, shift); 1595#else 1596 result = vec_slo ((__v16qu)__A, shift); 1597#endif 1598 } 1599 else 1600 result = zeros; 1601 1602 return (__m128i) result; 1603} 1604 1605extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1606_mm_srli_si128 (__m128i __A, const int __N) 1607{ 1608 return _mm_bsrli_si128 (__A, __N); 1609} 1610 1611extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1612_mm_slli_si128 (__m128i __A, const int _imm5) 1613{ 1614 __v16qu result; 1615 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1616 1617 if (_imm5 < 16) 1618#ifdef __LITTLE_ENDIAN__ 1619 result = vec_sld ((__v16qu) __A, zeros, _imm5); 1620#else 1621 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); 1622#endif 1623 else 1624 result = zeros; 1625 1626 return (__m128i) result; 1627} 1628 1629extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1630 1631_mm_srli_epi16 (__m128i __A, int __B) 1632{ 1633 __v8hu rshift; 1634 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1635 1636 if (__B < 16) 1637 { 1638 if (__builtin_constant_p(__B)) 1639 rshift = (__v8hu) vec_splat_s16(__B); 1640 else 1641 rshift = vec_splats ((unsigned short) __B); 1642 1643 result = vec_sr ((__v8hi) __A, rshift); 1644 } 1645 1646 return (__m128i) result; 1647} 1648 1649extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1650_mm_srli_epi32 (__m128i __A, int __B) 1651{ 1652 __v4su rshift; 1653 __v4si result = { 0, 0, 0, 0 }; 1654 1655 if (__B < 32) 1656 { 1657 if (__builtin_constant_p(__B)) 1658 { 1659 if (__B < 16) 1660 rshift = (__v4su) vec_splat_s32(__B); 1661 else 1662 rshift = (__v4su) vec_splats((unsigned int)__B); 1663 } 1664 else 1665 rshift = vec_splats ((unsigned int) __B); 1666 1667 result = vec_sr ((__v4si) __A, rshift); 1668 } 1669 1670 return (__m128i) result; 1671} 1672 1673#ifdef _ARCH_PWR8 1674extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1675_mm_srli_epi64 (__m128i __A, int __B) 1676{ 1677 __v2du rshift; 1678 __v2di result = { 0, 0 }; 1679 1680 if (__B < 64) 1681 { 1682 if (__builtin_constant_p(__B)) 1683 { 1684 if (__B < 16) 1685 rshift = (__v2du) vec_splat_s32(__B); 1686 else 1687 rshift = (__v2du) vec_splats((unsigned long long)__B); 1688 } 1689 else 1690 rshift = (__v2du) vec_splats ((unsigned int) __B); 1691 1692 result = vec_sr ((__v2di) __A, rshift); 1693 } 1694 1695 return (__m128i) result; 1696} 1697#endif 1698 1699extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1700_mm_sll_epi16 (__m128i __A, __m128i __B) 1701{ 1702 __v8hu lshift; 1703 __vector __bool short shmask; 1704 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1705 __v8hu result; 1706 1707#ifdef __LITTLE_ENDIAN__ 1708 lshift = vec_splat ((__v8hu) __B, 0); 1709#else 1710 lshift = vec_splat ((__v8hu) __B, 3); 1711#endif 1712 shmask = vec_cmple (lshift, shmax); 1713 result = vec_sl ((__v8hu) __A, lshift); 1714 result = vec_sel ((__v8hu) shmask, result, shmask); 1715 1716 return (__m128i) result; 1717} 1718 1719extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1720_mm_sll_epi32 (__m128i __A, __m128i __B) 1721{ 1722 __v4su lshift; 1723 __vector __bool int shmask; 1724 const __v4su shmax = { 32, 32, 32, 32 }; 1725 __v4su result; 1726#ifdef __LITTLE_ENDIAN__ 1727 lshift = vec_splat ((__v4su) __B, 0); 1728#else 1729 lshift = vec_splat ((__v4su) __B, 1); 1730#endif 1731 shmask = vec_cmplt (lshift, shmax); 1732 result = vec_sl ((__v4su) __A, lshift); 1733 result = vec_sel ((__v4su) shmask, result, shmask); 1734 1735 return (__m128i) result; 1736} 1737 1738#ifdef _ARCH_PWR8 1739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1740_mm_sll_epi64 (__m128i __A, __m128i __B) 1741{ 1742 __v2du lshift; 1743 __vector __bool long long shmask; 1744 const __v2du shmax = { 64, 64 }; 1745 __v2du result; 1746 1747 lshift = vec_splat ((__v2du) __B, 0); 1748 shmask = vec_cmplt (lshift, shmax); 1749 result = vec_sl ((__v2du) __A, lshift); 1750 result = vec_sel ((__v2du) shmask, result, shmask); 1751 1752 return (__m128i) result; 1753} 1754#endif 1755 1756extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1757_mm_sra_epi16 (__m128i __A, __m128i __B) 1758{ 1759 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1760 __v8hu rshift; 1761 __v8hi result; 1762 1763#ifdef __LITTLE_ENDIAN__ 1764 rshift = vec_splat ((__v8hu)__B, 0); 1765#else 1766 rshift = vec_splat ((__v8hu)__B, 3); 1767#endif 1768 rshift = vec_min (rshift, rshmax); 1769 result = vec_sra ((__v8hi) __A, rshift); 1770 1771 return (__m128i) result; 1772} 1773 1774extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1775_mm_sra_epi32 (__m128i __A, __m128i __B) 1776{ 1777 const __v4su rshmax = { 31, 31, 31, 31 }; 1778 __v4su rshift; 1779 __v4si result; 1780 1781#ifdef __LITTLE_ENDIAN__ 1782 rshift = vec_splat ((__v4su)__B, 0); 1783#else 1784 rshift = vec_splat ((__v4su)__B, 1); 1785#endif 1786 rshift = vec_min (rshift, rshmax); 1787 result = vec_sra ((__v4si) __A, rshift); 1788 1789 return (__m128i) result; 1790} 1791 1792extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1793_mm_srl_epi16 (__m128i __A, __m128i __B) 1794{ 1795 __v8hu rshift; 1796 __vector __bool short shmask; 1797 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1798 __v8hu result; 1799 1800#ifdef __LITTLE_ENDIAN__ 1801 rshift = vec_splat ((__v8hu) __B, 0); 1802#else 1803 rshift = vec_splat ((__v8hu) __B, 3); 1804#endif 1805 shmask = vec_cmple (rshift, shmax); 1806 result = vec_sr ((__v8hu) __A, rshift); 1807 result = vec_sel ((__v8hu) shmask, result, shmask); 1808 1809 return (__m128i) result; 1810} 1811 1812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1813_mm_srl_epi32 (__m128i __A, __m128i __B) 1814{ 1815 __v4su rshift; 1816 __vector __bool int shmask; 1817 const __v4su shmax = { 32, 32, 32, 32 }; 1818 __v4su result; 1819 1820#ifdef __LITTLE_ENDIAN__ 1821 rshift = vec_splat ((__v4su) __B, 0); 1822#else 1823 rshift = vec_splat ((__v4su) __B, 1); 1824#endif 1825 shmask = vec_cmplt (rshift, shmax); 1826 result = vec_sr ((__v4su) __A, rshift); 1827 result = vec_sel ((__v4su) shmask, result, shmask); 1828 1829 return (__m128i) result; 1830} 1831 1832#ifdef _ARCH_PWR8 1833extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1834_mm_srl_epi64 (__m128i __A, __m128i __B) 1835{ 1836 __v2du rshift; 1837 __vector __bool long long shmask; 1838 const __v2du shmax = { 64, 64 }; 1839 __v2du result; 1840 1841 rshift = vec_splat ((__v2du) __B, 0); 1842 shmask = vec_cmplt (rshift, shmax); 1843 result = vec_sr ((__v2du) __A, rshift); 1844 result = vec_sel ((__v2du) shmask, result, shmask); 1845 1846 return (__m128i) result; 1847} 1848#endif 1849 1850extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1851_mm_and_pd (__m128d __A, __m128d __B) 1852{ 1853 return (vec_and ((__v2df) __A, (__v2df) __B)); 1854} 1855 1856extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1857_mm_andnot_pd (__m128d __A, __m128d __B) 1858{ 1859 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1860} 1861 1862extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1863_mm_or_pd (__m128d __A, __m128d __B) 1864{ 1865 return (vec_or ((__v2df) __A, (__v2df) __B)); 1866} 1867 1868extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1869_mm_xor_pd (__m128d __A, __m128d __B) 1870{ 1871 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1872} 1873 1874extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1875_mm_and_si128 (__m128i __A, __m128i __B) 1876{ 1877 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1878} 1879 1880extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1881_mm_andnot_si128 (__m128i __A, __m128i __B) 1882{ 1883 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1884} 1885 1886extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1887_mm_or_si128 (__m128i __A, __m128i __B) 1888{ 1889 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1890} 1891 1892extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1893_mm_xor_si128 (__m128i __A, __m128i __B) 1894{ 1895 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1896} 1897 1898extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1899_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1900{ 1901 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1902} 1903 1904extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1905_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1906{ 1907 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1908} 1909 1910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1911_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1912{ 1913 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1914} 1915 1916extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1917_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1918{ 1919 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1920} 1921 1922extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1923_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1924{ 1925 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1926} 1927 1928extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1929_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1930{ 1931 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1932} 1933 1934extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1935_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1936{ 1937 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1938} 1939 1940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1941_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1942{ 1943 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1944} 1945 1946extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1947_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1948{ 1949 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1950} 1951 1952extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1953_mm_extract_epi16 (__m128i const __A, int const __N) 1954{ 1955 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1956} 1957 1958extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1959_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1960{ 1961 __v8hi result = (__v8hi)__A; 1962 1963 result [(__N & 7)] = __D; 1964 1965 return (__m128i) result; 1966} 1967 1968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1969_mm_max_epi16 (__m128i __A, __m128i __B) 1970{ 1971 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 1972} 1973 1974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1975_mm_max_epu8 (__m128i __A, __m128i __B) 1976{ 1977 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 1978} 1979 1980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1981_mm_min_epi16 (__m128i __A, __m128i __B) 1982{ 1983 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 1984} 1985 1986extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1987_mm_min_epu8 (__m128i __A, __m128i __B) 1988{ 1989 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 1990} 1991 1992 1993#ifdef _ARCH_PWR8 1994/* Intrinsic functions that require PowerISA 2.07 minimum. */ 1995 1996/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1997extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1998_mm_movemask_epi8 (__m128i __A) 1999{ 2000 __vector unsigned long long result; 2001 static const __vector unsigned char perm_mask = 2002 { 2003 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2004 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2005 }; 2006 2007 result = ((__vector unsigned long long) 2008 vec_vbpermq ((__vector unsigned char) __A, 2009 (__vector unsigned char) perm_mask)); 2010 2011#ifdef __LITTLE_ENDIAN__ 2012 return result[1]; 2013#else 2014 return result[0]; 2015#endif 2016} 2017#endif /* _ARCH_PWR8 */ 2018 2019extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2020_mm_mulhi_epu16 (__m128i __A, __m128i __B) 2021{ 2022 __v4su w0, w1; 2023 __v16qu xform1 = { 2024#ifdef __LITTLE_ENDIAN__ 2025 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2026 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2027#else 2028 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2029 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2030#endif 2031 }; 2032 2033 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2034 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2035 return (__m128i) vec_perm (w0, w1, xform1); 2036} 2037 2038extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2039_mm_shufflehi_epi16 (__m128i __A, const int __mask) 2040{ 2041 unsigned long element_selector_98 = __mask & 0x03; 2042 unsigned long element_selector_BA = (__mask >> 2) & 0x03; 2043 unsigned long element_selector_DC = (__mask >> 4) & 0x03; 2044 unsigned long element_selector_FE = (__mask >> 6) & 0x03; 2045 static const unsigned short permute_selectors[4] = 2046 { 2047#ifdef __LITTLE_ENDIAN__ 2048 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2049#else 2050 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2051#endif 2052 }; 2053 __v2du pmask = 2054#ifdef __LITTLE_ENDIAN__ 2055 { 0x1716151413121110UL, 0UL}; 2056#else 2057 { 0x1011121314151617UL, 0UL}; 2058#endif 2059 __m64_union t; 2060 __v2du a, r; 2061 2062 t.as_short[0] = permute_selectors[element_selector_98]; 2063 t.as_short[1] = permute_selectors[element_selector_BA]; 2064 t.as_short[2] = permute_selectors[element_selector_DC]; 2065 t.as_short[3] = permute_selectors[element_selector_FE]; 2066 pmask[1] = t.as_m64; 2067 a = (__v2du)__A; 2068 r = vec_perm (a, a, (__vector unsigned char)pmask); 2069 return (__m128i) r; 2070} 2071 2072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2073_mm_shufflelo_epi16 (__m128i __A, const int __mask) 2074{ 2075 unsigned long element_selector_10 = __mask & 0x03; 2076 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2077 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2078 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2079 static const unsigned short permute_selectors[4] = 2080 { 2081#ifdef __LITTLE_ENDIAN__ 2082 0x0100, 0x0302, 0x0504, 0x0706 2083#else 2084 0x0001, 0x0203, 0x0405, 0x0607 2085#endif 2086 }; 2087 __v2du pmask = 2088#ifdef __LITTLE_ENDIAN__ 2089 { 0UL, 0x1f1e1d1c1b1a1918UL}; 2090#else 2091 { 0UL, 0x18191a1b1c1d1e1fUL}; 2092#endif 2093 __m64_union t; 2094 __v2du a, r; 2095 t.as_short[0] = permute_selectors[element_selector_10]; 2096 t.as_short[1] = permute_selectors[element_selector_32]; 2097 t.as_short[2] = permute_selectors[element_selector_54]; 2098 t.as_short[3] = permute_selectors[element_selector_76]; 2099 pmask[0] = t.as_m64; 2100 a = (__v2du)__A; 2101 r = vec_perm (a, a, (__vector unsigned char)pmask); 2102 return (__m128i) r; 2103} 2104 2105extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2106_mm_shuffle_epi32 (__m128i __A, const int __mask) 2107{ 2108 unsigned long element_selector_10 = __mask & 0x03; 2109 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2110 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2111 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2112 static const unsigned int permute_selectors[4] = 2113 { 2114#ifdef __LITTLE_ENDIAN__ 2115 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2116#else 2117 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2118#endif 2119 }; 2120 __v4su t; 2121 2122 t[0] = permute_selectors[element_selector_10]; 2123 t[1] = permute_selectors[element_selector_32]; 2124 t[2] = permute_selectors[element_selector_54] + 0x10101010; 2125 t[3] = permute_selectors[element_selector_76] + 0x10101010; 2126 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); 2127} 2128 2129extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2130_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2131{ 2132 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2133 __v16qu mask, tmp; 2134 __m128i_u *p = (__m128i_u*)__C; 2135 2136 tmp = (__v16qu)_mm_loadu_si128(p); 2137 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); 2138 tmp = vec_sel (tmp, (__v16qu)__A, mask); 2139 _mm_storeu_si128 (p, (__m128i)tmp); 2140} 2141 2142extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2143_mm_avg_epu8 (__m128i __A, __m128i __B) 2144{ 2145 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2146} 2147 2148extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2149_mm_avg_epu16 (__m128i __A, __m128i __B) 2150{ 2151 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2152} 2153 2154 2155extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2156_mm_sad_epu8 (__m128i __A, __m128i __B) 2157{ 2158 __v16qu a, b; 2159 __v16qu vmin, vmax, vabsdiff; 2160 __v4si vsum; 2161 const __v4su zero = { 0, 0, 0, 0 }; 2162 __v4si result; 2163 2164 a = (__v16qu) __A; 2165 b = (__v16qu) __B; 2166 vmin = vec_min (a, b); 2167 vmax = vec_max (a, b); 2168 vabsdiff = vec_sub (vmax, vmin); 2169 /* Sum four groups of bytes into integers. */ 2170 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 2171 /* Sum across four integers with two integer results. */ 2172 result = vec_sum2s (vsum, (__vector signed int) zero); 2173 /* Rotate the sums into the correct position. */ 2174#ifdef __LITTLE_ENDIAN__ 2175 result = vec_sld (result, result, 4); 2176#else 2177 result = vec_sld (result, result, 6); 2178#endif 2179 /* Rotate the sums into the correct position. */ 2180 return (__m128i) result; 2181} 2182 2183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2184_mm_stream_si32 (int *__A, int __B) 2185{ 2186 /* Use the data cache block touch for store transient. */ 2187 __asm__ ( 2188 "dcbtstt 0,%0" 2189 : 2190 : "b" (__A) 2191 : "memory" 2192 ); 2193 *__A = __B; 2194} 2195 2196extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2197_mm_stream_si64 (long long int *__A, long long int __B) 2198{ 2199 /* Use the data cache block touch for store transient. */ 2200 __asm__ ( 2201 " dcbtstt 0,%0" 2202 : 2203 : "b" (__A) 2204 : "memory" 2205 ); 2206 *__A = __B; 2207} 2208 2209extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2210_mm_stream_si128 (__m128i *__A, __m128i __B) 2211{ 2212 /* Use the data cache block touch for store transient. */ 2213 __asm__ ( 2214 "dcbtstt 0,%0" 2215 : 2216 : "b" (__A) 2217 : "memory" 2218 ); 2219 *__A = __B; 2220} 2221 2222extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2223_mm_stream_pd (double *__A, __m128d __B) 2224{ 2225 /* Use the data cache block touch for store transient. */ 2226 __asm__ ( 2227 "dcbtstt 0,%0" 2228 : 2229 : "b" (__A) 2230 : "memory" 2231 ); 2232 *(__m128d*)__A = __B; 2233} 2234 2235extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2236_mm_clflush (void const *__A) 2237{ 2238 /* Use the data cache block flush. */ 2239 __asm__ ( 2240 "dcbf 0,%0" 2241 : 2242 : "b" (__A) 2243 : "memory" 2244 ); 2245} 2246 2247extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2248_mm_lfence (void) 2249{ 2250 /* Use light weight sync for load to load ordering. */ 2251 __atomic_thread_fence (__ATOMIC_RELEASE); 2252} 2253 2254extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2255_mm_mfence (void) 2256{ 2257 /* Use heavy weight sync for any to any ordering. */ 2258 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2259} 2260 2261extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2262_mm_cvtsi32_si128 (int __A) 2263{ 2264 return _mm_set_epi32 (0, 0, 0, __A); 2265} 2266 2267extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2268_mm_cvtsi64_si128 (long long __A) 2269{ 2270 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2271} 2272 2273/* Microsoft intrinsic. */ 2274extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2275_mm_cvtsi64x_si128 (long long __A) 2276{ 2277 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2278} 2279 2280/* Casts between various SP, DP, INT vector types. Note that these do no 2281 conversion of values, they just change the type. */ 2282extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2283_mm_castpd_ps(__m128d __A) 2284{ 2285 return (__m128) __A; 2286} 2287 2288extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2289_mm_castpd_si128(__m128d __A) 2290{ 2291 return (__m128i) __A; 2292} 2293 2294extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2295_mm_castps_pd(__m128 __A) 2296{ 2297 return (__m128d) __A; 2298} 2299 2300extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2301_mm_castps_si128(__m128 __A) 2302{ 2303 return (__m128i) __A; 2304} 2305 2306extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2307_mm_castsi128_ps(__m128i __A) 2308{ 2309 return (__m128) __A; 2310} 2311 2312extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2313_mm_castsi128_pd(__m128i __A) 2314{ 2315 return (__m128d) __A; 2316} 2317 2318#endif /* EMMINTRIN_H_ */ 2319