1/* Copyright (C) 2003-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27#ifndef NO_WARN_X86_INTRINSICS 28/* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics, 37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD 38 operations. However scalar double operations in vector (XMM) 39 registers require the POWER8 VSX ISA (2.07) level. Also there are 40 important differences for data format and placement of double 41 scalars in the vector register. 42 43 For PowerISA Scalar double is in FPRs (left most 64-bits of the 44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of 45 the XMM. These differences require extra steps on POWER to match 46 the SSE2 scalar double semantics. 47 48 Most SSE2 scalar double intrinsic operations can be performed more 49 efficiently as C language double scalar operations or optimized to 50 use vector SIMD operations. We recommend this for new applications. 51 52 Another difference is the format and details of the X86_64 MXSCR vs 53 the PowerISA FPSCR / VSCR registers. We recommend applications 54 replace direct access to the MXSCR with the more portable <fenv.h> 55 Posix APIs. */ 56#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 57#endif 58 59#ifndef EMMINTRIN_H_ 60#define EMMINTRIN_H_ 61 62#include <altivec.h> 63#include <assert.h> 64 65/* We need definitions from the SSE header files. */ 66#include <xmmintrin.h> 67 68/* SSE2 */ 69typedef __vector double __v2df; 70typedef __vector long long __v2di; 71typedef __vector unsigned long long __v2du; 72typedef __vector int __v4si; 73typedef __vector unsigned int __v4su; 74typedef __vector short __v8hi; 75typedef __vector unsigned short __v8hu; 76typedef __vector signed char __v16qi; 77typedef __vector unsigned char __v16qu; 78 79/* The Intel API is flexible enough that we must allow aliasing with other 80 vector types, and their scalar components. */ 81typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 82typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 83 84/* Unaligned version of the same types. */ 85typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 86typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 87 88/* Define two value permute mask. */ 89#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 90 91/* Create a vector with element 0 as F and the rest zero. */ 92extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93_mm_set_sd (double __F) 94{ 95 return __extension__ (__m128d){ __F, 0.0 }; 96} 97 98/* Create a vector with both elements equal to F. */ 99extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 100_mm_set1_pd (double __F) 101{ 102 return __extension__ (__m128d){ __F, __F }; 103} 104 105extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106_mm_set_pd1 (double __F) 107{ 108 return _mm_set1_pd (__F); 109} 110 111/* Create a vector with the lower value X and upper value W. */ 112extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 113_mm_set_pd (double __W, double __X) 114{ 115 return __extension__ (__m128d){ __X, __W }; 116} 117 118/* Create a vector with the lower value W and upper value X. */ 119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120_mm_setr_pd (double __W, double __X) 121{ 122 return __extension__ (__m128d){ __W, __X }; 123} 124 125/* Create an undefined vector. */ 126extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 127_mm_undefined_pd (void) 128{ 129 __m128d __Y = __Y; 130 return __Y; 131} 132 133/* Create a vector of zeros. */ 134extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135_mm_setzero_pd (void) 136{ 137 return (__m128d) vec_splats (0); 138} 139 140/* Sets the low DPFP value of A from the low value of B. */ 141extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142_mm_move_sd (__m128d __A, __m128d __B) 143{ 144 __v2df __result = (__v2df) __A; 145 __result [0] = ((__v2df) __B)[0]; 146 return (__m128d) __result; 147} 148 149/* Load two DPFP values from P. The address must be 16-byte aligned. */ 150extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 151_mm_load_pd (double const *__P) 152{ 153 assert(((unsigned long)__P & 0xfUL) == 0UL); 154 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 155} 156 157/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 158extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159_mm_loadu_pd (double const *__P) 160{ 161 return (vec_vsx_ld(0, __P)); 162} 163 164/* Create a vector with all two elements equal to *P. */ 165extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 166_mm_load1_pd (double const *__P) 167{ 168 return (vec_splats (*__P)); 169} 170 171/* Create a vector with element 0 as *P and the rest zero. */ 172extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 173_mm_load_sd (double const *__P) 174{ 175 return _mm_set_sd (*__P); 176} 177 178extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 179_mm_load_pd1 (double const *__P) 180{ 181 return _mm_load1_pd (__P); 182} 183 184/* Load two DPFP values in reverse order. The address must be aligned. */ 185extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 186_mm_loadr_pd (double const *__P) 187{ 188 __v2df __tmp = _mm_load_pd (__P); 189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 190} 191 192/* Store two DPFP values. The address must be 16-byte aligned. */ 193extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194_mm_store_pd (double *__P, __m128d __A) 195{ 196 assert(((unsigned long)__P & 0xfUL) == 0UL); 197 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 198} 199 200/* Store two DPFP values. The address need not be 16-byte aligned. */ 201extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 202_mm_storeu_pd (double *__P, __m128d __A) 203{ 204 *(__m128d_u *)__P = __A; 205} 206 207/* Stores the lower DPFP value. */ 208extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 209_mm_store_sd (double *__P, __m128d __A) 210{ 211 *__P = ((__v2df)__A)[0]; 212} 213 214extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215_mm_cvtsd_f64 (__m128d __A) 216{ 217 return ((__v2df)__A)[0]; 218} 219 220extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221_mm_storel_pd (double *__P, __m128d __A) 222{ 223 _mm_store_sd (__P, __A); 224} 225 226/* Stores the upper DPFP value. */ 227extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228_mm_storeh_pd (double *__P, __m128d __A) 229{ 230 *__P = ((__v2df)__A)[1]; 231} 232/* Store the lower DPFP value across two words. 233 The address must be 16-byte aligned. */ 234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235_mm_store1_pd (double *__P, __m128d __A) 236{ 237 _mm_store_pd (__P, vec_splat (__A, 0)); 238} 239 240extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241_mm_store_pd1 (double *__P, __m128d __A) 242{ 243 _mm_store1_pd (__P, __A); 244} 245 246/* Store two DPFP values in reverse order. The address must be aligned. */ 247extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248_mm_storer_pd (double *__P, __m128d __A) 249{ 250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 251} 252 253/* Intel intrinsic. */ 254extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255_mm_cvtsi128_si64 (__m128i __A) 256{ 257 return ((__v2di)__A)[0]; 258} 259 260/* Microsoft intrinsic. */ 261extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 262_mm_cvtsi128_si64x (__m128i __A) 263{ 264 return ((__v2di)__A)[0]; 265} 266 267extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268_mm_add_pd (__m128d __A, __m128d __B) 269{ 270 return (__m128d) ((__v2df)__A + (__v2df)__B); 271} 272 273/* Add the lower double-precision (64-bit) floating-point element in 274 a and b, store the result in the lower element of dst, and copy 275 the upper element from a to the upper element of dst. */ 276extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277_mm_add_sd (__m128d __A, __m128d __B) 278{ 279 __A[0] = __A[0] + __B[0]; 280 return (__A); 281} 282 283extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 284_mm_sub_pd (__m128d __A, __m128d __B) 285{ 286 return (__m128d) ((__v2df)__A - (__v2df)__B); 287} 288 289extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290_mm_sub_sd (__m128d __A, __m128d __B) 291{ 292 __A[0] = __A[0] - __B[0]; 293 return (__A); 294} 295 296extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 297_mm_mul_pd (__m128d __A, __m128d __B) 298{ 299 return (__m128d) ((__v2df)__A * (__v2df)__B); 300} 301 302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303_mm_mul_sd (__m128d __A, __m128d __B) 304{ 305 __A[0] = __A[0] * __B[0]; 306 return (__A); 307} 308 309extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 310_mm_div_pd (__m128d __A, __m128d __B) 311{ 312 return (__m128d) ((__v2df)__A / (__v2df)__B); 313} 314 315extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 316_mm_div_sd (__m128d __A, __m128d __B) 317{ 318 __A[0] = __A[0] / __B[0]; 319 return (__A); 320} 321 322extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323_mm_sqrt_pd (__m128d __A) 324{ 325 return (vec_sqrt (__A)); 326} 327 328/* Return pair {sqrt (B[0]), A[1]}. */ 329extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 330_mm_sqrt_sd (__m128d __A, __m128d __B) 331{ 332 __v2df __c; 333 __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 334 return (__m128d) _mm_setr_pd (__c[0], __A[1]); 335} 336 337extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 338_mm_min_pd (__m128d __A, __m128d __B) 339{ 340 return (vec_min (__A, __B)); 341} 342 343extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344_mm_min_sd (__m128d __A, __m128d __B) 345{ 346 __v2df __a, __b, __c; 347 __a = vec_splats (__A[0]); 348 __b = vec_splats (__B[0]); 349 __c = vec_min (__a, __b); 350 return (__m128d) _mm_setr_pd (__c[0], __A[1]); 351} 352 353extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 354_mm_max_pd (__m128d __A, __m128d __B) 355{ 356 return (vec_max (__A, __B)); 357} 358 359extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360_mm_max_sd (__m128d __A, __m128d __B) 361{ 362 __v2df __a, __b, __c; 363 __a = vec_splats (__A[0]); 364 __b = vec_splats (__B[0]); 365 __c = vec_max (__a, __b); 366 return (__m128d) _mm_setr_pd (__c[0], __A[1]); 367} 368 369extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370_mm_cmpeq_pd (__m128d __A, __m128d __B) 371{ 372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 373} 374 375extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 376_mm_cmplt_pd (__m128d __A, __m128d __B) 377{ 378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 379} 380 381extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382_mm_cmple_pd (__m128d __A, __m128d __B) 383{ 384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 385} 386 387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388_mm_cmpgt_pd (__m128d __A, __m128d __B) 389{ 390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 391} 392 393extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 394_mm_cmpge_pd (__m128d __A, __m128d __B) 395{ 396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 397} 398 399extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 400_mm_cmpneq_pd (__m128d __A, __m128d __B) 401{ 402 __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 403 return ((__m128d)vec_nor (__temp, __temp)); 404} 405 406extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 407_mm_cmpnlt_pd (__m128d __A, __m128d __B) 408{ 409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 410} 411 412extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413_mm_cmpnle_pd (__m128d __A, __m128d __B) 414{ 415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 416} 417 418extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 419_mm_cmpngt_pd (__m128d __A, __m128d __B) 420{ 421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 422} 423 424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 425_mm_cmpnge_pd (__m128d __A, __m128d __B) 426{ 427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 428} 429 430#if _ARCH_PWR8 431extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 432_mm_cmpord_pd (__m128d __A, __m128d __B) 433{ 434 __v2du c, d; 435 /* Compare against self will return false (0's) if NAN. */ 436 c = (__v2du)vec_cmpeq (__A, __A); 437 d = (__v2du)vec_cmpeq (__B, __B); 438 /* A != NAN and B != NAN. */ 439 return ((__m128d)vec_and(c, d)); 440} 441#endif 442 443extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 444_mm_cmpunord_pd (__m128d __A, __m128d __B) 445{ 446#if _ARCH_PWR8 447 __v2du c, d; 448 /* Compare against self will return false (0's) if NAN. */ 449 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 450 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 451 /* A == NAN OR B == NAN converts too: 452 NOT(A != NAN) OR NOT(B != NAN). */ 453 c = vec_nor (c, c); 454 return ((__m128d)vec_orc(c, d)); 455#else 456 __v2du c, d; 457 /* Compare against self will return false (0's) if NAN. */ 458 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 459 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 460 /* Convert the true ('1's) is NAN. */ 461 c = vec_nor (c, c); 462 d = vec_nor (d, d); 463 return ((__m128d)vec_or(c, d)); 464#endif 465} 466 467extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 468_mm_cmpeq_sd(__m128d __A, __m128d __B) 469{ 470 __v2df a, b, c; 471 /* PowerISA VSX does not allow partial (for just lower double) 472 results. So to insure we don't generate spurious exceptions 473 (from the upper double values) we splat the lower double 474 before we do the operation. */ 475 a = vec_splats (__A[0]); 476 b = vec_splats (__B[0]); 477 c = (__v2df) vec_cmpeq(a, b); 478 /* Then we merge the lower double result with the original upper 479 double from __A. */ 480 return (__m128d) _mm_setr_pd (c[0], __A[1]); 481} 482 483extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 484_mm_cmplt_sd (__m128d __A, __m128d __B) 485{ 486 __v2df a, b, c; 487 a = vec_splats (__A[0]); 488 b = vec_splats (__B[0]); 489 c = (__v2df) vec_cmplt(a, b); 490 return (__m128d) _mm_setr_pd (c[0], __A[1]); 491} 492 493extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 494_mm_cmple_sd (__m128d __A, __m128d __B) 495{ 496 __v2df a, b, c; 497 a = vec_splats (__A[0]); 498 b = vec_splats (__B[0]); 499 c = (__v2df) vec_cmple(a, b); 500 return (__m128d) _mm_setr_pd (c[0], __A[1]); 501} 502 503extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 504_mm_cmpgt_sd (__m128d __A, __m128d __B) 505{ 506 __v2df a, b, c; 507 a = vec_splats (__A[0]); 508 b = vec_splats (__B[0]); 509 c = (__v2df) vec_cmpgt(a, b); 510 return (__m128d) _mm_setr_pd (c[0], __A[1]); 511} 512 513extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 514_mm_cmpge_sd (__m128d __A, __m128d __B) 515{ 516 __v2df a, b, c; 517 a = vec_splats (__A[0]); 518 b = vec_splats (__B[0]); 519 c = (__v2df) vec_cmpge(a, b); 520 return (__m128d) _mm_setr_pd (c[0], __A[1]); 521} 522 523extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 524_mm_cmpneq_sd (__m128d __A, __m128d __B) 525{ 526 __v2df a, b, c; 527 a = vec_splats (__A[0]); 528 b = vec_splats (__B[0]); 529 c = (__v2df) vec_cmpeq(a, b); 530 c = vec_nor (c, c); 531 return (__m128d) _mm_setr_pd (c[0], __A[1]); 532} 533 534extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 535_mm_cmpnlt_sd (__m128d __A, __m128d __B) 536{ 537 __v2df a, b, c; 538 a = vec_splats (__A[0]); 539 b = vec_splats (__B[0]); 540 /* Not less than is just greater than or equal. */ 541 c = (__v2df) vec_cmpge(a, b); 542 return (__m128d) _mm_setr_pd (c[0], __A[1]); 543} 544 545extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 546_mm_cmpnle_sd (__m128d __A, __m128d __B) 547{ 548 __v2df a, b, c; 549 a = vec_splats (__A[0]); 550 b = vec_splats (__B[0]); 551 /* Not less than or equal is just greater than. */ 552 c = (__v2df) vec_cmpge(a, b); 553 return (__m128d) _mm_setr_pd (c[0], __A[1]); 554} 555 556extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 557_mm_cmpngt_sd (__m128d __A, __m128d __B) 558{ 559 __v2df a, b, c; 560 a = vec_splats (__A[0]); 561 b = vec_splats (__B[0]); 562 /* Not greater than is just less than or equal. */ 563 c = (__v2df) vec_cmple(a, b); 564 return (__m128d) _mm_setr_pd (c[0], __A[1]); 565} 566 567extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 568_mm_cmpnge_sd (__m128d __A, __m128d __B) 569{ 570 __v2df a, b, c; 571 a = vec_splats (__A[0]); 572 b = vec_splats (__B[0]); 573 /* Not greater than or equal is just less than. */ 574 c = (__v2df) vec_cmplt(a, b); 575 return (__m128d) _mm_setr_pd (c[0], __A[1]); 576} 577 578#if _ARCH_PWR8 579extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 580_mm_cmpord_sd (__m128d __A, __m128d __B) 581{ 582 __v2df r; 583 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 584 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 585} 586#endif 587 588extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589_mm_cmpunord_sd (__m128d __A, __m128d __B) 590{ 591 __v2df r; 592 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 593 return (__m128d) _mm_setr_pd (r[0], __A[1]); 594} 595 596/* FIXME 597 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 598 exactly the same because GCC for PowerPC only generates unordered 599 compares (scalar and vector). 600 Technically __mm_comieq_sp et all should be using the ordered 601 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 602 be OK. */ 603extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604_mm_comieq_sd (__m128d __A, __m128d __B) 605{ 606 return (__A[0] == __B[0]); 607} 608 609extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610_mm_comilt_sd (__m128d __A, __m128d __B) 611{ 612 return (__A[0] < __B[0]); 613} 614 615extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616_mm_comile_sd (__m128d __A, __m128d __B) 617{ 618 return (__A[0] <= __B[0]); 619} 620 621extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622_mm_comigt_sd (__m128d __A, __m128d __B) 623{ 624 return (__A[0] > __B[0]); 625} 626 627extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 628_mm_comige_sd (__m128d __A, __m128d __B) 629{ 630 return (__A[0] >= __B[0]); 631} 632 633extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 634_mm_comineq_sd (__m128d __A, __m128d __B) 635{ 636 return (__A[0] != __B[0]); 637} 638 639extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 640_mm_ucomieq_sd (__m128d __A, __m128d __B) 641{ 642 return (__A[0] == __B[0]); 643} 644 645extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646_mm_ucomilt_sd (__m128d __A, __m128d __B) 647{ 648 return (__A[0] < __B[0]); 649} 650 651extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 652_mm_ucomile_sd (__m128d __A, __m128d __B) 653{ 654 return (__A[0] <= __B[0]); 655} 656 657extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658_mm_ucomigt_sd (__m128d __A, __m128d __B) 659{ 660 return (__A[0] > __B[0]); 661} 662 663extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 664_mm_ucomige_sd (__m128d __A, __m128d __B) 665{ 666 return (__A[0] >= __B[0]); 667} 668 669extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 670_mm_ucomineq_sd (__m128d __A, __m128d __B) 671{ 672 return (__A[0] != __B[0]); 673} 674 675/* Create a vector of Qi, where i is the element number. */ 676extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 677_mm_set_epi64x (long long __q1, long long __q0) 678{ 679 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 680} 681 682extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 683_mm_set_epi64 (__m64 __q1, __m64 __q0) 684{ 685 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 686} 687 688extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 689_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 690{ 691 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 692} 693 694extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 696 short __q3, short __q2, short __q1, short __q0) 697{ 698 return __extension__ (__m128i)(__v8hi){ 699 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 700} 701 702extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 703_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 704 char __q11, char __q10, char __q09, char __q08, 705 char __q07, char __q06, char __q05, char __q04, 706 char __q03, char __q02, char __q01, char __q00) 707{ 708 return __extension__ (__m128i)(__v16qi){ 709 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 710 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 711 }; 712} 713 714/* Set all of the elements of the vector to A. */ 715extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 716_mm_set1_epi64x (long long __A) 717{ 718 return _mm_set_epi64x (__A, __A); 719} 720 721extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 722_mm_set1_epi64 (__m64 __A) 723{ 724 return _mm_set_epi64 (__A, __A); 725} 726 727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728_mm_set1_epi32 (int __A) 729{ 730 return _mm_set_epi32 (__A, __A, __A, __A); 731} 732 733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734_mm_set1_epi16 (short __A) 735{ 736 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 737} 738 739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740_mm_set1_epi8 (char __A) 741{ 742 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 743 __A, __A, __A, __A, __A, __A, __A, __A); 744} 745 746/* Create a vector of Qi, where i is the element number. 747 The parameter order is reversed from the _mm_set_epi* functions. */ 748extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 749_mm_setr_epi64 (__m64 __q0, __m64 __q1) 750{ 751 return _mm_set_epi64 (__q1, __q0); 752} 753 754extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 755_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 756{ 757 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 758} 759 760extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 761_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 762 short __q4, short __q5, short __q6, short __q7) 763{ 764 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 765} 766 767extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 768_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 769 char __q04, char __q05, char __q06, char __q07, 770 char __q08, char __q09, char __q10, char __q11, 771 char __q12, char __q13, char __q14, char __q15) 772{ 773 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 774 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 775} 776 777/* Create a vector with element 0 as *P and the rest zero. */ 778extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 779_mm_load_si128 (__m128i const *__P) 780{ 781 return *__P; 782} 783 784extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 785_mm_loadu_si128 (__m128i_u const *__P) 786{ 787 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 788} 789 790extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 791_mm_loadl_epi64 (__m128i_u const *__P) 792{ 793 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 794} 795 796extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 797_mm_store_si128 (__m128i *__P, __m128i __B) 798{ 799 assert(((unsigned long )__P & 0xfUL) == 0UL); 800 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 801} 802 803extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 804_mm_storeu_si128 (__m128i_u *__P, __m128i __B) 805{ 806 *__P = __B; 807} 808 809extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 810_mm_storel_epi64 (__m128i_u *__P, __m128i __B) 811{ 812 *(long long *)__P = ((__v2di)__B)[0]; 813} 814 815extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 816_mm_movepi64_pi64 (__m128i_u __B) 817{ 818 return (__m64) ((__v2di)__B)[0]; 819} 820 821extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 822_mm_movpi64_epi64 (__m64 __A) 823{ 824 return _mm_set_epi64 ((__m64)0LL, __A); 825} 826 827extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828_mm_move_epi64 (__m128i __A) 829{ 830 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 831} 832 833/* Create an undefined vector. */ 834extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 835_mm_undefined_si128 (void) 836{ 837 __m128i __Y = __Y; 838 return __Y; 839} 840 841/* Create a vector of zeros. */ 842extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 843_mm_setzero_si128 (void) 844{ 845 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 846} 847 848#ifdef _ARCH_PWR8 849extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 850_mm_cvtepi32_pd (__m128i __A) 851{ 852 __v2di __val; 853 /* For LE need to generate Vector Unpack Low Signed Word. 854 Which is generated from unpackh. */ 855 __val = (__v2di)vec_unpackh ((__v4si)__A); 856 857 return (__m128d)vec_ctf (__val, 0); 858} 859#endif 860 861extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 862_mm_cvtepi32_ps (__m128i __A) 863{ 864 return ((__m128)vec_ctf((__v4si)__A, 0)); 865} 866 867extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 868_mm_cvtpd_epi32 (__m128d __A) 869{ 870 __v2df __rounded = vec_rint (__A); 871 __v4si __result, __temp; 872 const __v4si __vzero = 873 { 0, 0, 0, 0 }; 874 875 /* VSX Vector truncate Double-Precision to integer and Convert to 876 Signed Integer Word format with Saturate. */ 877 __asm__( 878 "xvcvdpsxws %x0,%x1" 879 : "=wa" (__temp) 880 : "wa" (__rounded) 881 : ); 882 883#ifdef _ARCH_PWR8 884#ifdef __LITTLE_ENDIAN__ 885 __temp = vec_mergeo (__temp, __temp); 886#else 887 __temp = vec_mergee (__temp, __temp); 888#endif 889 __result = (__v4si) vec_vpkudum ((__vector long long) __temp, 890 (__vector long long) __vzero); 891#else 892 { 893 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 894 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 895 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); 896 } 897#endif 898 return (__m128i) __result; 899} 900 901extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 902_mm_cvtpd_pi32 (__m128d __A) 903{ 904 __m128i __result = _mm_cvtpd_epi32(__A); 905 906 return (__m64) __result[0]; 907} 908 909extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 910_mm_cvtpd_ps (__m128d __A) 911{ 912 __v4sf __result; 913 __v4si __temp; 914 const __v4si __vzero = { 0, 0, 0, 0 }; 915 916 __asm__( 917 "xvcvdpsp %x0,%x1" 918 : "=wa" (__temp) 919 : "wa" (__A) 920 : ); 921 922#ifdef _ARCH_PWR8 923#ifdef __LITTLE_ENDIAN__ 924 __temp = vec_mergeo (__temp, __temp); 925#else 926 __temp = vec_mergee (__temp, __temp); 927#endif 928 __result = (__v4sf) vec_vpkudum ((__vector long long) __temp, 929 (__vector long long) __vzero); 930#else 931 { 932 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 933 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 934 __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); 935 } 936#endif 937 return ((__m128)__result); 938} 939 940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 941_mm_cvttpd_epi32 (__m128d __A) 942{ 943 __v4si __result; 944 __v4si __temp; 945 const __v4si __vzero = { 0, 0, 0, 0 }; 946 947 /* VSX Vector truncate Double-Precision to integer and Convert to 948 Signed Integer Word format with Saturate. */ 949 __asm__( 950 "xvcvdpsxws %x0,%x1" 951 : "=wa" (__temp) 952 : "wa" (__A) 953 : ); 954 955#ifdef _ARCH_PWR8 956#ifdef __LITTLE_ENDIAN__ 957 __temp = vec_mergeo (__temp, __temp); 958#else 959 __temp = vec_mergee (__temp, __temp); 960#endif 961 __result = (__v4si) vec_vpkudum ((__vector long long) __temp, 962 (__vector long long) __vzero); 963#else 964 { 965 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 966 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 967 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); 968 } 969#endif 970 971 return ((__m128i) __result); 972} 973 974extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975_mm_cvttpd_pi32 (__m128d __A) 976{ 977 __m128i __result = _mm_cvttpd_epi32 (__A); 978 979 return (__m64) __result[0]; 980} 981 982extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983_mm_cvtsi128_si32 (__m128i __A) 984{ 985 return ((__v4si)__A)[0]; 986} 987 988#ifdef _ARCH_PWR8 989extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 990_mm_cvtpi32_pd (__m64 __A) 991{ 992 __v4si __temp; 993 __v2di __tmp2; 994 __v2df __result; 995 996 __temp = (__v4si)vec_splats (__A); 997 __tmp2 = (__v2di)vec_unpackl (__temp); 998 __result = vec_ctf ((__vector signed long long) __tmp2, 0); 999 return (__m128d)__result; 1000} 1001#endif 1002 1003extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1004_mm_cvtps_epi32 (__m128 __A) 1005{ 1006 __v4sf __rounded; 1007 __v4si __result; 1008 1009 __rounded = vec_rint((__v4sf) __A); 1010 __result = vec_cts (__rounded, 0); 1011 return (__m128i) __result; 1012} 1013 1014extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1015_mm_cvttps_epi32 (__m128 __A) 1016{ 1017 __v4si __result; 1018 1019 __result = vec_cts ((__v4sf) __A, 0); 1020 return (__m128i) __result; 1021} 1022 1023extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1024_mm_cvtps_pd (__m128 __A) 1025{ 1026 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 1027#ifdef vec_doubleh 1028 return (__m128d) vec_doubleh ((__v4sf)__A); 1029#else 1030 /* Otherwise the compiler is not current and so need to generate the 1031 equivalent code. */ 1032 __v4sf __a = (__v4sf)__A; 1033 __v4sf __temp; 1034 __v2df __result; 1035#ifdef __LITTLE_ENDIAN__ 1036 /* The input float values are in elements {[0], [1]} but the convert 1037 instruction needs them in elements {[1], [3]}, So we use two 1038 shift left double vector word immediates to get the elements 1039 lined up. */ 1040 __temp = __builtin_vsx_xxsldwi (__a, __a, 3); 1041 __temp = __builtin_vsx_xxsldwi (__a, __temp, 2); 1042#else 1043 /* The input float values are in elements {[0], [1]} but the convert 1044 instruction needs them in elements {[0], [2]}, So we use two 1045 shift left double vector word immediates to get the elements 1046 lined up. */ 1047 __temp = vec_vmrghw (__a, __a); 1048#endif 1049 __asm__( 1050 " xvcvspdp %x0,%x1" 1051 : "=wa" (__result) 1052 : "wa" (__temp) 1053 : ); 1054 return (__m128d) __result; 1055#endif 1056} 1057 1058extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059_mm_cvtsd_si32 (__m128d __A) 1060{ 1061 __v2df __rounded = vec_rint((__v2df) __A); 1062 int __result = ((__v2df)__rounded)[0]; 1063 1064 return __result; 1065} 1066/* Intel intrinsic. */ 1067extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1068_mm_cvtsd_si64 (__m128d __A) 1069{ 1070 __v2df __rounded = vec_rint ((__v2df) __A ); 1071 long long __result = ((__v2df) __rounded)[0]; 1072 1073 return __result; 1074} 1075 1076/* Microsoft intrinsic. */ 1077extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1078_mm_cvtsd_si64x (__m128d __A) 1079{ 1080 return _mm_cvtsd_si64 ((__v2df)__A); 1081} 1082 1083extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1084_mm_cvttsd_si32 (__m128d __A) 1085{ 1086 int __result = ((__v2df)__A)[0]; 1087 1088 return __result; 1089} 1090 1091/* Intel intrinsic. */ 1092extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1093_mm_cvttsd_si64 (__m128d __A) 1094{ 1095 long long __result = ((__v2df)__A)[0]; 1096 1097 return __result; 1098} 1099 1100/* Microsoft intrinsic. */ 1101extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1102_mm_cvttsd_si64x (__m128d __A) 1103{ 1104 return _mm_cvttsd_si64 (__A); 1105} 1106 1107extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1108_mm_cvtsd_ss (__m128 __A, __m128d __B) 1109{ 1110 __v4sf __result = (__v4sf)__A; 1111 1112#ifdef __LITTLE_ENDIAN__ 1113 __v4sf __temp_s; 1114 /* Copy double element[0] to element [1] for conversion. */ 1115 __v2df __temp_b = vec_splat((__v2df)__B, 0); 1116 1117 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1118 __result = __builtin_vsx_xxsldwi (__result, __result, 3); 1119 /* Convert double to single float scalar in a vector. */ 1120 __asm__( 1121 "xscvdpsp %x0,%x1" 1122 : "=wa" (__temp_s) 1123 : "wa" (__temp_b) 1124 : ); 1125 /* Shift the resulting scalar into vector element [0]. */ 1126 __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1); 1127#else 1128 __result [0] = ((__v2df)__B)[0]; 1129#endif 1130 return (__m128) __result; 1131} 1132 1133extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1134_mm_cvtsi32_sd (__m128d __A, int __B) 1135{ 1136 __v2df __result = (__v2df)__A; 1137 double __db = __B; 1138 __result [0] = __db; 1139 return (__m128d)__result; 1140} 1141 1142/* Intel intrinsic. */ 1143extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144_mm_cvtsi64_sd (__m128d __A, long long __B) 1145{ 1146 __v2df __result = (__v2df)__A; 1147 double __db = __B; 1148 __result [0] = __db; 1149 return (__m128d)__result; 1150} 1151 1152/* Microsoft intrinsic. */ 1153extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1154_mm_cvtsi64x_sd (__m128d __A, long long __B) 1155{ 1156 return _mm_cvtsi64_sd (__A, __B); 1157} 1158 1159extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1160_mm_cvtss_sd (__m128d __A, __m128 __B) 1161{ 1162#ifdef __LITTLE_ENDIAN__ 1163 /* Use splat to move element [0] into position for the convert. */ 1164 __v4sf __temp = vec_splat ((__v4sf)__B, 0); 1165 __v2df __res; 1166 /* Convert single float scalar to double in a vector. */ 1167 __asm__( 1168 "xscvspdp %x0,%x1" 1169 : "=wa" (__res) 1170 : "wa" (__temp) 1171 : ); 1172 return (__m128d) vec_mergel (__res, (__v2df)__A); 1173#else 1174 __v2df __res = (__v2df)__A; 1175 __res [0] = ((__v4sf)__B) [0]; 1176 return (__m128d) __res; 1177#endif 1178} 1179 1180extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1181_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1182{ 1183 __vector double __result; 1184 const int __litmsk = __mask & 0x3; 1185 1186 if (__litmsk == 0) 1187 __result = vec_mergeh (__A, __B); 1188#if __GNUC__ < 6 1189 else if (__litmsk == 1) 1190 __result = vec_xxpermdi (__B, __A, 2); 1191 else if (__litmsk == 2) 1192 __result = vec_xxpermdi (__B, __A, 1); 1193#else 1194 else if (__litmsk == 1) 1195 __result = vec_xxpermdi (__A, __B, 2); 1196 else if (__litmsk == 2) 1197 __result = vec_xxpermdi (__A, __B, 1); 1198#endif 1199 else 1200 __result = vec_mergel (__A, __B); 1201 1202 return __result; 1203} 1204 1205extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206_mm_unpackhi_pd (__m128d __A, __m128d __B) 1207{ 1208 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1209} 1210 1211extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212_mm_unpacklo_pd (__m128d __A, __m128d __B) 1213{ 1214 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1215} 1216 1217extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1218_mm_loadh_pd (__m128d __A, double const *__B) 1219{ 1220 __v2df __result = (__v2df)__A; 1221 __result [1] = *__B; 1222 return (__m128d)__result; 1223} 1224 1225extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1226_mm_loadl_pd (__m128d __A, double const *__B) 1227{ 1228 __v2df __result = (__v2df)__A; 1229 __result [0] = *__B; 1230 return (__m128d)__result; 1231} 1232 1233#ifdef _ARCH_PWR8 1234/* Intrinsic functions that require PowerISA 2.07 minimum. */ 1235 1236/* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1237extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1238_mm_movemask_pd (__m128d __A) 1239{ 1240 __vector unsigned long long __result; 1241 static const __vector unsigned int __perm_mask = 1242 { 1243#ifdef __LITTLE_ENDIAN__ 1244 0x80800040, 0x80808080, 0x80808080, 0x80808080 1245#else 1246 0x80808080, 0x80808080, 0x80808080, 0x80804000 1247#endif 1248 }; 1249 1250 __result = ((__vector unsigned long long) 1251 vec_vbpermq ((__vector unsigned char) __A, 1252 (__vector unsigned char) __perm_mask)); 1253 1254#ifdef __LITTLE_ENDIAN__ 1255 return __result[1]; 1256#else 1257 return __result[0]; 1258#endif 1259} 1260#endif /* _ARCH_PWR8 */ 1261 1262extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1263_mm_packs_epi16 (__m128i __A, __m128i __B) 1264{ 1265 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1266} 1267 1268extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269_mm_packs_epi32 (__m128i __A, __m128i __B) 1270{ 1271 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1272} 1273 1274extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275_mm_packus_epi16 (__m128i __A, __m128i __B) 1276{ 1277 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1278} 1279 1280extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1281_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1282{ 1283 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1284} 1285 1286extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1287_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1288{ 1289 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1290} 1291 1292extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1293_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1294{ 1295 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1296} 1297 1298extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1299_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1300{ 1301 return (__m128i) vec_mergel ((__vector long long) __A, 1302 (__vector long long) __B); 1303} 1304 1305extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1306_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1307{ 1308 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1309} 1310 1311extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1313{ 1314 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1315} 1316 1317extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1318_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1319{ 1320 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1321} 1322 1323extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1324_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1325{ 1326 return (__m128i) vec_mergeh ((__vector long long) __A, 1327 (__vector long long) __B); 1328} 1329 1330extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1331_mm_add_epi8 (__m128i __A, __m128i __B) 1332{ 1333 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1334} 1335 1336extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1337_mm_add_epi16 (__m128i __A, __m128i __B) 1338{ 1339 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1340} 1341 1342extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343_mm_add_epi32 (__m128i __A, __m128i __B) 1344{ 1345 return (__m128i) ((__v4su)__A + (__v4su)__B); 1346} 1347 1348extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1349_mm_add_epi64 (__m128i __A, __m128i __B) 1350{ 1351 return (__m128i) ((__v2du)__A + (__v2du)__B); 1352} 1353 1354extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1355_mm_adds_epi8 (__m128i __A, __m128i __B) 1356{ 1357 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1358} 1359 1360extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1361_mm_adds_epi16 (__m128i __A, __m128i __B) 1362{ 1363 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1364} 1365 1366extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1367_mm_adds_epu8 (__m128i __A, __m128i __B) 1368{ 1369 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1370} 1371 1372extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1373_mm_adds_epu16 (__m128i __A, __m128i __B) 1374{ 1375 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1376} 1377 1378extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379_mm_sub_epi8 (__m128i __A, __m128i __B) 1380{ 1381 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1382} 1383 1384extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1385_mm_sub_epi16 (__m128i __A, __m128i __B) 1386{ 1387 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1388} 1389 1390extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1391_mm_sub_epi32 (__m128i __A, __m128i __B) 1392{ 1393 return (__m128i) ((__v4su)__A - (__v4su)__B); 1394} 1395 1396extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1397_mm_sub_epi64 (__m128i __A, __m128i __B) 1398{ 1399 return (__m128i) ((__v2du)__A - (__v2du)__B); 1400} 1401 1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403_mm_subs_epi8 (__m128i __A, __m128i __B) 1404{ 1405 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1406} 1407 1408extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1409_mm_subs_epi16 (__m128i __A, __m128i __B) 1410{ 1411 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1412} 1413 1414extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1415_mm_subs_epu8 (__m128i __A, __m128i __B) 1416{ 1417 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1418} 1419 1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1421_mm_subs_epu16 (__m128i __A, __m128i __B) 1422{ 1423 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1424} 1425 1426extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1427_mm_madd_epi16 (__m128i __A, __m128i __B) 1428{ 1429 __vector signed int __zero = {0, 0, 0, 0}; 1430 1431 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero); 1432} 1433 1434extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1435_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1436{ 1437 __vector signed int __w0, __w1; 1438 1439 __vector unsigned char __xform1 = { 1440#ifdef __LITTLE_ENDIAN__ 1441 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1442 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1443#else 1444 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1445 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1446#endif 1447 }; 1448 1449 __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1450 __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1451 return (__m128i) vec_perm (__w0, __w1, __xform1); 1452} 1453 1454extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1455_mm_mullo_epi16 (__m128i __A, __m128i __B) 1456{ 1457 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1458} 1459 1460extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1461_mm_mul_su32 (__m64 __A, __m64 __B) 1462{ 1463 unsigned int __a = __A; 1464 unsigned int __b = __B; 1465 1466 return ((__m64)__a * (__m64)__b); 1467} 1468 1469extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1470_mm_mul_epu32 (__m128i __A, __m128i __B) 1471{ 1472#if __GNUC__ < 8 || !defined (_ARCH_PWR8) 1473 __v2du __result; 1474 1475#ifdef __LITTLE_ENDIAN__ 1476 /* VMX Vector Multiply Odd Unsigned Word. */ 1477 __asm__( 1478 "vmulouw %0,%1,%2" 1479 : "=v" (__result) 1480 : "v" (__A), "v" (__B) 1481 : ); 1482#else 1483 /* VMX Vector Multiply Even Unsigned Word. */ 1484 __asm__( 1485 "vmuleuw %0,%1,%2" 1486 : "=v" (__result) 1487 : "v" (__A), "v" (__B) 1488 : ); 1489#endif 1490 return (__m128i) __result; 1491#else 1492 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1493#endif 1494} 1495 1496extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1497_mm_slli_epi16 (__m128i __A, int __B) 1498{ 1499 __v8hu __lshift; 1500 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1501 1502 if (__B >= 0 && __B < 16) 1503 { 1504 if (__builtin_constant_p(__B)) 1505 __lshift = (__v8hu) vec_splat_s16(__B); 1506 else 1507 __lshift = vec_splats ((unsigned short) __B); 1508 1509 __result = vec_sl ((__v8hi) __A, __lshift); 1510 } 1511 1512 return (__m128i) __result; 1513} 1514 1515extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1516_mm_slli_epi32 (__m128i __A, int __B) 1517{ 1518 __v4su __lshift; 1519 __v4si __result = { 0, 0, 0, 0 }; 1520 1521 if (__B >= 0 && __B < 32) 1522 { 1523 if (__builtin_constant_p(__B) && __B < 16) 1524 __lshift = (__v4su) vec_splat_s32(__B); 1525 else 1526 __lshift = vec_splats ((unsigned int) __B); 1527 1528 __result = vec_sl ((__v4si) __A, __lshift); 1529 } 1530 1531 return (__m128i) __result; 1532} 1533 1534#ifdef _ARCH_PWR8 1535extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1536_mm_slli_epi64 (__m128i __A, int __B) 1537{ 1538 __v2du __lshift; 1539 __v2di __result = { 0, 0 }; 1540 1541 if (__B >= 0 && __B < 64) 1542 { 1543 if (__builtin_constant_p(__B) && __B < 16) 1544 __lshift = (__v2du) vec_splat_s32(__B); 1545 else 1546 __lshift = (__v2du) vec_splats ((unsigned int) __B); 1547 1548 __result = vec_sl ((__v2di) __A, __lshift); 1549 } 1550 1551 return (__m128i) __result; 1552} 1553#endif 1554 1555extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1556_mm_srai_epi16 (__m128i __A, int __B) 1557{ 1558 __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1559 __v8hi __result; 1560 1561 if (__B < 16) 1562 { 1563 if (__builtin_constant_p(__B)) 1564 __rshift = (__v8hu) vec_splat_s16(__B); 1565 else 1566 __rshift = vec_splats ((unsigned short) __B); 1567 } 1568 __result = vec_sra ((__v8hi) __A, __rshift); 1569 1570 return (__m128i) __result; 1571} 1572 1573extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1574_mm_srai_epi32 (__m128i __A, int __B) 1575{ 1576 __v4su __rshift = { 31, 31, 31, 31 }; 1577 __v4si __result; 1578 1579 if (__B < 32) 1580 { 1581 if (__builtin_constant_p(__B)) 1582 { 1583 if (__B < 16) 1584 __rshift = (__v4su) vec_splat_s32(__B); 1585 else 1586 __rshift = (__v4su) vec_splats((unsigned int)__B); 1587 } 1588 else 1589 __rshift = vec_splats ((unsigned int) __B); 1590 } 1591 __result = vec_sra ((__v4si) __A, __rshift); 1592 1593 return (__m128i) __result; 1594} 1595 1596extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1597_mm_bslli_si128 (__m128i __A, const int __N) 1598{ 1599 __v16qu __result; 1600 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1601 1602 if (__N < 16) 1603 __result = vec_sld ((__v16qu) __A, __zeros, __N); 1604 else 1605 __result = __zeros; 1606 1607 return (__m128i) __result; 1608} 1609 1610extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1611_mm_bsrli_si128 (__m128i __A, const int __N) 1612{ 1613 __v16qu __result; 1614 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1615 1616 if (__N < 16) 1617#ifdef __LITTLE_ENDIAN__ 1618 if (__builtin_constant_p(__N)) 1619 /* Would like to use Vector Shift Left Double by Octet 1620 Immediate here to use the immediate form and avoid 1621 load of __N * 8 value into a separate VR. */ 1622 __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N)); 1623 else 1624#endif 1625 { 1626 __v16qu __shift = vec_splats((unsigned char)(__N*8)); 1627#ifdef __LITTLE_ENDIAN__ 1628 __result = vec_sro ((__v16qu)__A, __shift); 1629#else 1630 __result = vec_slo ((__v16qu)__A, __shift); 1631#endif 1632 } 1633 else 1634 __result = __zeros; 1635 1636 return (__m128i) __result; 1637} 1638 1639extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1640_mm_srli_si128 (__m128i __A, const int __N) 1641{ 1642 return _mm_bsrli_si128 (__A, __N); 1643} 1644 1645extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1646_mm_slli_si128 (__m128i __A, const int _imm5) 1647{ 1648 __v16qu __result; 1649 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1650 1651 if (_imm5 < 16) 1652#ifdef __LITTLE_ENDIAN__ 1653 __result = vec_sld ((__v16qu) __A, __zeros, _imm5); 1654#else 1655 __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5)); 1656#endif 1657 else 1658 __result = __zeros; 1659 1660 return (__m128i) __result; 1661} 1662 1663extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1664 1665_mm_srli_epi16 (__m128i __A, int __B) 1666{ 1667 __v8hu __rshift; 1668 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1669 1670 if (__B < 16) 1671 { 1672 if (__builtin_constant_p(__B)) 1673 __rshift = (__v8hu) vec_splat_s16(__B); 1674 else 1675 __rshift = vec_splats ((unsigned short) __B); 1676 1677 __result = vec_sr ((__v8hi) __A, __rshift); 1678 } 1679 1680 return (__m128i) __result; 1681} 1682 1683extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1684_mm_srli_epi32 (__m128i __A, int __B) 1685{ 1686 __v4su __rshift; 1687 __v4si __result = { 0, 0, 0, 0 }; 1688 1689 if (__B < 32) 1690 { 1691 if (__builtin_constant_p(__B)) 1692 { 1693 if (__B < 16) 1694 __rshift = (__v4su) vec_splat_s32(__B); 1695 else 1696 __rshift = (__v4su) vec_splats((unsigned int)__B); 1697 } 1698 else 1699 __rshift = vec_splats ((unsigned int) __B); 1700 1701 __result = vec_sr ((__v4si) __A, __rshift); 1702 } 1703 1704 return (__m128i) __result; 1705} 1706 1707#ifdef _ARCH_PWR8 1708extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1709_mm_srli_epi64 (__m128i __A, int __B) 1710{ 1711 __v2du __rshift; 1712 __v2di __result = { 0, 0 }; 1713 1714 if (__B < 64) 1715 { 1716 if (__builtin_constant_p(__B)) 1717 { 1718 if (__B < 16) 1719 __rshift = (__v2du) vec_splat_s32(__B); 1720 else 1721 __rshift = (__v2du) vec_splats((unsigned long long)__B); 1722 } 1723 else 1724 __rshift = (__v2du) vec_splats ((unsigned int) __B); 1725 1726 __result = vec_sr ((__v2di) __A, __rshift); 1727 } 1728 1729 return (__m128i) __result; 1730} 1731#endif 1732 1733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1734_mm_sll_epi16 (__m128i __A, __m128i __B) 1735{ 1736 __v8hu __lshift; 1737 __vector __bool short __shmask; 1738 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1739 __v8hu __result; 1740 1741#ifdef __LITTLE_ENDIAN__ 1742 __lshift = vec_splat ((__v8hu) __B, 0); 1743#else 1744 __lshift = vec_splat ((__v8hu) __B, 3); 1745#endif 1746 __shmask = vec_cmple (__lshift, __shmax); 1747 __result = vec_sl ((__v8hu) __A, __lshift); 1748 __result = vec_sel ((__v8hu) __shmask, __result, __shmask); 1749 1750 return (__m128i) __result; 1751} 1752 1753extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1754_mm_sll_epi32 (__m128i __A, __m128i __B) 1755{ 1756 __v4su __lshift; 1757 __vector __bool int __shmask; 1758 const __v4su __shmax = { 32, 32, 32, 32 }; 1759 __v4su __result; 1760#ifdef __LITTLE_ENDIAN__ 1761 __lshift = vec_splat ((__v4su) __B, 0); 1762#else 1763 __lshift = vec_splat ((__v4su) __B, 1); 1764#endif 1765 __shmask = vec_cmplt (__lshift, __shmax); 1766 __result = vec_sl ((__v4su) __A, __lshift); 1767 __result = vec_sel ((__v4su) __shmask, __result, __shmask); 1768 1769 return (__m128i) __result; 1770} 1771 1772#ifdef _ARCH_PWR8 1773extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1774_mm_sll_epi64 (__m128i __A, __m128i __B) 1775{ 1776 __v2du __lshift; 1777 __vector __bool long long __shmask; 1778 const __v2du __shmax = { 64, 64 }; 1779 __v2du __result; 1780 1781 __lshift = vec_splat ((__v2du) __B, 0); 1782 __shmask = vec_cmplt (__lshift, __shmax); 1783 __result = vec_sl ((__v2du) __A, __lshift); 1784 __result = vec_sel ((__v2du) __shmask, __result, __shmask); 1785 1786 return (__m128i) __result; 1787} 1788#endif 1789 1790extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1791_mm_sra_epi16 (__m128i __A, __m128i __B) 1792{ 1793 const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1794 __v8hu __rshift; 1795 __v8hi __result; 1796 1797#ifdef __LITTLE_ENDIAN__ 1798 __rshift = vec_splat ((__v8hu)__B, 0); 1799#else 1800 __rshift = vec_splat ((__v8hu)__B, 3); 1801#endif 1802 __rshift = vec_min (__rshift, __rshmax); 1803 __result = vec_sra ((__v8hi) __A, __rshift); 1804 1805 return (__m128i) __result; 1806} 1807 1808extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1809_mm_sra_epi32 (__m128i __A, __m128i __B) 1810{ 1811 const __v4su __rshmax = { 31, 31, 31, 31 }; 1812 __v4su __rshift; 1813 __v4si __result; 1814 1815#ifdef __LITTLE_ENDIAN__ 1816 __rshift = vec_splat ((__v4su)__B, 0); 1817#else 1818 __rshift = vec_splat ((__v4su)__B, 1); 1819#endif 1820 __rshift = vec_min (__rshift, __rshmax); 1821 __result = vec_sra ((__v4si) __A, __rshift); 1822 1823 return (__m128i) __result; 1824} 1825 1826extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1827_mm_srl_epi16 (__m128i __A, __m128i __B) 1828{ 1829 __v8hu __rshift; 1830 __vector __bool short __shmask; 1831 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1832 __v8hu __result; 1833 1834#ifdef __LITTLE_ENDIAN__ 1835 __rshift = vec_splat ((__v8hu) __B, 0); 1836#else 1837 __rshift = vec_splat ((__v8hu) __B, 3); 1838#endif 1839 __shmask = vec_cmple (__rshift, __shmax); 1840 __result = vec_sr ((__v8hu) __A, __rshift); 1841 __result = vec_sel ((__v8hu) __shmask, __result, __shmask); 1842 1843 return (__m128i) __result; 1844} 1845 1846extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1847_mm_srl_epi32 (__m128i __A, __m128i __B) 1848{ 1849 __v4su __rshift; 1850 __vector __bool int __shmask; 1851 const __v4su __shmax = { 32, 32, 32, 32 }; 1852 __v4su __result; 1853 1854#ifdef __LITTLE_ENDIAN__ 1855 __rshift = vec_splat ((__v4su) __B, 0); 1856#else 1857 __rshift = vec_splat ((__v4su) __B, 1); 1858#endif 1859 __shmask = vec_cmplt (__rshift, __shmax); 1860 __result = vec_sr ((__v4su) __A, __rshift); 1861 __result = vec_sel ((__v4su) __shmask, __result, __shmask); 1862 1863 return (__m128i) __result; 1864} 1865 1866#ifdef _ARCH_PWR8 1867extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1868_mm_srl_epi64 (__m128i __A, __m128i __B) 1869{ 1870 __v2du __rshift; 1871 __vector __bool long long __shmask; 1872 const __v2du __shmax = { 64, 64 }; 1873 __v2du __result; 1874 1875 __rshift = vec_splat ((__v2du) __B, 0); 1876 __shmask = vec_cmplt (__rshift, __shmax); 1877 __result = vec_sr ((__v2du) __A, __rshift); 1878 __result = vec_sel ((__v2du) __shmask, __result, __shmask); 1879 1880 return (__m128i) __result; 1881} 1882#endif 1883 1884extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1885_mm_and_pd (__m128d __A, __m128d __B) 1886{ 1887 return (vec_and ((__v2df) __A, (__v2df) __B)); 1888} 1889 1890extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1891_mm_andnot_pd (__m128d __A, __m128d __B) 1892{ 1893 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1894} 1895 1896extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1897_mm_or_pd (__m128d __A, __m128d __B) 1898{ 1899 return (vec_or ((__v2df) __A, (__v2df) __B)); 1900} 1901 1902extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1903_mm_xor_pd (__m128d __A, __m128d __B) 1904{ 1905 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1906} 1907 1908extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1909_mm_and_si128 (__m128i __A, __m128i __B) 1910{ 1911 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1912} 1913 1914extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1915_mm_andnot_si128 (__m128i __A, __m128i __B) 1916{ 1917 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1918} 1919 1920extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1921_mm_or_si128 (__m128i __A, __m128i __B) 1922{ 1923 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1924} 1925 1926extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1927_mm_xor_si128 (__m128i __A, __m128i __B) 1928{ 1929 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1930} 1931 1932extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1933_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1934{ 1935 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1936} 1937 1938extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1939_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1940{ 1941 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1942} 1943 1944extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1945_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1946{ 1947 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1948} 1949 1950extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1951_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1952{ 1953 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1954} 1955 1956extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1957_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1958{ 1959 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1960} 1961 1962extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1963_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1964{ 1965 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1966} 1967 1968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1969_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1970{ 1971 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1972} 1973 1974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1975_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1976{ 1977 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1978} 1979 1980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1981_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1982{ 1983 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1984} 1985 1986extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1987_mm_extract_epi16 (__m128i const __A, int const __N) 1988{ 1989 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1990} 1991 1992extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1993_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1994{ 1995 __v8hi __result = (__v8hi)__A; 1996 1997 __result [(__N & 7)] = __D; 1998 1999 return (__m128i) __result; 2000} 2001 2002extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2003_mm_max_epi16 (__m128i __A, __m128i __B) 2004{ 2005 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 2006} 2007 2008extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2009_mm_max_epu8 (__m128i __A, __m128i __B) 2010{ 2011 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 2012} 2013 2014extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2015_mm_min_epi16 (__m128i __A, __m128i __B) 2016{ 2017 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 2018} 2019 2020extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2021_mm_min_epu8 (__m128i __A, __m128i __B) 2022{ 2023 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 2024} 2025 2026 2027#ifdef _ARCH_PWR8 2028/* Intrinsic functions that require PowerISA 2.07 minimum. */ 2029 2030/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 2031extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2032_mm_movemask_epi8 (__m128i __A) 2033{ 2034 __vector unsigned long long __result; 2035 static const __vector unsigned char __perm_mask = 2036 { 2037 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2038 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2039 }; 2040 2041 __result = ((__vector unsigned long long) 2042 vec_vbpermq ((__vector unsigned char) __A, 2043 (__vector unsigned char) __perm_mask)); 2044 2045#ifdef __LITTLE_ENDIAN__ 2046 return __result[1]; 2047#else 2048 return __result[0]; 2049#endif 2050} 2051#endif /* _ARCH_PWR8 */ 2052 2053extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2054_mm_mulhi_epu16 (__m128i __A, __m128i __B) 2055{ 2056 __v4su __w0, __w1; 2057 __v16qu __xform1 = { 2058#ifdef __LITTLE_ENDIAN__ 2059 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2060 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2061#else 2062 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2063 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2064#endif 2065 }; 2066 2067 __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2068 __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2069 return (__m128i) vec_perm (__w0, __w1, __xform1); 2070} 2071 2072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2073_mm_shufflehi_epi16 (__m128i __A, const int __mask) 2074{ 2075 unsigned long __element_selector_98 = __mask & 0x03; 2076 unsigned long __element_selector_BA = (__mask >> 2) & 0x03; 2077 unsigned long __element_selector_DC = (__mask >> 4) & 0x03; 2078 unsigned long __element_selector_FE = (__mask >> 6) & 0x03; 2079 static const unsigned short __permute_selectors[4] = 2080 { 2081#ifdef __LITTLE_ENDIAN__ 2082 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2083#else 2084 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2085#endif 2086 }; 2087 __v2du __pmask = 2088#ifdef __LITTLE_ENDIAN__ 2089 { 0x1716151413121110UL, 0UL}; 2090#else 2091 { 0x1011121314151617UL, 0UL}; 2092#endif 2093 __m64_union __t; 2094 __v2du __a, __r; 2095 2096 __t.as_short[0] = __permute_selectors[__element_selector_98]; 2097 __t.as_short[1] = __permute_selectors[__element_selector_BA]; 2098 __t.as_short[2] = __permute_selectors[__element_selector_DC]; 2099 __t.as_short[3] = __permute_selectors[__element_selector_FE]; 2100 __pmask[1] = __t.as_m64; 2101 __a = (__v2du)__A; 2102 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask); 2103 return (__m128i) __r; 2104} 2105 2106extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2107_mm_shufflelo_epi16 (__m128i __A, const int __mask) 2108{ 2109 unsigned long __element_selector_10 = __mask & 0x03; 2110 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2111 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2112 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2113 static const unsigned short __permute_selectors[4] = 2114 { 2115#ifdef __LITTLE_ENDIAN__ 2116 0x0100, 0x0302, 0x0504, 0x0706 2117#else 2118 0x0001, 0x0203, 0x0405, 0x0607 2119#endif 2120 }; 2121 __v2du __pmask = 2122#ifdef __LITTLE_ENDIAN__ 2123 { 0UL, 0x1f1e1d1c1b1a1918UL}; 2124#else 2125 { 0UL, 0x18191a1b1c1d1e1fUL}; 2126#endif 2127 __m64_union __t; 2128 __v2du __a, __r; 2129 __t.as_short[0] = __permute_selectors[__element_selector_10]; 2130 __t.as_short[1] = __permute_selectors[__element_selector_32]; 2131 __t.as_short[2] = __permute_selectors[__element_selector_54]; 2132 __t.as_short[3] = __permute_selectors[__element_selector_76]; 2133 __pmask[0] = __t.as_m64; 2134 __a = (__v2du)__A; 2135 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask); 2136 return (__m128i) __r; 2137} 2138 2139extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2140_mm_shuffle_epi32 (__m128i __A, const int __mask) 2141{ 2142 unsigned long __element_selector_10 = __mask & 0x03; 2143 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2144 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2145 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2146 static const unsigned int __permute_selectors[4] = 2147 { 2148#ifdef __LITTLE_ENDIAN__ 2149 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2150#else 2151 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2152#endif 2153 }; 2154 __v4su __t; 2155 2156 __t[0] = __permute_selectors[__element_selector_10]; 2157 __t[1] = __permute_selectors[__element_selector_32]; 2158 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 2159 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 2160 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t); 2161} 2162 2163extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2164_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2165{ 2166 __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2167 __v16qu __mask, __tmp; 2168 __m128i_u *__p = (__m128i_u*)__C; 2169 2170 __tmp = (__v16qu)_mm_loadu_si128(__p); 2171 __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit); 2172 __tmp = vec_sel (__tmp, (__v16qu)__A, __mask); 2173 _mm_storeu_si128 (__p, (__m128i)__tmp); 2174} 2175 2176extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2177_mm_avg_epu8 (__m128i __A, __m128i __B) 2178{ 2179 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2180} 2181 2182extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2183_mm_avg_epu16 (__m128i __A, __m128i __B) 2184{ 2185 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2186} 2187 2188 2189extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2190_mm_sad_epu8 (__m128i __A, __m128i __B) 2191{ 2192 __v16qu __a, __b; 2193 __v16qu __vmin, __vmax, __vabsdiff; 2194 __v4si __vsum; 2195 const __v4su __zero = { 0, 0, 0, 0 }; 2196 __v4si __result; 2197 2198 __a = (__v16qu) __A; 2199 __b = (__v16qu) __B; 2200 __vmin = vec_min (__a, __b); 2201 __vmax = vec_max (__a, __b); 2202 __vabsdiff = vec_sub (__vmax, __vmin); 2203 /* Sum four groups of bytes into integers. */ 2204 __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero); 2205 /* Sum across four integers with two integer results. */ 2206 __result = vec_sum2s (__vsum, (__vector signed int) __zero); 2207 /* Rotate the sums into the correct position. */ 2208#ifdef __LITTLE_ENDIAN__ 2209 __result = vec_sld (__result, __result, 4); 2210#else 2211 __result = vec_sld (__result, __result, 6); 2212#endif 2213 /* Rotate the sums into the correct position. */ 2214 return (__m128i) __result; 2215} 2216 2217extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2218_mm_stream_si32 (int *__A, int __B) 2219{ 2220 /* Use the data cache block touch for store transient. */ 2221 __asm__ ( 2222 "dcbtstt 0,%0" 2223 : 2224 : "b" (__A) 2225 : "memory" 2226 ); 2227 *__A = __B; 2228} 2229 2230extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2231_mm_stream_si64 (long long int *__A, long long int __B) 2232{ 2233 /* Use the data cache block touch for store transient. */ 2234 __asm__ ( 2235 " dcbtstt 0,%0" 2236 : 2237 : "b" (__A) 2238 : "memory" 2239 ); 2240 *__A = __B; 2241} 2242 2243extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2244_mm_stream_si128 (__m128i *__A, __m128i __B) 2245{ 2246 /* Use the data cache block touch for store transient. */ 2247 __asm__ ( 2248 "dcbtstt 0,%0" 2249 : 2250 : "b" (__A) 2251 : "memory" 2252 ); 2253 *__A = __B; 2254} 2255 2256extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2257_mm_stream_pd (double *__A, __m128d __B) 2258{ 2259 /* Use the data cache block touch for store transient. */ 2260 __asm__ ( 2261 "dcbtstt 0,%0" 2262 : 2263 : "b" (__A) 2264 : "memory" 2265 ); 2266 *(__m128d*)__A = __B; 2267} 2268 2269extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2270_mm_clflush (void const *__A) 2271{ 2272 /* Use the data cache block flush. */ 2273 __asm__ ( 2274 "dcbf 0,%0" 2275 : 2276 : "b" (__A) 2277 : "memory" 2278 ); 2279} 2280 2281extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2282_mm_lfence (void) 2283{ 2284 /* Use light weight sync for load to load ordering. */ 2285 __atomic_thread_fence (__ATOMIC_RELEASE); 2286} 2287 2288extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2289_mm_mfence (void) 2290{ 2291 /* Use heavy weight sync for any to any ordering. */ 2292 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2293} 2294 2295extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2296_mm_cvtsi32_si128 (int __A) 2297{ 2298 return _mm_set_epi32 (0, 0, 0, __A); 2299} 2300 2301extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2302_mm_cvtsi64_si128 (long long __A) 2303{ 2304 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2305} 2306 2307/* Microsoft intrinsic. */ 2308extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2309_mm_cvtsi64x_si128 (long long __A) 2310{ 2311 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2312} 2313 2314/* Casts between various SP, DP, INT vector types. Note that these do no 2315 conversion of values, they just change the type. */ 2316extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2317_mm_castpd_ps(__m128d __A) 2318{ 2319 return (__m128) __A; 2320} 2321 2322extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2323_mm_castpd_si128(__m128d __A) 2324{ 2325 return (__m128i) __A; 2326} 2327 2328extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2329_mm_castps_pd(__m128 __A) 2330{ 2331 return (__m128d) __A; 2332} 2333 2334extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2335_mm_castps_si128(__m128 __A) 2336{ 2337 return (__m128i) __A; 2338} 2339 2340extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2341_mm_castsi128_ps(__m128i __A) 2342{ 2343 return (__m128) __A; 2344} 2345 2346extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2347_mm_castsi128_pd(__m128i __A) 2348{ 2349 return (__m128d) __A; 2350} 2351 2352#endif /* EMMINTRIN_H_ */ 2353 2354