1/* Copyright (C) 2002-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24/* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27#ifndef NO_WARN_X86_INTRINSICS 28/* This header is distributed to simplify porting x86_64 code that 29 makes explicit use of Intel intrinsics to powerpc64le. 30 It is the user's responsibility to determine if the results are 31 acceptable and make additional changes as necessary. 32 Note that much code that uses Intel intrinsics can be rewritten in 33 standard C or GNU C extensions, which are more portable and better 34 optimized across multiple targets. 35 36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC 37 VMX/VSX ISA is a good match for vector float SIMD operations. 38 However scalar float operations in vector (XMM) registers require 39 the POWER8 VSX ISA (2.07) level. Also there are important 40 differences for data format and placement of float scalars in the 41 vector register. For PowerISA Scalar floats in FPRs (left most 42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE 43 uses the right most 32-bits of the XMM. These differences require 44 extra steps on POWER to match the SSE scalar float semantics. 45 46 Most SSE scalar float intrinsic operations can be performed more 47 efficiently as C language float scalar operations or optimized to 48 use vector SIMD operations. We recommend this for new applications. 49 50 Another difference is the format and details of the X86_64 MXSCR vs 51 the PowerISA FPSCR / VSCR registers. We recommend applications 52 replace direct access to the MXSCR with the more portable <fenv.h> 53 Posix APIs. */ 54#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 55#endif 56 57#ifndef _XMMINTRIN_H_INCLUDED 58#define _XMMINTRIN_H_INCLUDED 59 60/* Define four value permute mask */ 61#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) 62 63#include <altivec.h> 64 65/* Avoid collisions between altivec.h and strict adherence to C++ and 66 C11 standards. This should eventually be done inside altivec.h itself, 67 but only after testing a full distro build. */ 68#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ 69 (defined(__STDC_VERSION__) && \ 70 __STDC_VERSION__ >= 201112L)) 71#undef vector 72#undef pixel 73#undef bool 74#endif 75 76#include <assert.h> 77 78/* We need type definitions from the MMX header file. */ 79#include <mmintrin.h> 80 81/* Get _mm_malloc () and _mm_free (). */ 82#include <mm_malloc.h> 83 84/* The Intel API is flexible enough that we must allow aliasing with other 85 vector types, and their scalar components. */ 86typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 87 88/* Unaligned version of the same type. */ 89typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 90 __aligned__ (1))); 91 92/* Internal data types for implementing the intrinsics. */ 93typedef float __v4sf __attribute__ ((__vector_size__ (16))); 94 95/* Create an undefined vector. */ 96extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 97_mm_undefined_ps (void) 98{ 99 __m128 __Y = __Y; 100 return __Y; 101} 102 103/* Create a vector of zeros. */ 104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 105_mm_setzero_ps (void) 106{ 107 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 108} 109 110/* Load four SPFP values from P. The address must be 16-byte aligned. */ 111extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 112_mm_load_ps (float const *__P) 113{ 114 assert(((unsigned long)__P & 0xfUL) == 0UL); 115 return ((__m128)vec_ld(0, (__v4sf*)__P)); 116} 117 118/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120_mm_loadu_ps (float const *__P) 121{ 122 return (vec_vsx_ld(0, __P)); 123} 124 125/* Load four SPFP values in reverse order. The address must be aligned. */ 126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 127_mm_loadr_ps (float const *__P) 128{ 129 __v4sf __tmp; 130 __m128 __result; 131 static const __vector unsigned char __permute_vector = 132 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 133 0x17, 0x10, 0x11, 0x12, 0x13 }; 134 135 __tmp = vec_ld (0, (__v4sf *) __P); 136 __result = (__m128) vec_perm (__tmp, __tmp, __permute_vector); 137 return __result; 138} 139 140/* Create a vector with all four elements equal to F. */ 141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142_mm_set1_ps (float __F) 143{ 144 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 145} 146 147extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148_mm_set_ps1 (float __F) 149{ 150 return _mm_set1_ps (__F); 151} 152 153/* Create the vector [Z Y X W]. */ 154extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 155_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 156{ 157 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 158} 159 160/* Create the vector [W X Y Z]. */ 161extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 162_mm_setr_ps (float __Z, float __Y, float __X, float __W) 163{ 164 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 165} 166 167/* Store four SPFP values. The address must be 16-byte aligned. */ 168extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 169_mm_store_ps (float *__P, __m128 __A) 170{ 171 assert(((unsigned long)__P & 0xfUL) == 0UL); 172 vec_st((__v4sf)__A, 0, (__v4sf*)__P); 173} 174 175/* Store four SPFP values. The address need not be 16-byte aligned. */ 176extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177_mm_storeu_ps (float *__P, __m128 __A) 178{ 179 *(__m128_u *)__P = __A; 180} 181 182/* Store four SPFP values in reverse order. The address must be aligned. */ 183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 184_mm_storer_ps (float *__P, __m128 __A) 185{ 186 __v4sf __tmp; 187 static const __vector unsigned char __permute_vector = 188 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 189 0x17, 0x10, 0x11, 0x12, 0x13 }; 190 191 __tmp = (__m128) vec_perm (__A, __A, __permute_vector); 192 193 _mm_store_ps (__P, __tmp); 194} 195 196/* Store the lower SPFP value across four words. */ 197extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 198_mm_store1_ps (float *__P, __m128 __A) 199{ 200 __v4sf __va = vec_splat((__v4sf)__A, 0); 201 _mm_store_ps (__P, __va); 202} 203 204extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 205_mm_store_ps1 (float *__P, __m128 __A) 206{ 207 _mm_store1_ps (__P, __A); 208} 209 210/* Create a vector with element 0 as F and the rest zero. */ 211extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 212_mm_set_ss (float __F) 213{ 214 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; 215} 216 217/* Sets the low SPFP value of A from the low value of B. */ 218extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 219_mm_move_ss (__m128 __A, __m128 __B) 220{ 221 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 222 223 return (vec_sel ((__v4sf)__A, (__v4sf)__B, __mask)); 224} 225 226/* Create a vector with element 0 as *P and the rest zero. */ 227extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228_mm_load_ss (float const *__P) 229{ 230 return _mm_set_ss (*__P); 231} 232 233/* Stores the lower SPFP value. */ 234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235_mm_store_ss (float *__P, __m128 __A) 236{ 237 *__P = ((__v4sf)__A)[0]; 238} 239 240/* Perform the respective operation on the lower SPFP (single-precision 241 floating-point) values of A and B; the upper three SPFP values are 242 passed through from A. */ 243 244extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 245_mm_add_ss (__m128 __A, __m128 __B) 246{ 247#ifdef _ARCH_PWR7 248 __m128 __a, __b, __c; 249 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 250 /* PowerISA VSX does not allow partial (for just lower double) 251 results. So to insure we don't generate spurious exceptions 252 (from the upper double values) we splat the lower double 253 before we to the operation. */ 254 __a = vec_splat (__A, 0); 255 __b = vec_splat (__B, 0); 256 __c = __a + __b; 257 /* Then we merge the lower float result with the original upper 258 float elements from __A. */ 259 return (vec_sel (__A, __c, __mask)); 260#else 261 __A[0] = __A[0] + __B[0]; 262 return (__A); 263#endif 264} 265 266extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 267_mm_sub_ss (__m128 __A, __m128 __B) 268{ 269#ifdef _ARCH_PWR7 270 __m128 __a, __b, __c; 271 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 272 /* PowerISA VSX does not allow partial (for just lower double) 273 results. So to insure we don't generate spurious exceptions 274 (from the upper double values) we splat the lower double 275 before we to the operation. */ 276 __a = vec_splat (__A, 0); 277 __b = vec_splat (__B, 0); 278 __c = __a - __b; 279 /* Then we merge the lower float result with the original upper 280 float elements from __A. */ 281 return (vec_sel (__A, __c, __mask)); 282#else 283 __A[0] = __A[0] - __B[0]; 284 return (__A); 285#endif 286} 287 288extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289_mm_mul_ss (__m128 __A, __m128 __B) 290{ 291#ifdef _ARCH_PWR7 292 __m128 __a, __b, __c; 293 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 294 /* PowerISA VSX does not allow partial (for just lower double) 295 results. So to insure we don't generate spurious exceptions 296 (from the upper double values) we splat the lower double 297 before we to the operation. */ 298 __a = vec_splat (__A, 0); 299 __b = vec_splat (__B, 0); 300 __c = __a * __b; 301 /* Then we merge the lower float result with the original upper 302 float elements from __A. */ 303 return (vec_sel (__A, __c, __mask)); 304#else 305 __A[0] = __A[0] * __B[0]; 306 return (__A); 307#endif 308} 309 310extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 311_mm_div_ss (__m128 __A, __m128 __B) 312{ 313#ifdef _ARCH_PWR7 314 __m128 __a, __b, __c; 315 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 316 /* PowerISA VSX does not allow partial (for just lower double) 317 results. So to insure we don't generate spurious exceptions 318 (from the upper double values) we splat the lower double 319 before we to the operation. */ 320 __a = vec_splat (__A, 0); 321 __b = vec_splat (__B, 0); 322 __c = __a / __b; 323 /* Then we merge the lower float result with the original upper 324 float elements from __A. */ 325 return (vec_sel (__A, __c, __mask)); 326#else 327 __A[0] = __A[0] / __B[0]; 328 return (__A); 329#endif 330} 331 332extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 333_mm_sqrt_ss (__m128 __A) 334{ 335 __m128 __a, __c; 336 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 337 /* PowerISA VSX does not allow partial (for just lower double) 338 * results. So to insure we don't generate spurious exceptions 339 * (from the upper double values) we splat the lower double 340 * before we to the operation. */ 341 __a = vec_splat (__A, 0); 342 __c = vec_sqrt (__a); 343 /* Then we merge the lower float result with the original upper 344 * float elements from __A. */ 345 return (vec_sel (__A, __c, __mask)); 346} 347 348/* Perform the respective operation on the four SPFP values in A and B. */ 349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 350_mm_add_ps (__m128 __A, __m128 __B) 351{ 352 return (__m128) ((__v4sf)__A + (__v4sf)__B); 353} 354 355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356_mm_sub_ps (__m128 __A, __m128 __B) 357{ 358 return (__m128) ((__v4sf)__A - (__v4sf)__B); 359} 360 361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362_mm_mul_ps (__m128 __A, __m128 __B) 363{ 364 return (__m128) ((__v4sf)__A * (__v4sf)__B); 365} 366 367extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368_mm_div_ps (__m128 __A, __m128 __B) 369{ 370 return (__m128) ((__v4sf)__A / (__v4sf)__B); 371} 372 373extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 374_mm_sqrt_ps (__m128 __A) 375{ 376 return (vec_sqrt ((__v4sf)__A)); 377} 378 379extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380_mm_rcp_ps (__m128 __A) 381{ 382 return (vec_re ((__v4sf)__A)); 383} 384 385extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386_mm_rsqrt_ps (__m128 __A) 387{ 388 return (vec_rsqrte (__A)); 389} 390 391extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392_mm_rcp_ss (__m128 __A) 393{ 394 __m128 __a, __c; 395 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 396 /* PowerISA VSX does not allow partial (for just lower double) 397 * results. So to insure we don't generate spurious exceptions 398 * (from the upper double values) we splat the lower double 399 * before we to the operation. */ 400 __a = vec_splat (__A, 0); 401 __c = _mm_rcp_ps (__a); 402 /* Then we merge the lower float result with the original upper 403 * float elements from __A. */ 404 return (vec_sel (__A, __c, __mask)); 405} 406 407extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 408_mm_rsqrt_ss (__m128 __A) 409{ 410 __m128 __a, __c; 411 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 412 /* PowerISA VSX does not allow partial (for just lower double) 413 * results. So to insure we don't generate spurious exceptions 414 * (from the upper double values) we splat the lower double 415 * before we to the operation. */ 416 __a = vec_splat (__A, 0); 417 __c = vec_rsqrte (__a); 418 /* Then we merge the lower float result with the original upper 419 * float elements from __A. */ 420 return (vec_sel (__A, __c, __mask)); 421} 422 423extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 424_mm_min_ss (__m128 __A, __m128 __B) 425{ 426 __v4sf __a, __b, __c; 427 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 428 /* PowerISA VSX does not allow partial (for just lower float) 429 * results. So to insure we don't generate spurious exceptions 430 * (from the upper float values) we splat the lower float 431 * before we to the operation. */ 432 __a = vec_splat ((__v4sf)__A, 0); 433 __b = vec_splat ((__v4sf)__B, 0); 434 __c = vec_min (__a, __b); 435 /* Then we merge the lower float result with the original upper 436 * float elements from __A. */ 437 return (vec_sel ((__v4sf)__A, __c, __mask)); 438} 439 440extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 441_mm_max_ss (__m128 __A, __m128 __B) 442{ 443 __v4sf __a, __b, __c; 444 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; 445 /* PowerISA VSX does not allow partial (for just lower float) 446 * results. So to insure we don't generate spurious exceptions 447 * (from the upper float values) we splat the lower float 448 * before we to the operation. */ 449 __a = vec_splat (__A, 0); 450 __b = vec_splat (__B, 0); 451 __c = vec_max (__a, __b); 452 /* Then we merge the lower float result with the original upper 453 * float elements from __A. */ 454 return (vec_sel ((__v4sf)__A, __c, __mask)); 455} 456 457extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 458_mm_min_ps (__m128 __A, __m128 __B) 459{ 460 __vector __bool int __m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); 461 return vec_sel (__B, __A, __m); 462} 463 464extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465_mm_max_ps (__m128 __A, __m128 __B) 466{ 467 __vector __bool int __m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); 468 return vec_sel (__B, __A, __m); 469} 470 471/* Perform logical bit-wise operations on 128-bit values. */ 472extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 473_mm_and_ps (__m128 __A, __m128 __B) 474{ 475 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); 476// return __builtin_ia32_andps (__A, __B); 477} 478 479extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 480_mm_andnot_ps (__m128 __A, __m128 __B) 481{ 482 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); 483} 484 485extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 486_mm_or_ps (__m128 __A, __m128 __B) 487{ 488 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); 489} 490 491extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 492_mm_xor_ps (__m128 __A, __m128 __B) 493{ 494 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); 495} 496 497/* Perform a comparison on the four SPFP values of A and B. For each 498 element, if the comparison is true, place a mask of all ones in the 499 result, otherwise a mask of zeros. */ 500extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501_mm_cmpeq_ps (__m128 __A, __m128 __B) 502{ 503 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); 504} 505 506extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 507_mm_cmplt_ps (__m128 __A, __m128 __B) 508{ 509 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 510} 511 512extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 513_mm_cmple_ps (__m128 __A, __m128 __B) 514{ 515 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 516} 517 518extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 519_mm_cmpgt_ps (__m128 __A, __m128 __B) 520{ 521 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 522} 523 524extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 525_mm_cmpge_ps (__m128 __A, __m128 __B) 526{ 527 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 528} 529 530extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 531_mm_cmpneq_ps (__m128 __A, __m128 __B) 532{ 533 __v4sf __temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); 534 return ((__m128)vec_nor (__temp, __temp)); 535} 536 537extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 538_mm_cmpnlt_ps (__m128 __A, __m128 __B) 539{ 540 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 541} 542 543extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 544_mm_cmpnle_ps (__m128 __A, __m128 __B) 545{ 546 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 547} 548 549extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 550_mm_cmpngt_ps (__m128 __A, __m128 __B) 551{ 552 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 553} 554 555extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 556_mm_cmpnge_ps (__m128 __A, __m128 __B) 557{ 558 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 559} 560 561extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 562_mm_cmpord_ps (__m128 __A, __m128 __B) 563{ 564 __vector unsigned int __a, __b; 565 __vector unsigned int __c, __d; 566 static const __vector unsigned int __float_exp_mask = 567 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 568 569 __a = (__vector unsigned int) vec_abs ((__v4sf)__A); 570 __b = (__vector unsigned int) vec_abs ((__v4sf)__B); 571 __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a); 572 __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b); 573 return ((__m128 ) vec_and (__c, __d)); 574} 575 576extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 577_mm_cmpunord_ps (__m128 __A, __m128 __B) 578{ 579 __vector unsigned int __a, __b; 580 __vector unsigned int __c, __d; 581 static const __vector unsigned int __float_exp_mask = 582 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 583 584 __a = (__vector unsigned int) vec_abs ((__v4sf)__A); 585 __b = (__vector unsigned int) vec_abs ((__v4sf)__B); 586 __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask); 587 __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask); 588 return ((__m128 ) vec_or (__c, __d)); 589} 590 591/* Perform a comparison on the lower SPFP values of A and B. If the 592 comparison is true, place a mask of all ones in the result, otherwise a 593 mask of zeros. The upper three SPFP values are passed through from A. */ 594extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595_mm_cmpeq_ss (__m128 __A, __m128 __B) 596{ 597 static const __vector unsigned int __mask = 598 { 0xffffffff, 0, 0, 0 }; 599 __v4sf __a, __b, __c; 600 /* PowerISA VMX does not allow partial (for just element 0) 601 * results. So to insure we don't generate spurious exceptions 602 * (from the upper elements) we splat the lower float 603 * before we to the operation. */ 604 __a = vec_splat ((__v4sf) __A, 0); 605 __b = vec_splat ((__v4sf) __B, 0); 606 __c = (__v4sf) vec_cmpeq (__a, __b); 607 /* Then we merge the lower float result with the original upper 608 * float elements from __A. */ 609 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 610} 611 612extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 613_mm_cmplt_ss (__m128 __A, __m128 __B) 614{ 615 static const __vector unsigned int __mask = 616 { 0xffffffff, 0, 0, 0 }; 617 __v4sf __a, __b, __c; 618 /* PowerISA VMX does not allow partial (for just element 0) 619 * results. So to insure we don't generate spurious exceptions 620 * (from the upper elements) we splat the lower float 621 * before we to the operation. */ 622 __a = vec_splat ((__v4sf) __A, 0); 623 __b = vec_splat ((__v4sf) __B, 0); 624 __c = (__v4sf) vec_cmplt(__a, __b); 625 /* Then we merge the lower float result with the original upper 626 * float elements from __A. */ 627 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 628} 629 630extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 631_mm_cmple_ss (__m128 __A, __m128 __B) 632{ 633 static const __vector unsigned int __mask = 634 { 0xffffffff, 0, 0, 0 }; 635 __v4sf __a, __b, __c; 636 /* PowerISA VMX does not allow partial (for just element 0) 637 * results. So to insure we don't generate spurious exceptions 638 * (from the upper elements) we splat the lower float 639 * before we to the operation. */ 640 __a = vec_splat ((__v4sf) __A, 0); 641 __b = vec_splat ((__v4sf) __B, 0); 642 __c = (__v4sf) vec_cmple(__a, __b); 643 /* Then we merge the lower float result with the original upper 644 * float elements from __A. */ 645 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 646} 647 648extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649_mm_cmpgt_ss (__m128 __A, __m128 __B) 650{ 651 static const __vector unsigned int __mask = 652 { 0xffffffff, 0, 0, 0 }; 653 __v4sf __a, __b, __c; 654 /* PowerISA VMX does not allow partial (for just element 0) 655 * results. So to insure we don't generate spurious exceptions 656 * (from the upper elements) we splat the lower float 657 * before we to the operation. */ 658 __a = vec_splat ((__v4sf) __A, 0); 659 __b = vec_splat ((__v4sf) __B, 0); 660 __c = (__v4sf) vec_cmpgt(__a, __b); 661 /* Then we merge the lower float result with the original upper 662 * float elements from __A. */ 663 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 664} 665 666extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 667_mm_cmpge_ss (__m128 __A, __m128 __B) 668{ 669 static const __vector unsigned int __mask = 670 { 0xffffffff, 0, 0, 0 }; 671 __v4sf __a, __b, __c; 672 /* PowerISA VMX does not allow partial (for just element 0) 673 * results. So to insure we don't generate spurious exceptions 674 * (from the upper elements) we splat the lower float 675 * before we to the operation. */ 676 __a = vec_splat ((__v4sf) __A, 0); 677 __b = vec_splat ((__v4sf) __B, 0); 678 __c = (__v4sf) vec_cmpge(__a, __b); 679 /* Then we merge the lower float result with the original upper 680 * float elements from __A. */ 681 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 682} 683 684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 685_mm_cmpneq_ss (__m128 __A, __m128 __B) 686{ 687 static const __vector unsigned int __mask = 688 { 0xffffffff, 0, 0, 0 }; 689 __v4sf __a, __b, __c; 690 /* PowerISA VMX does not allow partial (for just element 0) 691 * results. So to insure we don't generate spurious exceptions 692 * (from the upper elements) we splat the lower float 693 * before we to the operation. */ 694 __a = vec_splat ((__v4sf) __A, 0); 695 __b = vec_splat ((__v4sf) __B, 0); 696 __c = (__v4sf) vec_cmpeq(__a, __b); 697 __c = vec_nor (__c, __c); 698 /* Then we merge the lower float result with the original upper 699 * float elements from __A. */ 700 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 701} 702 703extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 704_mm_cmpnlt_ss (__m128 __A, __m128 __B) 705{ 706 static const __vector unsigned int __mask = 707 { 0xffffffff, 0, 0, 0 }; 708 __v4sf __a, __b, __c; 709 /* PowerISA VMX does not allow partial (for just element 0) 710 * results. So to insure we don't generate spurious exceptions 711 * (from the upper elements) we splat the lower float 712 * before we to the operation. */ 713 __a = vec_splat ((__v4sf) __A, 0); 714 __b = vec_splat ((__v4sf) __B, 0); 715 __c = (__v4sf) vec_cmpge(__a, __b); 716 /* Then we merge the lower float result with the original upper 717 * float elements from __A. */ 718 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 719} 720 721extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 722_mm_cmpnle_ss (__m128 __A, __m128 __B) 723{ 724 static const __vector unsigned int __mask = 725 { 0xffffffff, 0, 0, 0 }; 726 __v4sf __a, __b, __c; 727 /* PowerISA VMX does not allow partial (for just element 0) 728 * results. So to insure we don't generate spurious exceptions 729 * (from the upper elements) we splat the lower float 730 * before we to the operation. */ 731 __a = vec_splat ((__v4sf) __A, 0); 732 __b = vec_splat ((__v4sf) __B, 0); 733 __c = (__v4sf) vec_cmpgt(__a, __b); 734 /* Then we merge the lower float result with the original upper 735 * float elements from __A. */ 736 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 737} 738 739extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740_mm_cmpngt_ss (__m128 __A, __m128 __B) 741{ 742 static const __vector unsigned int __mask = 743 { 0xffffffff, 0, 0, 0 }; 744 __v4sf __a, __b, __c; 745 /* PowerISA VMX does not allow partial (for just element 0) 746 * results. So to insure we don't generate spurious exceptions 747 * (from the upper elements) we splat the lower float 748 * before we to the operation. */ 749 __a = vec_splat ((__v4sf) __A, 0); 750 __b = vec_splat ((__v4sf) __B, 0); 751 __c = (__v4sf) vec_cmple(__a, __b); 752 /* Then we merge the lower float result with the original upper 753 * float elements from __A. */ 754 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 755} 756 757extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 758_mm_cmpnge_ss (__m128 __A, __m128 __B) 759{ 760 static const __vector unsigned int __mask = 761 { 0xffffffff, 0, 0, 0 }; 762 __v4sf __a, __b, __c; 763 /* PowerISA VMX does not allow partial (for just element 0) 764 * results. So to insure we don't generate spurious exceptions 765 * (from the upper elements) we splat the lower float 766 * before we do the operation. */ 767 __a = vec_splat ((__v4sf) __A, 0); 768 __b = vec_splat ((__v4sf) __B, 0); 769 __c = (__v4sf) vec_cmplt(__a, __b); 770 /* Then we merge the lower float result with the original upper 771 * float elements from __A. */ 772 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); 773} 774 775extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 776_mm_cmpord_ss (__m128 __A, __m128 __B) 777{ 778 __vector unsigned int __a, __b; 779 __vector unsigned int __c, __d; 780 static const __vector unsigned int __float_exp_mask = 781 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 782 static const __vector unsigned int __mask = 783 { 0xffffffff, 0, 0, 0 }; 784 785 __a = (__vector unsigned int) vec_abs ((__v4sf)__A); 786 __b = (__vector unsigned int) vec_abs ((__v4sf)__B); 787 __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a); 788 __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b); 789 __c = vec_and (__c, __d); 790 /* Then we merge the lower float result with the original upper 791 * float elements from __A. */ 792 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask)); 793} 794 795extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 796_mm_cmpunord_ss (__m128 __A, __m128 __B) 797{ 798 __vector unsigned int __a, __b; 799 __vector unsigned int __c, __d; 800 static const __vector unsigned int __float_exp_mask = 801 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 802 static const __vector unsigned int __mask = 803 { 0xffffffff, 0, 0, 0 }; 804 805 __a = (__vector unsigned int) vec_abs ((__v4sf)__A); 806 __b = (__vector unsigned int) vec_abs ((__v4sf)__B); 807 __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask); 808 __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask); 809 __c = vec_or (__c, __d); 810 /* Then we merge the lower float result with the original upper 811 * float elements from __A. */ 812 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask)); 813} 814 815/* Compare the lower SPFP values of A and B and return 1 if true 816 and 0 if false. */ 817extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 818_mm_comieq_ss (__m128 __A, __m128 __B) 819{ 820 return (__A[0] == __B[0]); 821} 822 823extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 824_mm_comilt_ss (__m128 __A, __m128 __B) 825{ 826 return (__A[0] < __B[0]); 827} 828 829extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 830_mm_comile_ss (__m128 __A, __m128 __B) 831{ 832 return (__A[0] <= __B[0]); 833} 834 835extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 836_mm_comigt_ss (__m128 __A, __m128 __B) 837{ 838 return (__A[0] > __B[0]); 839} 840 841extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 842_mm_comige_ss (__m128 __A, __m128 __B) 843{ 844 return (__A[0] >= __B[0]); 845} 846 847extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 848_mm_comineq_ss (__m128 __A, __m128 __B) 849{ 850 return (__A[0] != __B[0]); 851} 852 853/* FIXME 854 * The __mm_ucomi??_ss implementations below are exactly the same as 855 * __mm_comi??_ss because GCC for PowerPC only generates unordered 856 * compares (scalar and vector). 857 * Technically __mm_comieq_ss et al should be using the ordered 858 * compare and signal for QNaNs. 859 * The __mm_ucomieq_sd et all should be OK, as is. 860 */ 861extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 862_mm_ucomieq_ss (__m128 __A, __m128 __B) 863{ 864 return (__A[0] == __B[0]); 865} 866 867extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 868_mm_ucomilt_ss (__m128 __A, __m128 __B) 869{ 870 return (__A[0] < __B[0]); 871} 872 873extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 874_mm_ucomile_ss (__m128 __A, __m128 __B) 875{ 876 return (__A[0] <= __B[0]); 877} 878 879extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 880_mm_ucomigt_ss (__m128 __A, __m128 __B) 881{ 882 return (__A[0] > __B[0]); 883} 884 885extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 886_mm_ucomige_ss (__m128 __A, __m128 __B) 887{ 888 return (__A[0] >= __B[0]); 889} 890 891extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 892_mm_ucomineq_ss (__m128 __A, __m128 __B) 893{ 894 return (__A[0] != __B[0]); 895} 896 897extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 898_mm_cvtss_f32 (__m128 __A) 899{ 900 return ((__v4sf)__A)[0]; 901} 902 903/* Convert the lower SPFP value to a 32-bit integer according to the current 904 rounding mode. */ 905extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906_mm_cvtss_si32 (__m128 __A) 907{ 908 int __res; 909#ifdef _ARCH_PWR8 910 double __dtmp; 911 __asm__( 912#ifdef __LITTLE_ENDIAN__ 913 "xxsldwi %x0,%x0,%x0,3;\n" 914#endif 915 "xscvspdp %x2,%x0;\n" 916 "fctiw %2,%2;\n" 917 "mfvsrd %1,%x2;\n" 918 : "+wa" (__A), 919 "=r" (__res), 920 "=f" (__dtmp) 921 : ); 922#else 923 __res = __builtin_rint(__A[0]); 924#endif 925 return __res; 926} 927 928extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 929_mm_cvt_ss2si (__m128 __A) 930{ 931 return _mm_cvtss_si32 (__A); 932} 933 934/* Convert the lower SPFP value to a 32-bit integer according to the 935 current rounding mode. */ 936 937/* Intel intrinsic. */ 938extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939_mm_cvtss_si64 (__m128 __A) 940{ 941 long long __res; 942#if defined (_ARCH_PWR8) && defined (__powerpc64__) 943 double __dtmp; 944 __asm__( 945#ifdef __LITTLE_ENDIAN__ 946 "xxsldwi %x0,%x0,%x0,3;\n" 947#endif 948 "xscvspdp %x2,%x0;\n" 949 "fctid %2,%2;\n" 950 "mfvsrd %1,%x2;\n" 951 : "+wa" (__A), 952 "=r" (__res), 953 "=f" (__dtmp) 954 : ); 955#else 956 __res = __builtin_llrint(__A[0]); 957#endif 958 return __res; 959} 960 961/* Microsoft intrinsic. */ 962extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 963_mm_cvtss_si64x (__m128 __A) 964{ 965 return _mm_cvtss_si64 ((__v4sf) __A); 966} 967 968/* Constants for use with _mm_prefetch. */ 969enum _mm_hint 970{ 971 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ 972 _MM_HINT_ET0 = 7, 973 _MM_HINT_ET1 = 6, 974 _MM_HINT_T0 = 3, 975 _MM_HINT_T1 = 2, 976 _MM_HINT_T2 = 1, 977 _MM_HINT_NTA = 0 978}; 979 980/* Loads one cache line from address P to a location "closer" to the 981 processor. The selector I specifies the type of prefetch operation. */ 982extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983_mm_prefetch (const void *__P, enum _mm_hint __I) 984{ 985 /* Current PowerPC will ignores the hint parameters. */ 986 __builtin_prefetch (__P); 987} 988 989/* Convert the two lower SPFP values to 32-bit integers according to the 990 current rounding mode. Return the integers in packed form. */ 991extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 992_mm_cvtps_pi32 (__m128 __A) 993{ 994 /* Splat two lower SPFP values to both halves. */ 995 __v4sf __temp, __rounded; 996 __vector unsigned long long __result; 997 998 /* Splat two lower SPFP values to both halves. */ 999 __temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 1000 __rounded = vec_rint (__temp); 1001 __result = (__vector unsigned long long) vec_cts (__rounded, 0); 1002 1003 return (__m64) ((__vector long long) __result)[0]; 1004} 1005 1006extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1007_mm_cvt_ps2pi (__m128 __A) 1008{ 1009 return _mm_cvtps_pi32 (__A); 1010} 1011 1012/* Truncate the lower SPFP value to a 32-bit integer. */ 1013extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1014_mm_cvttss_si32 (__m128 __A) 1015{ 1016 /* Extract the lower float element. */ 1017 float __temp = __A[0]; 1018 /* truncate to 32-bit integer and return. */ 1019 return __temp; 1020} 1021 1022extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1023_mm_cvtt_ss2si (__m128 __A) 1024{ 1025 return _mm_cvttss_si32 (__A); 1026} 1027 1028/* Intel intrinsic. */ 1029extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1030_mm_cvttss_si64 (__m128 __A) 1031{ 1032 /* Extract the lower float element. */ 1033 float __temp = __A[0]; 1034 /* truncate to 32-bit integer and return. */ 1035 return __temp; 1036} 1037 1038/* Microsoft intrinsic. */ 1039extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040_mm_cvttss_si64x (__m128 __A) 1041{ 1042 /* Extract the lower float element. */ 1043 float __temp = __A[0]; 1044 /* truncate to 32-bit integer and return. */ 1045 return __temp; 1046} 1047 1048/* Truncate the two lower SPFP values to 32-bit integers. Return the 1049 integers in packed form. */ 1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051_mm_cvttps_pi32 (__m128 __A) 1052{ 1053 __v4sf __temp; 1054 __vector unsigned long long __result; 1055 1056 /* Splat two lower SPFP values to both halves. */ 1057 __temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 1058 __result = (__vector unsigned long long) vec_cts (__temp, 0); 1059 1060 return (__m64) ((__vector long long) __result)[0]; 1061} 1062 1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1064_mm_cvtt_ps2pi (__m128 __A) 1065{ 1066 return _mm_cvttps_pi32 (__A); 1067} 1068 1069/* Convert B to a SPFP value and insert it as element zero in A. */ 1070extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1071_mm_cvtsi32_ss (__m128 __A, int __B) 1072{ 1073 float __temp = __B; 1074 __A[0] = __temp; 1075 1076 return __A; 1077} 1078 1079extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1080_mm_cvt_si2ss (__m128 __A, int __B) 1081{ 1082 return _mm_cvtsi32_ss (__A, __B); 1083} 1084 1085/* Convert B to a SPFP value and insert it as element zero in A. */ 1086/* Intel intrinsic. */ 1087extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1088_mm_cvtsi64_ss (__m128 __A, long long __B) 1089{ 1090 float __temp = __B; 1091 __A[0] = __temp; 1092 1093 return __A; 1094} 1095 1096/* Microsoft intrinsic. */ 1097extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1098_mm_cvtsi64x_ss (__m128 __A, long long __B) 1099{ 1100 return _mm_cvtsi64_ss (__A, __B); 1101} 1102 1103/* Convert the two 32-bit values in B to SPFP form and insert them 1104 as the two lower elements in A. */ 1105extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1106_mm_cvtpi32_ps (__m128 __A, __m64 __B) 1107{ 1108 __vector signed int __vm1; 1109 __vector float __vf1; 1110 1111 __vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; 1112 __vf1 = (__vector float) vec_ctf (__vm1, 0); 1113 1114 return ((__m128) (__vector unsigned long long) 1115 { ((__vector unsigned long long)__vf1) [0], 1116 ((__vector unsigned long long)__A) [1]}); 1117} 1118 1119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120_mm_cvt_pi2ps (__m128 __A, __m64 __B) 1121{ 1122 return _mm_cvtpi32_ps (__A, __B); 1123} 1124 1125/* Convert the four signed 16-bit values in A to SPFP form. */ 1126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1127_mm_cvtpi16_ps (__m64 __A) 1128{ 1129 __vector signed short __vs8; 1130 __vector signed int __vi4; 1131 __vector float __vf1; 1132 1133 __vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; 1134 __vi4 = vec_vupklsh (__vs8); 1135 __vf1 = (__vector float) vec_ctf (__vi4, 0); 1136 1137 return (__m128) __vf1; 1138} 1139 1140/* Convert the four unsigned 16-bit values in A to SPFP form. */ 1141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1142_mm_cvtpu16_ps (__m64 __A) 1143{ 1144 const __vector unsigned short __zero = 1145 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1146 __vector unsigned short __vs8; 1147 __vector unsigned int __vi4; 1148 __vector float __vf1; 1149 1150 __vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; 1151 __vi4 = (__vector unsigned int) vec_mergel 1152#ifdef __LITTLE_ENDIAN__ 1153 (__vs8, __zero); 1154#else 1155 (__zero, __vs8); 1156#endif 1157 __vf1 = (__vector float) vec_ctf (__vi4, 0); 1158 1159 return (__m128) __vf1; 1160} 1161 1162/* Convert the low four signed 8-bit values in A to SPFP form. */ 1163extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1164_mm_cvtpi8_ps (__m64 __A) 1165{ 1166 __vector signed char __vc16; 1167 __vector signed short __vs8; 1168 __vector signed int __vi4; 1169 __vector float __vf1; 1170 1171 __vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; 1172 __vs8 = vec_vupkhsb (__vc16); 1173 __vi4 = vec_vupkhsh (__vs8); 1174 __vf1 = (__vector float) vec_ctf (__vi4, 0); 1175 1176 return (__m128) __vf1; 1177} 1178 1179/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 1180extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1181 1182_mm_cvtpu8_ps (__m64 __A) 1183{ 1184 const __vector unsigned char __zero = 1185 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1186 __vector unsigned char __vc16; 1187 __vector unsigned short __vs8; 1188 __vector unsigned int __vi4; 1189 __vector float __vf1; 1190 1191 __vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; 1192#ifdef __LITTLE_ENDIAN__ 1193 __vs8 = (__vector unsigned short) vec_mergel (__vc16, __zero); 1194 __vi4 = (__vector unsigned int) vec_mergeh (__vs8, 1195 (__vector unsigned short) __zero); 1196#else 1197 __vs8 = (__vector unsigned short) vec_mergel (__zero, __vc16); 1198 __vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) __zero, 1199 __vs8); 1200#endif 1201 __vf1 = (__vector float) vec_ctf (__vi4, 0); 1202 1203 return (__m128) __vf1; 1204} 1205 1206/* Convert the four signed 32-bit values in A and B to SPFP form. */ 1207extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1208_mm_cvtpi32x2_ps (__m64 __A, __m64 __B) 1209{ 1210 __vector signed int __vi4; 1211 __vector float __vf4; 1212 1213 __vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; 1214 __vf4 = (__vector float) vec_ctf (__vi4, 0); 1215 return (__m128) __vf4; 1216} 1217 1218/* Convert the four SPFP values in A to four signed 16-bit integers. */ 1219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1220_mm_cvtps_pi16 (__m128 __A) 1221{ 1222 __v4sf __rounded; 1223 __vector signed int __temp; 1224 __vector unsigned long long __result; 1225 1226 __rounded = vec_rint(__A); 1227 __temp = vec_cts (__rounded, 0); 1228 __result = (__vector unsigned long long) vec_pack (__temp, __temp); 1229 1230 return (__m64) ((__vector long long) __result)[0]; 1231} 1232 1233/* Convert the four SPFP values in A to four signed 8-bit integers. */ 1234extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1235_mm_cvtps_pi8 (__m128 __A) 1236{ 1237 __v4sf __rounded; 1238 __vector signed int __tmp_i; 1239 static const __vector signed int __zero = {0, 0, 0, 0}; 1240 __vector signed short __tmp_s; 1241 __vector signed char __res_v; 1242 1243 __rounded = vec_rint(__A); 1244 __tmp_i = vec_cts (__rounded, 0); 1245 __tmp_s = vec_pack (__tmp_i, __zero); 1246 __res_v = vec_pack (__tmp_s, __tmp_s); 1247 return (__m64) ((__vector long long) __res_v)[0]; 1248} 1249 1250/* Selects four specific SPFP values from A and B based on MASK. */ 1251extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1252 1253_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) 1254{ 1255 unsigned long __element_selector_10 = __mask & 0x03; 1256 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 1257 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 1258 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 1259 static const unsigned int __permute_selectors[4] = 1260 { 1261#ifdef __LITTLE_ENDIAN__ 1262 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 1263#else 1264 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 1265#endif 1266 }; 1267 __vector unsigned int __t; 1268 1269 __t[0] = __permute_selectors[__element_selector_10]; 1270 __t[1] = __permute_selectors[__element_selector_32]; 1271 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 1272 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 1273 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)__t); 1274} 1275 1276/* Selects and interleaves the upper two SPFP values from A and B. */ 1277extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1278_mm_unpackhi_ps (__m128 __A, __m128 __B) 1279{ 1280 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); 1281} 1282 1283/* Selects and interleaves the lower two SPFP values from A and B. */ 1284extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1285_mm_unpacklo_ps (__m128 __A, __m128 __B) 1286{ 1287 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); 1288} 1289 1290/* Sets the upper two SPFP values with 64-bits of data loaded from P; 1291 the lower two values are passed through from A. */ 1292extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1293_mm_loadh_pi (__m128 __A, __m64 const *__P) 1294{ 1295 __vector unsigned long long __a = (__vector unsigned long long)__A; 1296 __vector unsigned long long __p = vec_splats(*__P); 1297 __a [1] = __p [1]; 1298 1299 return (__m128)__a; 1300} 1301 1302/* Stores the upper two SPFP values of A into P. */ 1303extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1304_mm_storeh_pi (__m64 *__P, __m128 __A) 1305{ 1306 __vector unsigned long long __a = (__vector unsigned long long) __A; 1307 1308 *__P = __a[1]; 1309} 1310 1311/* Moves the upper two values of B into the lower two values of A. */ 1312extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1313_mm_movehl_ps (__m128 __A, __m128 __B) 1314{ 1315 return (__m128) vec_mergel ((__vector unsigned long long)__B, 1316 (__vector unsigned long long)__A); 1317} 1318 1319/* Moves the lower two values of B into the upper two values of A. */ 1320extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321_mm_movelh_ps (__m128 __A, __m128 __B) 1322{ 1323 return (__m128) vec_mergeh ((__vector unsigned long long)__A, 1324 (__vector unsigned long long)__B); 1325} 1326 1327/* Sets the lower two SPFP values with 64-bits of data loaded from P; 1328 the upper two values are passed through from A. */ 1329extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1330_mm_loadl_pi (__m128 __A, __m64 const *__P) 1331{ 1332 __vector unsigned long long __a = (__vector unsigned long long)__A; 1333 __vector unsigned long long __p = vec_splats(*__P); 1334 __a [0] = __p [0]; 1335 1336 return (__m128)__a; 1337} 1338 1339/* Stores the lower two SPFP values of A into P. */ 1340extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1341_mm_storel_pi (__m64 *__P, __m128 __A) 1342{ 1343 __vector unsigned long long __a = (__vector unsigned long long) __A; 1344 1345 *__P = __a[0]; 1346} 1347 1348#ifdef _ARCH_PWR8 1349/* Intrinsic functions that require PowerISA 2.07 minimum. */ 1350 1351/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1352extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1353_mm_movemask_ps (__m128 __A) 1354{ 1355 __vector unsigned long long __result; 1356 static const __vector unsigned int __perm_mask = 1357 { 1358#ifdef __LITTLE_ENDIAN__ 1359 0x00204060, 0x80808080, 0x80808080, 0x80808080 1360#else 1361 0x80808080, 0x80808080, 0x80808080, 0x00204060 1362#endif 1363 }; 1364 1365 __result = ((__vector unsigned long long) 1366 vec_vbpermq ((__vector unsigned char) __A, 1367 (__vector unsigned char) __perm_mask)); 1368 1369#ifdef __LITTLE_ENDIAN__ 1370 return __result[1]; 1371#else 1372 return __result[0]; 1373#endif 1374} 1375#endif /* _ARCH_PWR8 */ 1376 1377/* Create a vector with all four elements equal to *P. */ 1378extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379_mm_load1_ps (float const *__P) 1380{ 1381 return _mm_set1_ps (*__P); 1382} 1383 1384extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1385_mm_load_ps1 (float const *__P) 1386{ 1387 return _mm_load1_ps (__P); 1388} 1389 1390/* Extracts one of the four words of A. The selector N must be immediate. */ 1391extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1392_mm_extract_pi16 (__m64 const __A, int const __N) 1393{ 1394 unsigned int __shiftr = __N & 3; 1395#ifdef __BIG_ENDIAN__ 1396 __shiftr = 3 - __shiftr; 1397#endif 1398 1399 return ((__A >> (__shiftr * 16)) & 0xffff); 1400} 1401 1402extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403_m_pextrw (__m64 const __A, int const __N) 1404{ 1405 return _mm_extract_pi16 (__A, __N); 1406} 1407 1408/* Inserts word D into one of four words of A. The selector N must be 1409 immediate. */ 1410extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 1412{ 1413 const int __shiftl = (__N & 3) * 16; 1414 const __m64 __shiftD = (const __m64) __D << __shiftl; 1415 const __m64 __mask = 0xffffUL << __shiftl; 1416 __m64 __result = (__A & (~__mask)) | (__shiftD & __mask); 1417 1418 return __result; 1419} 1420 1421extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1422_m_pinsrw (__m64 const __A, int const __D, int const __N) 1423{ 1424 return _mm_insert_pi16 (__A, __D, __N); 1425} 1426 1427/* Compute the element-wise maximum of signed 16-bit values. */ 1428extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1429 1430_mm_max_pi16 (__m64 __A, __m64 __B) 1431{ 1432#if _ARCH_PWR8 1433 __vector signed short __a, __b, __r; 1434 __vector __bool short __c; 1435 1436 __a = (__vector signed short)vec_splats (__A); 1437 __b = (__vector signed short)vec_splats (__B); 1438 __c = (__vector __bool short)vec_cmpgt (__a, __b); 1439 __r = vec_sel (__b, __a, __c); 1440 return (__m64) ((__vector long long) __r)[0]; 1441#else 1442 __m64_union __m1, __m2, __res; 1443 1444 __m1.as_m64 = __A; 1445 __m2.as_m64 = __B; 1446 1447 __res.as_short[0] = 1448 (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0]; 1449 __res.as_short[1] = 1450 (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1]; 1451 __res.as_short[2] = 1452 (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2]; 1453 __res.as_short[3] = 1454 (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3]; 1455 1456 return (__m64) __res.as_m64; 1457#endif 1458} 1459 1460extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1461_m_pmaxsw (__m64 __A, __m64 __B) 1462{ 1463 return _mm_max_pi16 (__A, __B); 1464} 1465 1466/* Compute the element-wise maximum of unsigned 8-bit values. */ 1467extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1468_mm_max_pu8 (__m64 __A, __m64 __B) 1469{ 1470#if _ARCH_PWR8 1471 __vector unsigned char __a, __b, __r; 1472 __vector __bool char __c; 1473 1474 __a = (__vector unsigned char)vec_splats (__A); 1475 __b = (__vector unsigned char)vec_splats (__B); 1476 __c = (__vector __bool char)vec_cmpgt (__a, __b); 1477 __r = vec_sel (__b, __a, __c); 1478 return (__m64) ((__vector long long) __r)[0]; 1479#else 1480 __m64_union __m1, __m2, __res; 1481 long __i; 1482 1483 __m1.as_m64 = __A; 1484 __m2.as_m64 = __B; 1485 1486 for (__i = 0; __i < 8; __i++) 1487 __res.as_char[__i] = 1488 ((unsigned char) __m1.as_char[__i] > (unsigned char) __m2.as_char[__i]) ? 1489 __m1.as_char[__i] : __m2.as_char[__i]; 1490 1491 return (__m64) __res.as_m64; 1492#endif 1493} 1494 1495extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1496_m_pmaxub (__m64 __A, __m64 __B) 1497{ 1498 return _mm_max_pu8 (__A, __B); 1499} 1500 1501/* Compute the element-wise minimum of signed 16-bit values. */ 1502extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1503_mm_min_pi16 (__m64 __A, __m64 __B) 1504{ 1505#if _ARCH_PWR8 1506 __vector signed short __a, __b, __r; 1507 __vector __bool short __c; 1508 1509 __a = (__vector signed short)vec_splats (__A); 1510 __b = (__vector signed short)vec_splats (__B); 1511 __c = (__vector __bool short)vec_cmplt (__a, __b); 1512 __r = vec_sel (__b, __a, __c); 1513 return (__m64) ((__vector long long) __r)[0]; 1514#else 1515 __m64_union __m1, __m2, __res; 1516 1517 __m1.as_m64 = __A; 1518 __m2.as_m64 = __B; 1519 1520 __res.as_short[0] = 1521 (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0]; 1522 __res.as_short[1] = 1523 (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1]; 1524 __res.as_short[2] = 1525 (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2]; 1526 __res.as_short[3] = 1527 (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3]; 1528 1529 return (__m64) __res.as_m64; 1530#endif 1531} 1532 1533extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1534_m_pminsw (__m64 __A, __m64 __B) 1535{ 1536 return _mm_min_pi16 (__A, __B); 1537} 1538 1539/* Compute the element-wise minimum of unsigned 8-bit values. */ 1540extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1541_mm_min_pu8 (__m64 __A, __m64 __B) 1542{ 1543#if _ARCH_PWR8 1544 __vector unsigned char __a, __b, __r; 1545 __vector __bool char __c; 1546 1547 __a = (__vector unsigned char)vec_splats (__A); 1548 __b = (__vector unsigned char)vec_splats (__B); 1549 __c = (__vector __bool char)vec_cmplt (__a, __b); 1550 __r = vec_sel (__b, __a, __c); 1551 return (__m64) ((__vector long long) __r)[0]; 1552#else 1553 __m64_union __m1, __m2, __res; 1554 long __i; 1555 1556 __m1.as_m64 = __A; 1557 __m2.as_m64 = __B; 1558 1559 1560 for (__i = 0; __i < 8; __i++) 1561 __res.as_char[__i] = 1562 ((unsigned char) __m1.as_char[__i] < (unsigned char) __m2.as_char[__i]) ? 1563 __m1.as_char[__i] : __m2.as_char[__i]; 1564 1565 return (__m64) __res.as_m64; 1566#endif 1567} 1568 1569extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1570_m_pminub (__m64 __A, __m64 __B) 1571{ 1572 return _mm_min_pu8 (__A, __B); 1573} 1574 1575/* Create an 8-bit mask of the signs of 8-bit values. */ 1576extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1577_mm_movemask_pi8 (__m64 __A) 1578{ 1579#ifdef __powerpc64__ 1580 unsigned long long __p = 1581#ifdef __LITTLE_ENDIAN__ 1582 0x0008101820283038UL; // permute control for sign bits 1583#else 1584 0x3830282018100800UL; // permute control for sign bits 1585#endif 1586 return __builtin_bpermd (__p, __A); 1587#else 1588#ifdef __LITTLE_ENDIAN__ 1589 unsigned int __mask = 0x20283038UL; 1590 unsigned int __r1 = __builtin_bpermd (__mask, __A) & 0xf; 1591 unsigned int __r2 = __builtin_bpermd (__mask, __A >> 32) & 0xf; 1592#else 1593 unsigned int __mask = 0x38302820UL; 1594 unsigned int __r1 = __builtin_bpermd (__mask, __A >> 32) & 0xf; 1595 unsigned int __r2 = __builtin_bpermd (__mask, __A) & 0xf; 1596#endif 1597 return (__r2 << 4) | __r1; 1598#endif 1599} 1600 1601extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1602_m_pmovmskb (__m64 __A) 1603{ 1604 return _mm_movemask_pi8 (__A); 1605} 1606 1607/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 1608 in B and produce the high 16 bits of the 32-bit results. */ 1609extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1610_mm_mulhi_pu16 (__m64 __A, __m64 __B) 1611{ 1612 __vector unsigned short __a, __b; 1613 __vector unsigned short __c; 1614 __vector unsigned int __w0, __w1; 1615 __vector unsigned char __xform1 = { 1616#ifdef __LITTLE_ENDIAN__ 1617 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1618 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1619#else 1620 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1621 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1622#endif 1623 }; 1624 1625 __a = (__vector unsigned short)vec_splats (__A); 1626 __b = (__vector unsigned short)vec_splats (__B); 1627 1628 __w0 = vec_vmuleuh (__a, __b); 1629 __w1 = vec_vmulouh (__a, __b); 1630 __c = (__vector unsigned short)vec_perm (__w0, __w1, __xform1); 1631 1632 return (__m64) ((__vector long long) __c)[0]; 1633} 1634 1635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1636_m_pmulhuw (__m64 __A, __m64 __B) 1637{ 1638 return _mm_mulhi_pu16 (__A, __B); 1639} 1640 1641/* Return a combination of the four 16-bit values in A. The selector 1642 must be an immediate. */ 1643extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1644_mm_shuffle_pi16 (__m64 __A, int const __N) 1645{ 1646 unsigned long __element_selector_10 = __N & 0x03; 1647 unsigned long __element_selector_32 = (__N >> 2) & 0x03; 1648 unsigned long __element_selector_54 = (__N >> 4) & 0x03; 1649 unsigned long __element_selector_76 = (__N >> 6) & 0x03; 1650 static const unsigned short __permute_selectors[4] = 1651 { 1652#ifdef __LITTLE_ENDIAN__ 1653 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 1654#else 1655 0x0607, 0x0405, 0x0203, 0x0001 1656#endif 1657 }; 1658 __m64_union __t; 1659 __vector unsigned long long __a, __p, __r; 1660 1661#ifdef __LITTLE_ENDIAN__ 1662 __t.as_short[0] = __permute_selectors[__element_selector_10]; 1663 __t.as_short[1] = __permute_selectors[__element_selector_32]; 1664 __t.as_short[2] = __permute_selectors[__element_selector_54]; 1665 __t.as_short[3] = __permute_selectors[__element_selector_76]; 1666#else 1667 __t.as_short[3] = __permute_selectors[__element_selector_10]; 1668 __t.as_short[2] = __permute_selectors[__element_selector_32]; 1669 __t.as_short[1] = __permute_selectors[__element_selector_54]; 1670 __t.as_short[0] = __permute_selectors[__element_selector_76]; 1671#endif 1672 __p = vec_splats (__t.as_m64); 1673 __a = vec_splats (__A); 1674 __r = vec_perm (__a, __a, (__vector unsigned char)__p); 1675 return (__m64) ((__vector long long) __r)[0]; 1676} 1677 1678extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1679_m_pshufw (__m64 __A, int const __N) 1680{ 1681 return _mm_shuffle_pi16 (__A, __N); 1682} 1683 1684/* Conditionally store byte elements of A into P. The high bit of each 1685 byte in the selector N determines whether the corresponding byte from 1686 A is stored. */ 1687extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1688_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 1689{ 1690 __m64 __hibit = 0x8080808080808080UL; 1691 __m64 __mask, __tmp; 1692 __m64 *__p = (__m64*)__P; 1693 1694 __tmp = *__p; 1695 __mask = _mm_cmpeq_pi8 ((__N & __hibit), __hibit); 1696 __tmp = (__tmp & (~__mask)) | (__A & __mask); 1697 *__p = __tmp; 1698} 1699 1700extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1701_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1702{ 1703 _mm_maskmove_si64 (__A, __N, __P); 1704} 1705 1706/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1707extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1708_mm_avg_pu8 (__m64 __A, __m64 __B) 1709{ 1710 __vector unsigned char __a, __b, __c; 1711 1712 __a = (__vector unsigned char)vec_splats (__A); 1713 __b = (__vector unsigned char)vec_splats (__B); 1714 __c = vec_avg (__a, __b); 1715 return (__m64) ((__vector long long) __c)[0]; 1716} 1717 1718extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1719_m_pavgb (__m64 __A, __m64 __B) 1720{ 1721 return _mm_avg_pu8 (__A, __B); 1722} 1723 1724/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1725extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1726_mm_avg_pu16 (__m64 __A, __m64 __B) 1727{ 1728 __vector unsigned short __a, __b, __c; 1729 1730 __a = (__vector unsigned short)vec_splats (__A); 1731 __b = (__vector unsigned short)vec_splats (__B); 1732 __c = vec_avg (__a, __b); 1733 return (__m64) ((__vector long long) __c)[0]; 1734} 1735 1736extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1737_m_pavgw (__m64 __A, __m64 __B) 1738{ 1739 return _mm_avg_pu16 (__A, __B); 1740} 1741 1742/* Compute the sum of the absolute differences of the unsigned 8-bit 1743 values in A and B. Return the value in the lower 16-bit word; the 1744 upper words are cleared. */ 1745extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1746_mm_sad_pu8 (__m64 __A, __m64 __B) 1747{ 1748 __vector unsigned char __a, __b; 1749 __vector unsigned char __vmin, __vmax, __vabsdiff; 1750 __vector signed int __vsum; 1751 const __vector unsigned int __zero = 1752 { 0, 0, 0, 0 }; 1753 __m64_union __result = {0}; 1754 1755 __a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; 1756 __b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; 1757 __vmin = vec_min (__a, __b); 1758 __vmax = vec_max (__a, __b); 1759 __vabsdiff = vec_sub (__vmax, __vmin); 1760 /* Sum four groups of bytes into integers. */ 1761 __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero); 1762 /* Sum across four integers with integer result. */ 1763 __vsum = vec_sums (__vsum, (__vector signed int) __zero); 1764 /* The sum is in the right most 32-bits of the vector result. 1765 Transfer to a GPR and truncate to 16 bits. */ 1766 __result.as_short[0] = __vsum[3]; 1767 return __result.as_m64; 1768} 1769 1770extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1771_m_psadbw (__m64 __A, __m64 __B) 1772{ 1773 return _mm_sad_pu8 (__A, __B); 1774} 1775 1776/* Stores the data in A to the address P without polluting the caches. */ 1777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1778_mm_stream_pi (__m64 *__P, __m64 __A) 1779{ 1780 /* Use the data cache block touch for store transient. */ 1781 __asm__ ( 1782 " dcbtstt 0,%0" 1783 : 1784 : "b" (__P) 1785 : "memory" 1786 ); 1787 *__P = __A; 1788} 1789 1790/* Likewise. The address must be 16-byte aligned. */ 1791extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1792_mm_stream_ps (float *__P, __m128 __A) 1793{ 1794 /* Use the data cache block touch for store transient. */ 1795 __asm__ ( 1796 " dcbtstt 0,%0" 1797 : 1798 : "b" (__P) 1799 : "memory" 1800 ); 1801 _mm_store_ps (__P, __A); 1802} 1803 1804/* Guarantees that every preceding store is globally visible before 1805 any subsequent store. */ 1806extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1807_mm_sfence (void) 1808{ 1809 /* Generate a light weight sync. */ 1810 __atomic_thread_fence (__ATOMIC_RELEASE); 1811} 1812 1813/* The execution of the next instruction is delayed by an implementation 1814 specific amount of time. The instruction does not modify the 1815 architectural state. This is after the pop_options pragma because 1816 it does not require SSE support in the processor--the encoding is a 1817 nop on processors that do not support it. */ 1818extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1819_mm_pause (void) 1820{ 1821 /* There is no exact match with this construct, but the following is 1822 close to the desired effect. */ 1823#if _ARCH_PWR8 1824 /* On power8 and later processors we can depend on Program Priority 1825 (PRI) and associated "very low" PPI setting. Since we don't know 1826 what PPI this thread is running at we: 1) save the current PRI 1827 from the PPR SPR into a local GRP, 2) set the PRI to "very low* 1828 via the special or 31,31,31 encoding. 3) issue an "isync" to 1829 insure the PRI change takes effect before we execute any more 1830 instructions. 1831 Now we can execute a lwsync (release barrier) while we execute 1832 this thread at "very low" PRI. Finally we restore the original 1833 PRI and continue execution. */ 1834 unsigned long __PPR; 1835 1836 __asm__ volatile ( 1837 " mfppr %0;" 1838 " or 31,31,31;" 1839 " isync;" 1840 " lwsync;" 1841 " isync;" 1842 " mtppr %0;" 1843 : "=r" (__PPR) 1844 : 1845 : "memory" 1846 ); 1847#else 1848 /* For older processor where we may not even have Program Priority 1849 controls we can only depend on Heavy Weight Sync. */ 1850 __atomic_thread_fence (__ATOMIC_SEQ_CST); 1851#endif 1852} 1853 1854/* Transpose the 4x4 matrix composed of row[0-3]. */ 1855#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1856do { \ 1857 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1858 __v4sf __t0 = vec_vmrghw (__r0, __r1); \ 1859 __v4sf __t1 = vec_vmrghw (__r2, __r3); \ 1860 __v4sf __t2 = vec_vmrglw (__r0, __r1); \ 1861 __v4sf __t3 = vec_vmrglw (__r2, __r3); \ 1862 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ 1863 (__vector long long)__t1); \ 1864 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ 1865 (__vector long long)__t1); \ 1866 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ 1867 (__vector long long)__t3); \ 1868 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ 1869 (__vector long long)__t3); \ 1870} while (0) 1871 1872/* For backward source compatibility. */ 1873//# include <emmintrin.h> 1874 1875#endif /* _XMMINTRIN_H_INCLUDED */ 1876