1353942Sdim/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------=== 2353942Sdim * 3353942Sdim * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353942Sdim * See https://llvm.org/LICENSE.txt for license information. 5353942Sdim * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6353942Sdim * 7353942Sdim *===-----------------------------------------------------------------------=== 8353942Sdim */ 9353942Sdim 10353942Sdim/* Implemented from the specification included in the Intel C++ Compiler 11353942Sdim User Guide and Reference, version 9.0. */ 12353942Sdim 13353942Sdim#ifndef NO_WARN_X86_INTRINSICS 14353942Sdim/* This header is distributed to simplify porting x86_64 code that 15353942Sdim makes explicit use of Intel intrinsics to powerpc64le. 16353942Sdim It is the user's responsibility to determine if the results are 17353942Sdim acceptable and make additional changes as necessary. 18353942Sdim Note that much code that uses Intel intrinsics can be rewritten in 19353942Sdim standard C or GNU C extensions, which are more portable and better 20353942Sdim optimized across multiple targets. 21353942Sdim 22353942Sdim In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA 23353942Sdim is a good match for most SIMD operations. However the Horizontal 24353942Sdim add/sub requires the data pairs be permuted into a separate 25353942Sdim registers with vertical even/odd alignment for the operation. 26353942Sdim And the addsub operation requires the sign of only the even numbered 27353942Sdim elements be flipped (xored with -0.0). 28353942Sdim For larger blocks of code using these intrinsic implementations, 29353942Sdim the compiler be should be able to schedule instructions to avoid 30353942Sdim additional latency. 31353942Sdim 32353942Sdim In the specific case of the monitor and mwait instructions there are 33353942Sdim no direct equivalent in the PowerISA at this time. So those 34353942Sdim intrinsics are not implemented. */ 35353942Sdim#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." 36353942Sdim#endif 37353942Sdim 38353942Sdim#ifndef PMMINTRIN_H_ 39353942Sdim#define PMMINTRIN_H_ 40353942Sdim 41353942Sdim#if defined(__linux__) && defined(__ppc64__) 42353942Sdim 43353942Sdim/* We need definitions from the SSE2 and SSE header files*/ 44353942Sdim#include <emmintrin.h> 45353942Sdim 46353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 47353942Sdim_mm_addsub_ps (__m128 __X, __m128 __Y) 48353942Sdim{ 49353942Sdim const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; 50353942Sdim __v4sf even_neg_Y = vec_xor(__Y, even_n0); 51353942Sdim return (__m128) vec_add (__X, even_neg_Y); 52353942Sdim} 53353942Sdim 54353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 55353942Sdim_mm_addsub_pd (__m128d __X, __m128d __Y) 56353942Sdim{ 57353942Sdim const __v2df even_n0 = {-0.0, 0.0}; 58353942Sdim __v2df even_neg_Y = vec_xor(__Y, even_n0); 59353942Sdim return (__m128d) vec_add (__X, even_neg_Y); 60353942Sdim} 61353942Sdim 62353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 63353942Sdim_mm_hadd_ps (__m128 __X, __m128 __Y) 64353942Sdim{ 65353942Sdim __vector unsigned char xform2 = { 66353942Sdim 0x00, 0x01, 0x02, 0x03, 67353942Sdim 0x08, 0x09, 0x0A, 0x0B, 68353942Sdim 0x10, 0x11, 0x12, 0x13, 69353942Sdim 0x18, 0x19, 0x1A, 0x1B 70353942Sdim }; 71353942Sdim __vector unsigned char xform1 = { 72353942Sdim 0x04, 0x05, 0x06, 0x07, 73353942Sdim 0x0C, 0x0D, 0x0E, 0x0F, 74353942Sdim 0x14, 0x15, 0x16, 0x17, 75353942Sdim 0x1C, 0x1D, 0x1E, 0x1F 76353942Sdim }; 77353942Sdim return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), 78353942Sdim vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); 79353942Sdim} 80353942Sdim 81353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 82353942Sdim_mm_hsub_ps (__m128 __X, __m128 __Y) 83353942Sdim{ 84353942Sdim __vector unsigned char xform2 = { 85353942Sdim 0x00, 0x01, 0x02, 0x03, 86353942Sdim 0x08, 0x09, 0x0A, 0x0B, 87353942Sdim 0x10, 0x11, 0x12, 0x13, 88353942Sdim 0x18, 0x19, 0x1A, 0x1B 89353942Sdim }; 90353942Sdim __vector unsigned char xform1 = { 91353942Sdim 0x04, 0x05, 0x06, 0x07, 92353942Sdim 0x0C, 0x0D, 0x0E, 0x0F, 93353942Sdim 0x14, 0x15, 0x16, 0x17, 94353942Sdim 0x1C, 0x1D, 0x1E, 0x1F 95353942Sdim }; 96353942Sdim return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), 97353942Sdim vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); 98353942Sdim} 99353942Sdim 100353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 101353942Sdim_mm_hadd_pd (__m128d __X, __m128d __Y) 102353942Sdim{ 103353942Sdim return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), 104353942Sdim vec_mergel ((__v2df) __X, (__v2df)__Y)); 105353942Sdim} 106353942Sdim 107353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 108353942Sdim_mm_hsub_pd (__m128d __X, __m128d __Y) 109353942Sdim{ 110353942Sdim return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), 111353942Sdim vec_mergel ((__v2df) __X, (__v2df)__Y)); 112353942Sdim} 113353942Sdim 114353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 115353942Sdim_mm_movehdup_ps (__m128 __X) 116353942Sdim{ 117353942Sdim return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); 118353942Sdim} 119353942Sdim 120353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 121353942Sdim_mm_moveldup_ps (__m128 __X) 122353942Sdim{ 123353942Sdim return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); 124353942Sdim} 125353942Sdim 126353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 127353942Sdim_mm_loaddup_pd (double const *__P) 128353942Sdim{ 129353942Sdim return (__m128d) vec_splats (*__P); 130353942Sdim} 131353942Sdim 132353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133353942Sdim_mm_movedup_pd (__m128d __X) 134353942Sdim{ 135353942Sdim return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); 136353942Sdim} 137353942Sdim 138353942Sdimextern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139353942Sdim_mm_lddqu_si128 (__m128i const *__P) 140353942Sdim{ 141353942Sdim return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 142353942Sdim} 143353942Sdim 144353942Sdim/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ 145353942Sdim 146353942Sdim#else 147353942Sdim#include_next <pmmintrin.h> 148353942Sdim#endif /* defined(__linux__) && defined(__ppc64__) */ 149353942Sdim 150353942Sdim#endif /* PMMINTRIN_H_ */ 151