1353942Sdim/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
2353942Sdim *
3353942Sdim * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353942Sdim * See https://llvm.org/LICENSE.txt for license information.
5353942Sdim * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6353942Sdim *
7353942Sdim *===-----------------------------------------------------------------------===
8353942Sdim */
9353942Sdim
10353942Sdim/* Implemented from the specification included in the Intel C++ Compiler
11353942Sdim   User Guide and Reference, version 9.0.  */
12353942Sdim
13353942Sdim#ifndef NO_WARN_X86_INTRINSICS
14353942Sdim/* This header is distributed to simplify porting x86_64 code that
15353942Sdim   makes explicit use of Intel intrinsics to powerpc64le.
16353942Sdim   It is the user's responsibility to determine if the results are
17353942Sdim   acceptable and make additional changes as necessary.
18353942Sdim   Note that much code that uses Intel intrinsics can be rewritten in
19353942Sdim   standard C or GNU C extensions, which are more portable and better
20353942Sdim   optimized across multiple targets.
21353942Sdim
22353942Sdim   In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
23353942Sdim   is a good match for most SIMD operations.  However the Horizontal
24353942Sdim   add/sub requires the data pairs be permuted into a separate
25353942Sdim   registers with vertical even/odd alignment for the operation.
26353942Sdim   And the addsub operation requires the sign of only the even numbered
27353942Sdim   elements be flipped (xored with -0.0).
28353942Sdim   For larger blocks of code using these intrinsic implementations,
29353942Sdim   the compiler be should be able to schedule instructions to avoid
30353942Sdim   additional latency.
31353942Sdim
32353942Sdim   In the specific case of the monitor and mwait instructions there are
33353942Sdim   no direct equivalent in the PowerISA at this time.  So those
34353942Sdim   intrinsics are not implemented.  */
35353942Sdim#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
36353942Sdim#endif
37353942Sdim
38353942Sdim#ifndef PMMINTRIN_H_
39353942Sdim#define PMMINTRIN_H_
40353942Sdim
41353942Sdim#if defined(__linux__) && defined(__ppc64__)
42353942Sdim
43353942Sdim/* We need definitions from the SSE2 and SSE header files*/
44353942Sdim#include <emmintrin.h>
45353942Sdim
46353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
47353942Sdim_mm_addsub_ps (__m128 __X, __m128 __Y)
48353942Sdim{
49353942Sdim  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
50353942Sdim  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
51353942Sdim  return (__m128) vec_add (__X, even_neg_Y);
52353942Sdim}
53353942Sdim
54353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
55353942Sdim_mm_addsub_pd (__m128d __X, __m128d __Y)
56353942Sdim{
57353942Sdim  const __v2df even_n0 = {-0.0, 0.0};
58353942Sdim  __v2df even_neg_Y = vec_xor(__Y, even_n0);
59353942Sdim  return (__m128d) vec_add (__X, even_neg_Y);
60353942Sdim}
61353942Sdim
62353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63353942Sdim_mm_hadd_ps (__m128 __X, __m128 __Y)
64353942Sdim{
65353942Sdim  __vector unsigned char xform2 = {
66353942Sdim      0x00, 0x01, 0x02, 0x03,
67353942Sdim      0x08, 0x09, 0x0A, 0x0B,
68353942Sdim      0x10, 0x11, 0x12, 0x13,
69353942Sdim      0x18, 0x19, 0x1A, 0x1B
70353942Sdim    };
71353942Sdim  __vector unsigned char xform1 = {
72353942Sdim      0x04, 0x05, 0x06, 0x07,
73353942Sdim      0x0C, 0x0D, 0x0E, 0x0F,
74353942Sdim      0x14, 0x15, 0x16, 0x17,
75353942Sdim      0x1C, 0x1D, 0x1E, 0x1F
76353942Sdim    };
77353942Sdim  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
78353942Sdim			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
79353942Sdim}
80353942Sdim
81353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82353942Sdim_mm_hsub_ps (__m128 __X, __m128 __Y)
83353942Sdim{
84353942Sdim  __vector unsigned char xform2 = {
85353942Sdim      0x00, 0x01, 0x02, 0x03,
86353942Sdim      0x08, 0x09, 0x0A, 0x0B,
87353942Sdim      0x10, 0x11, 0x12, 0x13,
88353942Sdim      0x18, 0x19, 0x1A, 0x1B
89353942Sdim    };
90353942Sdim  __vector unsigned char xform1 = {
91353942Sdim      0x04, 0x05, 0x06, 0x07,
92353942Sdim      0x0C, 0x0D, 0x0E, 0x0F,
93353942Sdim      0x14, 0x15, 0x16, 0x17,
94353942Sdim      0x1C, 0x1D, 0x1E, 0x1F
95353942Sdim    };
96353942Sdim  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
97353942Sdim			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
98353942Sdim}
99353942Sdim
100353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101353942Sdim_mm_hadd_pd (__m128d __X, __m128d __Y)
102353942Sdim{
103353942Sdim  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
104353942Sdim				  vec_mergel ((__v2df) __X, (__v2df)__Y));
105353942Sdim}
106353942Sdim
107353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
108353942Sdim_mm_hsub_pd (__m128d __X, __m128d __Y)
109353942Sdim{
110353942Sdim  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
111353942Sdim			    vec_mergel ((__v2df) __X, (__v2df)__Y));
112353942Sdim}
113353942Sdim
114353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
115353942Sdim_mm_movehdup_ps (__m128 __X)
116353942Sdim{
117353942Sdim  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
118353942Sdim}
119353942Sdim
120353942Sdimextern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121353942Sdim_mm_moveldup_ps (__m128 __X)
122353942Sdim{
123353942Sdim  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
124353942Sdim}
125353942Sdim
126353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127353942Sdim_mm_loaddup_pd (double const *__P)
128353942Sdim{
129353942Sdim  return (__m128d) vec_splats (*__P);
130353942Sdim}
131353942Sdim
132353942Sdimextern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133353942Sdim_mm_movedup_pd (__m128d __X)
134353942Sdim{
135353942Sdim  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
136353942Sdim}
137353942Sdim
138353942Sdimextern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139353942Sdim_mm_lddqu_si128 (__m128i const *__P)
140353942Sdim{
141353942Sdim  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
142353942Sdim}
143353942Sdim
144353942Sdim/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */
145353942Sdim
146353942Sdim#else
147353942Sdim#include_next <pmmintrin.h>
148353942Sdim#endif /* defined(__linux__) && defined(__ppc64__) */
149353942Sdim
150353942Sdim#endif /* PMMINTRIN_H_ */
151