1/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header is distributed to simplify porting x86_64 code that
15   makes explicit use of Intel intrinsics to powerpc64le.
16
17   It is the user's responsibility to determine if the results are
18   acceptable and make additional changes as necessary.
19
20   Note that much code that uses Intel intrinsics can be rewritten in
21   standard C or GNU C extensions, which are more portable and better
22   optimized across multiple targets.  */
23#endif
24
25#ifndef TMMINTRIN_H_
26#define TMMINTRIN_H_
27
28#if defined(__linux__) && defined(__ppc64__)
29
30#include <altivec.h>
31
32/* We need definitions from the SSE header files.  */
33#include <pmmintrin.h>
34
35extern __inline __m128i
36__attribute__((__gnu_inline__, __always_inline__, __artificial__))
37_mm_abs_epi16 (__m128i __A)
38{
39  return (__m128i) vec_abs ((__v8hi) __A);
40}
41
42extern __inline __m128i
43__attribute__((__gnu_inline__, __always_inline__, __artificial__))
44_mm_abs_epi32 (__m128i __A)
45{
46  return (__m128i) vec_abs ((__v4si) __A);
47}
48
49extern __inline __m128i
50__attribute__((__gnu_inline__, __always_inline__, __artificial__))
51_mm_abs_epi8 (__m128i __A)
52{
53  return (__m128i) vec_abs ((__v16qi) __A);
54}
55
56extern __inline __m64
57__attribute__((__gnu_inline__, __always_inline__, __artificial__))
58_mm_abs_pi16 (__m64 __A)
59{
60  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
61  return (__m64) ((__v2du) vec_abs (__B))[0];
62}
63
64extern __inline __m64
65__attribute__((__gnu_inline__, __always_inline__, __artificial__))
66_mm_abs_pi32 (__m64 __A)
67{
68  __v4si __B = (__v4si) (__v2du) { __A, __A };
69  return (__m64) ((__v2du) vec_abs (__B))[0];
70}
71
72extern __inline __m64
73__attribute__((__gnu_inline__, __always_inline__, __artificial__))
74_mm_abs_pi8 (__m64 __A)
75{
76  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
77  return (__m64) ((__v2du) vec_abs (__B))[0];
78}
79
80extern __inline __m128i
81__attribute__((__gnu_inline__, __always_inline__, __artificial__))
82_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
83{
84  if (__builtin_constant_p (__count) && __count < 16)
85    {
86#ifdef __LITTLE_ENDIAN__
87      __A = (__m128i) vec_reve ((__v16qu) __A);
88      __B = (__m128i) vec_reve ((__v16qu) __B);
89#endif
90      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
91#ifdef __LITTLE_ENDIAN__
92      __A = (__m128i) vec_reve ((__v16qu) __A);
93#endif
94      return __A;
95    }
96
97  if (__count == 0)
98    return __B;
99
100  if (__count >= 16)
101    {
102      if (__count >= 32)
103	{
104	  const __v16qu zero = { 0 };
105	  return (__m128i) zero;
106	}
107      else
108	{
109	  const __v16qu __shift =
110	    vec_splats ((unsigned char) ((__count - 16) * 8));
111#ifdef __LITTLE_ENDIAN__
112	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
113#else
114	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
115#endif
116	}
117    }
118  else
119    {
120      const __v16qu __shiftA =
121	vec_splats ((unsigned char) ((16 - __count) * 8));
122      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
123#ifdef __LITTLE_ENDIAN__
124      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
125      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
126#else
127      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
128      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
129#endif
130      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
131    }
132}
133
134extern __inline __m64
135__attribute__((__gnu_inline__, __always_inline__, __artificial__))
136_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
137{
138  if (__count < 16)
139    {
140      __v2du __C = { __B, __A };
141#ifdef __LITTLE_ENDIAN__
142      const __v4su __shift = { __count << 3, 0, 0, 0 };
143      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
144#else
145      const __v4su __shift = { 0, 0, 0, __count << 3 };
146      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
147#endif
148      return (__m64) __C[0];
149    }
150  else
151    {
152      const __m64 __zero = { 0 };
153      return __zero;
154    }
155}
156
157extern __inline __m128i
158__attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm_hadd_epi16 (__m128i __A, __m128i __B)
160{
161  const __v16qu __P =
162    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
163  const __v16qu __Q =
164    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
165  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
166  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
167  return (__m128i) vec_add (__C, __D);
168}
169
170extern __inline __m128i
171__attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm_hadd_epi32 (__m128i __A, __m128i __B)
173{
174  const __v16qu __P =
175    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
176  const __v16qu __Q =
177    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
178  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
179  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
180  return (__m128i) vec_add (__C, __D);
181}
182
183extern __inline __m64
184__attribute__((__gnu_inline__, __always_inline__, __artificial__))
185_mm_hadd_pi16 (__m64 __A, __m64 __B)
186{
187  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
188  const __v16qu __P =
189    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
190  const __v16qu __Q =
191    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
192  __v8hi __D = vec_perm (__C, __C, __Q);
193  __C = vec_perm (__C, __C, __P);
194  __C = vec_add (__C, __D);
195  return (__m64) ((__v2du) __C)[1];
196}
197
198extern __inline __m64
199__attribute__((__gnu_inline__, __always_inline__, __artificial__))
200_mm_hadd_pi32 (__m64 __A, __m64 __B)
201{
202  __v4si __C = (__v4si) (__v2du) { __A, __B };
203  const __v16qu __P =
204    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
205  const __v16qu __Q =
206    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
207  __v4si __D = vec_perm (__C, __C, __Q);
208  __C = vec_perm (__C, __C, __P);
209  __C = vec_add (__C, __D);
210  return (__m64) ((__v2du) __C)[1];
211}
212
213extern __inline __m128i
214__attribute__((__gnu_inline__, __always_inline__, __artificial__))
215_mm_hadds_epi16 (__m128i __A, __m128i __B)
216{
217  __v4si __C = { 0 }, __D = { 0 };
218  __C = vec_sum4s ((__v8hi) __A, __C);
219  __D = vec_sum4s ((__v8hi) __B, __D);
220  __C = (__v4si) vec_packs (__C, __D);
221  return (__m128i) __C;
222}
223
224extern __inline __m64
225__attribute__((__gnu_inline__, __always_inline__, __artificial__))
226_mm_hadds_pi16 (__m64 __A, __m64 __B)
227{
228  const __v4si __zero = { 0 };
229  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
230  __v4si __D = vec_sum4s (__C, __zero);
231  __C = vec_packs (__D, __D);
232  return (__m64) ((__v2du) __C)[1];
233}
234
235extern __inline __m128i
236__attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_hsub_epi16 (__m128i __A, __m128i __B)
238{
239  const __v16qu __P =
240    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
241  const __v16qu __Q =
242    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
243  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
244  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
245  return (__m128i) vec_sub (__C, __D);
246}
247
248extern __inline __m128i
249__attribute__((__gnu_inline__, __always_inline__, __artificial__))
250_mm_hsub_epi32 (__m128i __A, __m128i __B)
251{
252  const __v16qu __P =
253    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
254  const __v16qu __Q =
255    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
256  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
257  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
258  return (__m128i) vec_sub (__C, __D);
259}
260
261extern __inline __m64
262__attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm_hsub_pi16 (__m64 __A, __m64 __B)
264{
265  const __v16qu __P =
266    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
267  const __v16qu __Q =
268    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
269  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
270  __v8hi __D = vec_perm (__C, __C, __Q);
271  __C = vec_perm (__C, __C, __P);
272  __C = vec_sub (__C, __D);
273  return (__m64) ((__v2du) __C)[1];
274}
275
276extern __inline __m64
277__attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm_hsub_pi32 (__m64 __A, __m64 __B)
279{
280  const __v16qu __P =
281    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
282  const __v16qu __Q =
283    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
284  __v4si __C = (__v4si) (__v2du) { __A, __B };
285  __v4si __D = vec_perm (__C, __C, __Q);
286  __C = vec_perm (__C, __C, __P);
287  __C = vec_sub (__C, __D);
288  return (__m64) ((__v2du) __C)[1];
289}
290
291extern __inline __m128i
292__attribute__((__gnu_inline__, __always_inline__, __artificial__))
293_mm_hsubs_epi16 (__m128i __A, __m128i __B)
294{
295  const __v16qu __P =
296    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
297  const __v16qu __Q =
298    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
299  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
300  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
301  return (__m128i) vec_subs (__C, __D);
302}
303
304extern __inline __m64
305__attribute__((__gnu_inline__, __always_inline__, __artificial__))
306_mm_hsubs_pi16 (__m64 __A, __m64 __B)
307{
308  const __v16qu __P =
309    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
310  const __v16qu __Q =
311    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
312  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
313  __v8hi __D = vec_perm (__C, __C, __P);
314  __v8hi __E = vec_perm (__C, __C, __Q);
315  __C = vec_subs (__D, __E);
316  return (__m64) ((__v2du) __C)[1];
317}
318
319extern __inline __m128i
320__attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm_shuffle_epi8 (__m128i __A, __m128i __B)
322{
323  const __v16qi __zero = { 0 };
324  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
325  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
326  return (__m128i) vec_sel (__C, __zero, __select);
327}
328
329extern __inline __m64
330__attribute__((__gnu_inline__, __always_inline__, __artificial__))
331_mm_shuffle_pi8 (__m64 __A, __m64 __B)
332{
333  const __v16qi __zero = { 0 };
334  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
335  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
336  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
337  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
338  __C = vec_sel (__C, __zero, __select);
339  return (__m64) ((__v2du) (__C))[0];
340}
341
342extern __inline __m128i
343__attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm_sign_epi8 (__m128i __A, __m128i __B)
345{
346  const __v16qi __zero = { 0 };
347  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
348  __v16qi __selectpos =
349    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
350  __v16qi __conv = vec_add (__selectneg, __selectpos);
351  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
352}
353
354extern __inline __m128i
355__attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm_sign_epi16 (__m128i __A, __m128i __B)
357{
358  const __v8hi __zero = { 0 };
359  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
360  __v8hi __selectpos =
361    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
362  __v8hi __conv = vec_add (__selectneg, __selectpos);
363  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
364}
365
366extern __inline __m128i
367__attribute__((__gnu_inline__, __always_inline__, __artificial__))
368_mm_sign_epi32 (__m128i __A, __m128i __B)
369{
370  const __v4si __zero = { 0 };
371  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
372  __v4si __selectpos =
373    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
374  __v4si __conv = vec_add (__selectneg, __selectpos);
375  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
376}
377
378extern __inline __m64
379__attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_sign_pi8 (__m64 __A, __m64 __B)
381{
382  const __v16qi __zero = { 0 };
383  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
384  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
385  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
386  return (__m64) ((__v2du) (__C))[0];
387}
388
389extern __inline __m64
390__attribute__((__gnu_inline__, __always_inline__, __artificial__))
391_mm_sign_pi16 (__m64 __A, __m64 __B)
392{
393  const __v8hi __zero = { 0 };
394  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
395  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
396  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
397  return (__m64) ((__v2du) (__C))[0];
398}
399
400extern __inline __m64
401__attribute__((__gnu_inline__, __always_inline__, __artificial__))
402_mm_sign_pi32 (__m64 __A, __m64 __B)
403{
404  const __v4si __zero = { 0 };
405  __v4si __C = (__v4si) (__v2du) { __A, __A };
406  __v4si __D = (__v4si) (__v2du) { __B, __B };
407  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
408  return (__m64) ((__v2du) (__C))[0];
409}
410
411extern __inline __m128i
412__attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_maddubs_epi16 (__m128i __A, __m128i __B)
414{
415  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
416  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
417  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
418  __v8hi __E = vec_unpackh ((__v16qi) __B);
419  __v8hi __F = vec_unpackl ((__v16qi) __B);
420  __C = vec_mul (__C, __E);
421  __D = vec_mul (__D, __F);
422  const __v16qu __odds  =
423    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
424  const __v16qu __evens =
425    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
426  __E = vec_perm (__C, __D, __odds);
427  __F = vec_perm (__C, __D, __evens);
428  return (__m128i) vec_adds (__E, __F);
429}
430
431extern __inline __m64
432__attribute__((__gnu_inline__, __always_inline__, __artificial__))
433_mm_maddubs_pi16 (__m64 __A, __m64 __B)
434{
435  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
436  __C = vec_unpackl ((__v16qi) __C);
437  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
438  __C = vec_and (__C, __unsigned);
439  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
440  __D = vec_unpackl ((__v16qi) __D);
441  __D = vec_mul (__C, __D);
442  const __v16qu __odds  =
443    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
444  const __v16qu __evens =
445    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
446  __C = vec_perm (__D, __D, __odds);
447  __D = vec_perm (__D, __D, __evens);
448  __C = vec_adds (__C, __D);
449  return (__m64) ((__v2du) (__C))[0];
450}
451
452extern __inline __m128i
453__attribute__((__gnu_inline__, __always_inline__, __artificial__))
454_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
455{
456  __v4si __C = vec_unpackh ((__v8hi) __A);
457  __v4si __D = vec_unpackh ((__v8hi) __B);
458  __C = vec_mul (__C, __D);
459  __D = vec_unpackl ((__v8hi) __A);
460  __v4si __E = vec_unpackl ((__v8hi) __B);
461  __D = vec_mul (__D, __E);
462  const __v4su __shift = vec_splats ((unsigned int) 14);
463  __C = vec_sr (__C, __shift);
464  __D = vec_sr (__D, __shift);
465  const __v4si __ones = vec_splats ((signed int) 1);
466  __C = vec_add (__C, __ones);
467  __C = vec_sr (__C, (__v4su) __ones);
468  __D = vec_add (__D, __ones);
469  __D = vec_sr (__D, (__v4su) __ones);
470  return (__m128i) vec_pack (__C, __D);
471}
472
473extern __inline __m64
474__attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
476{
477  __v4si __C = (__v4si) (__v2du) { __A, __A };
478  __C = vec_unpackh ((__v8hi) __C);
479  __v4si __D = (__v4si) (__v2du) { __B, __B };
480  __D = vec_unpackh ((__v8hi) __D);
481  __C = vec_mul (__C, __D);
482  const __v4su __shift = vec_splats ((unsigned int) 14);
483  __C = vec_sr (__C, __shift);
484  const __v4si __ones = vec_splats ((signed int) 1);
485  __C = vec_add (__C, __ones);
486  __C = vec_sr (__C, (__v4su) __ones);
487  __v8hi __E = vec_pack (__C, __D);
488  return (__m64) ((__v2du) (__E))[0];
489}
490
491#else
492#include_next <tmmintrin.h>
493#endif /* defined(__linux__) && defined(__ppc64__) */
494
495#endif /* TMMINTRIN_H_ */
496