1/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header is distributed to simplify porting x86_64 code that
15   makes explicit use of Intel intrinsics to powerpc64le.
16
17   It is the user's responsibility to determine if the results are
18   acceptable and make additional changes as necessary.
19
20   Note that much code that uses Intel intrinsics can be rewritten in
21   standard C or GNU C extensions, which are more portable and better
22   optimized across multiple targets.  */
23#endif
24
25#ifndef TMMINTRIN_H_
26#define TMMINTRIN_H_
27
28#if defined(__powerpc64__) &&                                                  \
29    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
30
31#include <altivec.h>
32
33/* We need definitions from the SSE header files.  */
34#include <pmmintrin.h>
35
36extern __inline __m128i
37    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38    _mm_abs_epi16(__m128i __A) {
39  return (__m128i)vec_abs((__v8hi)__A);
40}
41
42extern __inline __m128i
43    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44    _mm_abs_epi32(__m128i __A) {
45  return (__m128i)vec_abs((__v4si)__A);
46}
47
48extern __inline __m128i
49    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
50    _mm_abs_epi8(__m128i __A) {
51  return (__m128i)vec_abs((__v16qi)__A);
52}
53
54extern __inline __m64
55    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56    _mm_abs_pi16(__m64 __A) {
57  __v8hi __B = (__v8hi)(__v2du){__A, __A};
58  return (__m64)((__v2du)vec_abs(__B))[0];
59}
60
61extern __inline __m64
62    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63    _mm_abs_pi32(__m64 __A) {
64  __v4si __B = (__v4si)(__v2du){__A, __A};
65  return (__m64)((__v2du)vec_abs(__B))[0];
66}
67
68extern __inline __m64
69    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70    _mm_abs_pi8(__m64 __A) {
71  __v16qi __B = (__v16qi)(__v2du){__A, __A};
72  return (__m64)((__v2du)vec_abs(__B))[0];
73}
74
75extern __inline __m128i
76    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77    _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
78  if (__builtin_constant_p(__count) && __count < 16) {
79#ifdef __LITTLE_ENDIAN__
80    __A = (__m128i)vec_reve((__v16qu)__A);
81    __B = (__m128i)vec_reve((__v16qu)__B);
82#endif
83    __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
84#ifdef __LITTLE_ENDIAN__
85    __A = (__m128i)vec_reve((__v16qu)__A);
86#endif
87    return __A;
88  }
89
90  if (__count == 0)
91    return __B;
92
93  if (__count >= 16) {
94    if (__count >= 32) {
95      const __v16qu __zero = {0};
96      return (__m128i)__zero;
97    } else {
98      const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
99#ifdef __LITTLE_ENDIAN__
100      return (__m128i)vec_sro((__v16qu)__A, __shift);
101#else
102      return (__m128i)vec_slo((__v16qu)__A, __shift);
103#endif
104    }
105  } else {
106    const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
107    const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
108#ifdef __LITTLE_ENDIAN__
109    __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
110    __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
111#else
112    __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
113    __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
114#endif
115    return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
116  }
117}
118
119extern __inline __m64
120    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121    _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
122  if (__count < 16) {
123    __v2du __C = {__B, __A};
124#ifdef __LITTLE_ENDIAN__
125    const __v4su __shift = {__count << 3, 0, 0, 0};
126    __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
127#else
128    const __v4su __shift = {0, 0, 0, __count << 3};
129    __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
130#endif
131    return (__m64)__C[0];
132  } else {
133    const __m64 __zero = {0};
134    return __zero;
135  }
136}
137
138extern __inline __m128i
139    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140    _mm_hadd_epi16(__m128i __A, __m128i __B) {
141  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
142                       16, 17, 20, 21, 24, 25, 28, 29};
143  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
144                       18, 19, 22, 23, 26, 27, 30, 31};
145  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
146  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
147  return (__m128i)vec_add(__C, __D);
148}
149
150extern __inline __m128i
151    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152    _mm_hadd_epi32(__m128i __A, __m128i __B) {
153  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
154                       16, 17, 18, 19, 24, 25, 26, 27};
155  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
156                       20, 21, 22, 23, 28, 29, 30, 31};
157  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
158  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
159  return (__m128i)vec_add(__C, __D);
160}
161
162extern __inline __m64
163    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164    _mm_hadd_pi16(__m64 __A, __m64 __B) {
165  __v8hi __C = (__v8hi)(__v2du){__A, __B};
166  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
167  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
168  __v8hi __D = vec_perm(__C, __C, __Q);
169  __C = vec_perm(__C, __C, __P);
170  __C = vec_add(__C, __D);
171  return (__m64)((__v2du)__C)[1];
172}
173
174extern __inline __m64
175    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176    _mm_hadd_pi32(__m64 __A, __m64 __B) {
177  __v4si __C = (__v4si)(__v2du){__A, __B};
178  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
179  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
180  __v4si __D = vec_perm(__C, __C, __Q);
181  __C = vec_perm(__C, __C, __P);
182  __C = vec_add(__C, __D);
183  return (__m64)((__v2du)__C)[1];
184}
185
186extern __inline __m128i
187    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188    _mm_hadds_epi16(__m128i __A, __m128i __B) {
189  __v4si __C = {0}, __D = {0};
190  __C = vec_sum4s((__v8hi)__A, __C);
191  __D = vec_sum4s((__v8hi)__B, __D);
192  __C = (__v4si)vec_packs(__C, __D);
193  return (__m128i)__C;
194}
195
196extern __inline __m64
197    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198    _mm_hadds_pi16(__m64 __A, __m64 __B) {
199  const __v4si __zero = {0};
200  __v8hi __C = (__v8hi)(__v2du){__A, __B};
201  __v4si __D = vec_sum4s(__C, __zero);
202  __C = vec_packs(__D, __D);
203  return (__m64)((__v2du)__C)[1];
204}
205
206extern __inline __m128i
207    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208    _mm_hsub_epi16(__m128i __A, __m128i __B) {
209  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
210                       16, 17, 20, 21, 24, 25, 28, 29};
211  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
212                       18, 19, 22, 23, 26, 27, 30, 31};
213  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
214  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
215  return (__m128i)vec_sub(__C, __D);
216}
217
218extern __inline __m128i
219    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220    _mm_hsub_epi32(__m128i __A, __m128i __B) {
221  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
222                       16, 17, 18, 19, 24, 25, 26, 27};
223  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
224                       20, 21, 22, 23, 28, 29, 30, 31};
225  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
226  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
227  return (__m128i)vec_sub(__C, __D);
228}
229
230extern __inline __m64
231    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232    _mm_hsub_pi16(__m64 __A, __m64 __B) {
233  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
234  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
235  __v8hi __C = (__v8hi)(__v2du){__A, __B};
236  __v8hi __D = vec_perm(__C, __C, __Q);
237  __C = vec_perm(__C, __C, __P);
238  __C = vec_sub(__C, __D);
239  return (__m64)((__v2du)__C)[1];
240}
241
242extern __inline __m64
243    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244    _mm_hsub_pi32(__m64 __A, __m64 __B) {
245  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
246  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
247  __v4si __C = (__v4si)(__v2du){__A, __B};
248  __v4si __D = vec_perm(__C, __C, __Q);
249  __C = vec_perm(__C, __C, __P);
250  __C = vec_sub(__C, __D);
251  return (__m64)((__v2du)__C)[1];
252}
253
254extern __inline __m128i
255    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256    _mm_hsubs_epi16(__m128i __A, __m128i __B) {
257  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
258                       16, 17, 20, 21, 24, 25, 28, 29};
259  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
260                       18, 19, 22, 23, 26, 27, 30, 31};
261  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
262  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
263  return (__m128i)vec_subs(__C, __D);
264}
265
266extern __inline __m64
267    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268    _mm_hsubs_pi16(__m64 __A, __m64 __B) {
269  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
270  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
271  __v8hi __C = (__v8hi)(__v2du){__A, __B};
272  __v8hi __D = vec_perm(__C, __C, __P);
273  __v8hi __E = vec_perm(__C, __C, __Q);
274  __C = vec_subs(__D, __E);
275  return (__m64)((__v2du)__C)[1];
276}
277
278extern __inline __m128i
279    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
280    _mm_shuffle_epi8(__m128i __A, __m128i __B) {
281  const __v16qi __zero = {0};
282  __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
283  __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
284  return (__m128i)vec_sel(__C, __zero, __select);
285}
286
287extern __inline __m64
288    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289    _mm_shuffle_pi8(__m64 __A, __m64 __B) {
290  const __v16qi __zero = {0};
291  __v16qi __C = (__v16qi)(__v2du){__A, __A};
292  __v16qi __D = (__v16qi)(__v2du){__B, __B};
293  __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
294  __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
295  __C = vec_sel(__C, __zero, __select);
296  return (__m64)((__v2du)(__C))[0];
297}
298
299#ifdef _ARCH_PWR8
300extern __inline __m128i
301    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302    _mm_sign_epi8(__m128i __A, __m128i __B) {
303  const __v16qi __zero = {0};
304  __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
305  __v16qi __selectpos =
306      (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
307  __v16qi __conv = vec_add(__selectneg, __selectpos);
308  return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
309}
310#endif
311
312#ifdef _ARCH_PWR8
313extern __inline __m128i
314    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315    _mm_sign_epi16(__m128i __A, __m128i __B) {
316  const __v8hi __zero = {0};
317  __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
318  __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
319  __v8hi __conv = vec_add(__selectneg, __selectpos);
320  return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
321}
322#endif
323
324#ifdef _ARCH_PWR8
325extern __inline __m128i
326    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327    _mm_sign_epi32(__m128i __A, __m128i __B) {
328  const __v4si __zero = {0};
329  __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
330  __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
331  __v4si __conv = vec_add(__selectneg, __selectpos);
332  return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
333}
334#endif
335
336#ifdef _ARCH_PWR8
337extern __inline __m64
338    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339    _mm_sign_pi8(__m64 __A, __m64 __B) {
340  const __v16qi __zero = {0};
341  __v16qi __C = (__v16qi)(__v2du){__A, __A};
342  __v16qi __D = (__v16qi)(__v2du){__B, __B};
343  __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
344  return (__m64)((__v2du)(__C))[0];
345}
346#endif
347
348#ifdef _ARCH_PWR8
349extern __inline __m64
350    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351    _mm_sign_pi16(__m64 __A, __m64 __B) {
352  const __v8hi __zero = {0};
353  __v8hi __C = (__v8hi)(__v2du){__A, __A};
354  __v8hi __D = (__v8hi)(__v2du){__B, __B};
355  __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
356  return (__m64)((__v2du)(__C))[0];
357}
358#endif
359
360#ifdef _ARCH_PWR8
361extern __inline __m64
362    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363    _mm_sign_pi32(__m64 __A, __m64 __B) {
364  const __v4si __zero = {0};
365  __v4si __C = (__v4si)(__v2du){__A, __A};
366  __v4si __D = (__v4si)(__v2du){__B, __B};
367  __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
368  return (__m64)((__v2du)(__C))[0];
369}
370#endif
371
372extern __inline __m128i
373    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374    _mm_maddubs_epi16(__m128i __A, __m128i __B) {
375  __v8hi __unsigned = vec_splats((signed short)0x00ff);
376  __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
377  __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
378  __v8hi __E = vec_unpackh((__v16qi)__B);
379  __v8hi __F = vec_unpackl((__v16qi)__B);
380  __C = vec_mul(__C, __E);
381  __D = vec_mul(__D, __F);
382  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
383                          16, 17, 20, 21, 24, 25, 28, 29};
384  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
385                           18, 19, 22, 23, 26, 27, 30, 31};
386  __E = vec_perm(__C, __D, __odds);
387  __F = vec_perm(__C, __D, __evens);
388  return (__m128i)vec_adds(__E, __F);
389}
390
391extern __inline __m64
392    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393    _mm_maddubs_pi16(__m64 __A, __m64 __B) {
394  __v8hi __C = (__v8hi)(__v2du){__A, __A};
395  __C = vec_unpackl((__v16qi)__C);
396  const __v8hi __unsigned = vec_splats((signed short)0x00ff);
397  __C = vec_and(__C, __unsigned);
398  __v8hi __D = (__v8hi)(__v2du){__B, __B};
399  __D = vec_unpackl((__v16qi)__D);
400  __D = vec_mul(__C, __D);
401  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
402                          16, 17, 20, 21, 24, 25, 28, 29};
403  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
404                           18, 19, 22, 23, 26, 27, 30, 31};
405  __C = vec_perm(__D, __D, __odds);
406  __D = vec_perm(__D, __D, __evens);
407  __C = vec_adds(__C, __D);
408  return (__m64)((__v2du)(__C))[0];
409}
410
411extern __inline __m128i
412    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413    _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
414  __v4si __C = vec_unpackh((__v8hi)__A);
415  __v4si __D = vec_unpackh((__v8hi)__B);
416  __C = vec_mul(__C, __D);
417  __D = vec_unpackl((__v8hi)__A);
418  __v4si __E = vec_unpackl((__v8hi)__B);
419  __D = vec_mul(__D, __E);
420  const __v4su __shift = vec_splats((unsigned int)14);
421  __C = vec_sr(__C, __shift);
422  __D = vec_sr(__D, __shift);
423  const __v4si __ones = vec_splats((signed int)1);
424  __C = vec_add(__C, __ones);
425  __C = vec_sr(__C, (__v4su)__ones);
426  __D = vec_add(__D, __ones);
427  __D = vec_sr(__D, (__v4su)__ones);
428  return (__m128i)vec_pack(__C, __D);
429}
430
431extern __inline __m64
432    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433    _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
434  __v4si __C = (__v4si)(__v2du){__A, __A};
435  __C = vec_unpackh((__v8hi)__C);
436  __v4si __D = (__v4si)(__v2du){__B, __B};
437  __D = vec_unpackh((__v8hi)__D);
438  __C = vec_mul(__C, __D);
439  const __v4su __shift = vec_splats((unsigned int)14);
440  __C = vec_sr(__C, __shift);
441  const __v4si __ones = vec_splats((signed int)1);
442  __C = vec_add(__C, __ones);
443  __C = vec_sr(__C, (__v4su)__ones);
444  __v8hi __E = vec_pack(__C, __D);
445  return (__m64)((__v2du)(__E))[0];
446}
447
448#else
449#include_next <tmmintrin.h>
450#endif /* defined(__powerpc64__) &&                                            \
451        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
452
453#endif /* TMMINTRIN_H_ */
454