1/* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef NO_WARN_X86_INTRINSICS
28/* This header is distributed to simplify porting x86_64 code that
29   makes explicit use of Intel intrinsics to powerpc64le.
30   It is the user's responsibility to determine if the results are
31   acceptable and make additional changes as necessary.
32   Note that much code that uses Intel intrinsics can be rewritten in
33   standard C or GNU C extensions, which are more portable and better
34   optimized across multiple targets.  */
35#endif
36
37#ifndef TMMINTRIN_H_
38#define TMMINTRIN_H_
39
40#include <altivec.h>
41#include <assert.h>
42
43/* We need definitions from the SSE header files.  */
44#include <pmmintrin.h>
45
46extern __inline __m128i
47__attribute__((__gnu_inline__, __always_inline__, __artificial__))
48_mm_abs_epi16 (__m128i __A)
49{
50  return (__m128i) vec_abs ((__v8hi) __A);
51}
52
53extern __inline __m128i
54__attribute__((__gnu_inline__, __always_inline__, __artificial__))
55_mm_abs_epi32 (__m128i __A)
56{
57  return (__m128i) vec_abs ((__v4si) __A);
58}
59
60extern __inline __m128i
61__attribute__((__gnu_inline__, __always_inline__, __artificial__))
62_mm_abs_epi8 (__m128i __A)
63{
64  return (__m128i) vec_abs ((__v16qi) __A);
65}
66
67extern __inline __m64
68__attribute__((__gnu_inline__, __always_inline__, __artificial__))
69_mm_abs_pi16 (__m64 __A)
70{
71  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72  return (__m64) ((__v2du) vec_abs (__B))[0];
73}
74
75extern __inline __m64
76__attribute__((__gnu_inline__, __always_inline__, __artificial__))
77_mm_abs_pi32 (__m64 __A)
78{
79  __v4si __B = (__v4si) (__v2du) { __A, __A };
80  return (__m64) ((__v2du) vec_abs (__B))[0];
81}
82
83extern __inline __m64
84__attribute__((__gnu_inline__, __always_inline__, __artificial__))
85_mm_abs_pi8 (__m64 __A)
86{
87  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88  return (__m64) ((__v2du) vec_abs (__B))[0];
89}
90
91extern __inline __m128i
92__attribute__((__gnu_inline__, __always_inline__, __artificial__))
93_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
94{
95  if (__builtin_constant_p (__count) && __count < 16)
96    {
97#ifdef __LITTLE_ENDIAN__
98      __A = (__m128i) vec_reve ((__v16qu) __A);
99      __B = (__m128i) vec_reve ((__v16qu) __B);
100#endif
101      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102#ifdef __LITTLE_ENDIAN__
103      __A = (__m128i) vec_reve ((__v16qu) __A);
104#endif
105      return __A;
106    }
107
108  if (__count == 0)
109    return __B;
110
111  if (__count >= 16)
112    {
113      if (__count >= 32)
114	{
115	  const __v16qu __zero = { 0 };
116	  return (__m128i) __zero;
117	}
118      else
119	{
120	  const __v16qu __shift =
121	    vec_splats ((unsigned char) ((__count - 16) * 8));
122#ifdef __LITTLE_ENDIAN__
123	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
124#else
125	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
126#endif
127	}
128    }
129  else
130    {
131      const __v16qu __shiftA =
132	vec_splats ((unsigned char) ((16 - __count) * 8));
133      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134#ifdef __LITTLE_ENDIAN__
135      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137#else
138      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140#endif
141      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
142    }
143}
144
145extern __inline __m64
146__attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
148{
149  if (__count < 16)
150    {
151      __v2du __C = { __B, __A };
152#ifdef __LITTLE_ENDIAN__
153      const __v4su __shift = { __count << 3, 0, 0, 0 };
154      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155#else
156      const __v4su __shift = { 0, 0, 0, __count << 3 };
157      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158#endif
159      return (__m64) __C[0];
160    }
161  else
162    {
163      const __m64 __zero = { 0 };
164      return __zero;
165    }
166}
167
168extern __inline __m128i
169__attribute__((__gnu_inline__, __always_inline__, __artificial__))
170_mm_hadd_epi16 (__m128i __A, __m128i __B)
171{
172  const __v16qu __P =
173    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174  const __v16qu __Q =
175    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178  return (__m128i) vec_add (__C, __D);
179}
180
181extern __inline __m128i
182__attribute__((__gnu_inline__, __always_inline__, __artificial__))
183_mm_hadd_epi32 (__m128i __A, __m128i __B)
184{
185  const __v16qu __P =
186    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187  const __v16qu __Q =
188    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191  return (__m128i) vec_add (__C, __D);
192}
193
194extern __inline __m64
195__attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_hadd_pi16 (__m64 __A, __m64 __B)
197{
198  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199  const __v16qu __P =
200    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
201  const __v16qu __Q =
202    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
203  __v8hi __D = vec_perm (__C, __C, __Q);
204  __C = vec_perm (__C, __C, __P);
205  __C = vec_add (__C, __D);
206  return (__m64) ((__v2du) __C)[1];
207}
208
209extern __inline __m64
210__attribute__((__gnu_inline__, __always_inline__, __artificial__))
211_mm_hadd_pi32 (__m64 __A, __m64 __B)
212{
213  __v4si __C = (__v4si) (__v2du) { __A, __B };
214  const __v16qu __P =
215    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
216  const __v16qu __Q =
217    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
218  __v4si __D = vec_perm (__C, __C, __Q);
219  __C = vec_perm (__C, __C, __P);
220  __C = vec_add (__C, __D);
221  return (__m64) ((__v2du) __C)[1];
222}
223
224extern __inline __m128i
225__attribute__((__gnu_inline__, __always_inline__, __artificial__))
226_mm_hadds_epi16 (__m128i __A, __m128i __B)
227{
228  __v4si __C = { 0 }, __D = { 0 };
229  __C = vec_sum4s ((__v8hi) __A, __C);
230  __D = vec_sum4s ((__v8hi) __B, __D);
231  __C = (__v4si) vec_packs (__C, __D);
232  return (__m128i) __C;
233}
234
235extern __inline __m64
236__attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_hadds_pi16 (__m64 __A, __m64 __B)
238{
239  const __v4si __zero = { 0 };
240  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241  __v4si __D = vec_sum4s (__C, __zero);
242  __C = vec_packs (__D, __D);
243  return (__m64) ((__v2du) __C)[1];
244}
245
246extern __inline __m128i
247__attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm_hsub_epi16 (__m128i __A, __m128i __B)
249{
250  const __v16qu __P =
251    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252  const __v16qu __Q =
253    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256  return (__m128i) vec_sub (__C, __D);
257}
258
259extern __inline __m128i
260__attribute__((__gnu_inline__, __always_inline__, __artificial__))
261_mm_hsub_epi32 (__m128i __A, __m128i __B)
262{
263  const __v16qu __P =
264    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265  const __v16qu __Q =
266    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269  return (__m128i) vec_sub (__C, __D);
270}
271
272extern __inline __m64
273__attribute__((__gnu_inline__, __always_inline__, __artificial__))
274_mm_hsub_pi16 (__m64 __A, __m64 __B)
275{
276  const __v16qu __P =
277    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
278  const __v16qu __Q =
279    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
280  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281  __v8hi __D = vec_perm (__C, __C, __Q);
282  __C = vec_perm (__C, __C, __P);
283  __C = vec_sub (__C, __D);
284  return (__m64) ((__v2du) __C)[1];
285}
286
287extern __inline __m64
288__attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_hsub_pi32 (__m64 __A, __m64 __B)
290{
291  const __v16qu __P =
292    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
293  const __v16qu __Q =
294    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
295  __v4si __C = (__v4si) (__v2du) { __A, __B };
296  __v4si __D = vec_perm (__C, __C, __Q);
297  __C = vec_perm (__C, __C, __P);
298  __C = vec_sub (__C, __D);
299  return (__m64) ((__v2du) __C)[1];
300}
301
302extern __inline __m128i
303__attribute__((__gnu_inline__, __always_inline__, __artificial__))
304_mm_hsubs_epi16 (__m128i __A, __m128i __B)
305{
306  const __v16qu __P =
307    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308  const __v16qu __Q =
309    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312  return (__m128i) vec_subs (__C, __D);
313}
314
315extern __inline __m64
316__attribute__((__gnu_inline__, __always_inline__, __artificial__))
317_mm_hsubs_pi16 (__m64 __A, __m64 __B)
318{
319  const __v16qu __P =
320    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
321  const __v16qu __Q =
322    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
323  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324  __v8hi __D = vec_perm (__C, __C, __P);
325  __v8hi __E = vec_perm (__C, __C, __Q);
326  __C = vec_subs (__D, __E);
327  return (__m64) ((__v2du) __C)[1];
328}
329
330extern __inline __m128i
331__attribute__((__gnu_inline__, __always_inline__, __artificial__))
332_mm_shuffle_epi8 (__m128i __A, __m128i __B)
333{
334  const __v16qi __zero = { 0 };
335  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337  return (__m128i) vec_sel (__C, __zero, __select);
338}
339
340extern __inline __m64
341__attribute__((__gnu_inline__, __always_inline__, __artificial__))
342_mm_shuffle_pi8 (__m64 __A, __m64 __B)
343{
344  const __v16qi __zero = { 0 };
345  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349  __C = vec_sel (__C, __zero, __select);
350  return (__m64) ((__v2du) (__C))[0];
351}
352
353extern __inline __m128i
354__attribute__((__gnu_inline__, __always_inline__, __artificial__))
355_mm_sign_epi8 (__m128i __A, __m128i __B)
356{
357  const __v16qi __zero = { 0 };
358  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
359  __v16qi __selectpos =
360    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
361  __v16qi __conv = vec_add (__selectneg, __selectpos);
362  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
363}
364
365extern __inline __m128i
366__attribute__((__gnu_inline__, __always_inline__, __artificial__))
367_mm_sign_epi16 (__m128i __A, __m128i __B)
368{
369  const __v8hi __zero = { 0 };
370  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
371  __v8hi __selectpos =
372    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
373  __v8hi __conv = vec_add (__selectneg, __selectpos);
374  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
375}
376
377extern __inline __m128i
378__attribute__((__gnu_inline__, __always_inline__, __artificial__))
379_mm_sign_epi32 (__m128i __A, __m128i __B)
380{
381  const __v4si __zero = { 0 };
382  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
383  __v4si __selectpos =
384    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
385  __v4si __conv = vec_add (__selectneg, __selectpos);
386  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
387}
388
389extern __inline __m64
390__attribute__((__gnu_inline__, __always_inline__, __artificial__))
391_mm_sign_pi8 (__m64 __A, __m64 __B)
392{
393  const __v16qi __zero = { 0 };
394  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
395  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
396  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
397  return (__m64) ((__v2du) (__C))[0];
398}
399
400extern __inline __m64
401__attribute__((__gnu_inline__, __always_inline__, __artificial__))
402_mm_sign_pi16 (__m64 __A, __m64 __B)
403{
404  const __v8hi __zero = { 0 };
405  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
406  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
407  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
408  return (__m64) ((__v2du) (__C))[0];
409}
410
411extern __inline __m64
412__attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_sign_pi32 (__m64 __A, __m64 __B)
414{
415  const __v4si __zero = { 0 };
416  __v4si __C = (__v4si) (__v2du) { __A, __A };
417  __v4si __D = (__v4si) (__v2du) { __B, __B };
418  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
419  return (__m64) ((__v2du) (__C))[0];
420}
421
422extern __inline __m128i
423__attribute__((__gnu_inline__, __always_inline__, __artificial__))
424_mm_maddubs_epi16 (__m128i __A, __m128i __B)
425{
426  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
427  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
428  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
429  __v8hi __E = vec_unpackh ((__v16qi) __B);
430  __v8hi __F = vec_unpackl ((__v16qi) __B);
431  __C = vec_mul (__C, __E);
432  __D = vec_mul (__D, __F);
433  const __v16qu __odds  =
434    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
435  const __v16qu __evens =
436    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
437  __E = vec_perm (__C, __D, __odds);
438  __F = vec_perm (__C, __D, __evens);
439  return (__m128i) vec_adds (__E, __F);
440}
441
442extern __inline __m64
443__attribute__((__gnu_inline__, __always_inline__, __artificial__))
444_mm_maddubs_pi16 (__m64 __A, __m64 __B)
445{
446  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
447  __C = vec_unpackl ((__v16qi) __C);
448  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
449  __C = vec_and (__C, __unsigned);
450  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
451  __D = vec_unpackl ((__v16qi) __D);
452  __D = vec_mul (__C, __D);
453  const __v16qu __odds  =
454    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
455  const __v16qu __evens =
456    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
457  __C = vec_perm (__D, __D, __odds);
458  __D = vec_perm (__D, __D, __evens);
459  __C = vec_adds (__C, __D);
460  return (__m64) ((__v2du) (__C))[0];
461}
462
463extern __inline __m128i
464__attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
466{
467  __v4si __C = vec_unpackh ((__v8hi) __A);
468  __v4si __D = vec_unpackh ((__v8hi) __B);
469  __C = vec_mul (__C, __D);
470  __D = vec_unpackl ((__v8hi) __A);
471  __v4si __E = vec_unpackl ((__v8hi) __B);
472  __D = vec_mul (__D, __E);
473  const __v4su __shift = vec_splats ((unsigned int) 14);
474  __C = vec_sr (__C, __shift);
475  __D = vec_sr (__D, __shift);
476  const __v4si __ones = vec_splats ((signed int) 1);
477  __C = vec_add (__C, __ones);
478  __C = vec_sr (__C, (__v4su) __ones);
479  __D = vec_add (__D, __ones);
480  __D = vec_sr (__D, (__v4su) __ones);
481  return (__m128i) vec_pack (__C, __D);
482}
483
484extern __inline __m64
485__attribute__((__gnu_inline__, __always_inline__, __artificial__))
486_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
487{
488  __v4si __C = (__v4si) (__v2du) { __A, __A };
489  __C = vec_unpackh ((__v8hi) __C);
490  __v4si __D = (__v4si) (__v2du) { __B, __B };
491  __D = vec_unpackh ((__v8hi) __D);
492  __C = vec_mul (__C, __D);
493  const __v4su __shift = vec_splats ((unsigned int) 14);
494  __C = vec_sr (__C, __shift);
495  const __v4si __ones = vec_splats ((signed int) 1);
496  __C = vec_add (__C, __ones);
497  __C = vec_sr (__C, (__v4su) __ones);
498  __v8hi __E = vec_pack (__C, __D);
499  return (__m64) ((__v2du) (__E))[0];
500}
501
502#endif
503