1/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.
12
13   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
14
15#ifndef NO_WARN_X86_INTRINSICS
16/* This header is distributed to simplify porting x86_64 code that
17   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
18
19   It is the user's responsibility to determine if the results are
20   acceptable and make additional changes as necessary.
21
22   Note that much code that uses Intel intrinsics can be rewritten in
23   standard C or GNU C extensions, which are more portable and better
24   optimized across multiple targets.  */
25#error                                                                         \
26    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
27#endif
28
29#ifndef SMMINTRIN_H_
30#define SMMINTRIN_H_
31
32#if defined(__powerpc64__) &&                                                  \
33    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34
35#include <altivec.h>
36#include <tmmintrin.h>
37
38/* Rounding mode macros. */
39#define _MM_FROUND_TO_NEAREST_INT 0x00
40#define _MM_FROUND_TO_ZERO 0x01
41#define _MM_FROUND_TO_POS_INF 0x02
42#define _MM_FROUND_TO_NEG_INF 0x03
43#define _MM_FROUND_CUR_DIRECTION 0x04
44
45#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51
52#define _MM_FROUND_RAISE_EXC 0x00
53#define _MM_FROUND_NO_EXC 0x08
54
55extern __inline __m128d
56    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57    _mm_round_pd(__m128d __A, int __rounding) {
58  __v2df __r;
59  union {
60    double __fr;
61    long long __fpscr;
62  } __enables_save, __fpscr_save;
63
64  if (__rounding & _MM_FROUND_NO_EXC) {
65    /* Save enabled exceptions, disable all exceptions,
66       and preserve the rounding mode.  */
67#ifdef _ARCH_PWR9
68    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70#else
71    __fpscr_save.__fr = __builtin_mffs();
72    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73    __fpscr_save.__fpscr &= ~0xf8;
74    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
75#endif
76    /* Insert an artificial "read/write" reference to the variable
77       read below, to ensure the compiler does not schedule
78       a read/use of the variable before the FPSCR is modified, above.
79       This can be removed if and when GCC PR102783 is fixed.
80     */
81    __asm__("" : "+wa"(__A));
82  }
83
84  switch (__rounding) {
85  case _MM_FROUND_TO_NEAREST_INT:
86    __fpscr_save.__fr = __builtin_mffsl();
87    __attribute__((fallthrough));
88  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
89    __builtin_set_fpscr_rn(0b00);
90    /* Insert an artificial "read/write" reference to the variable
91       read below, to ensure the compiler does not schedule
92       a read/use of the variable before the FPSCR is modified, above.
93       This can be removed if and when GCC PR102783 is fixed.
94     */
95    __asm__("" : "+wa"(__A));
96
97    __r = vec_rint((__v2df)__A);
98
99    /* Insert an artificial "read" reference to the variable written
100       above, to ensure the compiler does not schedule the computation
101       of the value after the manipulation of the FPSCR, below.
102       This can be removed if and when GCC PR102783 is fixed.
103     */
104    __asm__("" : : "wa"(__r));
105    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
106    break;
107  case _MM_FROUND_TO_NEG_INF:
108  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
109    __r = vec_floor((__v2df)__A);
110    break;
111  case _MM_FROUND_TO_POS_INF:
112  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
113    __r = vec_ceil((__v2df)__A);
114    break;
115  case _MM_FROUND_TO_ZERO:
116  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
117    __r = vec_trunc((__v2df)__A);
118    break;
119  case _MM_FROUND_CUR_DIRECTION:
120    __r = vec_rint((__v2df)__A);
121    break;
122  }
123  if (__rounding & _MM_FROUND_NO_EXC) {
124    /* Insert an artificial "read" reference to the variable written
125       above, to ensure the compiler does not schedule the computation
126       of the value after the manipulation of the FPSCR, below.
127       This can be removed if and when GCC PR102783 is fixed.
128     */
129    __asm__("" : : "wa"(__r));
130    /* Restore enabled exceptions.  */
131    __fpscr_save.__fr = __builtin_mffsl();
132    __fpscr_save.__fpscr |= __enables_save.__fpscr;
133    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
134  }
135  return (__m128d)__r;
136}
137
138extern __inline __m128d
139    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140    _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
141  __B = _mm_round_pd(__B, __rounding);
142  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
143  return (__m128d)__r;
144}
145
146extern __inline __m128
147    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148    _mm_round_ps(__m128 __A, int __rounding) {
149  __v4sf __r;
150  union {
151    double __fr;
152    long long __fpscr;
153  } __enables_save, __fpscr_save;
154
155  if (__rounding & _MM_FROUND_NO_EXC) {
156    /* Save enabled exceptions, disable all exceptions,
157       and preserve the rounding mode.  */
158#ifdef _ARCH_PWR9
159    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
160    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
161#else
162    __fpscr_save.__fr = __builtin_mffs();
163    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
164    __fpscr_save.__fpscr &= ~0xf8;
165    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
166#endif
167    /* Insert an artificial "read/write" reference to the variable
168       read below, to ensure the compiler does not schedule
169       a read/use of the variable before the FPSCR is modified, above.
170       This can be removed if and when GCC PR102783 is fixed.
171     */
172    __asm__("" : "+wa"(__A));
173  }
174
175  switch (__rounding) {
176  case _MM_FROUND_TO_NEAREST_INT:
177    __fpscr_save.__fr = __builtin_mffsl();
178    __attribute__((fallthrough));
179  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
180    __builtin_set_fpscr_rn(0b00);
181    /* Insert an artificial "read/write" reference to the variable
182       read below, to ensure the compiler does not schedule
183       a read/use of the variable before the FPSCR is modified, above.
184       This can be removed if and when GCC PR102783 is fixed.
185     */
186    __asm__("" : "+wa"(__A));
187
188    __r = vec_rint((__v4sf)__A);
189
190    /* Insert an artificial "read" reference to the variable written
191       above, to ensure the compiler does not schedule the computation
192       of the value after the manipulation of the FPSCR, below.
193       This can be removed if and when GCC PR102783 is fixed.
194     */
195    __asm__("" : : "wa"(__r));
196    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
197    break;
198  case _MM_FROUND_TO_NEG_INF:
199  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
200    __r = vec_floor((__v4sf)__A);
201    break;
202  case _MM_FROUND_TO_POS_INF:
203  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
204    __r = vec_ceil((__v4sf)__A);
205    break;
206  case _MM_FROUND_TO_ZERO:
207  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
208    __r = vec_trunc((__v4sf)__A);
209    break;
210  case _MM_FROUND_CUR_DIRECTION:
211    __r = vec_rint((__v4sf)__A);
212    break;
213  }
214  if (__rounding & _MM_FROUND_NO_EXC) {
215    /* Insert an artificial "read" reference to the variable written
216       above, to ensure the compiler does not schedule the computation
217       of the value after the manipulation of the FPSCR, below.
218       This can be removed if and when GCC PR102783 is fixed.
219     */
220    __asm__("" : : "wa"(__r));
221    /* Restore enabled exceptions.  */
222    __fpscr_save.__fr = __builtin_mffsl();
223    __fpscr_save.__fpscr |= __enables_save.__fpscr;
224    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
225  }
226  return (__m128)__r;
227}
228
229extern __inline __m128
230    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231    _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
232  __B = _mm_round_ps(__B, __rounding);
233  __v4sf __r = (__v4sf)__A;
234  __r[0] = ((__v4sf)__B)[0];
235  return (__m128)__r;
236}
237
238#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
239#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
240
241#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
242#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
243
244#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
245#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
246
247#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
248#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
249
250extern __inline __m128i
251    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
253  __v16qi __result = (__v16qi)__A;
254
255  __result[__N & 0xf] = __D;
256
257  return (__m128i)__result;
258}
259
260extern __inline __m128i
261    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
263  __v4si __result = (__v4si)__A;
264
265  __result[__N & 3] = __D;
266
267  return (__m128i)__result;
268}
269
270extern __inline __m128i
271    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
273  __v2di __result = (__v2di)__A;
274
275  __result[__N & 1] = __D;
276
277  return (__m128i)__result;
278}
279
280extern __inline int
281    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282    _mm_extract_epi8(__m128i __X, const int __N) {
283  return (unsigned char)((__v16qi)__X)[__N & 15];
284}
285
286extern __inline int
287    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288    _mm_extract_epi32(__m128i __X, const int __N) {
289  return ((__v4si)__X)[__N & 3];
290}
291
292extern __inline int
293    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294    _mm_extract_epi64(__m128i __X, const int __N) {
295  return ((__v2di)__X)[__N & 1];
296}
297
298extern __inline int
299    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300    _mm_extract_ps(__m128 __X, const int __N) {
301  return ((__v4si)__X)[__N & 3];
302}
303
304#ifdef _ARCH_PWR8
305extern __inline __m128i
306    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
308  __v16qi __charmask = vec_splats((signed char)__imm8);
309  __charmask = vec_gb(__charmask);
310  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
311#ifdef __BIG_ENDIAN__
312  __shortmask = vec_reve(__shortmask);
313#endif
314  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
315}
316#endif
317
318extern __inline __m128i
319    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
321#ifdef _ARCH_PWR10
322  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
323#else
324  const __v16qu __seven = vec_splats((unsigned char)0x07);
325  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
326  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
327#endif
328}
329
330extern __inline __m128
331    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332    _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
333  __v16qu __pcv[] = {
334      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
335      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
336      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
337      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
338      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
339      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
340      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
341      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
342      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
343      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
344      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
345      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
346      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
347      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
348      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
349      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
350  };
351  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
352  return (__m128)__r;
353}
354
355extern __inline __m128
356    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357    _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
358#ifdef _ARCH_PWR10
359  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
360#else
361  const __v4si __zero = {0};
362  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
363  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
364#endif
365}
366
367extern __inline __m128d
368    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369    _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
370  __v16qu __pcv[] = {
371      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
372      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
373      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
374      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
375  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
376  return (__m128d)__r;
377}
378
379#ifdef _ARCH_PWR8
380extern __inline __m128d
381    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382    _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
383#ifdef _ARCH_PWR10
384  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
385#else
386  const __v2di __zero = {0};
387  const __vector __bool long long __boolmask =
388      vec_cmplt((__v2di)__mask, __zero);
389  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
390#endif
391}
392#endif
393
394extern __inline int
395    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396    _mm_testz_si128(__m128i __A, __m128i __B) {
397  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
398  const __v16qu __zero = {0};
399  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
400}
401
402extern __inline int
403    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404    _mm_testc_si128(__m128i __A, __m128i __B) {
405  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
406  const __v16qu __zero = {0};
407  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
408  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
409}
410
411extern __inline int
412    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413    _mm_testnzc_si128(__m128i __A, __m128i __B) {
414  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
415  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
416}
417
418#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
419
420#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
421
422#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
423
424#ifdef _ARCH_PWR8
425extern __inline __m128i
426    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427    _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
428  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
429}
430#endif
431
432extern __inline __m128i
433    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434    _mm_min_epi8(__m128i __X, __m128i __Y) {
435  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
436}
437
438extern __inline __m128i
439    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
440    _mm_min_epu16(__m128i __X, __m128i __Y) {
441  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
442}
443
444extern __inline __m128i
445    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446    _mm_min_epi32(__m128i __X, __m128i __Y) {
447  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
448}
449
450extern __inline __m128i
451    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452    _mm_min_epu32(__m128i __X, __m128i __Y) {
453  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
454}
455
456extern __inline __m128i
457    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458    _mm_max_epi8(__m128i __X, __m128i __Y) {
459  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
460}
461
462extern __inline __m128i
463    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464    _mm_max_epu16(__m128i __X, __m128i __Y) {
465  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
466}
467
468extern __inline __m128i
469    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470    _mm_max_epi32(__m128i __X, __m128i __Y) {
471  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
472}
473
474extern __inline __m128i
475    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476    _mm_max_epu32(__m128i __X, __m128i __Y) {
477  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
478}
479
480extern __inline __m128i
481    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482    _mm_mullo_epi32(__m128i __X, __m128i __Y) {
483  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
484}
485
486#ifdef _ARCH_PWR8
487extern __inline __m128i
488    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489    _mm_mul_epi32(__m128i __X, __m128i __Y) {
490  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
491}
492#endif
493
494extern __inline __m128i
495    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496    _mm_cvtepi8_epi16(__m128i __A) {
497  return (__m128i)vec_unpackh((__v16qi)__A);
498}
499
500extern __inline __m128i
501    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502    _mm_cvtepi8_epi32(__m128i __A) {
503  __A = (__m128i)vec_unpackh((__v16qi)__A);
504  return (__m128i)vec_unpackh((__v8hi)__A);
505}
506
507#ifdef _ARCH_PWR8
508extern __inline __m128i
509    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510    _mm_cvtepi8_epi64(__m128i __A) {
511  __A = (__m128i)vec_unpackh((__v16qi)__A);
512  __A = (__m128i)vec_unpackh((__v8hi)__A);
513  return (__m128i)vec_unpackh((__v4si)__A);
514}
515#endif
516
517extern __inline __m128i
518    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519    _mm_cvtepi16_epi32(__m128i __A) {
520  return (__m128i)vec_unpackh((__v8hi)__A);
521}
522
523#ifdef _ARCH_PWR8
524extern __inline __m128i
525    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526    _mm_cvtepi16_epi64(__m128i __A) {
527  __A = (__m128i)vec_unpackh((__v8hi)__A);
528  return (__m128i)vec_unpackh((__v4si)__A);
529}
530#endif
531
532#ifdef _ARCH_PWR8
533extern __inline __m128i
534    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535    _mm_cvtepi32_epi64(__m128i __A) {
536  return (__m128i)vec_unpackh((__v4si)__A);
537}
538#endif
539
540extern __inline __m128i
541    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542    _mm_cvtepu8_epi16(__m128i __A) {
543  const __v16qu __zero = {0};
544#ifdef __LITTLE_ENDIAN__
545  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
546#else  /* __BIG_ENDIAN__.  */
547  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
548#endif /* __BIG_ENDIAN__.  */
549  return __A;
550}
551
552extern __inline __m128i
553    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554    _mm_cvtepu8_epi32(__m128i __A) {
555  const __v16qu __zero = {0};
556#ifdef __LITTLE_ENDIAN__
557  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
558  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
559#else  /* __BIG_ENDIAN__.  */
560  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
561  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
562#endif /* __BIG_ENDIAN__.  */
563  return __A;
564}
565
566extern __inline __m128i
567    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568    _mm_cvtepu8_epi64(__m128i __A) {
569  const __v16qu __zero = {0};
570#ifdef __LITTLE_ENDIAN__
571  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
572  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
573  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
574#else  /* __BIG_ENDIAN__.  */
575  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
576  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
577  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
578#endif /* __BIG_ENDIAN__.  */
579  return __A;
580}
581
582extern __inline __m128i
583    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584    _mm_cvtepu16_epi32(__m128i __A) {
585  const __v8hu __zero = {0};
586#ifdef __LITTLE_ENDIAN__
587  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
588#else  /* __BIG_ENDIAN__.  */
589  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
590#endif /* __BIG_ENDIAN__.  */
591  return __A;
592}
593
594extern __inline __m128i
595    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596    _mm_cvtepu16_epi64(__m128i __A) {
597  const __v8hu __zero = {0};
598#ifdef __LITTLE_ENDIAN__
599  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
600  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
601#else  /* __BIG_ENDIAN__.  */
602  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
603  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
604#endif /* __BIG_ENDIAN__.  */
605  return __A;
606}
607
608extern __inline __m128i
609    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610    _mm_cvtepu32_epi64(__m128i __A) {
611  const __v4su __zero = {0};
612#ifdef __LITTLE_ENDIAN__
613  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
614#else  /* __BIG_ENDIAN__.  */
615  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
616#endif /* __BIG_ENDIAN__.  */
617  return __A;
618}
619
620/* Return horizontal packed word minimum and its index in bits [15:0]
621   and bits [18:16] respectively.  */
622extern __inline __m128i
623    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624    _mm_minpos_epu16(__m128i __A) {
625  union __u {
626    __m128i __m;
627    __v8hu __uh;
628  };
629  union __u __u = {.__m = __A}, __r = {.__m = {0}};
630  unsigned short __ridx = 0;
631  unsigned short __rmin = __u.__uh[__ridx];
632  unsigned long __i;
633  for (__i = 1; __i < 8; __i++) {
634    if (__u.__uh[__i] < __rmin) {
635      __rmin = __u.__uh[__i];
636      __ridx = __i;
637    }
638  }
639  __r.__uh[0] = __rmin;
640  __r.__uh[1] = __ridx;
641  return __r.__m;
642}
643
644extern __inline __m128i
645    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646    _mm_packus_epi32(__m128i __X, __m128i __Y) {
647  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
648}
649
650#ifdef _ARCH_PWR8
651extern __inline __m128i
652    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653    _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
654  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
655}
656#endif
657
658#else
659#include_next <smmintrin.h>
660#endif /* defined(__powerpc64__) &&                                            \
661        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
662
663#endif /* SMMINTRIN_H_ */
664