emmintrin.h revision 351280
1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19   However scalar float operations in vector (XMM) registers require
20   the POWER8 VSX ISA (2.07) level. There are differences for data
21   format and placement of float scalars in the vector register, which
22   require extra steps to match SSE2 scalar float semantics on POWER.
23
24   It should be noted that there's much difference between X86_64's
25   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26   portable <fenv.h> instead of access MXSCR directly.
27
28   Most SSE2 scalar float intrinsic operations can be performed more
29   efficiently as C language float scalar operations or optimized to
30   use vector SIMD operations. We recommend this for new applications.
31*/
32#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef EMMINTRIN_H_
36#define EMMINTRIN_H_
37
38#include <altivec.h>
39
40/* We need definitions from the SSE header files.  */
41#include <xmmintrin.h>
42
43/* SSE2 */
44typedef __vector double __v2df;
45typedef __vector long long __v2di;
46typedef __vector unsigned long long __v2du;
47typedef __vector int __v4si;
48typedef __vector unsigned int __v4su;
49typedef __vector short __v8hi;
50typedef __vector unsigned short __v8hu;
51typedef __vector signed char __v16qi;
52typedef __vector unsigned char __v16qu;
53
54/* The Intel API is flexible enough that we must allow aliasing with other
55   vector types, and their scalar components.  */
56typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
57typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
58
59/* Unaligned version of the same types.  */
60typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
61typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
62
63/* Define two value permute mask.  */
64#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
65
66/* Create a vector with element 0 as F and the rest zero.  */
67extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68_mm_set_sd (double __F)
69{
70  return __extension__ (__m128d){ __F, 0.0 };
71}
72
73/* Create a vector with both elements equal to F.  */
74extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75_mm_set1_pd (double __F)
76{
77  return __extension__ (__m128d){ __F, __F };
78}
79
80extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81_mm_set_pd1 (double __F)
82{
83  return _mm_set1_pd (__F);
84}
85
86/* Create a vector with the lower value X and upper value W.  */
87extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88_mm_set_pd (double __W, double __X)
89{
90  return __extension__ (__m128d){ __X, __W };
91}
92
93/* Create a vector with the lower value W and upper value X.  */
94extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95_mm_setr_pd (double __W, double __X)
96{
97  return __extension__ (__m128d){ __W, __X };
98}
99
100/* Create an undefined vector.  */
101extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102_mm_undefined_pd (void)
103{
104  __m128d __Y = __Y;
105  return __Y;
106}
107
108/* Create a vector of zeros.  */
109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110_mm_setzero_pd (void)
111{
112  return (__m128d) vec_splats (0);
113}
114
115/* Sets the low DPFP value of A from the low value of B.  */
116extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117_mm_move_sd (__m128d __A, __m128d __B)
118{
119  __v2df result = (__v2df) __A;
120  result [0] = ((__v2df) __B)[0];
121  return (__m128d) result;
122}
123
124/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_mm_load_pd (double const *__P)
127{
128  return ((__m128d)vec_ld(0, (__v16qu*)__P));
129}
130
131/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
132extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_loadu_pd (double const *__P)
134{
135  return (vec_vsx_ld(0, __P));
136}
137
138/* Create a vector with all two elements equal to *P.  */
139extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_load1_pd (double const *__P)
141{
142  return (vec_splats (*__P));
143}
144
145/* Create a vector with element 0 as *P and the rest zero.  */
146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_load_sd (double const *__P)
148{
149  return _mm_set_sd (*__P);
150}
151
152extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm_load_pd1 (double const *__P)
154{
155  return _mm_load1_pd (__P);
156}
157
158/* Load two DPFP values in reverse order.  The address must be aligned.  */
159extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160_mm_loadr_pd (double const *__P)
161{
162  __v2df __tmp = _mm_load_pd (__P);
163  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
164}
165
166/* Store two DPFP values.  The address must be 16-byte aligned.  */
167extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168_mm_store_pd (double *__P, __m128d __A)
169{
170  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
171}
172
173/* Store two DPFP values.  The address need not be 16-byte aligned.  */
174extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_storeu_pd (double *__P, __m128d __A)
176{
177  *(__m128d_u *)__P = __A;
178}
179
180/* Stores the lower DPFP value.  */
181extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182_mm_store_sd (double *__P, __m128d __A)
183{
184  *__P = ((__v2df)__A)[0];
185}
186
187extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188_mm_cvtsd_f64 (__m128d __A)
189{
190  return ((__v2df)__A)[0];
191}
192
193extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194_mm_storel_pd (double *__P, __m128d __A)
195{
196  _mm_store_sd (__P, __A);
197}
198
199/* Stores the upper DPFP value.  */
200extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201_mm_storeh_pd (double *__P, __m128d __A)
202{
203  *__P = ((__v2df)__A)[1];
204}
205/* Store the lower DPFP value across two words.
206   The address must be 16-byte aligned.  */
207extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208_mm_store1_pd (double *__P, __m128d __A)
209{
210  _mm_store_pd (__P, vec_splat (__A, 0));
211}
212
213extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214_mm_store_pd1 (double *__P, __m128d __A)
215{
216  _mm_store1_pd (__P, __A);
217}
218
219/* Store two DPFP values in reverse order.  The address must be aligned.  */
220extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_storer_pd (double *__P, __m128d __A)
222{
223  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
224}
225
226/* Intel intrinsic.  */
227extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_cvtsi128_si64 (__m128i __A)
229{
230  return ((__v2di)__A)[0];
231}
232
233/* Microsoft intrinsic.  */
234extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm_cvtsi128_si64x (__m128i __A)
236{
237  return ((__v2di)__A)[0];
238}
239
240extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241_mm_add_pd (__m128d __A, __m128d __B)
242{
243  return (__m128d) ((__v2df)__A + (__v2df)__B);
244}
245
246/* Add the lower double-precision (64-bit) floating-point element in
247   a and b, store the result in the lower element of dst, and copy
248   the upper element from a to the upper element of dst. */
249extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250_mm_add_sd (__m128d __A, __m128d __B)
251{
252  __A[0] = __A[0] + __B[0];
253  return (__A);
254}
255
256extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257_mm_sub_pd (__m128d __A, __m128d __B)
258{
259  return (__m128d) ((__v2df)__A - (__v2df)__B);
260}
261
262extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm_sub_sd (__m128d __A, __m128d __B)
264{
265  __A[0] = __A[0] - __B[0];
266  return (__A);
267}
268
269extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270_mm_mul_pd (__m128d __A, __m128d __B)
271{
272  return (__m128d) ((__v2df)__A * (__v2df)__B);
273}
274
275extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276_mm_mul_sd (__m128d __A, __m128d __B)
277{
278  __A[0] = __A[0] * __B[0];
279  return (__A);
280}
281
282extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283_mm_div_pd (__m128d __A, __m128d __B)
284{
285  return (__m128d) ((__v2df)__A / (__v2df)__B);
286}
287
288extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_div_sd (__m128d __A, __m128d __B)
290{
291  __A[0] = __A[0] / __B[0];
292  return (__A);
293}
294
295extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm_sqrt_pd (__m128d __A)
297{
298  return (vec_sqrt (__A));
299}
300
301/* Return pair {sqrt (B[0]), A[1]}.  */
302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303_mm_sqrt_sd (__m128d __A, __m128d __B)
304{
305  __v2df c;
306  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
307  return (__m128d) _mm_setr_pd (c[0], __A[1]);
308}
309
310extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm_min_pd (__m128d __A, __m128d __B)
312{
313  return (vec_min (__A, __B));
314}
315
316extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317_mm_min_sd (__m128d __A, __m128d __B)
318{
319  __v2df a, b, c;
320  a = vec_splats (__A[0]);
321  b = vec_splats (__B[0]);
322  c = vec_min (a, b);
323  return (__m128d) _mm_setr_pd (c[0], __A[1]);
324}
325
326extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327_mm_max_pd (__m128d __A, __m128d __B)
328{
329  return (vec_max (__A, __B));
330}
331
332extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_max_sd (__m128d __A, __m128d __B)
334{
335  __v2df a, b, c;
336  a = vec_splats (__A[0]);
337  b = vec_splats (__B[0]);
338  c = vec_max (a, b);
339  return (__m128d) _mm_setr_pd (c[0], __A[1]);
340}
341
342extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343_mm_cmpeq_pd (__m128d __A, __m128d __B)
344{
345  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
346}
347
348extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349_mm_cmplt_pd (__m128d __A, __m128d __B)
350{
351  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
352}
353
354extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355_mm_cmple_pd (__m128d __A, __m128d __B)
356{
357  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
358}
359
360extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361_mm_cmpgt_pd (__m128d __A, __m128d __B)
362{
363  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
364}
365
366extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367_mm_cmpge_pd (__m128d __A, __m128d __B)
368{
369  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
370}
371
372extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373_mm_cmpneq_pd (__m128d __A, __m128d __B)
374{
375  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
376  return ((__m128d)vec_nor (temp, temp));
377}
378
379extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_cmpnlt_pd (__m128d __A, __m128d __B)
381{
382  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
383}
384
385extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_cmpnle_pd (__m128d __A, __m128d __B)
387{
388  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
389}
390
391extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm_cmpngt_pd (__m128d __A, __m128d __B)
393{
394  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
395}
396
397extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398_mm_cmpnge_pd (__m128d __A, __m128d __B)
399{
400  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
401}
402
403extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404_mm_cmpord_pd (__m128d __A, __m128d __B)
405{
406#if _ARCH_PWR8
407  __v2du c, d;
408  /* Compare against self will return false (0's) if NAN.  */
409  c = (__v2du)vec_cmpeq (__A, __A);
410  d = (__v2du)vec_cmpeq (__B, __B);
411#else
412  __v2du a, b;
413  __v2du c, d;
414  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
415  a = (__v2du)vec_abs ((__v2df)__A);
416  b = (__v2du)vec_abs ((__v2df)__B);
417  c = (__v2du)vec_cmpgt (double_exp_mask, a);
418  d = (__v2du)vec_cmpgt (double_exp_mask, b);
419#endif
420  /* A != NAN and B != NAN.  */
421  return ((__m128d)vec_and(c, d));
422}
423
424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425_mm_cmpunord_pd (__m128d __A, __m128d __B)
426{
427#if _ARCH_PWR8
428  __v2du c, d;
429  /* Compare against self will return false (0's) if NAN.  */
430  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
431  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
432  /* A == NAN OR B == NAN converts too:
433     NOT(A != NAN) OR NOT(B != NAN).  */
434  c = vec_nor (c, c);
435  return ((__m128d)vec_orc(c, d));
436#else
437  __v2du c, d;
438  /* Compare against self will return false (0's) if NAN.  */
439  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
440  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
441  /* Convert the true ('1's) is NAN.  */
442  c = vec_nor (c, c);
443  d = vec_nor (d, d);
444  return ((__m128d)vec_or(c, d));
445#endif
446}
447
448extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
450{
451  __v2df a, b, c;
452  /* PowerISA VSX does not allow partial (for just lower double)
453     results. So to insure we don't generate spurious exceptions
454     (from the upper double values) we splat the lower double
455     before we do the operation. */
456  a = vec_splats (__A[0]);
457  b = vec_splats (__B[0]);
458  c = (__v2df) vec_cmpeq(a, b);
459  /* Then we merge the lower double result with the original upper
460     double from __A.  */
461  return (__m128d) _mm_setr_pd (c[0], __A[1]);
462}
463
464extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm_cmplt_sd (__m128d __A, __m128d __B)
466{
467  __v2df a, b, c;
468  a = vec_splats (__A[0]);
469  b = vec_splats (__B[0]);
470  c = (__v2df) vec_cmplt(a, b);
471  return (__m128d) _mm_setr_pd (c[0], __A[1]);
472}
473
474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm_cmple_sd (__m128d __A, __m128d __B)
476{
477  __v2df a, b, c;
478  a = vec_splats (__A[0]);
479  b = vec_splats (__B[0]);
480  c = (__v2df) vec_cmple(a, b);
481  return (__m128d) _mm_setr_pd (c[0], __A[1]);
482}
483
484extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485_mm_cmpgt_sd (__m128d __A, __m128d __B)
486{
487  __v2df a, b, c;
488  a = vec_splats (__A[0]);
489  b = vec_splats (__B[0]);
490  c = (__v2df) vec_cmpgt(a, b);
491  return (__m128d) _mm_setr_pd (c[0], __A[1]);
492}
493
494extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495_mm_cmpge_sd (__m128d __A, __m128d __B)
496{
497  __v2df a, b, c;
498  a = vec_splats (__A[0]);
499  b = vec_splats (__B[0]);
500  c = (__v2df) vec_cmpge(a, b);
501  return (__m128d) _mm_setr_pd (c[0], __A[1]);
502}
503
504extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505_mm_cmpneq_sd (__m128d __A, __m128d __B)
506{
507  __v2df a, b, c;
508  a = vec_splats (__A[0]);
509  b = vec_splats (__B[0]);
510  c = (__v2df) vec_cmpeq(a, b);
511  c = vec_nor (c, c);
512  return (__m128d) _mm_setr_pd (c[0], __A[1]);
513}
514
515extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516_mm_cmpnlt_sd (__m128d __A, __m128d __B)
517{
518  __v2df a, b, c;
519  a = vec_splats (__A[0]);
520  b = vec_splats (__B[0]);
521  /* Not less than is just greater than or equal.  */
522  c = (__v2df) vec_cmpge(a, b);
523  return (__m128d) _mm_setr_pd (c[0], __A[1]);
524}
525
526extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527_mm_cmpnle_sd (__m128d __A, __m128d __B)
528{
529  __v2df a, b, c;
530  a = vec_splats (__A[0]);
531  b = vec_splats (__B[0]);
532  /* Not less than or equal is just greater than.  */
533  c = (__v2df) vec_cmpge(a, b);
534  return (__m128d) _mm_setr_pd (c[0], __A[1]);
535}
536
537extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_mm_cmpngt_sd (__m128d __A, __m128d __B)
539{
540  __v2df a, b, c;
541  a = vec_splats (__A[0]);
542  b = vec_splats (__B[0]);
543  /* Not greater than is just less than or equal.  */
544  c = (__v2df) vec_cmple(a, b);
545  return (__m128d) _mm_setr_pd (c[0], __A[1]);
546}
547
548extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549_mm_cmpnge_sd (__m128d __A, __m128d __B)
550{
551  __v2df a, b, c;
552  a = vec_splats (__A[0]);
553  b = vec_splats (__B[0]);
554  /* Not greater than or equal is just less than.  */
555  c = (__v2df) vec_cmplt(a, b);
556  return (__m128d) _mm_setr_pd (c[0], __A[1]);
557}
558
559extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
560_mm_cmpord_sd (__m128d __A, __m128d __B)
561{
562  __v2df r;
563  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
564  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
565}
566
567extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568_mm_cmpunord_sd (__m128d __A, __m128d __B)
569{
570  __v2df r;
571  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
572  return (__m128d) _mm_setr_pd (r[0], __A[1]);
573}
574
575/* FIXME
576   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
577   exactly the same because GCC for PowerPC only generates unordered
578   compares (scalar and vector).
579   Technically __mm_comieq_sp et all should be using the ordered
580   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
581   be OK.   */
582extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583_mm_comieq_sd (__m128d __A, __m128d __B)
584{
585  return (__A[0] == __B[0]);
586}
587
588extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589_mm_comilt_sd (__m128d __A, __m128d __B)
590{
591  return (__A[0] < __B[0]);
592}
593
594extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm_comile_sd (__m128d __A, __m128d __B)
596{
597  return (__A[0] <= __B[0]);
598}
599
600extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601_mm_comigt_sd (__m128d __A, __m128d __B)
602{
603  return (__A[0] > __B[0]);
604}
605
606extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_comige_sd (__m128d __A, __m128d __B)
608{
609  return (__A[0] >= __B[0]);
610}
611
612extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_comineq_sd (__m128d __A, __m128d __B)
614{
615  return (__A[0] != __B[0]);
616}
617
618extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619_mm_ucomieq_sd (__m128d __A, __m128d __B)
620{
621	return (__A[0] == __B[0]);
622}
623
624extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm_ucomilt_sd (__m128d __A, __m128d __B)
626{
627	return (__A[0] < __B[0]);
628}
629
630extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm_ucomile_sd (__m128d __A, __m128d __B)
632{
633	return (__A[0] <= __B[0]);
634}
635
636extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637_mm_ucomigt_sd (__m128d __A, __m128d __B)
638{
639	return (__A[0] > __B[0]);
640}
641
642extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_ucomige_sd (__m128d __A, __m128d __B)
644{
645	return (__A[0] >= __B[0]);
646}
647
648extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm_ucomineq_sd (__m128d __A, __m128d __B)
650{
651  return (__A[0] != __B[0]);
652}
653
654/* Create a vector of Qi, where i is the element number.  */
655extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656_mm_set_epi64x (long long __q1, long long __q0)
657{
658  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
659}
660
661extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662_mm_set_epi64 (__m64 __q1,  __m64 __q0)
663{
664  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
665}
666
667extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
669{
670  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
671}
672
673extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
675	       short __q3, short __q2, short __q1, short __q0)
676{
677  return __extension__ (__m128i)(__v8hi){
678    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
679}
680
681extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
683	      char __q11, char __q10, char __q09, char __q08,
684	      char __q07, char __q06, char __q05, char __q04,
685	      char __q03, char __q02, char __q01, char __q00)
686{
687  return __extension__ (__m128i)(__v16qi){
688    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
689    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
690  };
691}
692
693/* Set all of the elements of the vector to A.  */
694extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_set1_epi64x (long long __A)
696{
697  return _mm_set_epi64x (__A, __A);
698}
699
700extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701_mm_set1_epi64 (__m64 __A)
702{
703  return _mm_set_epi64 (__A, __A);
704}
705
706extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707_mm_set1_epi32 (int __A)
708{
709  return _mm_set_epi32 (__A, __A, __A, __A);
710}
711
712extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713_mm_set1_epi16 (short __A)
714{
715  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
716}
717
718extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719_mm_set1_epi8 (char __A)
720{
721  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
722		       __A, __A, __A, __A, __A, __A, __A, __A);
723}
724
725/* Create a vector of Qi, where i is the element number.
726   The parameter order is reversed from the _mm_set_epi* functions.  */
727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728_mm_setr_epi64 (__m64 __q0, __m64 __q1)
729{
730  return _mm_set_epi64 (__q1, __q0);
731}
732
733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
735{
736  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
737}
738
739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
741	        short __q4, short __q5, short __q6, short __q7)
742{
743  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
744}
745
746extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
748	       char __q04, char __q05, char __q06, char __q07,
749	       char __q08, char __q09, char __q10, char __q11,
750	       char __q12, char __q13, char __q14, char __q15)
751{
752  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
753		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
754}
755
756/* Create a vector with element 0 as *P and the rest zero.  */
757extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758_mm_load_si128 (__m128i const *__P)
759{
760  return *__P;
761}
762
763extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764_mm_loadu_si128 (__m128i_u const *__P)
765{
766  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
767}
768
769extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770_mm_loadl_epi64 (__m128i_u const *__P)
771{
772  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
773}
774
775extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776_mm_store_si128 (__m128i *__P, __m128i __B)
777{
778  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
779}
780
781extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
783{
784  *__P = __B;
785}
786
787extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
789{
790  *(long long *)__P = ((__v2di)__B)[0];
791}
792
793extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794_mm_movepi64_pi64 (__m128i_u __B)
795{
796  return (__m64) ((__v2di)__B)[0];
797}
798
799extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800_mm_movpi64_epi64 (__m64 __A)
801{
802  return _mm_set_epi64 ((__m64)0LL, __A);
803}
804
805extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806_mm_move_epi64 (__m128i __A)
807{
808  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
809}
810
811/* Create an undefined vector.  */
812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813_mm_undefined_si128 (void)
814{
815  __m128i __Y = __Y;
816  return __Y;
817}
818
819/* Create a vector of zeros.  */
820extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821_mm_setzero_si128 (void)
822{
823  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
824}
825
826#ifdef _ARCH_PWR8
827extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828_mm_cvtepi32_pd (__m128i __A)
829{
830  __v2di val;
831  /* For LE need to generate Vector Unpack Low Signed Word.
832     Which is generated from unpackh.  */
833  val = (__v2di)vec_unpackh ((__v4si)__A);
834
835  return (__m128d)vec_ctf (val, 0);
836}
837#endif
838
839extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840_mm_cvtepi32_ps (__m128i __A)
841{
842  return ((__m128)vec_ctf((__v4si)__A, 0));
843}
844
845extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846_mm_cvtpd_epi32 (__m128d __A)
847{
848  __v2df rounded = vec_rint (__A);
849  __v4si result, temp;
850  const __v4si vzero =
851    { 0, 0, 0, 0 };
852
853  /* VSX Vector truncate Double-Precision to integer and Convert to
854   Signed Integer Word format with Saturate.  */
855  __asm__(
856      "xvcvdpsxws %x0,%x1"
857      : "=wa" (temp)
858      : "wa" (rounded)
859      : );
860
861#ifdef _ARCH_PWR8
862  temp = vec_mergeo (temp, temp);
863  result = (__v4si) vec_vpkudum ((__vector long long) temp,
864				 (__vector long long) vzero);
865#else
866  {
867    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
868	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
869    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
870  }
871#endif
872  return (__m128i) result;
873}
874
875extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876_mm_cvtpd_pi32 (__m128d __A)
877{
878  __m128i result = _mm_cvtpd_epi32(__A);
879
880  return (__m64) result[0];
881}
882
883extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm_cvtpd_ps (__m128d __A)
885{
886  __v4sf result;
887  __v4si temp;
888  const __v4si vzero = { 0, 0, 0, 0 };
889
890  __asm__(
891      "xvcvdpsp %x0,%x1"
892      : "=wa" (temp)
893      : "wa" (__A)
894      : );
895
896#ifdef _ARCH_PWR8
897  temp = vec_mergeo (temp, temp);
898  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
899				 (__vector long long) vzero);
900#else
901  {
902    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
903	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
904    result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
905  }
906#endif
907  return ((__m128)result);
908}
909
910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911_mm_cvttpd_epi32 (__m128d __A)
912{
913  __v4si result;
914  __v4si temp;
915  const __v4si vzero = { 0, 0, 0, 0 };
916
917  /* VSX Vector truncate Double-Precision to integer and Convert to
918   Signed Integer Word format with Saturate.  */
919  __asm__(
920      "xvcvdpsxws %x0,%x1"
921      : "=wa" (temp)
922      : "wa" (__A)
923      : );
924
925#ifdef _ARCH_PWR8
926  temp = vec_mergeo (temp, temp);
927  result = (__v4si) vec_vpkudum ((__vector long long) temp,
928				 (__vector long long) vzero);
929#else
930  {
931    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
932	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
933    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
934  }
935#endif
936
937  return ((__m128i) result);
938}
939
940extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941_mm_cvttpd_pi32 (__m128d __A)
942{
943  __m128i result = _mm_cvttpd_epi32 (__A);
944
945  return (__m64) result[0];
946}
947
948extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949_mm_cvtsi128_si32 (__m128i __A)
950{
951  return ((__v4si)__A)[0];
952}
953
954#ifdef _ARCH_PWR8
955extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956_mm_cvtpi32_pd (__m64 __A)
957{
958  __v4si temp;
959  __v2di tmp2;
960  __v2df result;
961
962  temp = (__v4si)vec_splats (__A);
963  tmp2 = (__v2di)vec_unpackl (temp);
964  result = vec_ctf ((__vector signed long long) tmp2, 0);
965  return (__m128d)result;
966}
967#endif
968
969extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970_mm_cvtps_epi32 (__m128 __A)
971{
972  __v4sf rounded;
973  __v4si result;
974
975  rounded = vec_rint((__v4sf) __A);
976  result = vec_cts (rounded, 0);
977  return (__m128i) result;
978}
979
980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981_mm_cvttps_epi32 (__m128 __A)
982{
983  __v4si result;
984
985  result = vec_cts ((__v4sf) __A, 0);
986  return (__m128i) result;
987}
988
989extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm_cvtps_pd (__m128 __A)
991{
992  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
993#ifdef vec_doubleh
994  return (__m128d) vec_doubleh ((__v4sf)__A);
995#else
996  /* Otherwise the compiler is not current and so need to generate the
997     equivalent code.  */
998  __v4sf a = (__v4sf)__A;
999  __v4sf temp;
1000  __v2df result;
1001#ifdef __LITTLE_ENDIAN__
1002  /* The input float values are in elements {[0], [1]} but the convert
1003     instruction needs them in elements {[1], [3]}, So we use two
1004     shift left double vector word immediates to get the elements
1005     lined up.  */
1006  temp = __builtin_vsx_xxsldwi (a, a, 3);
1007  temp = __builtin_vsx_xxsldwi (a, temp, 2);
1008#else
1009  /* The input float values are in elements {[0], [1]} but the convert
1010     instruction needs them in elements {[0], [2]}, So we use two
1011     shift left double vector word immediates to get the elements
1012     lined up.  */
1013  temp = vec_vmrghw (a, a);
1014#endif
1015  __asm__(
1016      " xvcvspdp %x0,%x1"
1017      : "=wa" (result)
1018      : "wa" (temp)
1019      : );
1020  return (__m128d) result;
1021#endif
1022}
1023
1024extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025_mm_cvtsd_si32 (__m128d __A)
1026{
1027  __v2df rounded = vec_rint((__v2df) __A);
1028  int result = ((__v2df)rounded)[0];
1029
1030  return result;
1031}
1032/* Intel intrinsic.  */
1033extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034_mm_cvtsd_si64 (__m128d __A)
1035{
1036  __v2df rounded = vec_rint ((__v2df) __A );
1037  long long result = ((__v2df) rounded)[0];
1038
1039  return result;
1040}
1041
1042/* Microsoft intrinsic.  */
1043extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044_mm_cvtsd_si64x (__m128d __A)
1045{
1046  return _mm_cvtsd_si64 ((__v2df)__A);
1047}
1048
1049extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050_mm_cvttsd_si32 (__m128d __A)
1051{
1052  int result = ((__v2df)__A)[0];
1053
1054  return result;
1055}
1056
1057/* Intel intrinsic.  */
1058extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059_mm_cvttsd_si64 (__m128d __A)
1060{
1061  long long result = ((__v2df)__A)[0];
1062
1063  return result;
1064}
1065
1066/* Microsoft intrinsic.  */
1067extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068_mm_cvttsd_si64x (__m128d __A)
1069{
1070  return _mm_cvttsd_si64 (__A);
1071}
1072
1073extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074_mm_cvtsd_ss (__m128 __A, __m128d __B)
1075{
1076  __v4sf result = (__v4sf)__A;
1077
1078#ifdef __LITTLE_ENDIAN__
1079  __v4sf temp_s;
1080  /* Copy double element[0] to element [1] for conversion.  */
1081  __v2df temp_b = vec_splat((__v2df)__B, 0);
1082
1083  /* Pre-rotate __A left 3 (logically right 1) elements.  */
1084  result = __builtin_vsx_xxsldwi (result, result, 3);
1085  /* Convert double to single float scalar in a vector.  */
1086  __asm__(
1087      "xscvdpsp %x0,%x1"
1088      : "=wa" (temp_s)
1089      : "wa" (temp_b)
1090      : );
1091  /* Shift the resulting scalar into vector element [0].  */
1092  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1093#else
1094  result [0] = ((__v2df)__B)[0];
1095#endif
1096  return (__m128) result;
1097}
1098
1099extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100_mm_cvtsi32_sd (__m128d __A, int __B)
1101{
1102  __v2df result = (__v2df)__A;
1103  double db = __B;
1104  result [0] = db;
1105  return (__m128d)result;
1106}
1107
1108/* Intel intrinsic.  */
1109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110_mm_cvtsi64_sd (__m128d __A, long long __B)
1111{
1112  __v2df result = (__v2df)__A;
1113  double db = __B;
1114  result [0] = db;
1115  return (__m128d)result;
1116}
1117
1118/* Microsoft intrinsic.  */
1119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120_mm_cvtsi64x_sd (__m128d __A, long long __B)
1121{
1122  return _mm_cvtsi64_sd (__A, __B);
1123}
1124
1125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126_mm_cvtss_sd (__m128d __A, __m128 __B)
1127{
1128#ifdef __LITTLE_ENDIAN__
1129  /* Use splat to move element [0] into position for the convert. */
1130  __v4sf temp = vec_splat ((__v4sf)__B, 0);
1131  __v2df res;
1132  /* Convert single float scalar to double in a vector.  */
1133  __asm__(
1134      "xscvspdp %x0,%x1"
1135      : "=wa" (res)
1136      : "wa" (temp)
1137      : );
1138  return (__m128d) vec_mergel (res, (__v2df)__A);
1139#else
1140  __v2df res = (__v2df)__A;
1141  res [0] = ((__v4sf)__B) [0];
1142  return (__m128d) res;
1143#endif
1144}
1145
1146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1148{
1149  __vector double result;
1150  const int litmsk = __mask & 0x3;
1151
1152  if (litmsk == 0)
1153    result = vec_mergeh (__A, __B);
1154#if __GNUC__ < 6
1155  else if (litmsk == 1)
1156    result = vec_xxpermdi (__B, __A, 2);
1157  else if (litmsk == 2)
1158    result = vec_xxpermdi (__B, __A, 1);
1159#else
1160  else if (litmsk == 1)
1161    result = vec_xxpermdi (__A, __B, 2);
1162  else if (litmsk == 2)
1163    result = vec_xxpermdi (__A, __B, 1);
1164#endif
1165  else
1166    result = vec_mergel (__A, __B);
1167
1168  return result;
1169}
1170
1171extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172_mm_unpackhi_pd (__m128d __A, __m128d __B)
1173{
1174  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1175}
1176
1177extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178_mm_unpacklo_pd (__m128d __A, __m128d __B)
1179{
1180  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1181}
1182
1183extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184_mm_loadh_pd (__m128d __A, double const *__B)
1185{
1186  __v2df result = (__v2df)__A;
1187  result [1] = *__B;
1188  return (__m128d)result;
1189}
1190
1191extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192_mm_loadl_pd (__m128d __A, double const *__B)
1193{
1194  __v2df result = (__v2df)__A;
1195  result [0] = *__B;
1196  return (__m128d)result;
1197}
1198
1199#ifdef _ARCH_PWR8
1200/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1201
1202/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1203extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1204_mm_movemask_pd (__m128d  __A)
1205{
1206  __vector unsigned long long result;
1207  static const __vector unsigned int perm_mask =
1208    {
1209#ifdef __LITTLE_ENDIAN__
1210	0x80800040, 0x80808080, 0x80808080, 0x80808080
1211#else
1212      0x80808080, 0x80808080, 0x80808080, 0x80804000
1213#endif
1214    };
1215
1216  result = ((__vector unsigned long long)
1217	    vec_vbpermq ((__vector unsigned char) __A,
1218			 (__vector unsigned char) perm_mask));
1219
1220#ifdef __LITTLE_ENDIAN__
1221  return result[1];
1222#else
1223  return result[0];
1224#endif
1225}
1226#endif /* _ARCH_PWR8 */
1227
1228extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229_mm_packs_epi16 (__m128i __A, __m128i __B)
1230{
1231  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1232}
1233
1234extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235_mm_packs_epi32 (__m128i __A, __m128i __B)
1236{
1237  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1238}
1239
1240extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241_mm_packus_epi16 (__m128i __A, __m128i __B)
1242{
1243  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1244}
1245
1246extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1248{
1249  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1250}
1251
1252extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1254{
1255  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1256}
1257
1258extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1260{
1261  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1262}
1263
1264extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1266{
1267  return (__m128i) vec_mergel ((__vector long long) __A,
1268			       (__vector long long) __B);
1269}
1270
1271extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1273{
1274  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1275}
1276
1277extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1279{
1280  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1281}
1282
1283extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1285{
1286  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1287}
1288
1289extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1291{
1292  return (__m128i) vec_mergeh ((__vector long long) __A,
1293			       (__vector long long) __B);
1294}
1295
1296extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297_mm_add_epi8 (__m128i __A, __m128i __B)
1298{
1299  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1300}
1301
1302extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303_mm_add_epi16 (__m128i __A, __m128i __B)
1304{
1305  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1306}
1307
1308extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309_mm_add_epi32 (__m128i __A, __m128i __B)
1310{
1311  return (__m128i) ((__v4su)__A + (__v4su)__B);
1312}
1313
1314extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315_mm_add_epi64 (__m128i __A, __m128i __B)
1316{
1317  return (__m128i) ((__v2du)__A + (__v2du)__B);
1318}
1319
1320extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321_mm_adds_epi8 (__m128i __A, __m128i __B)
1322{
1323  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1324}
1325
1326extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327_mm_adds_epi16 (__m128i __A, __m128i __B)
1328{
1329  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1330}
1331
1332extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333_mm_adds_epu8 (__m128i __A, __m128i __B)
1334{
1335  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1336}
1337
1338extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339_mm_adds_epu16 (__m128i __A, __m128i __B)
1340{
1341  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1342}
1343
1344extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345_mm_sub_epi8 (__m128i __A, __m128i __B)
1346{
1347  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1348}
1349
1350extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351_mm_sub_epi16 (__m128i __A, __m128i __B)
1352{
1353  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1354}
1355
1356extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357_mm_sub_epi32 (__m128i __A, __m128i __B)
1358{
1359  return (__m128i) ((__v4su)__A - (__v4su)__B);
1360}
1361
1362extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363_mm_sub_epi64 (__m128i __A, __m128i __B)
1364{
1365  return (__m128i) ((__v2du)__A - (__v2du)__B);
1366}
1367
1368extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369_mm_subs_epi8 (__m128i __A, __m128i __B)
1370{
1371  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1372}
1373
1374extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375_mm_subs_epi16 (__m128i __A, __m128i __B)
1376{
1377  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1378}
1379
1380extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381_mm_subs_epu8 (__m128i __A, __m128i __B)
1382{
1383  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1384}
1385
1386extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387_mm_subs_epu16 (__m128i __A, __m128i __B)
1388{
1389  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1390}
1391
1392extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393_mm_madd_epi16 (__m128i __A, __m128i __B)
1394{
1395  __vector signed int zero = {0, 0, 0, 0};
1396
1397  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1398}
1399
1400extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1402{
1403  __vector signed int w0, w1;
1404
1405  __vector unsigned char xform1 = {
1406#ifdef __LITTLE_ENDIAN__
1407      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1408      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1409#else
1410      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1411      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1412#endif
1413    };
1414
1415  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1416  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1417  return (__m128i) vec_perm (w0, w1, xform1);
1418}
1419
1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421_mm_mullo_epi16 (__m128i __A, __m128i __B)
1422{
1423    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1424}
1425
1426extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427_mm_mul_su32 (__m64 __A, __m64 __B)
1428{
1429  unsigned int a = __A;
1430  unsigned int b = __B;
1431
1432  return ((__m64)a * (__m64)b);
1433}
1434
1435extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436_mm_mul_epu32 (__m128i __A, __m128i __B)
1437{
1438#if __GNUC__ < 8
1439  __v2du result;
1440
1441#ifdef __LITTLE_ENDIAN__
1442  /* VMX Vector Multiply Odd Unsigned Word.  */
1443  __asm__(
1444      "vmulouw %0,%1,%2"
1445      : "=v" (result)
1446      : "v" (__A), "v" (__B)
1447      : );
1448#else
1449  /* VMX Vector Multiply Even Unsigned Word.  */
1450  __asm__(
1451      "vmuleuw %0,%1,%2"
1452      : "=v" (result)
1453      : "v" (__A), "v" (__B)
1454      : );
1455#endif
1456  return (__m128i) result;
1457#else
1458  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1459#endif
1460}
1461
1462extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463_mm_slli_epi16 (__m128i __A, int __B)
1464{
1465  __v8hu lshift;
1466  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1467
1468  if (__B >= 0 && __B < 16)
1469    {
1470      if (__builtin_constant_p(__B))
1471	lshift = (__v8hu) vec_splat_s16(__B);
1472      else
1473	lshift = vec_splats ((unsigned short) __B);
1474
1475      result = vec_sl ((__v8hi) __A, lshift);
1476    }
1477
1478  return (__m128i) result;
1479}
1480
1481extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482_mm_slli_epi32 (__m128i __A, int __B)
1483{
1484  __v4su lshift;
1485  __v4si result = { 0, 0, 0, 0 };
1486
1487  if (__B >= 0 && __B < 32)
1488    {
1489      if (__builtin_constant_p(__B) && __B < 16)
1490	lshift = (__v4su) vec_splat_s32(__B);
1491      else
1492	lshift = vec_splats ((unsigned int) __B);
1493
1494      result = vec_sl ((__v4si) __A, lshift);
1495    }
1496
1497  return (__m128i) result;
1498}
1499
1500#ifdef _ARCH_PWR8
1501extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502_mm_slli_epi64 (__m128i __A, int __B)
1503{
1504  __v2du lshift;
1505  __v2di result = { 0, 0 };
1506
1507  if (__B >= 0 && __B < 64)
1508    {
1509      if (__builtin_constant_p(__B) && __B < 16)
1510	lshift = (__v2du) vec_splat_s32(__B);
1511      else
1512	lshift = (__v2du) vec_splats ((unsigned int) __B);
1513
1514      result = vec_sl ((__v2di) __A, lshift);
1515    }
1516
1517  return (__m128i) result;
1518}
1519#endif
1520
1521extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522_mm_srai_epi16 (__m128i __A, int __B)
1523{
1524  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1525  __v8hi result;
1526
1527  if (__B < 16)
1528    {
1529      if (__builtin_constant_p(__B))
1530	rshift = (__v8hu) vec_splat_s16(__B);
1531      else
1532	rshift = vec_splats ((unsigned short) __B);
1533    }
1534  result = vec_sra ((__v8hi) __A, rshift);
1535
1536  return (__m128i) result;
1537}
1538
1539extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540_mm_srai_epi32 (__m128i __A, int __B)
1541{
1542  __v4su rshift = { 31, 31, 31, 31 };
1543  __v4si result;
1544
1545  if (__B < 32)
1546    {
1547      if (__builtin_constant_p(__B))
1548	{
1549	  if (__B < 16)
1550	      rshift = (__v4su) vec_splat_s32(__B);
1551	    else
1552	      rshift = (__v4su) vec_splats((unsigned int)__B);
1553	}
1554      else
1555	rshift = vec_splats ((unsigned int) __B);
1556    }
1557  result = vec_sra ((__v4si) __A, rshift);
1558
1559  return (__m128i) result;
1560}
1561
1562extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563_mm_bslli_si128 (__m128i __A, const int __N)
1564{
1565  __v16qu result;
1566  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567
1568  if (__N < 16)
1569    result = vec_sld ((__v16qu) __A, zeros, __N);
1570  else
1571    result = zeros;
1572
1573  return (__m128i) result;
1574}
1575
1576extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577_mm_bsrli_si128 (__m128i __A, const int __N)
1578{
1579  __v16qu result;
1580  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1581
1582  if (__N < 16)
1583#ifdef __LITTLE_ENDIAN__
1584    if (__builtin_constant_p(__N))
1585      /* Would like to use Vector Shift Left Double by Octet
1586	 Immediate here to use the immediate form and avoid
1587	 load of __N * 8 value into a separate VR.  */
1588      result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1589    else
1590#endif
1591      {
1592	__v16qu shift = vec_splats((unsigned char)(__N*8));
1593#ifdef __LITTLE_ENDIAN__
1594	result = vec_sro ((__v16qu)__A, shift);
1595#else
1596	result = vec_slo ((__v16qu)__A, shift);
1597#endif
1598      }
1599  else
1600    result = zeros;
1601
1602  return (__m128i) result;
1603}
1604
1605extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606_mm_srli_si128 (__m128i __A, const int __N)
1607{
1608  return _mm_bsrli_si128 (__A, __N);
1609}
1610
1611extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1612_mm_slli_si128 (__m128i __A, const int _imm5)
1613{
1614  __v16qu result;
1615  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1616
1617  if (_imm5 < 16)
1618#ifdef __LITTLE_ENDIAN__
1619    result = vec_sld ((__v16qu) __A, zeros, _imm5);
1620#else
1621    result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1622#endif
1623  else
1624    result = zeros;
1625
1626  return (__m128i) result;
1627}
1628
1629extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1630
1631_mm_srli_epi16 (__m128i  __A, int __B)
1632{
1633  __v8hu rshift;
1634  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1635
1636  if (__B < 16)
1637    {
1638      if (__builtin_constant_p(__B))
1639	rshift = (__v8hu) vec_splat_s16(__B);
1640      else
1641	rshift = vec_splats ((unsigned short) __B);
1642
1643      result = vec_sr ((__v8hi) __A, rshift);
1644    }
1645
1646  return (__m128i) result;
1647}
1648
1649extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650_mm_srli_epi32 (__m128i __A, int __B)
1651{
1652  __v4su rshift;
1653  __v4si result = { 0, 0, 0, 0 };
1654
1655  if (__B < 32)
1656    {
1657      if (__builtin_constant_p(__B))
1658	{
1659	  if (__B < 16)
1660	      rshift = (__v4su) vec_splat_s32(__B);
1661	    else
1662	      rshift = (__v4su) vec_splats((unsigned int)__B);
1663	}
1664      else
1665	rshift = vec_splats ((unsigned int) __B);
1666
1667      result = vec_sr ((__v4si) __A, rshift);
1668    }
1669
1670  return (__m128i) result;
1671}
1672
1673#ifdef _ARCH_PWR8
1674extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675_mm_srli_epi64 (__m128i __A, int __B)
1676{
1677  __v2du rshift;
1678  __v2di result = { 0, 0 };
1679
1680  if (__B < 64)
1681    {
1682      if (__builtin_constant_p(__B))
1683	{
1684	  if (__B < 16)
1685	      rshift = (__v2du) vec_splat_s32(__B);
1686	    else
1687	      rshift = (__v2du) vec_splats((unsigned long long)__B);
1688	}
1689      else
1690	rshift = (__v2du) vec_splats ((unsigned int) __B);
1691
1692      result = vec_sr ((__v2di) __A, rshift);
1693    }
1694
1695  return (__m128i) result;
1696}
1697#endif
1698
1699extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700_mm_sll_epi16 (__m128i __A, __m128i __B)
1701{
1702  __v8hu lshift;
1703  __vector __bool short shmask;
1704  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1705  __v8hu result;
1706
1707#ifdef __LITTLE_ENDIAN__
1708  lshift = vec_splat ((__v8hu) __B, 0);
1709#else
1710  lshift = vec_splat ((__v8hu) __B, 3);
1711#endif
1712  shmask = vec_cmple (lshift, shmax);
1713  result = vec_sl ((__v8hu) __A, lshift);
1714  result = vec_sel ((__v8hu) shmask, result, shmask);
1715
1716  return (__m128i) result;
1717}
1718
1719extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720_mm_sll_epi32 (__m128i __A, __m128i __B)
1721{
1722  __v4su lshift;
1723  __vector __bool int shmask;
1724  const __v4su shmax = { 32, 32, 32, 32 };
1725  __v4su result;
1726#ifdef __LITTLE_ENDIAN__
1727  lshift = vec_splat ((__v4su) __B, 0);
1728#else
1729  lshift = vec_splat ((__v4su) __B, 1);
1730#endif
1731  shmask = vec_cmplt (lshift, shmax);
1732  result = vec_sl ((__v4su) __A, lshift);
1733  result = vec_sel ((__v4su) shmask, result, shmask);
1734
1735  return (__m128i) result;
1736}
1737
1738#ifdef _ARCH_PWR8
1739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740_mm_sll_epi64 (__m128i __A, __m128i __B)
1741{
1742  __v2du lshift;
1743  __vector __bool long long shmask;
1744  const __v2du shmax = { 64, 64 };
1745  __v2du result;
1746
1747  lshift = vec_splat ((__v2du) __B, 0);
1748  shmask = vec_cmplt (lshift, shmax);
1749  result = vec_sl ((__v2du) __A, lshift);
1750  result = vec_sel ((__v2du) shmask, result, shmask);
1751
1752  return (__m128i) result;
1753}
1754#endif
1755
1756extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757_mm_sra_epi16 (__m128i __A, __m128i __B)
1758{
1759  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1760  __v8hu rshift;
1761  __v8hi result;
1762
1763#ifdef __LITTLE_ENDIAN__
1764  rshift = vec_splat ((__v8hu)__B, 0);
1765#else
1766  rshift = vec_splat ((__v8hu)__B, 3);
1767#endif
1768  rshift = vec_min (rshift, rshmax);
1769  result = vec_sra ((__v8hi) __A, rshift);
1770
1771  return (__m128i) result;
1772}
1773
1774extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775_mm_sra_epi32 (__m128i __A, __m128i __B)
1776{
1777  const __v4su rshmax = { 31, 31, 31, 31 };
1778  __v4su rshift;
1779  __v4si result;
1780
1781#ifdef __LITTLE_ENDIAN__
1782  rshift = vec_splat ((__v4su)__B, 0);
1783#else
1784  rshift = vec_splat ((__v4su)__B, 1);
1785#endif
1786  rshift = vec_min (rshift, rshmax);
1787  result = vec_sra ((__v4si) __A, rshift);
1788
1789  return (__m128i) result;
1790}
1791
1792extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1793_mm_srl_epi16 (__m128i __A, __m128i __B)
1794{
1795  __v8hu rshift;
1796  __vector __bool short shmask;
1797  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1798  __v8hu result;
1799
1800#ifdef __LITTLE_ENDIAN__
1801  rshift = vec_splat ((__v8hu) __B, 0);
1802#else
1803  rshift = vec_splat ((__v8hu) __B, 3);
1804#endif
1805  shmask = vec_cmple (rshift, shmax);
1806  result = vec_sr ((__v8hu) __A, rshift);
1807  result = vec_sel ((__v8hu) shmask, result, shmask);
1808
1809  return (__m128i) result;
1810}
1811
1812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813_mm_srl_epi32 (__m128i __A, __m128i __B)
1814{
1815  __v4su rshift;
1816  __vector __bool int shmask;
1817  const __v4su shmax = { 32, 32, 32, 32 };
1818  __v4su result;
1819
1820#ifdef __LITTLE_ENDIAN__
1821  rshift = vec_splat ((__v4su) __B, 0);
1822#else
1823  rshift = vec_splat ((__v4su) __B, 1);
1824#endif
1825  shmask = vec_cmplt (rshift, shmax);
1826  result = vec_sr ((__v4su) __A, rshift);
1827  result = vec_sel ((__v4su) shmask, result, shmask);
1828
1829  return (__m128i) result;
1830}
1831
1832#ifdef _ARCH_PWR8
1833extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1834_mm_srl_epi64 (__m128i __A, __m128i __B)
1835{
1836  __v2du rshift;
1837  __vector __bool long long shmask;
1838  const __v2du shmax = { 64, 64 };
1839  __v2du result;
1840
1841  rshift = vec_splat ((__v2du) __B, 0);
1842  shmask = vec_cmplt (rshift, shmax);
1843  result = vec_sr ((__v2du) __A, rshift);
1844  result = vec_sel ((__v2du) shmask, result, shmask);
1845
1846  return (__m128i) result;
1847}
1848#endif
1849
1850extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851_mm_and_pd (__m128d __A, __m128d __B)
1852{
1853  return (vec_and ((__v2df) __A, (__v2df) __B));
1854}
1855
1856extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857_mm_andnot_pd (__m128d __A, __m128d __B)
1858{
1859  return (vec_andc ((__v2df) __B, (__v2df) __A));
1860}
1861
1862extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863_mm_or_pd (__m128d __A, __m128d __B)
1864{
1865  return (vec_or ((__v2df) __A, (__v2df) __B));
1866}
1867
1868extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869_mm_xor_pd (__m128d __A, __m128d __B)
1870{
1871  return (vec_xor ((__v2df) __A, (__v2df) __B));
1872}
1873
1874extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875_mm_and_si128 (__m128i __A, __m128i __B)
1876{
1877  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1878}
1879
1880extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881_mm_andnot_si128 (__m128i __A, __m128i __B)
1882{
1883  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1884}
1885
1886extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887_mm_or_si128 (__m128i __A, __m128i __B)
1888{
1889  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1890}
1891
1892extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893_mm_xor_si128 (__m128i __A, __m128i __B)
1894{
1895  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1896}
1897
1898extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1900{
1901  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1902}
1903
1904extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1906{
1907  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1908}
1909
1910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1912{
1913  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1914}
1915
1916extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1918{
1919  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1920}
1921
1922extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1924{
1925  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1926}
1927
1928extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1930{
1931  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1932}
1933
1934extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1936{
1937  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1938}
1939
1940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1942{
1943  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1944}
1945
1946extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1948{
1949  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1950}
1951
1952extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1953_mm_extract_epi16 (__m128i const __A, int const __N)
1954{
1955  return (unsigned short) ((__v8hi)__A)[__N & 7];
1956}
1957
1958extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1959_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1960{
1961  __v8hi result = (__v8hi)__A;
1962
1963  result [(__N & 7)] = __D;
1964
1965  return (__m128i) result;
1966}
1967
1968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969_mm_max_epi16 (__m128i __A, __m128i __B)
1970{
1971  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1972}
1973
1974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975_mm_max_epu8 (__m128i __A, __m128i __B)
1976{
1977  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1978}
1979
1980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981_mm_min_epi16 (__m128i __A, __m128i __B)
1982{
1983  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1984}
1985
1986extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987_mm_min_epu8 (__m128i __A, __m128i __B)
1988{
1989  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1990}
1991
1992
1993#ifdef _ARCH_PWR8
1994/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1995
1996/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1997extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1998_mm_movemask_epi8 (__m128i __A)
1999{
2000  __vector unsigned long long result;
2001  static const __vector unsigned char perm_mask =
2002    {
2003	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2004	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2005    };
2006
2007  result = ((__vector unsigned long long)
2008	    vec_vbpermq ((__vector unsigned char) __A,
2009			 (__vector unsigned char) perm_mask));
2010
2011#ifdef __LITTLE_ENDIAN__
2012  return result[1];
2013#else
2014  return result[0];
2015#endif
2016}
2017#endif /* _ARCH_PWR8 */
2018
2019extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2020_mm_mulhi_epu16 (__m128i __A, __m128i __B)
2021{
2022  __v4su w0, w1;
2023  __v16qu xform1 = {
2024#ifdef __LITTLE_ENDIAN__
2025      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2026      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2027#else
2028      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2029      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2030#endif
2031    };
2032
2033  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2034  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2035  return (__m128i) vec_perm (w0, w1, xform1);
2036}
2037
2038extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2039_mm_shufflehi_epi16 (__m128i __A, const int __mask)
2040{
2041  unsigned long element_selector_98 = __mask & 0x03;
2042  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2043  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2044  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2045  static const unsigned short permute_selectors[4] =
2046    {
2047#ifdef __LITTLE_ENDIAN__
2048	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2049#else
2050	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2051#endif
2052    };
2053  __v2du pmask =
2054#ifdef __LITTLE_ENDIAN__
2055      { 0x1716151413121110UL,  0UL};
2056#else
2057      { 0x1011121314151617UL,  0UL};
2058#endif
2059  __m64_union t;
2060  __v2du a, r;
2061
2062  t.as_short[0] = permute_selectors[element_selector_98];
2063  t.as_short[1] = permute_selectors[element_selector_BA];
2064  t.as_short[2] = permute_selectors[element_selector_DC];
2065  t.as_short[3] = permute_selectors[element_selector_FE];
2066  pmask[1] = t.as_m64;
2067  a = (__v2du)__A;
2068  r = vec_perm (a, a, (__vector unsigned char)pmask);
2069  return (__m128i) r;
2070}
2071
2072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073_mm_shufflelo_epi16 (__m128i __A, const int __mask)
2074{
2075  unsigned long element_selector_10 = __mask & 0x03;
2076  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2077  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2078  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2079  static const unsigned short permute_selectors[4] =
2080    {
2081#ifdef __LITTLE_ENDIAN__
2082	      0x0100, 0x0302, 0x0504, 0x0706
2083#else
2084	      0x0001, 0x0203, 0x0405, 0x0607
2085#endif
2086    };
2087  __v2du pmask =
2088#ifdef __LITTLE_ENDIAN__
2089                 { 0UL,  0x1f1e1d1c1b1a1918UL};
2090#else
2091                 { 0UL,  0x18191a1b1c1d1e1fUL};
2092#endif
2093  __m64_union t;
2094  __v2du a, r;
2095  t.as_short[0] = permute_selectors[element_selector_10];
2096  t.as_short[1] = permute_selectors[element_selector_32];
2097  t.as_short[2] = permute_selectors[element_selector_54];
2098  t.as_short[3] = permute_selectors[element_selector_76];
2099  pmask[0] = t.as_m64;
2100  a = (__v2du)__A;
2101  r = vec_perm (a, a, (__vector unsigned char)pmask);
2102  return (__m128i) r;
2103}
2104
2105extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106_mm_shuffle_epi32 (__m128i __A, const int __mask)
2107{
2108  unsigned long element_selector_10 = __mask & 0x03;
2109  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2110  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2111  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2112  static const unsigned int permute_selectors[4] =
2113    {
2114#ifdef __LITTLE_ENDIAN__
2115	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2116#else
2117      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2118#endif
2119    };
2120  __v4su t;
2121
2122  t[0] = permute_selectors[element_selector_10];
2123  t[1] = permute_selectors[element_selector_32];
2124  t[2] = permute_selectors[element_selector_54] + 0x10101010;
2125  t[3] = permute_selectors[element_selector_76] + 0x10101010;
2126  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2127}
2128
2129extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2130_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2131{
2132  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2133  __v16qu mask, tmp;
2134  __m128i_u *p = (__m128i_u*)__C;
2135
2136  tmp = (__v16qu)_mm_loadu_si128(p);
2137  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2138  tmp = vec_sel (tmp, (__v16qu)__A, mask);
2139  _mm_storeu_si128 (p, (__m128i)tmp);
2140}
2141
2142extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143_mm_avg_epu8 (__m128i __A, __m128i __B)
2144{
2145  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2146}
2147
2148extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2149_mm_avg_epu16 (__m128i __A, __m128i __B)
2150{
2151  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2152}
2153
2154
2155extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156_mm_sad_epu8 (__m128i __A, __m128i __B)
2157{
2158  __v16qu a, b;
2159  __v16qu vmin, vmax, vabsdiff;
2160  __v4si vsum;
2161  const __v4su zero = { 0, 0, 0, 0 };
2162  __v4si result;
2163
2164  a = (__v16qu) __A;
2165  b = (__v16qu) __B;
2166  vmin = vec_min (a, b);
2167  vmax = vec_max (a, b);
2168  vabsdiff = vec_sub (vmax, vmin);
2169  /* Sum four groups of bytes into integers.  */
2170  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2171  /* Sum across four integers with two integer results.  */
2172  result = vec_sum2s (vsum, (__vector signed int) zero);
2173  /* Rotate the sums into the correct position.  */
2174#ifdef __LITTLE_ENDIAN__
2175  result = vec_sld (result, result, 4);
2176#else
2177  result = vec_sld (result, result, 6);
2178#endif
2179  /* Rotate the sums into the correct position.  */
2180  return (__m128i) result;
2181}
2182
2183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2184_mm_stream_si32 (int *__A, int __B)
2185{
2186  /* Use the data cache block touch for store transient.  */
2187  __asm__ (
2188    "dcbtstt 0,%0"
2189    :
2190    : "b" (__A)
2191    : "memory"
2192  );
2193  *__A = __B;
2194}
2195
2196extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2197_mm_stream_si64 (long long int *__A, long long int __B)
2198{
2199  /* Use the data cache block touch for store transient.  */
2200  __asm__ (
2201    "	dcbtstt	0,%0"
2202    :
2203    : "b" (__A)
2204    : "memory"
2205  );
2206  *__A = __B;
2207}
2208
2209extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2210_mm_stream_si128 (__m128i *__A, __m128i __B)
2211{
2212  /* Use the data cache block touch for store transient.  */
2213  __asm__ (
2214    "dcbtstt 0,%0"
2215    :
2216    : "b" (__A)
2217    : "memory"
2218  );
2219  *__A = __B;
2220}
2221
2222extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2223_mm_stream_pd (double *__A, __m128d __B)
2224{
2225  /* Use the data cache block touch for store transient.  */
2226  __asm__ (
2227    "dcbtstt 0,%0"
2228    :
2229    : "b" (__A)
2230    : "memory"
2231  );
2232  *(__m128d*)__A = __B;
2233}
2234
2235extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236_mm_clflush (void const *__A)
2237{
2238  /* Use the data cache block flush.  */
2239  __asm__ (
2240    "dcbf 0,%0"
2241    :
2242    : "b" (__A)
2243    : "memory"
2244  );
2245}
2246
2247extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248_mm_lfence (void)
2249{
2250  /* Use light weight sync for load to load ordering.  */
2251  __atomic_thread_fence (__ATOMIC_RELEASE);
2252}
2253
2254extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2255_mm_mfence (void)
2256{
2257  /* Use heavy weight sync for any to any ordering.  */
2258  __atomic_thread_fence (__ATOMIC_SEQ_CST);
2259}
2260
2261extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2262_mm_cvtsi32_si128 (int __A)
2263{
2264  return _mm_set_epi32 (0, 0, 0, __A);
2265}
2266
2267extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2268_mm_cvtsi64_si128 (long long __A)
2269{
2270  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2271}
2272
2273/* Microsoft intrinsic.  */
2274extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2275_mm_cvtsi64x_si128 (long long __A)
2276{
2277  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2278}
2279
2280/* Casts between various SP, DP, INT vector types.  Note that these do no
2281   conversion of values, they just change the type.  */
2282extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283_mm_castpd_ps(__m128d __A)
2284{
2285  return (__m128) __A;
2286}
2287
2288extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289_mm_castpd_si128(__m128d __A)
2290{
2291  return (__m128i) __A;
2292}
2293
2294extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295_mm_castps_pd(__m128 __A)
2296{
2297  return (__m128d) __A;
2298}
2299
2300extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2301_mm_castps_si128(__m128 __A)
2302{
2303  return (__m128i) __A;
2304}
2305
2306extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2307_mm_castsi128_ps(__m128i __A)
2308{
2309  return (__m128) __A;
2310}
2311
2312extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2313_mm_castsi128_pd(__m128i __A)
2314{
2315  return (__m128d) __A;
2316}
2317
2318#endif /* EMMINTRIN_H_ */
2319