1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19   However scalar float operations in vector (XMM) registers require
20   the POWER8 VSX ISA (2.07) level. There are differences for data
21   format and placement of float scalars in the vector register, which
22   require extra steps to match SSE2 scalar float semantics on POWER.
23
24   It should be noted that there's much difference between X86_64's
25   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26   portable <fenv.h> instead of access MXSCR directly.
27
28   Most SSE2 scalar float intrinsic operations can be performed more
29   efficiently as C language float scalar operations or optimized to
30   use vector SIMD operations. We recommend this for new applications.
31*/
32#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef EMMINTRIN_H_
36#define EMMINTRIN_H_
37
38#if defined(__linux__) && defined(__ppc64__)
39
40#include <altivec.h>
41
42/* We need definitions from the SSE header files.  */
43#include <xmmintrin.h>
44
45/* SSE2 */
46typedef __vector double __v2df;
47typedef __vector long long __v2di;
48typedef __vector unsigned long long __v2du;
49typedef __vector int __v4si;
50typedef __vector unsigned int __v4su;
51typedef __vector short __v8hi;
52typedef __vector unsigned short __v8hu;
53typedef __vector signed char __v16qi;
54typedef __vector unsigned char __v16qu;
55
56/* The Intel API is flexible enough that we must allow aliasing with other
57   vector types, and their scalar components.  */
58typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
59typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
60
61/* Unaligned version of the same types.  */
62typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
63typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
64
65/* Define two value permute mask.  */
66#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
67
68/* Create a vector with element 0 as F and the rest zero.  */
69extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_mm_set_sd (double __F)
71{
72  return __extension__ (__m128d){ __F, 0.0 };
73}
74
75/* Create a vector with both elements equal to F.  */
76extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77_mm_set1_pd (double __F)
78{
79  return __extension__ (__m128d){ __F, __F };
80}
81
82extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83_mm_set_pd1 (double __F)
84{
85  return _mm_set1_pd (__F);
86}
87
88/* Create a vector with the lower value X and upper value W.  */
89extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90_mm_set_pd (double __W, double __X)
91{
92  return __extension__ (__m128d){ __X, __W };
93}
94
95/* Create a vector with the lower value W and upper value X.  */
96extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97_mm_setr_pd (double __W, double __X)
98{
99  return __extension__ (__m128d){ __W, __X };
100}
101
102/* Create an undefined vector.  */
103extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104_mm_undefined_pd (void)
105{
106  __m128d __Y = __Y;
107  return __Y;
108}
109
110/* Create a vector of zeros.  */
111extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112_mm_setzero_pd (void)
113{
114  return (__m128d) vec_splats (0);
115}
116
117/* Sets the low DPFP value of A from the low value of B.  */
118extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_mm_move_sd (__m128d __A, __m128d __B)
120{
121  __v2df result = (__v2df) __A;
122  result [0] = ((__v2df) __B)[0];
123  return (__m128d) result;
124}
125
126/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
127extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128_mm_load_pd (double const *__P)
129{
130  return ((__m128d)vec_ld(0, (__v16qu*)__P));
131}
132
133/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
134extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135_mm_loadu_pd (double const *__P)
136{
137  return (vec_vsx_ld(0, __P));
138}
139
140/* Create a vector with all two elements equal to *P.  */
141extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_load1_pd (double const *__P)
143{
144  return (vec_splats (*__P));
145}
146
147/* Create a vector with element 0 as *P and the rest zero.  */
148extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149_mm_load_sd (double const *__P)
150{
151  return _mm_set_sd (*__P);
152}
153
154extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_load_pd1 (double const *__P)
156{
157  return _mm_load1_pd (__P);
158}
159
160/* Load two DPFP values in reverse order.  The address must be aligned.  */
161extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm_loadr_pd (double const *__P)
163{
164  __v2df __tmp = _mm_load_pd (__P);
165  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
166}
167
168/* Store two DPFP values.  The address must be 16-byte aligned.  */
169extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170_mm_store_pd (double *__P, __m128d __A)
171{
172  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
173}
174
175/* Store two DPFP values.  The address need not be 16-byte aligned.  */
176extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm_storeu_pd (double *__P, __m128d __A)
178{
179  *(__m128d_u *)__P = __A;
180}
181
182/* Stores the lower DPFP value.  */
183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184_mm_store_sd (double *__P, __m128d __A)
185{
186  *__P = ((__v2df)__A)[0];
187}
188
189extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190_mm_cvtsd_f64 (__m128d __A)
191{
192  return ((__v2df)__A)[0];
193}
194
195extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_storel_pd (double *__P, __m128d __A)
197{
198  _mm_store_sd (__P, __A);
199}
200
201/* Stores the upper DPFP value.  */
202extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm_storeh_pd (double *__P, __m128d __A)
204{
205  *__P = ((__v2df)__A)[1];
206}
207/* Store the lower DPFP value across two words.
208   The address must be 16-byte aligned.  */
209extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210_mm_store1_pd (double *__P, __m128d __A)
211{
212  _mm_store_pd (__P, vec_splat (__A, 0));
213}
214
215extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216_mm_store_pd1 (double *__P, __m128d __A)
217{
218  _mm_store1_pd (__P, __A);
219}
220
221/* Store two DPFP values in reverse order.  The address must be aligned.  */
222extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_mm_storer_pd (double *__P, __m128d __A)
224{
225  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
226}
227
228/* Intel intrinsic.  */
229extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230_mm_cvtsi128_si64 (__m128i __A)
231{
232  return ((__v2di)__A)[0];
233}
234
235/* Microsoft intrinsic.  */
236extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_cvtsi128_si64x (__m128i __A)
238{
239  return ((__v2di)__A)[0];
240}
241
242extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243_mm_add_pd (__m128d __A, __m128d __B)
244{
245  return (__m128d) ((__v2df)__A + (__v2df)__B);
246}
247
248/* Add the lower double-precision (64-bit) floating-point element in
249   a and b, store the result in the lower element of dst, and copy
250   the upper element from a to the upper element of dst. */
251extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252_mm_add_sd (__m128d __A, __m128d __B)
253{
254  __A[0] = __A[0] + __B[0];
255  return (__A);
256}
257
258extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259_mm_sub_pd (__m128d __A, __m128d __B)
260{
261  return (__m128d) ((__v2df)__A - (__v2df)__B);
262}
263
264extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265_mm_sub_sd (__m128d __A, __m128d __B)
266{
267  __A[0] = __A[0] - __B[0];
268  return (__A);
269}
270
271extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm_mul_pd (__m128d __A, __m128d __B)
273{
274  return (__m128d) ((__v2df)__A * (__v2df)__B);
275}
276
277extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm_mul_sd (__m128d __A, __m128d __B)
279{
280  __A[0] = __A[0] * __B[0];
281  return (__A);
282}
283
284extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285_mm_div_pd (__m128d __A, __m128d __B)
286{
287  return (__m128d) ((__v2df)__A / (__v2df)__B);
288}
289
290extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291_mm_div_sd (__m128d __A, __m128d __B)
292{
293  __A[0] = __A[0] / __B[0];
294  return (__A);
295}
296
297extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
298_mm_sqrt_pd (__m128d __A)
299{
300  return (vec_sqrt (__A));
301}
302
303/* Return pair {sqrt (B[0]), A[1]}.  */
304extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm_sqrt_sd (__m128d __A, __m128d __B)
306{
307  __v2df c;
308  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
309  return (__m128d) _mm_setr_pd (c[0], __A[1]);
310}
311
312extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313_mm_min_pd (__m128d __A, __m128d __B)
314{
315  return (vec_min (__A, __B));
316}
317
318extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319_mm_min_sd (__m128d __A, __m128d __B)
320{
321  __v2df a, b, c;
322  a = vec_splats (__A[0]);
323  b = vec_splats (__B[0]);
324  c = vec_min (a, b);
325  return (__m128d) _mm_setr_pd (c[0], __A[1]);
326}
327
328extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329_mm_max_pd (__m128d __A, __m128d __B)
330{
331  return (vec_max (__A, __B));
332}
333
334extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335_mm_max_sd (__m128d __A, __m128d __B)
336{
337  __v2df a, b, c;
338  a = vec_splats (__A[0]);
339  b = vec_splats (__B[0]);
340  c = vec_max (a, b);
341  return (__m128d) _mm_setr_pd (c[0], __A[1]);
342}
343
344extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_cmpeq_pd (__m128d __A, __m128d __B)
346{
347  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
348}
349
350extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351_mm_cmplt_pd (__m128d __A, __m128d __B)
352{
353  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
354}
355
356extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357_mm_cmple_pd (__m128d __A, __m128d __B)
358{
359  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
360}
361
362extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_cmpgt_pd (__m128d __A, __m128d __B)
364{
365  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
366}
367
368extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmpge_pd (__m128d __A, __m128d __B)
370{
371  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
372}
373
374extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm_cmpneq_pd (__m128d __A, __m128d __B)
376{
377  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
378  return ((__m128d)vec_nor (temp, temp));
379}
380
381extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382_mm_cmpnlt_pd (__m128d __A, __m128d __B)
383{
384  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
385}
386
387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388_mm_cmpnle_pd (__m128d __A, __m128d __B)
389{
390  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391}
392
393extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394_mm_cmpngt_pd (__m128d __A, __m128d __B)
395{
396  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
397}
398
399extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400_mm_cmpnge_pd (__m128d __A, __m128d __B)
401{
402  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
403}
404
405extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406_mm_cmpord_pd (__m128d __A, __m128d __B)
407{
408#if _ARCH_PWR8
409  __v2du c, d;
410  /* Compare against self will return false (0's) if NAN.  */
411  c = (__v2du)vec_cmpeq (__A, __A);
412  d = (__v2du)vec_cmpeq (__B, __B);
413#else
414  __v2du a, b;
415  __v2du c, d;
416  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
417  a = (__v2du)vec_abs ((__v2df)__A);
418  b = (__v2du)vec_abs ((__v2df)__B);
419  c = (__v2du)vec_cmpgt (double_exp_mask, a);
420  d = (__v2du)vec_cmpgt (double_exp_mask, b);
421#endif
422  /* A != NAN and B != NAN.  */
423  return ((__m128d)vec_and(c, d));
424}
425
426extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427_mm_cmpunord_pd (__m128d __A, __m128d __B)
428{
429#if _ARCH_PWR8
430  __v2du c, d;
431  /* Compare against self will return false (0's) if NAN.  */
432  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
433  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
434  /* A == NAN OR B == NAN converts too:
435     NOT(A != NAN) OR NOT(B != NAN).  */
436  c = vec_nor (c, c);
437  return ((__m128d)vec_orc(c, d));
438#else
439  __v2du c, d;
440  /* Compare against self will return false (0's) if NAN.  */
441  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
442  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
443  /* Convert the true ('1's) is NAN.  */
444  c = vec_nor (c, c);
445  d = vec_nor (d, d);
446  return ((__m128d)vec_or(c, d));
447#endif
448}
449
450extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
452{
453  __v2df a, b, c;
454  /* PowerISA VSX does not allow partial (for just lower double)
455     results. So to insure we don't generate spurious exceptions
456     (from the upper double values) we splat the lower double
457     before we do the operation. */
458  a = vec_splats (__A[0]);
459  b = vec_splats (__B[0]);
460  c = (__v2df) vec_cmpeq(a, b);
461  /* Then we merge the lower double result with the original upper
462     double from __A.  */
463  return (__m128d) _mm_setr_pd (c[0], __A[1]);
464}
465
466extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467_mm_cmplt_sd (__m128d __A, __m128d __B)
468{
469  __v2df a, b, c;
470  a = vec_splats (__A[0]);
471  b = vec_splats (__B[0]);
472  c = (__v2df) vec_cmplt(a, b);
473  return (__m128d) _mm_setr_pd (c[0], __A[1]);
474}
475
476extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477_mm_cmple_sd (__m128d __A, __m128d __B)
478{
479  __v2df a, b, c;
480  a = vec_splats (__A[0]);
481  b = vec_splats (__B[0]);
482  c = (__v2df) vec_cmple(a, b);
483  return (__m128d) _mm_setr_pd (c[0], __A[1]);
484}
485
486extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm_cmpgt_sd (__m128d __A, __m128d __B)
488{
489  __v2df a, b, c;
490  a = vec_splats (__A[0]);
491  b = vec_splats (__B[0]);
492  c = (__v2df) vec_cmpgt(a, b);
493  return (__m128d) _mm_setr_pd (c[0], __A[1]);
494}
495
496extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_mm_cmpge_sd (__m128d __A, __m128d __B)
498{
499  __v2df a, b, c;
500  a = vec_splats (__A[0]);
501  b = vec_splats (__B[0]);
502  c = (__v2df) vec_cmpge(a, b);
503  return (__m128d) _mm_setr_pd (c[0], __A[1]);
504}
505
506extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_cmpneq_sd (__m128d __A, __m128d __B)
508{
509  __v2df a, b, c;
510  a = vec_splats (__A[0]);
511  b = vec_splats (__B[0]);
512  c = (__v2df) vec_cmpeq(a, b);
513  c = vec_nor (c, c);
514  return (__m128d) _mm_setr_pd (c[0], __A[1]);
515}
516
517extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518_mm_cmpnlt_sd (__m128d __A, __m128d __B)
519{
520  __v2df a, b, c;
521  a = vec_splats (__A[0]);
522  b = vec_splats (__B[0]);
523  /* Not less than is just greater than or equal.  */
524  c = (__v2df) vec_cmpge(a, b);
525  return (__m128d) _mm_setr_pd (c[0], __A[1]);
526}
527
528extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529_mm_cmpnle_sd (__m128d __A, __m128d __B)
530{
531  __v2df a, b, c;
532  a = vec_splats (__A[0]);
533  b = vec_splats (__B[0]);
534  /* Not less than or equal is just greater than.  */
535  c = (__v2df) vec_cmpge(a, b);
536  return (__m128d) _mm_setr_pd (c[0], __A[1]);
537}
538
539extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540_mm_cmpngt_sd (__m128d __A, __m128d __B)
541{
542  __v2df a, b, c;
543  a = vec_splats (__A[0]);
544  b = vec_splats (__B[0]);
545  /* Not greater than is just less than or equal.  */
546  c = (__v2df) vec_cmple(a, b);
547  return (__m128d) _mm_setr_pd (c[0], __A[1]);
548}
549
550extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551_mm_cmpnge_sd (__m128d __A, __m128d __B)
552{
553  __v2df a, b, c;
554  a = vec_splats (__A[0]);
555  b = vec_splats (__B[0]);
556  /* Not greater than or equal is just less than.  */
557  c = (__v2df) vec_cmplt(a, b);
558  return (__m128d) _mm_setr_pd (c[0], __A[1]);
559}
560
561extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562_mm_cmpord_sd (__m128d __A, __m128d __B)
563{
564  __v2df r;
565  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
566  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
567}
568
569extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570_mm_cmpunord_sd (__m128d __A, __m128d __B)
571{
572  __v2df r;
573  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
574  return (__m128d) _mm_setr_pd (r[0], __A[1]);
575}
576
577/* FIXME
578   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
579   exactly the same because GCC for PowerPC only generates unordered
580   compares (scalar and vector).
581   Technically __mm_comieq_sp et all should be using the ordered
582   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
583   be OK.   */
584extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585_mm_comieq_sd (__m128d __A, __m128d __B)
586{
587  return (__A[0] == __B[0]);
588}
589
590extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591_mm_comilt_sd (__m128d __A, __m128d __B)
592{
593  return (__A[0] < __B[0]);
594}
595
596extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
597_mm_comile_sd (__m128d __A, __m128d __B)
598{
599  return (__A[0] <= __B[0]);
600}
601
602extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603_mm_comigt_sd (__m128d __A, __m128d __B)
604{
605  return (__A[0] > __B[0]);
606}
607
608extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609_mm_comige_sd (__m128d __A, __m128d __B)
610{
611  return (__A[0] >= __B[0]);
612}
613
614extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615_mm_comineq_sd (__m128d __A, __m128d __B)
616{
617  return (__A[0] != __B[0]);
618}
619
620extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621_mm_ucomieq_sd (__m128d __A, __m128d __B)
622{
623	return (__A[0] == __B[0]);
624}
625
626extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627_mm_ucomilt_sd (__m128d __A, __m128d __B)
628{
629	return (__A[0] < __B[0]);
630}
631
632extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633_mm_ucomile_sd (__m128d __A, __m128d __B)
634{
635	return (__A[0] <= __B[0]);
636}
637
638extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639_mm_ucomigt_sd (__m128d __A, __m128d __B)
640{
641	return (__A[0] > __B[0]);
642}
643
644extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm_ucomige_sd (__m128d __A, __m128d __B)
646{
647	return (__A[0] >= __B[0]);
648}
649
650extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651_mm_ucomineq_sd (__m128d __A, __m128d __B)
652{
653  return (__A[0] != __B[0]);
654}
655
656/* Create a vector of Qi, where i is the element number.  */
657extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658_mm_set_epi64x (long long __q1, long long __q0)
659{
660  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
661}
662
663extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664_mm_set_epi64 (__m64 __q1,  __m64 __q0)
665{
666  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
667}
668
669extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
671{
672  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
673}
674
675extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
677	       short __q3, short __q2, short __q1, short __q0)
678{
679  return __extension__ (__m128i)(__v8hi){
680    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
681}
682
683extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
685	      char __q11, char __q10, char __q09, char __q08,
686	      char __q07, char __q06, char __q05, char __q04,
687	      char __q03, char __q02, char __q01, char __q00)
688{
689  return __extension__ (__m128i)(__v16qi){
690    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
691    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
692  };
693}
694
695/* Set all of the elements of the vector to A.  */
696extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm_set1_epi64x (long long __A)
698{
699  return _mm_set_epi64x (__A, __A);
700}
701
702extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm_set1_epi64 (__m64 __A)
704{
705  return _mm_set_epi64 (__A, __A);
706}
707
708extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709_mm_set1_epi32 (int __A)
710{
711  return _mm_set_epi32 (__A, __A, __A, __A);
712}
713
714extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715_mm_set1_epi16 (short __A)
716{
717  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
718}
719
720extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721_mm_set1_epi8 (char __A)
722{
723  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
724		       __A, __A, __A, __A, __A, __A, __A, __A);
725}
726
727/* Create a vector of Qi, where i is the element number.
728   The parameter order is reversed from the _mm_set_epi* functions.  */
729extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730_mm_setr_epi64 (__m64 __q0, __m64 __q1)
731{
732  return _mm_set_epi64 (__q1, __q0);
733}
734
735extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
737{
738  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
739}
740
741extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
743	        short __q4, short __q5, short __q6, short __q7)
744{
745  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
746}
747
748extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
750	       char __q04, char __q05, char __q06, char __q07,
751	       char __q08, char __q09, char __q10, char __q11,
752	       char __q12, char __q13, char __q14, char __q15)
753{
754  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
755		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
756}
757
758/* Create a vector with element 0 as *P and the rest zero.  */
759extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760_mm_load_si128 (__m128i const *__P)
761{
762  return *__P;
763}
764
765extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766_mm_loadu_si128 (__m128i_u const *__P)
767{
768  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
769}
770
771extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772_mm_loadl_epi64 (__m128i_u const *__P)
773{
774  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
775}
776
777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778_mm_store_si128 (__m128i *__P, __m128i __B)
779{
780  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
781}
782
783extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
785{
786  *__P = __B;
787}
788
789extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
791{
792  *(long long *)__P = ((__v2di)__B)[0];
793}
794
795extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796_mm_movepi64_pi64 (__m128i_u __B)
797{
798  return (__m64) ((__v2di)__B)[0];
799}
800
801extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802_mm_movpi64_epi64 (__m64 __A)
803{
804  return _mm_set_epi64 ((__m64)0LL, __A);
805}
806
807extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
808_mm_move_epi64 (__m128i __A)
809{
810  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
811}
812
813/* Create an undefined vector.  */
814extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815_mm_undefined_si128 (void)
816{
817  __m128i __Y = __Y;
818  return __Y;
819}
820
821/* Create a vector of zeros.  */
822extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823_mm_setzero_si128 (void)
824{
825  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
826}
827
828#ifdef _ARCH_PWR8
829extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm_cvtepi32_pd (__m128i __A)
831{
832  __v2di val;
833  /* For LE need to generate Vector Unpack Low Signed Word.
834     Which is generated from unpackh.  */
835  val = (__v2di)vec_unpackh ((__v4si)__A);
836
837  return (__m128d)vec_ctf (val, 0);
838}
839#endif
840
841extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm_cvtepi32_ps (__m128i __A)
843{
844  return ((__m128)vec_ctf((__v4si)__A, 0));
845}
846
847extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm_cvtpd_epi32 (__m128d __A)
849{
850  __v2df rounded = vec_rint (__A);
851  __v4si result, temp;
852  const __v4si vzero =
853    { 0, 0, 0, 0 };
854
855  /* VSX Vector truncate Double-Precision to integer and Convert to
856   Signed Integer Word format with Saturate.  */
857  __asm__(
858      "xvcvdpsxws %x0,%x1"
859      : "=wa" (temp)
860      : "wa" (rounded)
861      : );
862
863#ifdef _ARCH_PWR8
864  temp = vec_mergeo (temp, temp);
865  result = (__v4si) vec_vpkudum ((__vector long long) temp,
866				 (__vector long long) vzero);
867#else
868  {
869    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
870	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
871    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
872  }
873#endif
874  return (__m128i) result;
875}
876
877extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm_cvtpd_pi32 (__m128d __A)
879{
880  __m128i result = _mm_cvtpd_epi32(__A);
881
882  return (__m64) result[0];
883}
884
885extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886_mm_cvtpd_ps (__m128d __A)
887{
888  __v4sf result;
889  __v4si temp;
890  const __v4si vzero = { 0, 0, 0, 0 };
891
892  __asm__(
893      "xvcvdpsp %x0,%x1"
894      : "=wa" (temp)
895      : "wa" (__A)
896      : );
897
898#ifdef _ARCH_PWR8
899  temp = vec_mergeo (temp, temp);
900  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
901				 (__vector long long) vzero);
902#else
903  {
904    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
905	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
906    result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
907  }
908#endif
909  return ((__m128)result);
910}
911
912extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm_cvttpd_epi32 (__m128d __A)
914{
915  __v4si result;
916  __v4si temp;
917  const __v4si vzero = { 0, 0, 0, 0 };
918
919  /* VSX Vector truncate Double-Precision to integer and Convert to
920   Signed Integer Word format with Saturate.  */
921  __asm__(
922      "xvcvdpsxws %x0,%x1"
923      : "=wa" (temp)
924      : "wa" (__A)
925      : );
926
927#ifdef _ARCH_PWR8
928  temp = vec_mergeo (temp, temp);
929  result = (__v4si) vec_vpkudum ((__vector long long) temp,
930				 (__vector long long) vzero);
931#else
932  {
933    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
934	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
935    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
936  }
937#endif
938
939  return ((__m128i) result);
940}
941
942extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
943_mm_cvttpd_pi32 (__m128d __A)
944{
945  __m128i result = _mm_cvttpd_epi32 (__A);
946
947  return (__m64) result[0];
948}
949
950extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951_mm_cvtsi128_si32 (__m128i __A)
952{
953  return ((__v4si)__A)[0];
954}
955
956#ifdef _ARCH_PWR8
957extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958_mm_cvtpi32_pd (__m64 __A)
959{
960  __v4si temp;
961  __v2di tmp2;
962  __v2df result;
963
964  temp = (__v4si)vec_splats (__A);
965  tmp2 = (__v2di)vec_unpackl (temp);
966  result = vec_ctf ((__vector signed long long) tmp2, 0);
967  return (__m128d)result;
968}
969#endif
970
971extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972_mm_cvtps_epi32 (__m128 __A)
973{
974  __v4sf rounded;
975  __v4si result;
976
977  rounded = vec_rint((__v4sf) __A);
978  result = vec_cts (rounded, 0);
979  return (__m128i) result;
980}
981
982extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_cvttps_epi32 (__m128 __A)
984{
985  __v4si result;
986
987  result = vec_cts ((__v4sf) __A, 0);
988  return (__m128i) result;
989}
990
991extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992_mm_cvtps_pd (__m128 __A)
993{
994  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
995#ifdef vec_doubleh
996  return (__m128d) vec_doubleh ((__v4sf)__A);
997#else
998  /* Otherwise the compiler is not current and so need to generate the
999     equivalent code.  */
1000  __v4sf a = (__v4sf)__A;
1001  __v4sf temp;
1002  __v2df result;
1003#ifdef __LITTLE_ENDIAN__
1004  /* The input float values are in elements {[0], [1]} but the convert
1005     instruction needs them in elements {[1], [3]}, So we use two
1006     shift left double vector word immediates to get the elements
1007     lined up.  */
1008  temp = __builtin_vsx_xxsldwi (a, a, 3);
1009  temp = __builtin_vsx_xxsldwi (a, temp, 2);
1010#else
1011  /* The input float values are in elements {[0], [1]} but the convert
1012     instruction needs them in elements {[0], [2]}, So we use two
1013     shift left double vector word immediates to get the elements
1014     lined up.  */
1015  temp = vec_vmrghw (a, a);
1016#endif
1017  __asm__(
1018      " xvcvspdp %x0,%x1"
1019      : "=wa" (result)
1020      : "wa" (temp)
1021      : );
1022  return (__m128d) result;
1023#endif
1024}
1025
1026extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027_mm_cvtsd_si32 (__m128d __A)
1028{
1029  __v2df rounded = vec_rint((__v2df) __A);
1030  int result = ((__v2df)rounded)[0];
1031
1032  return result;
1033}
1034/* Intel intrinsic.  */
1035extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036_mm_cvtsd_si64 (__m128d __A)
1037{
1038  __v2df rounded = vec_rint ((__v2df) __A );
1039  long long result = ((__v2df) rounded)[0];
1040
1041  return result;
1042}
1043
1044/* Microsoft intrinsic.  */
1045extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046_mm_cvtsd_si64x (__m128d __A)
1047{
1048  return _mm_cvtsd_si64 ((__v2df)__A);
1049}
1050
1051extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052_mm_cvttsd_si32 (__m128d __A)
1053{
1054  int result = ((__v2df)__A)[0];
1055
1056  return result;
1057}
1058
1059/* Intel intrinsic.  */
1060extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061_mm_cvttsd_si64 (__m128d __A)
1062{
1063  long long result = ((__v2df)__A)[0];
1064
1065  return result;
1066}
1067
1068/* Microsoft intrinsic.  */
1069extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070_mm_cvttsd_si64x (__m128d __A)
1071{
1072  return _mm_cvttsd_si64 (__A);
1073}
1074
1075extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076_mm_cvtsd_ss (__m128 __A, __m128d __B)
1077{
1078  __v4sf result = (__v4sf)__A;
1079
1080#ifdef __LITTLE_ENDIAN__
1081  __v4sf temp_s;
1082  /* Copy double element[0] to element [1] for conversion.  */
1083  __v2df temp_b = vec_splat((__v2df)__B, 0);
1084
1085  /* Pre-rotate __A left 3 (logically right 1) elements.  */
1086  result = __builtin_vsx_xxsldwi (result, result, 3);
1087  /* Convert double to single float scalar in a vector.  */
1088  __asm__(
1089      "xscvdpsp %x0,%x1"
1090      : "=wa" (temp_s)
1091      : "wa" (temp_b)
1092      : );
1093  /* Shift the resulting scalar into vector element [0].  */
1094  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1095#else
1096  result [0] = ((__v2df)__B)[0];
1097#endif
1098  return (__m128) result;
1099}
1100
1101extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102_mm_cvtsi32_sd (__m128d __A, int __B)
1103{
1104  __v2df result = (__v2df)__A;
1105  double db = __B;
1106  result [0] = db;
1107  return (__m128d)result;
1108}
1109
1110/* Intel intrinsic.  */
1111extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112_mm_cvtsi64_sd (__m128d __A, long long __B)
1113{
1114  __v2df result = (__v2df)__A;
1115  double db = __B;
1116  result [0] = db;
1117  return (__m128d)result;
1118}
1119
1120/* Microsoft intrinsic.  */
1121extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122_mm_cvtsi64x_sd (__m128d __A, long long __B)
1123{
1124  return _mm_cvtsi64_sd (__A, __B);
1125}
1126
1127extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128_mm_cvtss_sd (__m128d __A, __m128 __B)
1129{
1130#ifdef __LITTLE_ENDIAN__
1131  /* Use splat to move element [0] into position for the convert. */
1132  __v4sf temp = vec_splat ((__v4sf)__B, 0);
1133  __v2df res;
1134  /* Convert single float scalar to double in a vector.  */
1135  __asm__(
1136      "xscvspdp %x0,%x1"
1137      : "=wa" (res)
1138      : "wa" (temp)
1139      : );
1140  return (__m128d) vec_mergel (res, (__v2df)__A);
1141#else
1142  __v2df res = (__v2df)__A;
1143  res [0] = ((__v4sf)__B) [0];
1144  return (__m128d) res;
1145#endif
1146}
1147
1148extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1150{
1151  __vector double result;
1152  const int litmsk = __mask & 0x3;
1153
1154  if (litmsk == 0)
1155    result = vec_mergeh (__A, __B);
1156#if __GNUC__ < 6
1157  else if (litmsk == 1)
1158    result = vec_xxpermdi (__B, __A, 2);
1159  else if (litmsk == 2)
1160    result = vec_xxpermdi (__B, __A, 1);
1161#else
1162  else if (litmsk == 1)
1163    result = vec_xxpermdi (__A, __B, 2);
1164  else if (litmsk == 2)
1165    result = vec_xxpermdi (__A, __B, 1);
1166#endif
1167  else
1168    result = vec_mergel (__A, __B);
1169
1170  return result;
1171}
1172
1173extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174_mm_unpackhi_pd (__m128d __A, __m128d __B)
1175{
1176  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1177}
1178
1179extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180_mm_unpacklo_pd (__m128d __A, __m128d __B)
1181{
1182  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1183}
1184
1185extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186_mm_loadh_pd (__m128d __A, double const *__B)
1187{
1188  __v2df result = (__v2df)__A;
1189  result [1] = *__B;
1190  return (__m128d)result;
1191}
1192
1193extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194_mm_loadl_pd (__m128d __A, double const *__B)
1195{
1196  __v2df result = (__v2df)__A;
1197  result [0] = *__B;
1198  return (__m128d)result;
1199}
1200
1201#ifdef _ARCH_PWR8
1202/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1203
1204/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1205extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm_movemask_pd (__m128d  __A)
1207{
1208  __vector unsigned long long result;
1209  static const __vector unsigned int perm_mask =
1210    {
1211#ifdef __LITTLE_ENDIAN__
1212	0x80800040, 0x80808080, 0x80808080, 0x80808080
1213#else
1214      0x80808080, 0x80808080, 0x80808080, 0x80804000
1215#endif
1216    };
1217
1218  result = ((__vector unsigned long long)
1219	    vec_vbpermq ((__vector unsigned char) __A,
1220			 (__vector unsigned char) perm_mask));
1221
1222#ifdef __LITTLE_ENDIAN__
1223  return result[1];
1224#else
1225  return result[0];
1226#endif
1227}
1228#endif /* _ARCH_PWR8 */
1229
1230extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1231_mm_packs_epi16 (__m128i __A, __m128i __B)
1232{
1233  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1234}
1235
1236extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237_mm_packs_epi32 (__m128i __A, __m128i __B)
1238{
1239  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1240}
1241
1242extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243_mm_packus_epi16 (__m128i __A, __m128i __B)
1244{
1245  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1246}
1247
1248extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1250{
1251  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1252}
1253
1254extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1256{
1257  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1258}
1259
1260extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1262{
1263  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1264}
1265
1266extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1268{
1269  return (__m128i) vec_mergel ((__vector long long) __A,
1270			       (__vector long long) __B);
1271}
1272
1273extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1275{
1276  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1277}
1278
1279extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1281{
1282  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1283}
1284
1285extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1287{
1288  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1289}
1290
1291extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1293{
1294  return (__m128i) vec_mergeh ((__vector long long) __A,
1295			       (__vector long long) __B);
1296}
1297
1298extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299_mm_add_epi8 (__m128i __A, __m128i __B)
1300{
1301  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1302}
1303
1304extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305_mm_add_epi16 (__m128i __A, __m128i __B)
1306{
1307  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1308}
1309
1310extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311_mm_add_epi32 (__m128i __A, __m128i __B)
1312{
1313  return (__m128i) ((__v4su)__A + (__v4su)__B);
1314}
1315
1316extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317_mm_add_epi64 (__m128i __A, __m128i __B)
1318{
1319  return (__m128i) ((__v2du)__A + (__v2du)__B);
1320}
1321
1322extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323_mm_adds_epi8 (__m128i __A, __m128i __B)
1324{
1325  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1326}
1327
1328extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329_mm_adds_epi16 (__m128i __A, __m128i __B)
1330{
1331  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1332}
1333
1334extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335_mm_adds_epu8 (__m128i __A, __m128i __B)
1336{
1337  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1338}
1339
1340extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341_mm_adds_epu16 (__m128i __A, __m128i __B)
1342{
1343  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1344}
1345
1346extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347_mm_sub_epi8 (__m128i __A, __m128i __B)
1348{
1349  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1350}
1351
1352extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_sub_epi16 (__m128i __A, __m128i __B)
1354{
1355  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1356}
1357
1358extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359_mm_sub_epi32 (__m128i __A, __m128i __B)
1360{
1361  return (__m128i) ((__v4su)__A - (__v4su)__B);
1362}
1363
1364extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365_mm_sub_epi64 (__m128i __A, __m128i __B)
1366{
1367  return (__m128i) ((__v2du)__A - (__v2du)__B);
1368}
1369
1370extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371_mm_subs_epi8 (__m128i __A, __m128i __B)
1372{
1373  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1374}
1375
1376extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377_mm_subs_epi16 (__m128i __A, __m128i __B)
1378{
1379  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1380}
1381
1382extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm_subs_epu8 (__m128i __A, __m128i __B)
1384{
1385  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1386}
1387
1388extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389_mm_subs_epu16 (__m128i __A, __m128i __B)
1390{
1391  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1392}
1393
1394extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395_mm_madd_epi16 (__m128i __A, __m128i __B)
1396{
1397  __vector signed int zero = {0, 0, 0, 0};
1398
1399  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1400}
1401
1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1404{
1405  __vector signed int w0, w1;
1406
1407  __vector unsigned char xform1 = {
1408#ifdef __LITTLE_ENDIAN__
1409      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1410      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1411#else
1412      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1413      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1414#endif
1415    };
1416
1417  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1418  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1419  return (__m128i) vec_perm (w0, w1, xform1);
1420}
1421
1422extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423_mm_mullo_epi16 (__m128i __A, __m128i __B)
1424{
1425    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1426}
1427
1428extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429_mm_mul_su32 (__m64 __A, __m64 __B)
1430{
1431  unsigned int a = __A;
1432  unsigned int b = __B;
1433
1434  return ((__m64)a * (__m64)b);
1435}
1436
1437extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438_mm_mul_epu32 (__m128i __A, __m128i __B)
1439{
1440#if __GNUC__ < 8
1441  __v2du result;
1442
1443#ifdef __LITTLE_ENDIAN__
1444  /* VMX Vector Multiply Odd Unsigned Word.  */
1445  __asm__(
1446      "vmulouw %0,%1,%2"
1447      : "=v" (result)
1448      : "v" (__A), "v" (__B)
1449      : );
1450#else
1451  /* VMX Vector Multiply Even Unsigned Word.  */
1452  __asm__(
1453      "vmuleuw %0,%1,%2"
1454      : "=v" (result)
1455      : "v" (__A), "v" (__B)
1456      : );
1457#endif
1458  return (__m128i) result;
1459#else
1460  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1461#endif
1462}
1463
1464extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465_mm_slli_epi16 (__m128i __A, int __B)
1466{
1467  __v8hu lshift;
1468  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1469
1470  if (__B >= 0 && __B < 16)
1471    {
1472      if (__builtin_constant_p(__B))
1473	lshift = (__v8hu) vec_splat_s16(__B);
1474      else
1475	lshift = vec_splats ((unsigned short) __B);
1476
1477      result = vec_sl ((__v8hi) __A, lshift);
1478    }
1479
1480  return (__m128i) result;
1481}
1482
1483extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484_mm_slli_epi32 (__m128i __A, int __B)
1485{
1486  __v4su lshift;
1487  __v4si result = { 0, 0, 0, 0 };
1488
1489  if (__B >= 0 && __B < 32)
1490    {
1491      if (__builtin_constant_p(__B) && __B < 16)
1492	lshift = (__v4su) vec_splat_s32(__B);
1493      else
1494	lshift = vec_splats ((unsigned int) __B);
1495
1496      result = vec_sl ((__v4si) __A, lshift);
1497    }
1498
1499  return (__m128i) result;
1500}
1501
1502#ifdef _ARCH_PWR8
1503extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1504_mm_slli_epi64 (__m128i __A, int __B)
1505{
1506  __v2du lshift;
1507  __v2di result = { 0, 0 };
1508
1509  if (__B >= 0 && __B < 64)
1510    {
1511      if (__builtin_constant_p(__B) && __B < 16)
1512	lshift = (__v2du) vec_splat_s32(__B);
1513      else
1514	lshift = (__v2du) vec_splats ((unsigned int) __B);
1515
1516      result = vec_sl ((__v2di) __A, lshift);
1517    }
1518
1519  return (__m128i) result;
1520}
1521#endif
1522
1523extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1524_mm_srai_epi16 (__m128i __A, int __B)
1525{
1526  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1527  __v8hi result;
1528
1529  if (__B < 16)
1530    {
1531      if (__builtin_constant_p(__B))
1532	rshift = (__v8hu) vec_splat_s16(__B);
1533      else
1534	rshift = vec_splats ((unsigned short) __B);
1535    }
1536  result = vec_sra ((__v8hi) __A, rshift);
1537
1538  return (__m128i) result;
1539}
1540
1541extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1542_mm_srai_epi32 (__m128i __A, int __B)
1543{
1544  __v4su rshift = { 31, 31, 31, 31 };
1545  __v4si result;
1546
1547  if (__B < 32)
1548    {
1549      if (__builtin_constant_p(__B))
1550	{
1551	  if (__B < 16)
1552	      rshift = (__v4su) vec_splat_s32(__B);
1553	    else
1554	      rshift = (__v4su) vec_splats((unsigned int)__B);
1555	}
1556      else
1557	rshift = vec_splats ((unsigned int) __B);
1558    }
1559  result = vec_sra ((__v4si) __A, rshift);
1560
1561  return (__m128i) result;
1562}
1563
1564extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1565_mm_bslli_si128 (__m128i __A, const int __N)
1566{
1567  __v16qu result;
1568  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1569
1570  if (__N < 16)
1571    result = vec_sld ((__v16qu) __A, zeros, __N);
1572  else
1573    result = zeros;
1574
1575  return (__m128i) result;
1576}
1577
1578extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1579_mm_bsrli_si128 (__m128i __A, const int __N)
1580{
1581  __v16qu result;
1582  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1583
1584  if (__N < 16)
1585#ifdef __LITTLE_ENDIAN__
1586    if (__builtin_constant_p(__N))
1587      /* Would like to use Vector Shift Left Double by Octet
1588	 Immediate here to use the immediate form and avoid
1589	 load of __N * 8 value into a separate VR.  */
1590      result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1591    else
1592#endif
1593      {
1594	__v16qu shift = vec_splats((unsigned char)(__N*8));
1595#ifdef __LITTLE_ENDIAN__
1596	result = vec_sro ((__v16qu)__A, shift);
1597#else
1598	result = vec_slo ((__v16qu)__A, shift);
1599#endif
1600      }
1601  else
1602    result = zeros;
1603
1604  return (__m128i) result;
1605}
1606
1607extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608_mm_srli_si128 (__m128i __A, const int __N)
1609{
1610  return _mm_bsrli_si128 (__A, __N);
1611}
1612
1613extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1614_mm_slli_si128 (__m128i __A, const int _imm5)
1615{
1616  __v16qu result;
1617  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1618
1619  if (_imm5 < 16)
1620#ifdef __LITTLE_ENDIAN__
1621    result = vec_sld ((__v16qu) __A, zeros, _imm5);
1622#else
1623    result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1624#endif
1625  else
1626    result = zeros;
1627
1628  return (__m128i) result;
1629}
1630
1631extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1632
1633_mm_srli_epi16 (__m128i  __A, int __B)
1634{
1635  __v8hu rshift;
1636  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1637
1638  if (__B < 16)
1639    {
1640      if (__builtin_constant_p(__B))
1641	rshift = (__v8hu) vec_splat_s16(__B);
1642      else
1643	rshift = vec_splats ((unsigned short) __B);
1644
1645      result = vec_sr ((__v8hi) __A, rshift);
1646    }
1647
1648  return (__m128i) result;
1649}
1650
1651extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652_mm_srli_epi32 (__m128i __A, int __B)
1653{
1654  __v4su rshift;
1655  __v4si result = { 0, 0, 0, 0 };
1656
1657  if (__B < 32)
1658    {
1659      if (__builtin_constant_p(__B))
1660	{
1661	  if (__B < 16)
1662	      rshift = (__v4su) vec_splat_s32(__B);
1663	    else
1664	      rshift = (__v4su) vec_splats((unsigned int)__B);
1665	}
1666      else
1667	rshift = vec_splats ((unsigned int) __B);
1668
1669      result = vec_sr ((__v4si) __A, rshift);
1670    }
1671
1672  return (__m128i) result;
1673}
1674
1675#ifdef _ARCH_PWR8
1676extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1677_mm_srli_epi64 (__m128i __A, int __B)
1678{
1679  __v2du rshift;
1680  __v2di result = { 0, 0 };
1681
1682  if (__B < 64)
1683    {
1684      if (__builtin_constant_p(__B))
1685	{
1686	  if (__B < 16)
1687	      rshift = (__v2du) vec_splat_s32(__B);
1688	    else
1689	      rshift = (__v2du) vec_splats((unsigned long long)__B);
1690	}
1691      else
1692	rshift = (__v2du) vec_splats ((unsigned int) __B);
1693
1694      result = vec_sr ((__v2di) __A, rshift);
1695    }
1696
1697  return (__m128i) result;
1698}
1699#endif
1700
1701extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702_mm_sll_epi16 (__m128i __A, __m128i __B)
1703{
1704  __v8hu lshift;
1705  __vector __bool short shmask;
1706  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1707  __v8hu result;
1708
1709#ifdef __LITTLE_ENDIAN__
1710  lshift = vec_splat ((__v8hu) __B, 0);
1711#else
1712  lshift = vec_splat ((__v8hu) __B, 3);
1713#endif
1714  shmask = vec_cmple (lshift, shmax);
1715  result = vec_sl ((__v8hu) __A, lshift);
1716  result = vec_sel ((__v8hu) shmask, result, shmask);
1717
1718  return (__m128i) result;
1719}
1720
1721extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1722_mm_sll_epi32 (__m128i __A, __m128i __B)
1723{
1724  __v4su lshift;
1725  __vector __bool int shmask;
1726  const __v4su shmax = { 32, 32, 32, 32 };
1727  __v4su result;
1728#ifdef __LITTLE_ENDIAN__
1729  lshift = vec_splat ((__v4su) __B, 0);
1730#else
1731  lshift = vec_splat ((__v4su) __B, 1);
1732#endif
1733  shmask = vec_cmplt (lshift, shmax);
1734  result = vec_sl ((__v4su) __A, lshift);
1735  result = vec_sel ((__v4su) shmask, result, shmask);
1736
1737  return (__m128i) result;
1738}
1739
1740#ifdef _ARCH_PWR8
1741extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742_mm_sll_epi64 (__m128i __A, __m128i __B)
1743{
1744  __v2du lshift;
1745  __vector __bool long long shmask;
1746  const __v2du shmax = { 64, 64 };
1747  __v2du result;
1748
1749  lshift = vec_splat ((__v2du) __B, 0);
1750  shmask = vec_cmplt (lshift, shmax);
1751  result = vec_sl ((__v2du) __A, lshift);
1752  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1753
1754  return (__m128i) result;
1755}
1756#endif
1757
1758extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1759_mm_sra_epi16 (__m128i __A, __m128i __B)
1760{
1761  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1762  __v8hu rshift;
1763  __v8hi result;
1764
1765#ifdef __LITTLE_ENDIAN__
1766  rshift = vec_splat ((__v8hu)__B, 0);
1767#else
1768  rshift = vec_splat ((__v8hu)__B, 3);
1769#endif
1770  rshift = vec_min (rshift, rshmax);
1771  result = vec_sra ((__v8hi) __A, rshift);
1772
1773  return (__m128i) result;
1774}
1775
1776extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777_mm_sra_epi32 (__m128i __A, __m128i __B)
1778{
1779  const __v4su rshmax = { 31, 31, 31, 31 };
1780  __v4su rshift;
1781  __v4si result;
1782
1783#ifdef __LITTLE_ENDIAN__
1784  rshift = vec_splat ((__v4su)__B, 0);
1785#else
1786  rshift = vec_splat ((__v4su)__B, 1);
1787#endif
1788  rshift = vec_min (rshift, rshmax);
1789  result = vec_sra ((__v4si) __A, rshift);
1790
1791  return (__m128i) result;
1792}
1793
1794extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1795_mm_srl_epi16 (__m128i __A, __m128i __B)
1796{
1797  __v8hu rshift;
1798  __vector __bool short shmask;
1799  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1800  __v8hu result;
1801
1802#ifdef __LITTLE_ENDIAN__
1803  rshift = vec_splat ((__v8hu) __B, 0);
1804#else
1805  rshift = vec_splat ((__v8hu) __B, 3);
1806#endif
1807  shmask = vec_cmple (rshift, shmax);
1808  result = vec_sr ((__v8hu) __A, rshift);
1809  result = vec_sel ((__v8hu) shmask, result, shmask);
1810
1811  return (__m128i) result;
1812}
1813
1814extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815_mm_srl_epi32 (__m128i __A, __m128i __B)
1816{
1817  __v4su rshift;
1818  __vector __bool int shmask;
1819  const __v4su shmax = { 32, 32, 32, 32 };
1820  __v4su result;
1821
1822#ifdef __LITTLE_ENDIAN__
1823  rshift = vec_splat ((__v4su) __B, 0);
1824#else
1825  rshift = vec_splat ((__v4su) __B, 1);
1826#endif
1827  shmask = vec_cmplt (rshift, shmax);
1828  result = vec_sr ((__v4su) __A, rshift);
1829  result = vec_sel ((__v4su) shmask, result, shmask);
1830
1831  return (__m128i) result;
1832}
1833
1834#ifdef _ARCH_PWR8
1835extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1836_mm_srl_epi64 (__m128i __A, __m128i __B)
1837{
1838  __v2du rshift;
1839  __vector __bool long long shmask;
1840  const __v2du shmax = { 64, 64 };
1841  __v2du result;
1842
1843  rshift = vec_splat ((__v2du) __B, 0);
1844  shmask = vec_cmplt (rshift, shmax);
1845  result = vec_sr ((__v2du) __A, rshift);
1846  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1847
1848  return (__m128i) result;
1849}
1850#endif
1851
1852extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1853_mm_and_pd (__m128d __A, __m128d __B)
1854{
1855  return (vec_and ((__v2df) __A, (__v2df) __B));
1856}
1857
1858extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1859_mm_andnot_pd (__m128d __A, __m128d __B)
1860{
1861  return (vec_andc ((__v2df) __B, (__v2df) __A));
1862}
1863
1864extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1865_mm_or_pd (__m128d __A, __m128d __B)
1866{
1867  return (vec_or ((__v2df) __A, (__v2df) __B));
1868}
1869
1870extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1871_mm_xor_pd (__m128d __A, __m128d __B)
1872{
1873  return (vec_xor ((__v2df) __A, (__v2df) __B));
1874}
1875
1876extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1877_mm_and_si128 (__m128i __A, __m128i __B)
1878{
1879  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1880}
1881
1882extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1883_mm_andnot_si128 (__m128i __A, __m128i __B)
1884{
1885  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1886}
1887
1888extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1889_mm_or_si128 (__m128i __A, __m128i __B)
1890{
1891  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1892}
1893
1894extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1895_mm_xor_si128 (__m128i __A, __m128i __B)
1896{
1897  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1898}
1899
1900extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1901_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1902{
1903  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1904}
1905
1906extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1907_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1908{
1909  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1910}
1911
1912extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1913_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1914{
1915  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1916}
1917
1918extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1919_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1920{
1921  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1922}
1923
1924extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1925_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1926{
1927  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1928}
1929
1930extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1932{
1933  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1934}
1935
1936extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1938{
1939  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1940}
1941
1942extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1944{
1945  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1946}
1947
1948extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1950{
1951  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1952}
1953
1954extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1955_mm_extract_epi16 (__m128i const __A, int const __N)
1956{
1957  return (unsigned short) ((__v8hi)__A)[__N & 7];
1958}
1959
1960extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1962{
1963  __v8hi result = (__v8hi)__A;
1964
1965  result [(__N & 7)] = __D;
1966
1967  return (__m128i) result;
1968}
1969
1970extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1971_mm_max_epi16 (__m128i __A, __m128i __B)
1972{
1973  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1974}
1975
1976extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1977_mm_max_epu8 (__m128i __A, __m128i __B)
1978{
1979  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1980}
1981
1982extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1983_mm_min_epi16 (__m128i __A, __m128i __B)
1984{
1985  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1986}
1987
1988extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1989_mm_min_epu8 (__m128i __A, __m128i __B)
1990{
1991  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1992}
1993
1994
1995#ifdef _ARCH_PWR8
1996/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1997
1998/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1999extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2000_mm_movemask_epi8 (__m128i __A)
2001{
2002  __vector unsigned long long result;
2003  static const __vector unsigned char perm_mask =
2004    {
2005	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2006	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2007    };
2008
2009  result = ((__vector unsigned long long)
2010	    vec_vbpermq ((__vector unsigned char) __A,
2011			 (__vector unsigned char) perm_mask));
2012
2013#ifdef __LITTLE_ENDIAN__
2014  return result[1];
2015#else
2016  return result[0];
2017#endif
2018}
2019#endif /* _ARCH_PWR8 */
2020
2021extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2022_mm_mulhi_epu16 (__m128i __A, __m128i __B)
2023{
2024  __v4su w0, w1;
2025  __v16qu xform1 = {
2026#ifdef __LITTLE_ENDIAN__
2027      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2028      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2029#else
2030      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2031      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2032#endif
2033    };
2034
2035  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2036  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2037  return (__m128i) vec_perm (w0, w1, xform1);
2038}
2039
2040extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2041_mm_shufflehi_epi16 (__m128i __A, const int __mask)
2042{
2043  unsigned long element_selector_98 = __mask & 0x03;
2044  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2045  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2046  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2047  static const unsigned short permute_selectors[4] =
2048    {
2049#ifdef __LITTLE_ENDIAN__
2050	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2051#else
2052	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2053#endif
2054    };
2055  __v2du pmask =
2056#ifdef __LITTLE_ENDIAN__
2057      { 0x1716151413121110UL,  0UL};
2058#else
2059      { 0x1011121314151617UL,  0UL};
2060#endif
2061  __m64_union t;
2062  __v2du a, r;
2063
2064  t.as_short[0] = permute_selectors[element_selector_98];
2065  t.as_short[1] = permute_selectors[element_selector_BA];
2066  t.as_short[2] = permute_selectors[element_selector_DC];
2067  t.as_short[3] = permute_selectors[element_selector_FE];
2068  pmask[1] = t.as_m64;
2069  a = (__v2du)__A;
2070  r = vec_perm (a, a, (__vector unsigned char)pmask);
2071  return (__m128i) r;
2072}
2073
2074extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2075_mm_shufflelo_epi16 (__m128i __A, const int __mask)
2076{
2077  unsigned long element_selector_10 = __mask & 0x03;
2078  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2079  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2080  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2081  static const unsigned short permute_selectors[4] =
2082    {
2083#ifdef __LITTLE_ENDIAN__
2084	      0x0100, 0x0302, 0x0504, 0x0706
2085#else
2086	      0x0001, 0x0203, 0x0405, 0x0607
2087#endif
2088    };
2089  __v2du pmask =
2090#ifdef __LITTLE_ENDIAN__
2091                 { 0UL,  0x1f1e1d1c1b1a1918UL};
2092#else
2093                 { 0UL,  0x18191a1b1c1d1e1fUL};
2094#endif
2095  __m64_union t;
2096  __v2du a, r;
2097  t.as_short[0] = permute_selectors[element_selector_10];
2098  t.as_short[1] = permute_selectors[element_selector_32];
2099  t.as_short[2] = permute_selectors[element_selector_54];
2100  t.as_short[3] = permute_selectors[element_selector_76];
2101  pmask[0] = t.as_m64;
2102  a = (__v2du)__A;
2103  r = vec_perm (a, a, (__vector unsigned char)pmask);
2104  return (__m128i) r;
2105}
2106
2107extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2108_mm_shuffle_epi32 (__m128i __A, const int __mask)
2109{
2110  unsigned long element_selector_10 = __mask & 0x03;
2111  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2112  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2113  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2114  static const unsigned int permute_selectors[4] =
2115    {
2116#ifdef __LITTLE_ENDIAN__
2117	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2118#else
2119      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2120#endif
2121    };
2122  __v4su t;
2123
2124  t[0] = permute_selectors[element_selector_10];
2125  t[1] = permute_selectors[element_selector_32];
2126  t[2] = permute_selectors[element_selector_54] + 0x10101010;
2127  t[3] = permute_selectors[element_selector_76] + 0x10101010;
2128  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2129}
2130
2131extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2132_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2133{
2134  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2135  __v16qu mask, tmp;
2136  __m128i_u *p = (__m128i_u*)__C;
2137
2138  tmp = (__v16qu)_mm_loadu_si128(p);
2139  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2140  tmp = vec_sel (tmp, (__v16qu)__A, mask);
2141  _mm_storeu_si128 (p, (__m128i)tmp);
2142}
2143
2144extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2145_mm_avg_epu8 (__m128i __A, __m128i __B)
2146{
2147  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2148}
2149
2150extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2151_mm_avg_epu16 (__m128i __A, __m128i __B)
2152{
2153  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2154}
2155
2156
2157extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2158_mm_sad_epu8 (__m128i __A, __m128i __B)
2159{
2160  __v16qu a, b;
2161  __v16qu vmin, vmax, vabsdiff;
2162  __v4si vsum;
2163  const __v4su zero = { 0, 0, 0, 0 };
2164  __v4si result;
2165
2166  a = (__v16qu) __A;
2167  b = (__v16qu) __B;
2168  vmin = vec_min (a, b);
2169  vmax = vec_max (a, b);
2170  vabsdiff = vec_sub (vmax, vmin);
2171  /* Sum four groups of bytes into integers.  */
2172  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2173  /* Sum across four integers with two integer results.  */
2174  result = vec_sum2s (vsum, (__vector signed int) zero);
2175  /* Rotate the sums into the correct position.  */
2176#ifdef __LITTLE_ENDIAN__
2177  result = vec_sld (result, result, 4);
2178#else
2179  result = vec_sld (result, result, 6);
2180#endif
2181  /* Rotate the sums into the correct position.  */
2182  return (__m128i) result;
2183}
2184
2185extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186_mm_stream_si32 (int *__A, int __B)
2187{
2188  /* Use the data cache block touch for store transient.  */
2189  __asm__ (
2190    "dcbtstt 0,%0"
2191    :
2192    : "b" (__A)
2193    : "memory"
2194  );
2195  *__A = __B;
2196}
2197
2198extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2199_mm_stream_si64 (long long int *__A, long long int __B)
2200{
2201  /* Use the data cache block touch for store transient.  */
2202  __asm__ (
2203    "	dcbtstt	0,%0"
2204    :
2205    : "b" (__A)
2206    : "memory"
2207  );
2208  *__A = __B;
2209}
2210
2211extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2212_mm_stream_si128 (__m128i *__A, __m128i __B)
2213{
2214  /* Use the data cache block touch for store transient.  */
2215  __asm__ (
2216    "dcbtstt 0,%0"
2217    :
2218    : "b" (__A)
2219    : "memory"
2220  );
2221  *__A = __B;
2222}
2223
2224extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2225_mm_stream_pd (double *__A, __m128d __B)
2226{
2227  /* Use the data cache block touch for store transient.  */
2228  __asm__ (
2229    "dcbtstt 0,%0"
2230    :
2231    : "b" (__A)
2232    : "memory"
2233  );
2234  *(__m128d*)__A = __B;
2235}
2236
2237extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2238_mm_clflush (void const *__A)
2239{
2240  /* Use the data cache block flush.  */
2241  __asm__ (
2242    "dcbf 0,%0"
2243    :
2244    : "b" (__A)
2245    : "memory"
2246  );
2247}
2248
2249extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2250_mm_lfence (void)
2251{
2252  /* Use light weight sync for load to load ordering.  */
2253  __atomic_thread_fence (__ATOMIC_RELEASE);
2254}
2255
2256extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257_mm_mfence (void)
2258{
2259  /* Use heavy weight sync for any to any ordering.  */
2260  __atomic_thread_fence (__ATOMIC_SEQ_CST);
2261}
2262
2263extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2264_mm_cvtsi32_si128 (int __A)
2265{
2266  return _mm_set_epi32 (0, 0, 0, __A);
2267}
2268
2269extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270_mm_cvtsi64_si128 (long long __A)
2271{
2272  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2273}
2274
2275/* Microsoft intrinsic.  */
2276extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277_mm_cvtsi64x_si128 (long long __A)
2278{
2279  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2280}
2281
2282/* Casts between various SP, DP, INT vector types.  Note that these do no
2283   conversion of values, they just change the type.  */
2284extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2285_mm_castpd_ps(__m128d __A)
2286{
2287  return (__m128) __A;
2288}
2289
2290extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2291_mm_castpd_si128(__m128d __A)
2292{
2293  return (__m128i) __A;
2294}
2295
2296extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2297_mm_castps_pd(__m128 __A)
2298{
2299  return (__m128d) __A;
2300}
2301
2302extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2303_mm_castps_si128(__m128 __A)
2304{
2305  return (__m128i) __A;
2306}
2307
2308extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2309_mm_castsi128_ps(__m128i __A)
2310{
2311  return (__m128) __A;
2312}
2313
2314extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2315_mm_castsi128_pd(__m128i __A)
2316{
2317  return (__m128d) __A;
2318}
2319
2320#else
2321#include_next <emmintrin.h>
2322#endif /* defined(__linux__) && defined(__ppc64__) */
2323
2324#endif /* EMMINTRIN_H_ */
2325