1/* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef NO_WARN_X86_INTRINSICS
28/* This header is distributed to simplify porting x86_64 code that
29   makes explicit use of Intel intrinsics to powerpc64le.
30   It is the user's responsibility to determine if the results are
31   acceptable and make additional changes as necessary.
32   Note that much code that uses Intel intrinsics can be rewritten in
33   standard C or GNU C extensions, which are more portable and better
34   optimized across multiple targets.
35
36   In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37   the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38   operations.  However scalar double operations in vector (XMM)
39   registers require the POWER8 VSX ISA (2.07) level. Also there are
40   important differences for data format and placement of double
41   scalars in the vector register.
42
43   For PowerISA Scalar double is in FPRs (left most 64-bits of the
44   low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45   the XMM. These differences require extra steps on POWER to match
46   the SSE2 scalar double semantics.
47
48   Most SSE2 scalar double intrinsic operations can be performed more
49   efficiently as C language double scalar operations or optimized to
50   use vector SIMD operations.  We recommend this for new applications.
51
52   Another difference is the format and details of the X86_64 MXSCR vs
53   the PowerISA FPSCR / VSCR registers. We recommend applications
54   replace direct access to the MXSCR with the more portable <fenv.h>
55   Posix APIs. */
56#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
57#endif
58
59#ifndef EMMINTRIN_H_
60#define EMMINTRIN_H_
61
62#include <altivec.h>
63#include <assert.h>
64
65/* We need definitions from the SSE header files.  */
66#include <xmmintrin.h>
67
68/* SSE2 */
69typedef __vector double __v2df;
70typedef __vector long long __v2di;
71typedef __vector unsigned long long __v2du;
72typedef __vector int __v4si;
73typedef __vector unsigned int __v4su;
74typedef __vector short __v8hi;
75typedef __vector unsigned short __v8hu;
76typedef __vector signed char __v16qi;
77typedef __vector unsigned char __v16qu;
78
79/* The Intel API is flexible enough that we must allow aliasing with other
80   vector types, and their scalar components.  */
81typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83
84/* Unaligned version of the same types.  */
85typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87
88/* Define two value permute mask.  */
89#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
90
91/* Create a vector with element 0 as F and the rest zero.  */
92extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93_mm_set_sd (double __F)
94{
95  return __extension__ (__m128d){ __F, 0.0 };
96}
97
98/* Create a vector with both elements equal to F.  */
99extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100_mm_set1_pd (double __F)
101{
102  return __extension__ (__m128d){ __F, __F };
103}
104
105extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106_mm_set_pd1 (double __F)
107{
108  return _mm_set1_pd (__F);
109}
110
111/* Create a vector with the lower value X and upper value W.  */
112extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113_mm_set_pd (double __W, double __X)
114{
115  return __extension__ (__m128d){ __X, __W };
116}
117
118/* Create a vector with the lower value W and upper value X.  */
119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_setr_pd (double __W, double __X)
121{
122  return __extension__ (__m128d){ __W, __X };
123}
124
125/* Create an undefined vector.  */
126extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127_mm_undefined_pd (void)
128{
129  __m128d __Y = __Y;
130  return __Y;
131}
132
133/* Create a vector of zeros.  */
134extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135_mm_setzero_pd (void)
136{
137  return (__m128d) vec_splats (0);
138}
139
140/* Sets the low DPFP value of A from the low value of B.  */
141extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_move_sd (__m128d __A, __m128d __B)
143{
144  __v2df __result = (__v2df) __A;
145  __result [0] = ((__v2df) __B)[0];
146  return (__m128d) __result;
147}
148
149/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
150extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151_mm_load_pd (double const *__P)
152{
153  assert(((unsigned long)__P & 0xfUL) == 0UL);
154  return ((__m128d)vec_ld(0, (__v16qu*)__P));
155}
156
157/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
158extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm_loadu_pd (double const *__P)
160{
161  return (vec_vsx_ld(0, __P));
162}
163
164/* Create a vector with all two elements equal to *P.  */
165extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166_mm_load1_pd (double const *__P)
167{
168  return (vec_splats (*__P));
169}
170
171/* Create a vector with element 0 as *P and the rest zero.  */
172extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173_mm_load_sd (double const *__P)
174{
175  return _mm_set_sd (*__P);
176}
177
178extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179_mm_load_pd1 (double const *__P)
180{
181  return _mm_load1_pd (__P);
182}
183
184/* Load two DPFP values in reverse order.  The address must be aligned.  */
185extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186_mm_loadr_pd (double const *__P)
187{
188  __v2df __tmp = _mm_load_pd (__P);
189  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
190}
191
192/* Store two DPFP values.  The address must be 16-byte aligned.  */
193extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194_mm_store_pd (double *__P, __m128d __A)
195{
196  assert(((unsigned long)__P & 0xfUL) == 0UL);
197  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
198}
199
200/* Store two DPFP values.  The address need not be 16-byte aligned.  */
201extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202_mm_storeu_pd (double *__P, __m128d __A)
203{
204  *(__m128d_u *)__P = __A;
205}
206
207/* Stores the lower DPFP value.  */
208extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209_mm_store_sd (double *__P, __m128d __A)
210{
211  *__P = ((__v2df)__A)[0];
212}
213
214extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215_mm_cvtsd_f64 (__m128d __A)
216{
217  return ((__v2df)__A)[0];
218}
219
220extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_storel_pd (double *__P, __m128d __A)
222{
223  _mm_store_sd (__P, __A);
224}
225
226/* Stores the upper DPFP value.  */
227extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_storeh_pd (double *__P, __m128d __A)
229{
230  *__P = ((__v2df)__A)[1];
231}
232/* Store the lower DPFP value across two words.
233   The address must be 16-byte aligned.  */
234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm_store1_pd (double *__P, __m128d __A)
236{
237  _mm_store_pd (__P, vec_splat (__A, 0));
238}
239
240extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241_mm_store_pd1 (double *__P, __m128d __A)
242{
243  _mm_store1_pd (__P, __A);
244}
245
246/* Store two DPFP values in reverse order.  The address must be aligned.  */
247extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm_storer_pd (double *__P, __m128d __A)
249{
250  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
251}
252
253/* Intel intrinsic.  */
254extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255_mm_cvtsi128_si64 (__m128i __A)
256{
257  return ((__v2di)__A)[0];
258}
259
260/* Microsoft intrinsic.  */
261extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262_mm_cvtsi128_si64x (__m128i __A)
263{
264  return ((__v2di)__A)[0];
265}
266
267extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268_mm_add_pd (__m128d __A, __m128d __B)
269{
270  return (__m128d) ((__v2df)__A + (__v2df)__B);
271}
272
273/* Add the lower double-precision (64-bit) floating-point element in
274   a and b, store the result in the lower element of dst, and copy
275   the upper element from a to the upper element of dst. */
276extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277_mm_add_sd (__m128d __A, __m128d __B)
278{
279  __A[0] = __A[0] + __B[0];
280  return (__A);
281}
282
283extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284_mm_sub_pd (__m128d __A, __m128d __B)
285{
286  return (__m128d) ((__v2df)__A - (__v2df)__B);
287}
288
289extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm_sub_sd (__m128d __A, __m128d __B)
291{
292  __A[0] = __A[0] - __B[0];
293  return (__A);
294}
295
296extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297_mm_mul_pd (__m128d __A, __m128d __B)
298{
299  return (__m128d) ((__v2df)__A * (__v2df)__B);
300}
301
302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303_mm_mul_sd (__m128d __A, __m128d __B)
304{
305  __A[0] = __A[0] * __B[0];
306  return (__A);
307}
308
309extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310_mm_div_pd (__m128d __A, __m128d __B)
311{
312  return (__m128d) ((__v2df)__A / (__v2df)__B);
313}
314
315extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
316_mm_div_sd (__m128d __A, __m128d __B)
317{
318  __A[0] = __A[0] / __B[0];
319  return (__A);
320}
321
322extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323_mm_sqrt_pd (__m128d __A)
324{
325  return (vec_sqrt (__A));
326}
327
328/* Return pair {sqrt (B[0]), A[1]}.  */
329extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330_mm_sqrt_sd (__m128d __A, __m128d __B)
331{
332  __v2df __c;
333  __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
335}
336
337extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338_mm_min_pd (__m128d __A, __m128d __B)
339{
340  return (vec_min (__A, __B));
341}
342
343extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm_min_sd (__m128d __A, __m128d __B)
345{
346  __v2df __a, __b, __c;
347  __a = vec_splats (__A[0]);
348  __b = vec_splats (__B[0]);
349  __c = vec_min (__a, __b);
350  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
351}
352
353extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354_mm_max_pd (__m128d __A, __m128d __B)
355{
356  return (vec_max (__A, __B));
357}
358
359extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360_mm_max_sd (__m128d __A, __m128d __B)
361{
362  __v2df __a, __b, __c;
363  __a = vec_splats (__A[0]);
364  __b = vec_splats (__B[0]);
365  __c = vec_max (__a, __b);
366  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
367}
368
369extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370_mm_cmpeq_pd (__m128d __A, __m128d __B)
371{
372  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
373}
374
375extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376_mm_cmplt_pd (__m128d __A, __m128d __B)
377{
378  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
379}
380
381extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382_mm_cmple_pd (__m128d __A, __m128d __B)
383{
384  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
385}
386
387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388_mm_cmpgt_pd (__m128d __A, __m128d __B)
389{
390  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391}
392
393extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394_mm_cmpge_pd (__m128d __A, __m128d __B)
395{
396  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
397}
398
399extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400_mm_cmpneq_pd (__m128d __A, __m128d __B)
401{
402  __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403  return ((__m128d)vec_nor (__temp, __temp));
404}
405
406extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407_mm_cmpnlt_pd (__m128d __A, __m128d __B)
408{
409  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
410}
411
412extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_cmpnle_pd (__m128d __A, __m128d __B)
414{
415  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
416}
417
418extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419_mm_cmpngt_pd (__m128d __A, __m128d __B)
420{
421  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
422}
423
424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425_mm_cmpnge_pd (__m128d __A, __m128d __B)
426{
427  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
428}
429
430#if _ARCH_PWR8
431extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432_mm_cmpord_pd (__m128d __A, __m128d __B)
433{
434  __v2du c, d;
435  /* Compare against self will return false (0's) if NAN.  */
436  c = (__v2du)vec_cmpeq (__A, __A);
437  d = (__v2du)vec_cmpeq (__B, __B);
438  /* A != NAN and B != NAN.  */
439  return ((__m128d)vec_and(c, d));
440}
441#endif
442
443extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444_mm_cmpunord_pd (__m128d __A, __m128d __B)
445{
446#if _ARCH_PWR8
447  __v2du c, d;
448  /* Compare against self will return false (0's) if NAN.  */
449  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
450  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
451  /* A == NAN OR B == NAN converts too:
452     NOT(A != NAN) OR NOT(B != NAN).  */
453  c = vec_nor (c, c);
454  return ((__m128d)vec_orc(c, d));
455#else
456  __v2du c, d;
457  /* Compare against self will return false (0's) if NAN.  */
458  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
459  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
460  /* Convert the true ('1's) is NAN.  */
461  c = vec_nor (c, c);
462  d = vec_nor (d, d);
463  return ((__m128d)vec_or(c, d));
464#endif
465}
466
467extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
469{
470  __v2df a, b, c;
471  /* PowerISA VSX does not allow partial (for just lower double)
472     results. So to insure we don't generate spurious exceptions
473     (from the upper double values) we splat the lower double
474     before we do the operation. */
475  a = vec_splats (__A[0]);
476  b = vec_splats (__B[0]);
477  c = (__v2df) vec_cmpeq(a, b);
478  /* Then we merge the lower double result with the original upper
479     double from __A.  */
480  return (__m128d) _mm_setr_pd (c[0], __A[1]);
481}
482
483extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484_mm_cmplt_sd (__m128d __A, __m128d __B)
485{
486  __v2df a, b, c;
487  a = vec_splats (__A[0]);
488  b = vec_splats (__B[0]);
489  c = (__v2df) vec_cmplt(a, b);
490  return (__m128d) _mm_setr_pd (c[0], __A[1]);
491}
492
493extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494_mm_cmple_sd (__m128d __A, __m128d __B)
495{
496  __v2df a, b, c;
497  a = vec_splats (__A[0]);
498  b = vec_splats (__B[0]);
499  c = (__v2df) vec_cmple(a, b);
500  return (__m128d) _mm_setr_pd (c[0], __A[1]);
501}
502
503extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504_mm_cmpgt_sd (__m128d __A, __m128d __B)
505{
506  __v2df a, b, c;
507  a = vec_splats (__A[0]);
508  b = vec_splats (__B[0]);
509  c = (__v2df) vec_cmpgt(a, b);
510  return (__m128d) _mm_setr_pd (c[0], __A[1]);
511}
512
513extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514_mm_cmpge_sd (__m128d __A, __m128d __B)
515{
516  __v2df a, b, c;
517  a = vec_splats (__A[0]);
518  b = vec_splats (__B[0]);
519  c = (__v2df) vec_cmpge(a, b);
520  return (__m128d) _mm_setr_pd (c[0], __A[1]);
521}
522
523extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524_mm_cmpneq_sd (__m128d __A, __m128d __B)
525{
526  __v2df a, b, c;
527  a = vec_splats (__A[0]);
528  b = vec_splats (__B[0]);
529  c = (__v2df) vec_cmpeq(a, b);
530  c = vec_nor (c, c);
531  return (__m128d) _mm_setr_pd (c[0], __A[1]);
532}
533
534extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535_mm_cmpnlt_sd (__m128d __A, __m128d __B)
536{
537  __v2df a, b, c;
538  a = vec_splats (__A[0]);
539  b = vec_splats (__B[0]);
540  /* Not less than is just greater than or equal.  */
541  c = (__v2df) vec_cmpge(a, b);
542  return (__m128d) _mm_setr_pd (c[0], __A[1]);
543}
544
545extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546_mm_cmpnle_sd (__m128d __A, __m128d __B)
547{
548  __v2df a, b, c;
549  a = vec_splats (__A[0]);
550  b = vec_splats (__B[0]);
551  /* Not less than or equal is just greater than.  */
552  c = (__v2df) vec_cmpge(a, b);
553  return (__m128d) _mm_setr_pd (c[0], __A[1]);
554}
555
556extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557_mm_cmpngt_sd (__m128d __A, __m128d __B)
558{
559  __v2df a, b, c;
560  a = vec_splats (__A[0]);
561  b = vec_splats (__B[0]);
562  /* Not greater than is just less than or equal.  */
563  c = (__v2df) vec_cmple(a, b);
564  return (__m128d) _mm_setr_pd (c[0], __A[1]);
565}
566
567extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568_mm_cmpnge_sd (__m128d __A, __m128d __B)
569{
570  __v2df a, b, c;
571  a = vec_splats (__A[0]);
572  b = vec_splats (__B[0]);
573  /* Not greater than or equal is just less than.  */
574  c = (__v2df) vec_cmplt(a, b);
575  return (__m128d) _mm_setr_pd (c[0], __A[1]);
576}
577
578#if _ARCH_PWR8
579extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580_mm_cmpord_sd (__m128d __A, __m128d __B)
581{
582  __v2df r;
583  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
584  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
585}
586#endif
587
588extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589_mm_cmpunord_sd (__m128d __A, __m128d __B)
590{
591  __v2df r;
592  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
593  return (__m128d) _mm_setr_pd (r[0], __A[1]);
594}
595
596/* FIXME
597   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
598   exactly the same because GCC for PowerPC only generates unordered
599   compares (scalar and vector).
600   Technically __mm_comieq_sp et all should be using the ordered
601   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
602   be OK.   */
603extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604_mm_comieq_sd (__m128d __A, __m128d __B)
605{
606  return (__A[0] == __B[0]);
607}
608
609extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610_mm_comilt_sd (__m128d __A, __m128d __B)
611{
612  return (__A[0] < __B[0]);
613}
614
615extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616_mm_comile_sd (__m128d __A, __m128d __B)
617{
618  return (__A[0] <= __B[0]);
619}
620
621extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622_mm_comigt_sd (__m128d __A, __m128d __B)
623{
624  return (__A[0] > __B[0]);
625}
626
627extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628_mm_comige_sd (__m128d __A, __m128d __B)
629{
630  return (__A[0] >= __B[0]);
631}
632
633extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634_mm_comineq_sd (__m128d __A, __m128d __B)
635{
636  return (__A[0] != __B[0]);
637}
638
639extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640_mm_ucomieq_sd (__m128d __A, __m128d __B)
641{
642	return (__A[0] == __B[0]);
643}
644
645extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646_mm_ucomilt_sd (__m128d __A, __m128d __B)
647{
648	return (__A[0] < __B[0]);
649}
650
651extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652_mm_ucomile_sd (__m128d __A, __m128d __B)
653{
654	return (__A[0] <= __B[0]);
655}
656
657extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658_mm_ucomigt_sd (__m128d __A, __m128d __B)
659{
660	return (__A[0] > __B[0]);
661}
662
663extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664_mm_ucomige_sd (__m128d __A, __m128d __B)
665{
666	return (__A[0] >= __B[0]);
667}
668
669extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670_mm_ucomineq_sd (__m128d __A, __m128d __B)
671{
672  return (__A[0] != __B[0]);
673}
674
675/* Create a vector of Qi, where i is the element number.  */
676extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677_mm_set_epi64x (long long __q1, long long __q0)
678{
679  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
680}
681
682extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683_mm_set_epi64 (__m64 __q1,  __m64 __q0)
684{
685  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
686}
687
688extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
689_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
690{
691  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
692}
693
694extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
696	       short __q3, short __q2, short __q1, short __q0)
697{
698  return __extension__ (__m128i)(__v8hi){
699    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
700}
701
702extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
704	      char __q11, char __q10, char __q09, char __q08,
705	      char __q07, char __q06, char __q05, char __q04,
706	      char __q03, char __q02, char __q01, char __q00)
707{
708  return __extension__ (__m128i)(__v16qi){
709    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
710    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
711  };
712}
713
714/* Set all of the elements of the vector to A.  */
715extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716_mm_set1_epi64x (long long __A)
717{
718  return _mm_set_epi64x (__A, __A);
719}
720
721extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722_mm_set1_epi64 (__m64 __A)
723{
724  return _mm_set_epi64 (__A, __A);
725}
726
727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728_mm_set1_epi32 (int __A)
729{
730  return _mm_set_epi32 (__A, __A, __A, __A);
731}
732
733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_set1_epi16 (short __A)
735{
736  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
737}
738
739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm_set1_epi8 (char __A)
741{
742  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
743		       __A, __A, __A, __A, __A, __A, __A, __A);
744}
745
746/* Create a vector of Qi, where i is the element number.
747   The parameter order is reversed from the _mm_set_epi* functions.  */
748extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749_mm_setr_epi64 (__m64 __q0, __m64 __q1)
750{
751  return _mm_set_epi64 (__q1, __q0);
752}
753
754extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
756{
757  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
758}
759
760extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
762	        short __q4, short __q5, short __q6, short __q7)
763{
764  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
765}
766
767extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
768_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
769	       char __q04, char __q05, char __q06, char __q07,
770	       char __q08, char __q09, char __q10, char __q11,
771	       char __q12, char __q13, char __q14, char __q15)
772{
773  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
774		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
775}
776
777/* Create a vector with element 0 as *P and the rest zero.  */
778extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779_mm_load_si128 (__m128i const *__P)
780{
781  return *__P;
782}
783
784extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785_mm_loadu_si128 (__m128i_u const *__P)
786{
787  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
788}
789
790extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791_mm_loadl_epi64 (__m128i_u const *__P)
792{
793  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
794}
795
796extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797_mm_store_si128 (__m128i *__P, __m128i __B)
798{
799  assert(((unsigned long )__P & 0xfUL) == 0UL);
800  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
801}
802
803extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
804_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
805{
806  *__P = __B;
807}
808
809extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
811{
812  *(long long *)__P = ((__v2di)__B)[0];
813}
814
815extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816_mm_movepi64_pi64 (__m128i_u __B)
817{
818  return (__m64) ((__v2di)__B)[0];
819}
820
821extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
822_mm_movpi64_epi64 (__m64 __A)
823{
824  return _mm_set_epi64 ((__m64)0LL, __A);
825}
826
827extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828_mm_move_epi64 (__m128i __A)
829{
830  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
831}
832
833/* Create an undefined vector.  */
834extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835_mm_undefined_si128 (void)
836{
837  __m128i __Y = __Y;
838  return __Y;
839}
840
841/* Create a vector of zeros.  */
842extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843_mm_setzero_si128 (void)
844{
845  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
846}
847
848#ifdef _ARCH_PWR8
849extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850_mm_cvtepi32_pd (__m128i __A)
851{
852  __v2di __val;
853  /* For LE need to generate Vector Unpack Low Signed Word.
854     Which is generated from unpackh.  */
855  __val = (__v2di)vec_unpackh ((__v4si)__A);
856
857  return (__m128d)vec_ctf (__val, 0);
858}
859#endif
860
861extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862_mm_cvtepi32_ps (__m128i __A)
863{
864  return ((__m128)vec_ctf((__v4si)__A, 0));
865}
866
867extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868_mm_cvtpd_epi32 (__m128d __A)
869{
870  __v2df __rounded = vec_rint (__A);
871  __v4si __result, __temp;
872  const __v4si __vzero =
873    { 0, 0, 0, 0 };
874
875  /* VSX Vector truncate Double-Precision to integer and Convert to
876   Signed Integer Word format with Saturate.  */
877  __asm__(
878      "xvcvdpsxws %x0,%x1"
879      : "=wa" (__temp)
880      : "wa" (__rounded)
881      : );
882
883#ifdef _ARCH_PWR8
884#ifdef __LITTLE_ENDIAN__
885  __temp = vec_mergeo (__temp, __temp);
886#else
887  __temp = vec_mergee (__temp, __temp);
888#endif
889  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
890				 (__vector long long) __vzero);
891#else
892  {
893    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
894	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
895    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
896  }
897#endif
898  return (__m128i) __result;
899}
900
901extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902_mm_cvtpd_pi32 (__m128d __A)
903{
904  __m128i __result = _mm_cvtpd_epi32(__A);
905
906  return (__m64) __result[0];
907}
908
909extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
910_mm_cvtpd_ps (__m128d __A)
911{
912  __v4sf __result;
913  __v4si __temp;
914  const __v4si __vzero = { 0, 0, 0, 0 };
915
916  __asm__(
917      "xvcvdpsp %x0,%x1"
918      : "=wa" (__temp)
919      : "wa" (__A)
920      : );
921
922#ifdef _ARCH_PWR8
923#ifdef __LITTLE_ENDIAN__
924  __temp = vec_mergeo (__temp, __temp);
925#else
926  __temp = vec_mergee (__temp, __temp);
927#endif
928  __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
929				 (__vector long long) __vzero);
930#else
931  {
932    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
933	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
934    __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
935  }
936#endif
937  return ((__m128)__result);
938}
939
940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941_mm_cvttpd_epi32 (__m128d __A)
942{
943  __v4si __result;
944  __v4si __temp;
945  const __v4si __vzero = { 0, 0, 0, 0 };
946
947  /* VSX Vector truncate Double-Precision to integer and Convert to
948   Signed Integer Word format with Saturate.  */
949  __asm__(
950      "xvcvdpsxws %x0,%x1"
951      : "=wa" (__temp)
952      : "wa" (__A)
953      : );
954
955#ifdef _ARCH_PWR8
956#ifdef __LITTLE_ENDIAN__
957  __temp = vec_mergeo (__temp, __temp);
958#else
959  __temp = vec_mergee (__temp, __temp);
960#endif
961  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
962				 (__vector long long) __vzero);
963#else
964  {
965    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
966	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
967    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
968  }
969#endif
970
971  return ((__m128i) __result);
972}
973
974extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975_mm_cvttpd_pi32 (__m128d __A)
976{
977  __m128i __result = _mm_cvttpd_epi32 (__A);
978
979  return (__m64) __result[0];
980}
981
982extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_cvtsi128_si32 (__m128i __A)
984{
985  return ((__v4si)__A)[0];
986}
987
988#ifdef _ARCH_PWR8
989extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm_cvtpi32_pd (__m64 __A)
991{
992  __v4si __temp;
993  __v2di __tmp2;
994  __v2df __result;
995
996  __temp = (__v4si)vec_splats (__A);
997  __tmp2 = (__v2di)vec_unpackl (__temp);
998  __result = vec_ctf ((__vector signed long long) __tmp2, 0);
999  return (__m128d)__result;
1000}
1001#endif
1002
1003extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1004_mm_cvtps_epi32 (__m128 __A)
1005{
1006  __v4sf __rounded;
1007  __v4si __result;
1008
1009  __rounded = vec_rint((__v4sf) __A);
1010  __result = vec_cts (__rounded, 0);
1011  return (__m128i) __result;
1012}
1013
1014extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015_mm_cvttps_epi32 (__m128 __A)
1016{
1017  __v4si __result;
1018
1019  __result = vec_cts ((__v4sf) __A, 0);
1020  return (__m128i) __result;
1021}
1022
1023extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024_mm_cvtps_pd (__m128 __A)
1025{
1026  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1027#ifdef vec_doubleh
1028  return (__m128d) vec_doubleh ((__v4sf)__A);
1029#else
1030  /* Otherwise the compiler is not current and so need to generate the
1031     equivalent code.  */
1032  __v4sf __a = (__v4sf)__A;
1033  __v4sf __temp;
1034  __v2df __result;
1035#ifdef __LITTLE_ENDIAN__
1036  /* The input float values are in elements {[0], [1]} but the convert
1037     instruction needs them in elements {[1], [3]}, So we use two
1038     shift left double vector word immediates to get the elements
1039     lined up.  */
1040  __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
1041  __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
1042#else
1043  /* The input float values are in elements {[0], [1]} but the convert
1044     instruction needs them in elements {[0], [2]}, So we use two
1045     shift left double vector word immediates to get the elements
1046     lined up.  */
1047  __temp = vec_vmrghw (__a, __a);
1048#endif
1049  __asm__(
1050      " xvcvspdp %x0,%x1"
1051      : "=wa" (__result)
1052      : "wa" (__temp)
1053      : );
1054  return (__m128d) __result;
1055#endif
1056}
1057
1058extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059_mm_cvtsd_si32 (__m128d __A)
1060{
1061  __v2df __rounded = vec_rint((__v2df) __A);
1062  int __result = ((__v2df)__rounded)[0];
1063
1064  return __result;
1065}
1066/* Intel intrinsic.  */
1067extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068_mm_cvtsd_si64 (__m128d __A)
1069{
1070  __v2df __rounded = vec_rint ((__v2df) __A );
1071  long long __result = ((__v2df) __rounded)[0];
1072
1073  return __result;
1074}
1075
1076/* Microsoft intrinsic.  */
1077extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078_mm_cvtsd_si64x (__m128d __A)
1079{
1080  return _mm_cvtsd_si64 ((__v2df)__A);
1081}
1082
1083extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084_mm_cvttsd_si32 (__m128d __A)
1085{
1086  int __result = ((__v2df)__A)[0];
1087
1088  return __result;
1089}
1090
1091/* Intel intrinsic.  */
1092extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093_mm_cvttsd_si64 (__m128d __A)
1094{
1095  long long __result = ((__v2df)__A)[0];
1096
1097  return __result;
1098}
1099
1100/* Microsoft intrinsic.  */
1101extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102_mm_cvttsd_si64x (__m128d __A)
1103{
1104  return _mm_cvttsd_si64 (__A);
1105}
1106
1107extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108_mm_cvtsd_ss (__m128 __A, __m128d __B)
1109{
1110  __v4sf __result = (__v4sf)__A;
1111
1112#ifdef __LITTLE_ENDIAN__
1113  __v4sf __temp_s;
1114  /* Copy double element[0] to element [1] for conversion.  */
1115  __v2df __temp_b = vec_splat((__v2df)__B, 0);
1116
1117  /* Pre-rotate __A left 3 (logically right 1) elements.  */
1118  __result = __builtin_vsx_xxsldwi (__result, __result, 3);
1119  /* Convert double to single float scalar in a vector.  */
1120  __asm__(
1121      "xscvdpsp %x0,%x1"
1122      : "=wa" (__temp_s)
1123      : "wa" (__temp_b)
1124      : );
1125  /* Shift the resulting scalar into vector element [0].  */
1126  __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
1127#else
1128  __result [0] = ((__v2df)__B)[0];
1129#endif
1130  return (__m128) __result;
1131}
1132
1133extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134_mm_cvtsi32_sd (__m128d __A, int __B)
1135{
1136  __v2df __result = (__v2df)__A;
1137  double __db = __B;
1138  __result [0] = __db;
1139  return (__m128d)__result;
1140}
1141
1142/* Intel intrinsic.  */
1143extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144_mm_cvtsi64_sd (__m128d __A, long long __B)
1145{
1146  __v2df __result = (__v2df)__A;
1147  double __db = __B;
1148  __result [0] = __db;
1149  return (__m128d)__result;
1150}
1151
1152/* Microsoft intrinsic.  */
1153extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154_mm_cvtsi64x_sd (__m128d __A, long long __B)
1155{
1156  return _mm_cvtsi64_sd (__A, __B);
1157}
1158
1159extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160_mm_cvtss_sd (__m128d __A, __m128 __B)
1161{
1162#ifdef __LITTLE_ENDIAN__
1163  /* Use splat to move element [0] into position for the convert. */
1164  __v4sf __temp = vec_splat ((__v4sf)__B, 0);
1165  __v2df __res;
1166  /* Convert single float scalar to double in a vector.  */
1167  __asm__(
1168      "xscvspdp %x0,%x1"
1169      : "=wa" (__res)
1170      : "wa" (__temp)
1171      : );
1172  return (__m128d) vec_mergel (__res, (__v2df)__A);
1173#else
1174  __v2df __res = (__v2df)__A;
1175  __res [0] = ((__v4sf)__B) [0];
1176  return (__m128d) __res;
1177#endif
1178}
1179
1180extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1182{
1183  __vector double __result;
1184  const int __litmsk = __mask & 0x3;
1185
1186  if (__litmsk == 0)
1187    __result = vec_mergeh (__A, __B);
1188#if __GNUC__ < 6
1189  else if (__litmsk == 1)
1190    __result = vec_xxpermdi (__B, __A, 2);
1191  else if (__litmsk == 2)
1192    __result = vec_xxpermdi (__B, __A, 1);
1193#else
1194  else if (__litmsk == 1)
1195    __result = vec_xxpermdi (__A, __B, 2);
1196  else if (__litmsk == 2)
1197    __result = vec_xxpermdi (__A, __B, 1);
1198#endif
1199  else
1200    __result = vec_mergel (__A, __B);
1201
1202  return __result;
1203}
1204
1205extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm_unpackhi_pd (__m128d __A, __m128d __B)
1207{
1208  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1209}
1210
1211extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212_mm_unpacklo_pd (__m128d __A, __m128d __B)
1213{
1214  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1215}
1216
1217extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218_mm_loadh_pd (__m128d __A, double const *__B)
1219{
1220  __v2df __result = (__v2df)__A;
1221  __result [1] = *__B;
1222  return (__m128d)__result;
1223}
1224
1225extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226_mm_loadl_pd (__m128d __A, double const *__B)
1227{
1228  __v2df __result = (__v2df)__A;
1229  __result [0] = *__B;
1230  return (__m128d)__result;
1231}
1232
1233#ifdef _ARCH_PWR8
1234/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1235
1236/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1237extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238_mm_movemask_pd (__m128d  __A)
1239{
1240  __vector unsigned long long __result;
1241  static const __vector unsigned int __perm_mask =
1242    {
1243#ifdef __LITTLE_ENDIAN__
1244	0x80800040, 0x80808080, 0x80808080, 0x80808080
1245#else
1246      0x80808080, 0x80808080, 0x80808080, 0x80804000
1247#endif
1248    };
1249
1250  __result = ((__vector unsigned long long)
1251	    vec_vbpermq ((__vector unsigned char) __A,
1252			 (__vector unsigned char) __perm_mask));
1253
1254#ifdef __LITTLE_ENDIAN__
1255  return __result[1];
1256#else
1257  return __result[0];
1258#endif
1259}
1260#endif /* _ARCH_PWR8 */
1261
1262extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263_mm_packs_epi16 (__m128i __A, __m128i __B)
1264{
1265  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1266}
1267
1268extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269_mm_packs_epi32 (__m128i __A, __m128i __B)
1270{
1271  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1272}
1273
1274extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275_mm_packus_epi16 (__m128i __A, __m128i __B)
1276{
1277  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1278}
1279
1280extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1282{
1283  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1284}
1285
1286extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1288{
1289  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1290}
1291
1292extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1294{
1295  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1296}
1297
1298extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1300{
1301  return (__m128i) vec_mergel ((__vector long long) __A,
1302			       (__vector long long) __B);
1303}
1304
1305extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1307{
1308  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1309}
1310
1311extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1313{
1314  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1315}
1316
1317extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1319{
1320  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1321}
1322
1323extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1325{
1326  return (__m128i) vec_mergeh ((__vector long long) __A,
1327			       (__vector long long) __B);
1328}
1329
1330extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331_mm_add_epi8 (__m128i __A, __m128i __B)
1332{
1333  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1334}
1335
1336extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337_mm_add_epi16 (__m128i __A, __m128i __B)
1338{
1339  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1340}
1341
1342extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343_mm_add_epi32 (__m128i __A, __m128i __B)
1344{
1345  return (__m128i) ((__v4su)__A + (__v4su)__B);
1346}
1347
1348extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349_mm_add_epi64 (__m128i __A, __m128i __B)
1350{
1351  return (__m128i) ((__v2du)__A + (__v2du)__B);
1352}
1353
1354extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355_mm_adds_epi8 (__m128i __A, __m128i __B)
1356{
1357  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1358}
1359
1360extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361_mm_adds_epi16 (__m128i __A, __m128i __B)
1362{
1363  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1364}
1365
1366extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367_mm_adds_epu8 (__m128i __A, __m128i __B)
1368{
1369  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1370}
1371
1372extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373_mm_adds_epu16 (__m128i __A, __m128i __B)
1374{
1375  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1376}
1377
1378extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_mm_sub_epi8 (__m128i __A, __m128i __B)
1380{
1381  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1382}
1383
1384extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385_mm_sub_epi16 (__m128i __A, __m128i __B)
1386{
1387  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1388}
1389
1390extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391_mm_sub_epi32 (__m128i __A, __m128i __B)
1392{
1393  return (__m128i) ((__v4su)__A - (__v4su)__B);
1394}
1395
1396extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397_mm_sub_epi64 (__m128i __A, __m128i __B)
1398{
1399  return (__m128i) ((__v2du)__A - (__v2du)__B);
1400}
1401
1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm_subs_epi8 (__m128i __A, __m128i __B)
1404{
1405  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1406}
1407
1408extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409_mm_subs_epi16 (__m128i __A, __m128i __B)
1410{
1411  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1412}
1413
1414extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415_mm_subs_epu8 (__m128i __A, __m128i __B)
1416{
1417  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1418}
1419
1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421_mm_subs_epu16 (__m128i __A, __m128i __B)
1422{
1423  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1424}
1425
1426extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427_mm_madd_epi16 (__m128i __A, __m128i __B)
1428{
1429  __vector signed int __zero = {0, 0, 0, 0};
1430
1431  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
1432}
1433
1434extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1436{
1437  __vector signed int __w0, __w1;
1438
1439  __vector unsigned char __xform1 = {
1440#ifdef __LITTLE_ENDIAN__
1441      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1442      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1443#else
1444      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1445      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1446#endif
1447    };
1448
1449  __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1450  __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1451  return (__m128i) vec_perm (__w0, __w1, __xform1);
1452}
1453
1454extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455_mm_mullo_epi16 (__m128i __A, __m128i __B)
1456{
1457    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1458}
1459
1460extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1461_mm_mul_su32 (__m64 __A, __m64 __B)
1462{
1463  unsigned int __a = __A;
1464  unsigned int __b = __B;
1465
1466  return ((__m64)__a * (__m64)__b);
1467}
1468
1469extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1470_mm_mul_epu32 (__m128i __A, __m128i __B)
1471{
1472#if __GNUC__ < 8 || !defined (_ARCH_PWR8)
1473  __v2du __result;
1474
1475#ifdef __LITTLE_ENDIAN__
1476  /* VMX Vector Multiply Odd Unsigned Word.  */
1477  __asm__(
1478      "vmulouw %0,%1,%2"
1479      : "=v" (__result)
1480      : "v" (__A), "v" (__B)
1481      : );
1482#else
1483  /* VMX Vector Multiply Even Unsigned Word.  */
1484  __asm__(
1485      "vmuleuw %0,%1,%2"
1486      : "=v" (__result)
1487      : "v" (__A), "v" (__B)
1488      : );
1489#endif
1490  return (__m128i) __result;
1491#else
1492  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1493#endif
1494}
1495
1496extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497_mm_slli_epi16 (__m128i __A, int __B)
1498{
1499  __v8hu __lshift;
1500  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1501
1502  if (__B >= 0 && __B < 16)
1503    {
1504      if (__builtin_constant_p(__B))
1505	__lshift = (__v8hu) vec_splat_s16(__B);
1506      else
1507	__lshift = vec_splats ((unsigned short) __B);
1508
1509      __result = vec_sl ((__v8hi) __A, __lshift);
1510    }
1511
1512  return (__m128i) __result;
1513}
1514
1515extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1516_mm_slli_epi32 (__m128i __A, int __B)
1517{
1518  __v4su __lshift;
1519  __v4si __result = { 0, 0, 0, 0 };
1520
1521  if (__B >= 0 && __B < 32)
1522    {
1523      if (__builtin_constant_p(__B) && __B < 16)
1524	__lshift = (__v4su) vec_splat_s32(__B);
1525      else
1526	__lshift = vec_splats ((unsigned int) __B);
1527
1528      __result = vec_sl ((__v4si) __A, __lshift);
1529    }
1530
1531  return (__m128i) __result;
1532}
1533
1534#ifdef _ARCH_PWR8
1535extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1536_mm_slli_epi64 (__m128i __A, int __B)
1537{
1538  __v2du __lshift;
1539  __v2di __result = { 0, 0 };
1540
1541  if (__B >= 0 && __B < 64)
1542    {
1543      if (__builtin_constant_p(__B) && __B < 16)
1544	__lshift = (__v2du) vec_splat_s32(__B);
1545      else
1546	__lshift = (__v2du) vec_splats ((unsigned int) __B);
1547
1548      __result = vec_sl ((__v2di) __A, __lshift);
1549    }
1550
1551  return (__m128i) __result;
1552}
1553#endif
1554
1555extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1556_mm_srai_epi16 (__m128i __A, int __B)
1557{
1558  __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1559  __v8hi __result;
1560
1561  if (__B < 16)
1562    {
1563      if (__builtin_constant_p(__B))
1564	__rshift = (__v8hu) vec_splat_s16(__B);
1565      else
1566	__rshift = vec_splats ((unsigned short) __B);
1567    }
1568  __result = vec_sra ((__v8hi) __A, __rshift);
1569
1570  return (__m128i) __result;
1571}
1572
1573extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574_mm_srai_epi32 (__m128i __A, int __B)
1575{
1576  __v4su __rshift = { 31, 31, 31, 31 };
1577  __v4si __result;
1578
1579  if (__B < 32)
1580    {
1581      if (__builtin_constant_p(__B))
1582	{
1583	  if (__B < 16)
1584	      __rshift = (__v4su) vec_splat_s32(__B);
1585	    else
1586	      __rshift = (__v4su) vec_splats((unsigned int)__B);
1587	}
1588      else
1589	__rshift = vec_splats ((unsigned int) __B);
1590    }
1591  __result = vec_sra ((__v4si) __A, __rshift);
1592
1593  return (__m128i) __result;
1594}
1595
1596extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1597_mm_bslli_si128 (__m128i __A, const int __N)
1598{
1599  __v16qu __result;
1600  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1601
1602  if (__N < 16)
1603    __result = vec_sld ((__v16qu) __A, __zeros, __N);
1604  else
1605    __result = __zeros;
1606
1607  return (__m128i) __result;
1608}
1609
1610extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1611_mm_bsrli_si128 (__m128i __A, const int __N)
1612{
1613  __v16qu __result;
1614  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1615
1616  if (__N < 16)
1617#ifdef __LITTLE_ENDIAN__
1618    if (__builtin_constant_p(__N))
1619      /* Would like to use Vector Shift Left Double by Octet
1620	 Immediate here to use the immediate form and avoid
1621	 load of __N * 8 value into a separate VR.  */
1622      __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
1623    else
1624#endif
1625      {
1626	__v16qu __shift = vec_splats((unsigned char)(__N*8));
1627#ifdef __LITTLE_ENDIAN__
1628	__result = vec_sro ((__v16qu)__A, __shift);
1629#else
1630	__result = vec_slo ((__v16qu)__A, __shift);
1631#endif
1632      }
1633  else
1634    __result = __zeros;
1635
1636  return (__m128i) __result;
1637}
1638
1639extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640_mm_srli_si128 (__m128i __A, const int __N)
1641{
1642  return _mm_bsrli_si128 (__A, __N);
1643}
1644
1645extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1646_mm_slli_si128 (__m128i __A, const int _imm5)
1647{
1648  __v16qu __result;
1649  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1650
1651  if (_imm5 < 16)
1652#ifdef __LITTLE_ENDIAN__
1653    __result = vec_sld ((__v16qu) __A, __zeros, _imm5);
1654#else
1655    __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5));
1656#endif
1657  else
1658    __result = __zeros;
1659
1660  return (__m128i) __result;
1661}
1662
1663extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664
1665_mm_srli_epi16 (__m128i  __A, int __B)
1666{
1667  __v8hu __rshift;
1668  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1669
1670  if (__B < 16)
1671    {
1672      if (__builtin_constant_p(__B))
1673	__rshift = (__v8hu) vec_splat_s16(__B);
1674      else
1675	__rshift = vec_splats ((unsigned short) __B);
1676
1677      __result = vec_sr ((__v8hi) __A, __rshift);
1678    }
1679
1680  return (__m128i) __result;
1681}
1682
1683extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684_mm_srli_epi32 (__m128i __A, int __B)
1685{
1686  __v4su __rshift;
1687  __v4si __result = { 0, 0, 0, 0 };
1688
1689  if (__B < 32)
1690    {
1691      if (__builtin_constant_p(__B))
1692	{
1693	  if (__B < 16)
1694	      __rshift = (__v4su) vec_splat_s32(__B);
1695	    else
1696	      __rshift = (__v4su) vec_splats((unsigned int)__B);
1697	}
1698      else
1699	__rshift = vec_splats ((unsigned int) __B);
1700
1701      __result = vec_sr ((__v4si) __A, __rshift);
1702    }
1703
1704  return (__m128i) __result;
1705}
1706
1707#ifdef _ARCH_PWR8
1708extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1709_mm_srli_epi64 (__m128i __A, int __B)
1710{
1711  __v2du __rshift;
1712  __v2di __result = { 0, 0 };
1713
1714  if (__B < 64)
1715    {
1716      if (__builtin_constant_p(__B))
1717	{
1718	  if (__B < 16)
1719	      __rshift = (__v2du) vec_splat_s32(__B);
1720	    else
1721	      __rshift = (__v2du) vec_splats((unsigned long long)__B);
1722	}
1723      else
1724	__rshift = (__v2du) vec_splats ((unsigned int) __B);
1725
1726      __result = vec_sr ((__v2di) __A, __rshift);
1727    }
1728
1729  return (__m128i) __result;
1730}
1731#endif
1732
1733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1734_mm_sll_epi16 (__m128i __A, __m128i __B)
1735{
1736  __v8hu __lshift;
1737  __vector __bool short __shmask;
1738  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1739  __v8hu __result;
1740
1741#ifdef __LITTLE_ENDIAN__
1742  __lshift = vec_splat ((__v8hu) __B, 0);
1743#else
1744  __lshift = vec_splat ((__v8hu) __B, 3);
1745#endif
1746  __shmask = vec_cmple (__lshift, __shmax);
1747  __result = vec_sl ((__v8hu) __A, __lshift);
1748  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1749
1750  return (__m128i) __result;
1751}
1752
1753extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1754_mm_sll_epi32 (__m128i __A, __m128i __B)
1755{
1756  __v4su __lshift;
1757  __vector __bool int __shmask;
1758  const __v4su __shmax = { 32, 32, 32, 32 };
1759  __v4su __result;
1760#ifdef __LITTLE_ENDIAN__
1761  __lshift = vec_splat ((__v4su) __B, 0);
1762#else
1763  __lshift = vec_splat ((__v4su) __B, 1);
1764#endif
1765  __shmask = vec_cmplt (__lshift, __shmax);
1766  __result = vec_sl ((__v4su) __A, __lshift);
1767  __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1768
1769  return (__m128i) __result;
1770}
1771
1772#ifdef _ARCH_PWR8
1773extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1774_mm_sll_epi64 (__m128i __A, __m128i __B)
1775{
1776  __v2du __lshift;
1777  __vector __bool long long __shmask;
1778  const __v2du __shmax = { 64, 64 };
1779  __v2du __result;
1780
1781  __lshift = vec_splat ((__v2du) __B, 0);
1782  __shmask = vec_cmplt (__lshift, __shmax);
1783  __result = vec_sl ((__v2du) __A, __lshift);
1784  __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1785
1786  return (__m128i) __result;
1787}
1788#endif
1789
1790extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1791_mm_sra_epi16 (__m128i __A, __m128i __B)
1792{
1793  const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1794  __v8hu __rshift;
1795  __v8hi __result;
1796
1797#ifdef __LITTLE_ENDIAN__
1798  __rshift = vec_splat ((__v8hu)__B, 0);
1799#else
1800  __rshift = vec_splat ((__v8hu)__B, 3);
1801#endif
1802  __rshift = vec_min (__rshift, __rshmax);
1803  __result = vec_sra ((__v8hi) __A, __rshift);
1804
1805  return (__m128i) __result;
1806}
1807
1808extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1809_mm_sra_epi32 (__m128i __A, __m128i __B)
1810{
1811  const __v4su __rshmax = { 31, 31, 31, 31 };
1812  __v4su __rshift;
1813  __v4si __result;
1814
1815#ifdef __LITTLE_ENDIAN__
1816  __rshift = vec_splat ((__v4su)__B, 0);
1817#else
1818  __rshift = vec_splat ((__v4su)__B, 1);
1819#endif
1820  __rshift = vec_min (__rshift, __rshmax);
1821  __result = vec_sra ((__v4si) __A, __rshift);
1822
1823  return (__m128i) __result;
1824}
1825
1826extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827_mm_srl_epi16 (__m128i __A, __m128i __B)
1828{
1829  __v8hu __rshift;
1830  __vector __bool short __shmask;
1831  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1832  __v8hu __result;
1833
1834#ifdef __LITTLE_ENDIAN__
1835  __rshift = vec_splat ((__v8hu) __B, 0);
1836#else
1837  __rshift = vec_splat ((__v8hu) __B, 3);
1838#endif
1839  __shmask = vec_cmple (__rshift, __shmax);
1840  __result = vec_sr ((__v8hu) __A, __rshift);
1841  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1842
1843  return (__m128i) __result;
1844}
1845
1846extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1847_mm_srl_epi32 (__m128i __A, __m128i __B)
1848{
1849  __v4su __rshift;
1850  __vector __bool int __shmask;
1851  const __v4su __shmax = { 32, 32, 32, 32 };
1852  __v4su __result;
1853
1854#ifdef __LITTLE_ENDIAN__
1855  __rshift = vec_splat ((__v4su) __B, 0);
1856#else
1857  __rshift = vec_splat ((__v4su) __B, 1);
1858#endif
1859  __shmask = vec_cmplt (__rshift, __shmax);
1860  __result = vec_sr ((__v4su) __A, __rshift);
1861  __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1862
1863  return (__m128i) __result;
1864}
1865
1866#ifdef _ARCH_PWR8
1867extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868_mm_srl_epi64 (__m128i __A, __m128i __B)
1869{
1870  __v2du __rshift;
1871  __vector __bool long long __shmask;
1872  const __v2du __shmax = { 64, 64 };
1873  __v2du __result;
1874
1875  __rshift = vec_splat ((__v2du) __B, 0);
1876  __shmask = vec_cmplt (__rshift, __shmax);
1877  __result = vec_sr ((__v2du) __A, __rshift);
1878  __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1879
1880  return (__m128i) __result;
1881}
1882#endif
1883
1884extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1885_mm_and_pd (__m128d __A, __m128d __B)
1886{
1887  return (vec_and ((__v2df) __A, (__v2df) __B));
1888}
1889
1890extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1891_mm_andnot_pd (__m128d __A, __m128d __B)
1892{
1893  return (vec_andc ((__v2df) __B, (__v2df) __A));
1894}
1895
1896extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1897_mm_or_pd (__m128d __A, __m128d __B)
1898{
1899  return (vec_or ((__v2df) __A, (__v2df) __B));
1900}
1901
1902extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1903_mm_xor_pd (__m128d __A, __m128d __B)
1904{
1905  return (vec_xor ((__v2df) __A, (__v2df) __B));
1906}
1907
1908extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1909_mm_and_si128 (__m128i __A, __m128i __B)
1910{
1911  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1912}
1913
1914extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1915_mm_andnot_si128 (__m128i __A, __m128i __B)
1916{
1917  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1918}
1919
1920extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1921_mm_or_si128 (__m128i __A, __m128i __B)
1922{
1923  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1924}
1925
1926extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1927_mm_xor_si128 (__m128i __A, __m128i __B)
1928{
1929  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1930}
1931
1932extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1934{
1935  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1936}
1937
1938extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1940{
1941  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1942}
1943
1944extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1946{
1947  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1948}
1949
1950extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1952{
1953  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1954}
1955
1956extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1957_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1958{
1959  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1960}
1961
1962extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1963_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1964{
1965  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1966}
1967
1968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1970{
1971  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1972}
1973
1974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1976{
1977  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1978}
1979
1980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1982{
1983  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1984}
1985
1986extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987_mm_extract_epi16 (__m128i const __A, int const __N)
1988{
1989  return (unsigned short) ((__v8hi)__A)[__N & 7];
1990}
1991
1992extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1993_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1994{
1995  __v8hi __result = (__v8hi)__A;
1996
1997  __result [(__N & 7)] = __D;
1998
1999  return (__m128i) __result;
2000}
2001
2002extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003_mm_max_epi16 (__m128i __A, __m128i __B)
2004{
2005  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2006}
2007
2008extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2009_mm_max_epu8 (__m128i __A, __m128i __B)
2010{
2011  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2012}
2013
2014extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2015_mm_min_epi16 (__m128i __A, __m128i __B)
2016{
2017  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2018}
2019
2020extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2021_mm_min_epu8 (__m128i __A, __m128i __B)
2022{
2023  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2024}
2025
2026
2027#ifdef _ARCH_PWR8
2028/* Intrinsic functions that require PowerISA 2.07 minimum.  */
2029
2030/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
2031extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2032_mm_movemask_epi8 (__m128i __A)
2033{
2034  __vector unsigned long long __result;
2035  static const __vector unsigned char __perm_mask =
2036    {
2037	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2038	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2039    };
2040
2041  __result = ((__vector unsigned long long)
2042	    vec_vbpermq ((__vector unsigned char) __A,
2043			 (__vector unsigned char) __perm_mask));
2044
2045#ifdef __LITTLE_ENDIAN__
2046  return __result[1];
2047#else
2048  return __result[0];
2049#endif
2050}
2051#endif /* _ARCH_PWR8 */
2052
2053extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2054_mm_mulhi_epu16 (__m128i __A, __m128i __B)
2055{
2056  __v4su __w0, __w1;
2057  __v16qu __xform1 = {
2058#ifdef __LITTLE_ENDIAN__
2059      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2060      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2061#else
2062      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2063      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2064#endif
2065    };
2066
2067  __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2068  __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2069  return (__m128i) vec_perm (__w0, __w1, __xform1);
2070}
2071
2072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073_mm_shufflehi_epi16 (__m128i __A, const int __mask)
2074{
2075  unsigned long __element_selector_98 = __mask & 0x03;
2076  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2077  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2078  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2079  static const unsigned short __permute_selectors[4] =
2080    {
2081#ifdef __LITTLE_ENDIAN__
2082	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2083#else
2084	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2085#endif
2086    };
2087  __v2du __pmask =
2088#ifdef __LITTLE_ENDIAN__
2089      { 0x1716151413121110UL,  0UL};
2090#else
2091      { 0x1011121314151617UL,  0UL};
2092#endif
2093  __m64_union __t;
2094  __v2du __a, __r;
2095
2096  __t.as_short[0] = __permute_selectors[__element_selector_98];
2097  __t.as_short[1] = __permute_selectors[__element_selector_BA];
2098  __t.as_short[2] = __permute_selectors[__element_selector_DC];
2099  __t.as_short[3] = __permute_selectors[__element_selector_FE];
2100  __pmask[1] = __t.as_m64;
2101  __a = (__v2du)__A;
2102  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2103  return (__m128i) __r;
2104}
2105
2106extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2107_mm_shufflelo_epi16 (__m128i __A, const int __mask)
2108{
2109  unsigned long __element_selector_10 = __mask & 0x03;
2110  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2111  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2112  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2113  static const unsigned short __permute_selectors[4] =
2114    {
2115#ifdef __LITTLE_ENDIAN__
2116	      0x0100, 0x0302, 0x0504, 0x0706
2117#else
2118	      0x0001, 0x0203, 0x0405, 0x0607
2119#endif
2120    };
2121  __v2du __pmask =
2122#ifdef __LITTLE_ENDIAN__
2123                 { 0UL,  0x1f1e1d1c1b1a1918UL};
2124#else
2125                 { 0UL,  0x18191a1b1c1d1e1fUL};
2126#endif
2127  __m64_union __t;
2128  __v2du __a, __r;
2129  __t.as_short[0] = __permute_selectors[__element_selector_10];
2130  __t.as_short[1] = __permute_selectors[__element_selector_32];
2131  __t.as_short[2] = __permute_selectors[__element_selector_54];
2132  __t.as_short[3] = __permute_selectors[__element_selector_76];
2133  __pmask[0] = __t.as_m64;
2134  __a = (__v2du)__A;
2135  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2136  return (__m128i) __r;
2137}
2138
2139extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2140_mm_shuffle_epi32 (__m128i __A, const int __mask)
2141{
2142  unsigned long __element_selector_10 = __mask & 0x03;
2143  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2144  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2145  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2146  static const unsigned int __permute_selectors[4] =
2147    {
2148#ifdef __LITTLE_ENDIAN__
2149	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2150#else
2151      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2152#endif
2153    };
2154  __v4su __t;
2155
2156  __t[0] = __permute_selectors[__element_selector_10];
2157  __t[1] = __permute_selectors[__element_selector_32];
2158  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2159  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2160  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
2161}
2162
2163extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2165{
2166  __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2167  __v16qu __mask, __tmp;
2168  __m128i_u *__p = (__m128i_u*)__C;
2169
2170  __tmp = (__v16qu)_mm_loadu_si128(__p);
2171  __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
2172  __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
2173  _mm_storeu_si128 (__p, (__m128i)__tmp);
2174}
2175
2176extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2177_mm_avg_epu8 (__m128i __A, __m128i __B)
2178{
2179  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2180}
2181
2182extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2183_mm_avg_epu16 (__m128i __A, __m128i __B)
2184{
2185  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2186}
2187
2188
2189extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2190_mm_sad_epu8 (__m128i __A, __m128i __B)
2191{
2192  __v16qu __a, __b;
2193  __v16qu __vmin, __vmax, __vabsdiff;
2194  __v4si __vsum;
2195  const __v4su __zero = { 0, 0, 0, 0 };
2196  __v4si __result;
2197
2198  __a = (__v16qu) __A;
2199  __b = (__v16qu) __B;
2200  __vmin = vec_min (__a, __b);
2201  __vmax = vec_max (__a, __b);
2202  __vabsdiff = vec_sub (__vmax, __vmin);
2203  /* Sum four groups of bytes into integers.  */
2204  __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
2205  /* Sum across four integers with two integer results.  */
2206  __result = vec_sum2s (__vsum, (__vector signed int) __zero);
2207  /* Rotate the sums into the correct position.  */
2208#ifdef __LITTLE_ENDIAN__
2209  __result = vec_sld (__result, __result, 4);
2210#else
2211  __result = vec_sld (__result, __result, 6);
2212#endif
2213  /* Rotate the sums into the correct position.  */
2214  return (__m128i) __result;
2215}
2216
2217extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2218_mm_stream_si32 (int *__A, int __B)
2219{
2220  /* Use the data cache block touch for store transient.  */
2221  __asm__ (
2222    "dcbtstt 0,%0"
2223    :
2224    : "b" (__A)
2225    : "memory"
2226  );
2227  *__A = __B;
2228}
2229
2230extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2231_mm_stream_si64 (long long int *__A, long long int __B)
2232{
2233  /* Use the data cache block touch for store transient.  */
2234  __asm__ (
2235    "	dcbtstt	0,%0"
2236    :
2237    : "b" (__A)
2238    : "memory"
2239  );
2240  *__A = __B;
2241}
2242
2243extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2244_mm_stream_si128 (__m128i *__A, __m128i __B)
2245{
2246  /* Use the data cache block touch for store transient.  */
2247  __asm__ (
2248    "dcbtstt 0,%0"
2249    :
2250    : "b" (__A)
2251    : "memory"
2252  );
2253  *__A = __B;
2254}
2255
2256extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257_mm_stream_pd (double *__A, __m128d __B)
2258{
2259  /* Use the data cache block touch for store transient.  */
2260  __asm__ (
2261    "dcbtstt 0,%0"
2262    :
2263    : "b" (__A)
2264    : "memory"
2265  );
2266  *(__m128d*)__A = __B;
2267}
2268
2269extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270_mm_clflush (void const *__A)
2271{
2272  /* Use the data cache block flush.  */
2273  __asm__ (
2274    "dcbf 0,%0"
2275    :
2276    : "b" (__A)
2277    : "memory"
2278  );
2279}
2280
2281extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2282_mm_lfence (void)
2283{
2284  /* Use light weight sync for load to load ordering.  */
2285  __atomic_thread_fence (__ATOMIC_RELEASE);
2286}
2287
2288extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289_mm_mfence (void)
2290{
2291  /* Use heavy weight sync for any to any ordering.  */
2292  __atomic_thread_fence (__ATOMIC_SEQ_CST);
2293}
2294
2295extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2296_mm_cvtsi32_si128 (int __A)
2297{
2298  return _mm_set_epi32 (0, 0, 0, __A);
2299}
2300
2301extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2302_mm_cvtsi64_si128 (long long __A)
2303{
2304  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2305}
2306
2307/* Microsoft intrinsic.  */
2308extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2309_mm_cvtsi64x_si128 (long long __A)
2310{
2311  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2312}
2313
2314/* Casts between various SP, DP, INT vector types.  Note that these do no
2315   conversion of values, they just change the type.  */
2316extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2317_mm_castpd_ps(__m128d __A)
2318{
2319  return (__m128) __A;
2320}
2321
2322extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2323_mm_castpd_si128(__m128d __A)
2324{
2325  return (__m128i) __A;
2326}
2327
2328extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2329_mm_castps_pd(__m128 __A)
2330{
2331  return (__m128d) __A;
2332}
2333
2334extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2335_mm_castps_si128(__m128 __A)
2336{
2337  return (__m128i) __A;
2338}
2339
2340extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2341_mm_castsi128_ps(__m128i __A)
2342{
2343  return (__m128) __A;
2344}
2345
2346extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2347_mm_castsi128_pd(__m128i __A)
2348{
2349  return (__m128d) __A;
2350}
2351
2352#endif /* EMMINTRIN_H_ */
2353
2354