1/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18   VMX/VSX ISA is a good match for vector float SIMD operations.
19   However scalar float operations in vector (XMM) registers require
20   the POWER8 VSX ISA (2.07) level. There are differences for data
21   format and placement of float scalars in the vector register, which
22   require extra steps to match SSE scalar float semantics on POWER.
23
24   It should be noted that there's much difference between X86_64's
25   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26   portable <fenv.h> instead of access MXSCR directly.
27
28   Most SSE scalar float intrinsic operations can be performed more
29   efficiently as C language float scalar operations or optimized to
30   use vector SIMD operations. We recommend this for new applications. */
31#error                                                                         \
32    "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef XMMINTRIN_H_
36#define XMMINTRIN_H_
37
38#if defined(__powerpc64__) &&                                                  \
39    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40
41/* Define four value permute mask */
42#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
43
44#include <altivec.h>
45
46/* Avoid collisions between altivec.h and strict adherence to C++ and
47   C11 standards.  This should eventually be done inside altivec.h itself,
48   but only after testing a full distro build.  */
49#if defined(__STRICT_ANSI__) &&                                                \
50    (defined(__cplusplus) ||                                                   \
51     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
52#undef vector
53#undef pixel
54#undef bool
55#endif
56
57/* We need type definitions from the MMX header file.  */
58#include <mmintrin.h>
59
60/* Get _mm_malloc () and _mm_free ().  */
61#if __STDC_HOSTED__
62#include <mm_malloc.h>
63#endif
64
65/* The Intel API is flexible enough that we must allow aliasing with other
66   vector types, and their scalar components.  */
67typedef vector float __m128 __attribute__((__may_alias__));
68
69/* Unaligned version of the same type.  */
70typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
71
72/* Internal data types for implementing the intrinsics.  */
73typedef vector float __v4sf;
74
75/* Create an undefined vector.  */
76extern __inline __m128
77    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78    _mm_undefined_ps(void) {
79  __m128 __Y = __Y;
80  return __Y;
81}
82
83/* Create a vector of zeros.  */
84extern __inline __m128
85    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86    _mm_setzero_ps(void) {
87  return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
88}
89
90/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
91extern __inline __m128
92    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93    _mm_load_ps(float const *__P) {
94  return ((__m128)vec_ld(0, (__v4sf *)__P));
95}
96
97/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
98extern __inline __m128
99    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100    _mm_loadu_ps(float const *__P) {
101  return (vec_vsx_ld(0, __P));
102}
103
104/* Load four SPFP values in reverse order.  The address must be aligned.  */
105extern __inline __m128
106    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107    _mm_loadr_ps(float const *__P) {
108  __v4sf __tmp;
109  __m128 __result;
110  static const __vector unsigned char __permute_vector = {
111      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
113
114  __tmp = vec_ld(0, (__v4sf *)__P);
115  __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
116  return __result;
117}
118
119/* Create a vector with all four elements equal to F.  */
120extern __inline __m128
121    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122    _mm_set1_ps(float __F) {
123  return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
124}
125
126extern __inline __m128
127    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128    _mm_set_ps1(float __F) {
129  return _mm_set1_ps(__F);
130}
131
132/* Create the vector [Z Y X W].  */
133extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
134                                      __artificial__))
135_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
136  return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
137}
138
139/* Create the vector [W X Y Z].  */
140extern __inline __m128
141    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142    _mm_setr_ps(float __Z, float __Y, float __X, float __W) {
143  return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
144}
145
146/* Store four SPFP values.  The address must be 16-byte aligned.  */
147extern __inline void
148    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149    _mm_store_ps(float *__P, __m128 __A) {
150  vec_st((__v4sf)__A, 0, (__v4sf *)__P);
151}
152
153/* Store four SPFP values.  The address need not be 16-byte aligned.  */
154extern __inline void
155    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156    _mm_storeu_ps(float *__P, __m128 __A) {
157  *(__m128_u *)__P = __A;
158}
159
160/* Store four SPFP values in reverse order.  The address must be aligned.  */
161extern __inline void
162    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163    _mm_storer_ps(float *__P, __m128 __A) {
164  __v4sf __tmp;
165  static const __vector unsigned char __permute_vector = {
166      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
168
169  __tmp = (__m128)vec_perm(__A, __A, __permute_vector);
170
171  _mm_store_ps(__P, __tmp);
172}
173
174/* Store the lower SPFP value across four words.  */
175extern __inline void
176    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177    _mm_store1_ps(float *__P, __m128 __A) {
178  __v4sf __va = vec_splat((__v4sf)__A, 0);
179  _mm_store_ps(__P, __va);
180}
181
182extern __inline void
183    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184    _mm_store_ps1(float *__P, __m128 __A) {
185  _mm_store1_ps(__P, __A);
186}
187
188/* Create a vector with element 0 as F and the rest zero.  */
189extern __inline __m128
190    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191    _mm_set_ss(float __F) {
192  return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
193}
194
195/* Sets the low SPFP value of A from the low value of B.  */
196extern __inline __m128
197    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198    _mm_move_ss(__m128 __A, __m128 __B) {
199  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
200
201  return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
202}
203
204/* Create a vector with element 0 as *P and the rest zero.  */
205extern __inline __m128
206    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207    _mm_load_ss(float const *__P) {
208  return _mm_set_ss(*__P);
209}
210
211/* Stores the lower SPFP value.  */
212extern __inline void
213    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214    _mm_store_ss(float *__P, __m128 __A) {
215  *__P = ((__v4sf)__A)[0];
216}
217
218/* Perform the respective operation on the lower SPFP (single-precision
219   floating-point) values of A and B; the upper three SPFP values are
220   passed through from A.  */
221
222extern __inline __m128
223    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224    _mm_add_ss(__m128 __A, __m128 __B) {
225#ifdef _ARCH_PWR7
226  __m128 __a, __b, __c;
227  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
228  /* PowerISA VSX does not allow partial (for just lower double)
229     results. So to insure we don't generate spurious exceptions
230     (from the upper double values) we splat the lower double
231     before we to the operation.  */
232  __a = vec_splat(__A, 0);
233  __b = vec_splat(__B, 0);
234  __c = __a + __b;
235  /* Then we merge the lower float result with the original upper
236     float elements from __A.  */
237  return (vec_sel(__A, __c, __mask));
238#else
239  __A[0] = __A[0] + __B[0];
240  return (__A);
241#endif
242}
243
244extern __inline __m128
245    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246    _mm_sub_ss(__m128 __A, __m128 __B) {
247#ifdef _ARCH_PWR7
248  __m128 __a, __b, __c;
249  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250  /* PowerISA VSX does not allow partial (for just lower double)
251     results. So to insure we don't generate spurious exceptions
252     (from the upper double values) we splat the lower double
253     before we to the operation.  */
254  __a = vec_splat(__A, 0);
255  __b = vec_splat(__B, 0);
256  __c = __a - __b;
257  /* Then we merge the lower float result with the original upper
258     float elements from __A.  */
259  return (vec_sel(__A, __c, __mask));
260#else
261  __A[0] = __A[0] - __B[0];
262  return (__A);
263#endif
264}
265
266extern __inline __m128
267    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268    _mm_mul_ss(__m128 __A, __m128 __B) {
269#ifdef _ARCH_PWR7
270  __m128 __a, __b, __c;
271  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272  /* PowerISA VSX does not allow partial (for just lower double)
273     results. So to insure we don't generate spurious exceptions
274     (from the upper double values) we splat the lower double
275     before we to the operation.  */
276  __a = vec_splat(__A, 0);
277  __b = vec_splat(__B, 0);
278  __c = __a * __b;
279  /* Then we merge the lower float result with the original upper
280     float elements from __A.  */
281  return (vec_sel(__A, __c, __mask));
282#else
283  __A[0] = __A[0] * __B[0];
284  return (__A);
285#endif
286}
287
288extern __inline __m128
289    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290    _mm_div_ss(__m128 __A, __m128 __B) {
291#ifdef _ARCH_PWR7
292  __m128 __a, __b, __c;
293  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294  /* PowerISA VSX does not allow partial (for just lower double)
295     results. So to insure we don't generate spurious exceptions
296     (from the upper double values) we splat the lower double
297     before we to the operation.  */
298  __a = vec_splat(__A, 0);
299  __b = vec_splat(__B, 0);
300  __c = __a / __b;
301  /* Then we merge the lower float result with the original upper
302     float elements from __A.  */
303  return (vec_sel(__A, __c, __mask));
304#else
305  __A[0] = __A[0] / __B[0];
306  return (__A);
307#endif
308}
309
310extern __inline __m128
311    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312    _mm_sqrt_ss(__m128 __A) {
313  __m128 __a, __c;
314  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
315  /* PowerISA VSX does not allow partial (for just lower double)
316   * results. So to insure we don't generate spurious exceptions
317   * (from the upper double values) we splat the lower double
318   * before we to the operation. */
319  __a = vec_splat(__A, 0);
320  __c = vec_sqrt(__a);
321  /* Then we merge the lower float result with the original upper
322   * float elements from __A.  */
323  return (vec_sel(__A, __c, __mask));
324}
325
326/* Perform the respective operation on the four SPFP values in A and B.  */
327extern __inline __m128
328    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329    _mm_add_ps(__m128 __A, __m128 __B) {
330  return (__m128)((__v4sf)__A + (__v4sf)__B);
331}
332
333extern __inline __m128
334    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335    _mm_sub_ps(__m128 __A, __m128 __B) {
336  return (__m128)((__v4sf)__A - (__v4sf)__B);
337}
338
339extern __inline __m128
340    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341    _mm_mul_ps(__m128 __A, __m128 __B) {
342  return (__m128)((__v4sf)__A * (__v4sf)__B);
343}
344
345extern __inline __m128
346    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347    _mm_div_ps(__m128 __A, __m128 __B) {
348  return (__m128)((__v4sf)__A / (__v4sf)__B);
349}
350
351extern __inline __m128
352    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353    _mm_sqrt_ps(__m128 __A) {
354  return (vec_sqrt((__v4sf)__A));
355}
356
357extern __inline __m128
358    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359    _mm_rcp_ps(__m128 __A) {
360  return (vec_re((__v4sf)__A));
361}
362
363extern __inline __m128
364    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365    _mm_rsqrt_ps(__m128 __A) {
366  return (vec_rsqrte(__A));
367}
368
369extern __inline __m128
370    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371    _mm_rcp_ss(__m128 __A) {
372  __m128 __a, __c;
373  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
374  /* PowerISA VSX does not allow partial (for just lower double)
375   * results. So to insure we don't generate spurious exceptions
376   * (from the upper double values) we splat the lower double
377   * before we to the operation. */
378  __a = vec_splat(__A, 0);
379  __c = _mm_rcp_ps(__a);
380  /* Then we merge the lower float result with the original upper
381   * float elements from __A.  */
382  return (vec_sel(__A, __c, __mask));
383}
384
385extern __inline __m128
386    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387    _mm_rsqrt_ss(__m128 __A) {
388  __m128 __a, __c;
389  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
390  /* PowerISA VSX does not allow partial (for just lower double)
391   * results. So to insure we don't generate spurious exceptions
392   * (from the upper double values) we splat the lower double
393   * before we to the operation. */
394  __a = vec_splat(__A, 0);
395  __c = vec_rsqrte(__a);
396  /* Then we merge the lower float result with the original upper
397   * float elements from __A.  */
398  return (vec_sel(__A, __c, __mask));
399}
400
401extern __inline __m128
402    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403    _mm_min_ss(__m128 __A, __m128 __B) {
404  __v4sf __a, __b, __c;
405  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
406  /* PowerISA VSX does not allow partial (for just lower float)
407   * results. So to insure we don't generate spurious exceptions
408   * (from the upper float values) we splat the lower float
409   * before we to the operation. */
410  __a = vec_splat((__v4sf)__A, 0);
411  __b = vec_splat((__v4sf)__B, 0);
412  __c = vec_min(__a, __b);
413  /* Then we merge the lower float result with the original upper
414   * float elements from __A.  */
415  return (vec_sel((__v4sf)__A, __c, __mask));
416}
417
418extern __inline __m128
419    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420    _mm_max_ss(__m128 __A, __m128 __B) {
421  __v4sf __a, __b, __c;
422  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
423  /* PowerISA VSX does not allow partial (for just lower float)
424   * results. So to insure we don't generate spurious exceptions
425   * (from the upper float values) we splat the lower float
426   * before we to the operation. */
427  __a = vec_splat(__A, 0);
428  __b = vec_splat(__B, 0);
429  __c = vec_max(__a, __b);
430  /* Then we merge the lower float result with the original upper
431   * float elements from __A.  */
432  return (vec_sel((__v4sf)__A, __c, __mask));
433}
434
435extern __inline __m128
436    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437    _mm_min_ps(__m128 __A, __m128 __B) {
438  __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
439  return vec_sel(__B, __A, __m);
440}
441
442extern __inline __m128
443    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444    _mm_max_ps(__m128 __A, __m128 __B) {
445  __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
446  return vec_sel(__B, __A, __m);
447}
448
449/* Perform logical bit-wise operations on 128-bit values.  */
450extern __inline __m128
451    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452    _mm_and_ps(__m128 __A, __m128 __B) {
453  return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
454  //  return __builtin_ia32_andps (__A, __B);
455}
456
457extern __inline __m128
458    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459    _mm_andnot_ps(__m128 __A, __m128 __B) {
460  return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
461}
462
463extern __inline __m128
464    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465    _mm_or_ps(__m128 __A, __m128 __B) {
466  return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
467}
468
469extern __inline __m128
470    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471    _mm_xor_ps(__m128 __A, __m128 __B) {
472  return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
473}
474
475/* Perform a comparison on the four SPFP values of A and B.  For each
476   element, if the comparison is true, place a mask of all ones in the
477   result, otherwise a mask of zeros.  */
478extern __inline __m128
479    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480    _mm_cmpeq_ps(__m128 __A, __m128 __B) {
481  return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
482}
483
484extern __inline __m128
485    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486    _mm_cmplt_ps(__m128 __A, __m128 __B) {
487  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
488}
489
490extern __inline __m128
491    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492    _mm_cmple_ps(__m128 __A, __m128 __B) {
493  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
494}
495
496extern __inline __m128
497    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498    _mm_cmpgt_ps(__m128 __A, __m128 __B) {
499  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
500}
501
502extern __inline __m128
503    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504    _mm_cmpge_ps(__m128 __A, __m128 __B) {
505  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
506}
507
508extern __inline __m128
509    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510    _mm_cmpneq_ps(__m128 __A, __m128 __B) {
511  __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512  return ((__m128)vec_nor(__temp, __temp));
513}
514
515extern __inline __m128
516    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517    _mm_cmpnlt_ps(__m128 __A, __m128 __B) {
518  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
519}
520
521extern __inline __m128
522    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523    _mm_cmpnle_ps(__m128 __A, __m128 __B) {
524  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
525}
526
527extern __inline __m128
528    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529    _mm_cmpngt_ps(__m128 __A, __m128 __B) {
530  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
531}
532
533extern __inline __m128
534    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535    _mm_cmpnge_ps(__m128 __A, __m128 __B) {
536  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
537}
538
539extern __inline __m128
540    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541    _mm_cmpord_ps(__m128 __A, __m128 __B) {
542  __vector unsigned int __a, __b;
543  __vector unsigned int __c, __d;
544  static const __vector unsigned int __float_exp_mask = {
545      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
546
547  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
548  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
549  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
550  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
551  return ((__m128)vec_and(__c, __d));
552}
553
554extern __inline __m128
555    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556    _mm_cmpunord_ps(__m128 __A, __m128 __B) {
557  __vector unsigned int __a, __b;
558  __vector unsigned int __c, __d;
559  static const __vector unsigned int __float_exp_mask = {
560      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
561
562  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
563  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
564  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
565  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
566  return ((__m128)vec_or(__c, __d));
567}
568
569/* Perform a comparison on the lower SPFP values of A and B.  If the
570   comparison is true, place a mask of all ones in the result, otherwise a
571   mask of zeros.  The upper three SPFP values are passed through from A.  */
572extern __inline __m128
573    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574    _mm_cmpeq_ss(__m128 __A, __m128 __B) {
575  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
576  __v4sf __a, __b, __c;
577  /* PowerISA VMX does not allow partial (for just element 0)
578   * results. So to insure we don't generate spurious exceptions
579   * (from the upper elements) we splat the lower float
580   * before we to the operation. */
581  __a = vec_splat((__v4sf)__A, 0);
582  __b = vec_splat((__v4sf)__B, 0);
583  __c = (__v4sf)vec_cmpeq(__a, __b);
584  /* Then we merge the lower float result with the original upper
585   * float elements from __A.  */
586  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
587}
588
589extern __inline __m128
590    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591    _mm_cmplt_ss(__m128 __A, __m128 __B) {
592  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
593  __v4sf __a, __b, __c;
594  /* PowerISA VMX does not allow partial (for just element 0)
595   * results. So to insure we don't generate spurious exceptions
596   * (from the upper elements) we splat the lower float
597   * before we to the operation. */
598  __a = vec_splat((__v4sf)__A, 0);
599  __b = vec_splat((__v4sf)__B, 0);
600  __c = (__v4sf)vec_cmplt(__a, __b);
601  /* Then we merge the lower float result with the original upper
602   * float elements from __A.  */
603  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
604}
605
606extern __inline __m128
607    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608    _mm_cmple_ss(__m128 __A, __m128 __B) {
609  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
610  __v4sf __a, __b, __c;
611  /* PowerISA VMX does not allow partial (for just element 0)
612   * results. So to insure we don't generate spurious exceptions
613   * (from the upper elements) we splat the lower float
614   * before we to the operation. */
615  __a = vec_splat((__v4sf)__A, 0);
616  __b = vec_splat((__v4sf)__B, 0);
617  __c = (__v4sf)vec_cmple(__a, __b);
618  /* Then we merge the lower float result with the original upper
619   * float elements from __A.  */
620  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
621}
622
623extern __inline __m128
624    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625    _mm_cmpgt_ss(__m128 __A, __m128 __B) {
626  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
627  __v4sf __a, __b, __c;
628  /* PowerISA VMX does not allow partial (for just element 0)
629   * results. So to insure we don't generate spurious exceptions
630   * (from the upper elements) we splat the lower float
631   * before we to the operation. */
632  __a = vec_splat((__v4sf)__A, 0);
633  __b = vec_splat((__v4sf)__B, 0);
634  __c = (__v4sf)vec_cmpgt(__a, __b);
635  /* Then we merge the lower float result with the original upper
636   * float elements from __A.  */
637  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
638}
639
640extern __inline __m128
641    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642    _mm_cmpge_ss(__m128 __A, __m128 __B) {
643  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
644  __v4sf __a, __b, __c;
645  /* PowerISA VMX does not allow partial (for just element 0)
646   * results. So to insure we don't generate spurious exceptions
647   * (from the upper elements) we splat the lower float
648   * before we to the operation. */
649  __a = vec_splat((__v4sf)__A, 0);
650  __b = vec_splat((__v4sf)__B, 0);
651  __c = (__v4sf)vec_cmpge(__a, __b);
652  /* Then we merge the lower float result with the original upper
653   * float elements from __A.  */
654  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
655}
656
657extern __inline __m128
658    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659    _mm_cmpneq_ss(__m128 __A, __m128 __B) {
660  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
661  __v4sf __a, __b, __c;
662  /* PowerISA VMX does not allow partial (for just element 0)
663   * results. So to insure we don't generate spurious exceptions
664   * (from the upper elements) we splat the lower float
665   * before we to the operation. */
666  __a = vec_splat((__v4sf)__A, 0);
667  __b = vec_splat((__v4sf)__B, 0);
668  __c = (__v4sf)vec_cmpeq(__a, __b);
669  __c = vec_nor(__c, __c);
670  /* Then we merge the lower float result with the original upper
671   * float elements from __A.  */
672  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
673}
674
675extern __inline __m128
676    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677    _mm_cmpnlt_ss(__m128 __A, __m128 __B) {
678  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
679  __v4sf __a, __b, __c;
680  /* PowerISA VMX does not allow partial (for just element 0)
681   * results. So to insure we don't generate spurious exceptions
682   * (from the upper elements) we splat the lower float
683   * before we to the operation. */
684  __a = vec_splat((__v4sf)__A, 0);
685  __b = vec_splat((__v4sf)__B, 0);
686  __c = (__v4sf)vec_cmpge(__a, __b);
687  /* Then we merge the lower float result with the original upper
688   * float elements from __A.  */
689  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
690}
691
692extern __inline __m128
693    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694    _mm_cmpnle_ss(__m128 __A, __m128 __B) {
695  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
696  __v4sf __a, __b, __c;
697  /* PowerISA VMX does not allow partial (for just element 0)
698   * results. So to insure we don't generate spurious exceptions
699   * (from the upper elements) we splat the lower float
700   * before we to the operation. */
701  __a = vec_splat((__v4sf)__A, 0);
702  __b = vec_splat((__v4sf)__B, 0);
703  __c = (__v4sf)vec_cmpgt(__a, __b);
704  /* Then we merge the lower float result with the original upper
705   * float elements from __A.  */
706  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
707}
708
709extern __inline __m128
710    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711    _mm_cmpngt_ss(__m128 __A, __m128 __B) {
712  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
713  __v4sf __a, __b, __c;
714  /* PowerISA VMX does not allow partial (for just element 0)
715   * results. So to insure we don't generate spurious exceptions
716   * (from the upper elements) we splat the lower float
717   * before we to the operation. */
718  __a = vec_splat((__v4sf)__A, 0);
719  __b = vec_splat((__v4sf)__B, 0);
720  __c = (__v4sf)vec_cmple(__a, __b);
721  /* Then we merge the lower float result with the original upper
722   * float elements from __A.  */
723  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
724}
725
726extern __inline __m128
727    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728    _mm_cmpnge_ss(__m128 __A, __m128 __B) {
729  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
730  __v4sf __a, __b, __c;
731  /* PowerISA VMX does not allow partial (for just element 0)
732   * results. So to insure we don't generate spurious exceptions
733   * (from the upper elements) we splat the lower float
734   * before we do the operation. */
735  __a = vec_splat((__v4sf)__A, 0);
736  __b = vec_splat((__v4sf)__B, 0);
737  __c = (__v4sf)vec_cmplt(__a, __b);
738  /* Then we merge the lower float result with the original upper
739   * float elements from __A.  */
740  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
741}
742
743extern __inline __m128
744    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745    _mm_cmpord_ss(__m128 __A, __m128 __B) {
746  __vector unsigned int __a, __b;
747  __vector unsigned int __c, __d;
748  static const __vector unsigned int __float_exp_mask = {
749      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
751
752  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
753  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
754  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
755  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
756  __c = vec_and(__c, __d);
757  /* Then we merge the lower float result with the original upper
758   * float elements from __A.  */
759  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
760}
761
762extern __inline __m128
763    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764    _mm_cmpunord_ss(__m128 __A, __m128 __B) {
765  __vector unsigned int __a, __b;
766  __vector unsigned int __c, __d;
767  static const __vector unsigned int __float_exp_mask = {
768      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
770
771  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
772  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
773  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
774  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
775  __c = vec_or(__c, __d);
776  /* Then we merge the lower float result with the original upper
777   * float elements from __A.  */
778  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
779}
780
781/* Compare the lower SPFP values of A and B and return 1 if true
782   and 0 if false.  */
783extern __inline int
784    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785    _mm_comieq_ss(__m128 __A, __m128 __B) {
786  return (__A[0] == __B[0]);
787}
788
789extern __inline int
790    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791    _mm_comilt_ss(__m128 __A, __m128 __B) {
792  return (__A[0] < __B[0]);
793}
794
795extern __inline int
796    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797    _mm_comile_ss(__m128 __A, __m128 __B) {
798  return (__A[0] <= __B[0]);
799}
800
801extern __inline int
802    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803    _mm_comigt_ss(__m128 __A, __m128 __B) {
804  return (__A[0] > __B[0]);
805}
806
807extern __inline int
808    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809    _mm_comige_ss(__m128 __A, __m128 __B) {
810  return (__A[0] >= __B[0]);
811}
812
813extern __inline int
814    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815    _mm_comineq_ss(__m128 __A, __m128 __B) {
816  return (__A[0] != __B[0]);
817}
818
819/* FIXME
820 * The __mm_ucomi??_ss implementations below are exactly the same as
821 * __mm_comi??_ss because GCC for PowerPC only generates unordered
822 * compares (scalar and vector).
823 * Technically __mm_comieq_ss et al should be using the ordered
824 * compare and signal for QNaNs.
825 * The __mm_ucomieq_sd et all should be OK, as is.
826 */
827extern __inline int
828    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829    _mm_ucomieq_ss(__m128 __A, __m128 __B) {
830  return (__A[0] == __B[0]);
831}
832
833extern __inline int
834    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835    _mm_ucomilt_ss(__m128 __A, __m128 __B) {
836  return (__A[0] < __B[0]);
837}
838
839extern __inline int
840    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841    _mm_ucomile_ss(__m128 __A, __m128 __B) {
842  return (__A[0] <= __B[0]);
843}
844
845extern __inline int
846    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847    _mm_ucomigt_ss(__m128 __A, __m128 __B) {
848  return (__A[0] > __B[0]);
849}
850
851extern __inline int
852    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853    _mm_ucomige_ss(__m128 __A, __m128 __B) {
854  return (__A[0] >= __B[0]);
855}
856
857extern __inline int
858    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859    _mm_ucomineq_ss(__m128 __A, __m128 __B) {
860  return (__A[0] != __B[0]);
861}
862
863extern __inline float
864    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865    _mm_cvtss_f32(__m128 __A) {
866  return ((__v4sf)__A)[0];
867}
868
869/* Convert the lower SPFP value to a 32-bit integer according to the current
870   rounding mode.  */
871extern __inline int
872    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873    _mm_cvtss_si32(__m128 __A) {
874  int __res;
875#ifdef _ARCH_PWR8
876  double __dtmp;
877  __asm__(
878#ifdef __LITTLE_ENDIAN__
879      "xxsldwi %x0,%x0,%x0,3;\n"
880#endif
881      "xscvspdp %x2,%x0;\n"
882      "fctiw  %2,%2;\n"
883      "mfvsrd  %1,%x2;\n"
884      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
885      :);
886#else
887  __res = __builtin_rint(__A[0]);
888#endif
889  return __res;
890}
891
892extern __inline int
893    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894    _mm_cvt_ss2si(__m128 __A) {
895  return _mm_cvtss_si32(__A);
896}
897
898/* Convert the lower SPFP value to a 32-bit integer according to the
899   current rounding mode.  */
900
901/* Intel intrinsic.  */
902extern __inline long long
903    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904    _mm_cvtss_si64(__m128 __A) {
905  long long __res;
906#if defined(_ARCH_PWR8) && defined(__powerpc64__)
907  double __dtmp;
908  __asm__(
909#ifdef __LITTLE_ENDIAN__
910      "xxsldwi %x0,%x0,%x0,3;\n"
911#endif
912      "xscvspdp %x2,%x0;\n"
913      "fctid  %2,%2;\n"
914      "mfvsrd  %1,%x2;\n"
915      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
916      :);
917#else
918  __res = __builtin_llrint(__A[0]);
919#endif
920  return __res;
921}
922
923/* Microsoft intrinsic.  */
924extern __inline long long
925    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926    _mm_cvtss_si64x(__m128 __A) {
927  return _mm_cvtss_si64((__v4sf)__A);
928}
929
930/* Constants for use with _mm_prefetch.  */
931enum _mm_hint {
932  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
933  _MM_HINT_ET0 = 7,
934  _MM_HINT_ET1 = 6,
935  _MM_HINT_T0 = 3,
936  _MM_HINT_T1 = 2,
937  _MM_HINT_T2 = 1,
938  _MM_HINT_NTA = 0
939};
940
941/* Loads one cache line from address P to a location "closer" to the
942   processor.  The selector I specifies the type of prefetch operation.  */
943extern __inline void
944    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945    _mm_prefetch(const void *__P, enum _mm_hint __I) {
946  /* Current PowerPC will ignores the hint parameters.  */
947  __builtin_prefetch(__P);
948}
949
950/* Convert the two lower SPFP values to 32-bit integers according to the
951   current rounding mode.  Return the integers in packed form.  */
952extern __inline __m64
953    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954    _mm_cvtps_pi32(__m128 __A) {
955  /* Splat two lower SPFP values to both halves.  */
956  __v4sf __temp, __rounded;
957  __vector unsigned long long __result;
958
959  /* Splat two lower SPFP values to both halves.  */
960  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
961  __rounded = vec_rint(__temp);
962  __result = (__vector unsigned long long)vec_cts(__rounded, 0);
963
964  return (__m64)((__vector long long)__result)[0];
965}
966
967extern __inline __m64
968    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969    _mm_cvt_ps2pi(__m128 __A) {
970  return _mm_cvtps_pi32(__A);
971}
972
973/* Truncate the lower SPFP value to a 32-bit integer.  */
974extern __inline int
975    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976    _mm_cvttss_si32(__m128 __A) {
977  /* Extract the lower float element.  */
978  float __temp = __A[0];
979  /* truncate to 32-bit integer and return.  */
980  return __temp;
981}
982
983extern __inline int
984    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985    _mm_cvtt_ss2si(__m128 __A) {
986  return _mm_cvttss_si32(__A);
987}
988
989/* Intel intrinsic.  */
990extern __inline long long
991    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992    _mm_cvttss_si64(__m128 __A) {
993  /* Extract the lower float element.  */
994  float __temp = __A[0];
995  /* truncate to 32-bit integer and return.  */
996  return __temp;
997}
998
999/* Microsoft intrinsic.  */
1000extern __inline long long
1001    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002    _mm_cvttss_si64x(__m128 __A) {
1003  /* Extract the lower float element.  */
1004  float __temp = __A[0];
1005  /* truncate to 32-bit integer and return.  */
1006  return __temp;
1007}
1008
1009/* Truncate the two lower SPFP values to 32-bit integers.  Return the
1010   integers in packed form.  */
1011extern __inline __m64
1012    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013    _mm_cvttps_pi32(__m128 __A) {
1014  __v4sf __temp;
1015  __vector unsigned long long __result;
1016
1017  /* Splat two lower SPFP values to both halves.  */
1018  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
1019  __result = (__vector unsigned long long)vec_cts(__temp, 0);
1020
1021  return (__m64)((__vector long long)__result)[0];
1022}
1023
1024extern __inline __m64
1025    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026    _mm_cvtt_ps2pi(__m128 __A) {
1027  return _mm_cvttps_pi32(__A);
1028}
1029
1030/* Convert B to a SPFP value and insert it as element zero in A.  */
1031extern __inline __m128
1032    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033    _mm_cvtsi32_ss(__m128 __A, int __B) {
1034  float __temp = __B;
1035  __A[0] = __temp;
1036
1037  return __A;
1038}
1039
1040extern __inline __m128
1041    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042    _mm_cvt_si2ss(__m128 __A, int __B) {
1043  return _mm_cvtsi32_ss(__A, __B);
1044}
1045
1046/* Convert B to a SPFP value and insert it as element zero in A.  */
1047/* Intel intrinsic.  */
1048extern __inline __m128
1049    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050    _mm_cvtsi64_ss(__m128 __A, long long __B) {
1051  float __temp = __B;
1052  __A[0] = __temp;
1053
1054  return __A;
1055}
1056
1057/* Microsoft intrinsic.  */
1058extern __inline __m128
1059    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060    _mm_cvtsi64x_ss(__m128 __A, long long __B) {
1061  return _mm_cvtsi64_ss(__A, __B);
1062}
1063
1064/* Convert the two 32-bit values in B to SPFP form and insert them
1065   as the two lower elements in A.  */
1066extern __inline __m128
1067    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068    _mm_cvtpi32_ps(__m128 __A, __m64 __B) {
1069  __vector signed int __vm1;
1070  __vector float __vf1;
1071
1072  __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
1073  __vf1 = (__vector float)vec_ctf(__vm1, 0);
1074
1075  return ((__m128)(__vector unsigned long long){
1076      ((__vector unsigned long long)__vf1)[0],
1077      ((__vector unsigned long long)__A)[1]});
1078}
1079
1080extern __inline __m128
1081    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082    _mm_cvt_pi2ps(__m128 __A, __m64 __B) {
1083  return _mm_cvtpi32_ps(__A, __B);
1084}
1085
1086/* Convert the four signed 16-bit values in A to SPFP form.  */
1087extern __inline __m128
1088    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089    _mm_cvtpi16_ps(__m64 __A) {
1090  __vector signed short __vs8;
1091  __vector signed int __vi4;
1092  __vector float __vf1;
1093
1094  __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
1095  __vi4 = vec_vupklsh(__vs8);
1096  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1097
1098  return (__m128)__vf1;
1099}
1100
1101/* Convert the four unsigned 16-bit values in A to SPFP form.  */
1102extern __inline __m128
1103    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104    _mm_cvtpu16_ps(__m64 __A) {
1105  const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106  __vector unsigned short __vs8;
1107  __vector unsigned int __vi4;
1108  __vector float __vf1;
1109
1110  __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
1111  __vi4 = (__vector unsigned int)vec_mergel
1112#ifdef __LITTLE_ENDIAN__
1113      (__vs8, __zero);
1114#else
1115      (__zero, __vs8);
1116#endif
1117  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1118
1119  return (__m128)__vf1;
1120}
1121
1122/* Convert the low four signed 8-bit values in A to SPFP form.  */
1123extern __inline __m128
1124    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125    _mm_cvtpi8_ps(__m64 __A) {
1126  __vector signed char __vc16;
1127  __vector signed short __vs8;
1128  __vector signed int __vi4;
1129  __vector float __vf1;
1130
1131  __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
1132  __vs8 = vec_vupkhsb(__vc16);
1133  __vi4 = vec_vupkhsh(__vs8);
1134  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1135
1136  return (__m128)__vf1;
1137}
1138
1139/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1140extern __inline __m128
1141    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142
1143    _mm_cvtpu8_ps(__m64 __A) {
1144  const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145  __vector unsigned char __vc16;
1146  __vector unsigned short __vs8;
1147  __vector unsigned int __vi4;
1148  __vector float __vf1;
1149
1150  __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
1151#ifdef __LITTLE_ENDIAN__
1152  __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
1153  __vi4 =
1154      (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
1155#else
1156  __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
1157  __vi4 =
1158      (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
1159#endif
1160  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1161
1162  return (__m128)__vf1;
1163}
1164
1165/* Convert the four signed 32-bit values in A and B to SPFP form.  */
1166extern __inline __m128
1167    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168    _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
1169  __vector signed int __vi4;
1170  __vector float __vf4;
1171
1172  __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
1173  __vf4 = (__vector float)vec_ctf(__vi4, 0);
1174  return (__m128)__vf4;
1175}
1176
1177/* Convert the four SPFP values in A to four signed 16-bit integers.  */
1178extern __inline __m64
1179    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180    _mm_cvtps_pi16(__m128 __A) {
1181  __v4sf __rounded;
1182  __vector signed int __temp;
1183  __vector unsigned long long __result;
1184
1185  __rounded = vec_rint(__A);
1186  __temp = vec_cts(__rounded, 0);
1187  __result = (__vector unsigned long long)vec_pack(__temp, __temp);
1188
1189  return (__m64)((__vector long long)__result)[0];
1190}
1191
1192/* Convert the four SPFP values in A to four signed 8-bit integers.  */
1193extern __inline __m64
1194    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195    _mm_cvtps_pi8(__m128 __A) {
1196  __v4sf __rounded;
1197  __vector signed int __tmp_i;
1198  static const __vector signed int __zero = {0, 0, 0, 0};
1199  __vector signed short __tmp_s;
1200  __vector signed char __res_v;
1201
1202  __rounded = vec_rint(__A);
1203  __tmp_i = vec_cts(__rounded, 0);
1204  __tmp_s = vec_pack(__tmp_i, __zero);
1205  __res_v = vec_pack(__tmp_s, __tmp_s);
1206  return (__m64)((__vector long long)__res_v)[0];
1207}
1208
1209/* Selects four specific SPFP values from A and B based on MASK.  */
1210extern __inline __m128
1211    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212
1213    _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
1214  unsigned long __element_selector_10 = __mask & 0x03;
1215  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218  static const unsigned int __permute_selectors[4] = {
1219#ifdef __LITTLE_ENDIAN__
1220      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1221#else
1222      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1223#endif
1224  };
1225  __vector unsigned int __t;
1226
1227  __t[0] = __permute_selectors[__element_selector_10];
1228  __t[1] = __permute_selectors[__element_selector_32];
1229  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231  return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
1232}
1233
1234/* Selects and interleaves the upper two SPFP values from A and B.  */
1235extern __inline __m128
1236    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237    _mm_unpackhi_ps(__m128 __A, __m128 __B) {
1238  return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1239}
1240
1241/* Selects and interleaves the lower two SPFP values from A and B.  */
1242extern __inline __m128
1243    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244    _mm_unpacklo_ps(__m128 __A, __m128 __B) {
1245  return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1246}
1247
1248/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1249   the lower two values are passed through from A.  */
1250extern __inline __m128
1251    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252    _mm_loadh_pi(__m128 __A, __m64 const *__P) {
1253  __vector unsigned long long __a = (__vector unsigned long long)__A;
1254  __vector unsigned long long __p = vec_splats(*__P);
1255  __a[1] = __p[1];
1256
1257  return (__m128)__a;
1258}
1259
1260/* Stores the upper two SPFP values of A into P.  */
1261extern __inline void
1262    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263    _mm_storeh_pi(__m64 *__P, __m128 __A) {
1264  __vector unsigned long long __a = (__vector unsigned long long)__A;
1265
1266  *__P = __a[1];
1267}
1268
1269/* Moves the upper two values of B into the lower two values of A.  */
1270extern __inline __m128
1271    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272    _mm_movehl_ps(__m128 __A, __m128 __B) {
1273  return (__m128)vec_mergel((__vector unsigned long long)__B,
1274                            (__vector unsigned long long)__A);
1275}
1276
1277/* Moves the lower two values of B into the upper two values of A.  */
1278extern __inline __m128
1279    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280    _mm_movelh_ps(__m128 __A, __m128 __B) {
1281  return (__m128)vec_mergeh((__vector unsigned long long)__A,
1282                            (__vector unsigned long long)__B);
1283}
1284
1285/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1286   the upper two values are passed through from A.  */
1287extern __inline __m128
1288    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289    _mm_loadl_pi(__m128 __A, __m64 const *__P) {
1290  __vector unsigned long long __a = (__vector unsigned long long)__A;
1291  __vector unsigned long long __p = vec_splats(*__P);
1292  __a[0] = __p[0];
1293
1294  return (__m128)__a;
1295}
1296
1297/* Stores the lower two SPFP values of A into P.  */
1298extern __inline void
1299    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300    _mm_storel_pi(__m64 *__P, __m128 __A) {
1301  __vector unsigned long long __a = (__vector unsigned long long)__A;
1302
1303  *__P = __a[0];
1304}
1305
1306#ifdef _ARCH_PWR8
1307/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1308
1309/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1310extern __inline int
1311    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312    _mm_movemask_ps(__m128 __A) {
1313#ifdef _ARCH_PWR10
1314  return vec_extractm((__vector unsigned int)__A);
1315#else
1316  __vector unsigned long long __result;
1317  static const __vector unsigned int __perm_mask = {
1318#ifdef __LITTLE_ENDIAN__
1319      0x00204060, 0x80808080, 0x80808080, 0x80808080
1320#else
1321      0x80808080, 0x80808080, 0x80808080, 0x00204060
1322#endif
1323  };
1324
1325  __result = ((__vector unsigned long long)vec_vbpermq(
1326      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1327
1328#ifdef __LITTLE_ENDIAN__
1329  return __result[1];
1330#else
1331  return __result[0];
1332#endif
1333#endif /* !_ARCH_PWR10 */
1334}
1335#endif /* _ARCH_PWR8 */
1336
1337/* Create a vector with all four elements equal to *P.  */
1338extern __inline __m128
1339    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340    _mm_load1_ps(float const *__P) {
1341  return _mm_set1_ps(*__P);
1342}
1343
1344extern __inline __m128
1345    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346    _mm_load_ps1(float const *__P) {
1347  return _mm_load1_ps(__P);
1348}
1349
1350/* Extracts one of the four words of A.  The selector N must be immediate.  */
1351extern __inline int
1352    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353    _mm_extract_pi16(__m64 const __A, int const __N) {
1354  unsigned int __shiftr = __N & 3;
1355#ifdef __BIG_ENDIAN__
1356  __shiftr = 3 - __shiftr;
1357#endif
1358
1359  return ((__A >> (__shiftr * 16)) & 0xffff);
1360}
1361
1362extern __inline int
1363    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364    _m_pextrw(__m64 const __A, int const __N) {
1365  return _mm_extract_pi16(__A, __N);
1366}
1367
1368/* Inserts word D into one of four words of A.  The selector N must be
1369   immediate.  */
1370extern __inline __m64
1371    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372    _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
1373  const int __shiftl = (__N & 3) * 16;
1374  const __m64 __shiftD = (const __m64)__D << __shiftl;
1375  const __m64 __mask = 0xffffUL << __shiftl;
1376  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1377
1378  return __result;
1379}
1380
1381extern __inline __m64
1382    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383    _m_pinsrw(__m64 const __A, int const __D, int const __N) {
1384  return _mm_insert_pi16(__A, __D, __N);
1385}
1386
1387/* Compute the element-wise maximum of signed 16-bit values.  */
1388extern __inline __m64
1389    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390
1391    _mm_max_pi16(__m64 __A, __m64 __B) {
1392#if _ARCH_PWR8
1393  __vector signed short __a, __b, __r;
1394  __vector __bool short __c;
1395
1396  __a = (__vector signed short)vec_splats(__A);
1397  __b = (__vector signed short)vec_splats(__B);
1398  __c = (__vector __bool short)vec_cmpgt(__a, __b);
1399  __r = vec_sel(__b, __a, __c);
1400  return (__m64)((__vector long long)__r)[0];
1401#else
1402  __m64_union __m1, __m2, __res;
1403
1404  __m1.as_m64 = __A;
1405  __m2.as_m64 = __B;
1406
1407  __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1408                                                            : __m2.as_short[0];
1409  __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1410                                                            : __m2.as_short[1];
1411  __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1412                                                            : __m2.as_short[2];
1413  __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1414                                                            : __m2.as_short[3];
1415
1416  return (__m64)__res.as_m64;
1417#endif
1418}
1419
1420extern __inline __m64
1421    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422    _m_pmaxsw(__m64 __A, __m64 __B) {
1423  return _mm_max_pi16(__A, __B);
1424}
1425
1426/* Compute the element-wise maximum of unsigned 8-bit values.  */
1427extern __inline __m64
1428    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429    _mm_max_pu8(__m64 __A, __m64 __B) {
1430#if _ARCH_PWR8
1431  __vector unsigned char __a, __b, __r;
1432  __vector __bool char __c;
1433
1434  __a = (__vector unsigned char)vec_splats(__A);
1435  __b = (__vector unsigned char)vec_splats(__B);
1436  __c = (__vector __bool char)vec_cmpgt(__a, __b);
1437  __r = vec_sel(__b, __a, __c);
1438  return (__m64)((__vector long long)__r)[0];
1439#else
1440  __m64_union __m1, __m2, __res;
1441  long __i;
1442
1443  __m1.as_m64 = __A;
1444  __m2.as_m64 = __B;
1445
1446  for (__i = 0; __i < 8; __i++)
1447    __res.as_char[__i] =
1448        ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
1449            ? __m1.as_char[__i]
1450            : __m2.as_char[__i];
1451
1452  return (__m64)__res.as_m64;
1453#endif
1454}
1455
1456extern __inline __m64
1457    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458    _m_pmaxub(__m64 __A, __m64 __B) {
1459  return _mm_max_pu8(__A, __B);
1460}
1461
1462/* Compute the element-wise minimum of signed 16-bit values.  */
1463extern __inline __m64
1464    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465    _mm_min_pi16(__m64 __A, __m64 __B) {
1466#if _ARCH_PWR8
1467  __vector signed short __a, __b, __r;
1468  __vector __bool short __c;
1469
1470  __a = (__vector signed short)vec_splats(__A);
1471  __b = (__vector signed short)vec_splats(__B);
1472  __c = (__vector __bool short)vec_cmplt(__a, __b);
1473  __r = vec_sel(__b, __a, __c);
1474  return (__m64)((__vector long long)__r)[0];
1475#else
1476  __m64_union __m1, __m2, __res;
1477
1478  __m1.as_m64 = __A;
1479  __m2.as_m64 = __B;
1480
1481  __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1482                                                            : __m2.as_short[0];
1483  __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1484                                                            : __m2.as_short[1];
1485  __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1486                                                            : __m2.as_short[2];
1487  __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1488                                                            : __m2.as_short[3];
1489
1490  return (__m64)__res.as_m64;
1491#endif
1492}
1493
1494extern __inline __m64
1495    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496    _m_pminsw(__m64 __A, __m64 __B) {
1497  return _mm_min_pi16(__A, __B);
1498}
1499
1500/* Compute the element-wise minimum of unsigned 8-bit values.  */
1501extern __inline __m64
1502    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503    _mm_min_pu8(__m64 __A, __m64 __B) {
1504#if _ARCH_PWR8
1505  __vector unsigned char __a, __b, __r;
1506  __vector __bool char __c;
1507
1508  __a = (__vector unsigned char)vec_splats(__A);
1509  __b = (__vector unsigned char)vec_splats(__B);
1510  __c = (__vector __bool char)vec_cmplt(__a, __b);
1511  __r = vec_sel(__b, __a, __c);
1512  return (__m64)((__vector long long)__r)[0];
1513#else
1514  __m64_union __m1, __m2, __res;
1515  long __i;
1516
1517  __m1.as_m64 = __A;
1518  __m2.as_m64 = __B;
1519
1520  for (__i = 0; __i < 8; __i++)
1521    __res.as_char[__i] =
1522        ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
1523            ? __m1.as_char[__i]
1524            : __m2.as_char[__i];
1525
1526  return (__m64)__res.as_m64;
1527#endif
1528}
1529
1530extern __inline __m64
1531    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532    _m_pminub(__m64 __A, __m64 __B) {
1533  return _mm_min_pu8(__A, __B);
1534}
1535
1536/* Create an 8-bit mask of the signs of 8-bit values.  */
1537extern __inline int
1538    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539    _mm_movemask_pi8(__m64 __A) {
1540#ifdef __powerpc64__
1541  unsigned long long __p =
1542#ifdef __LITTLE_ENDIAN__
1543      0x0008101820283038UL; // permute control for sign bits
1544#else
1545      0x3830282018100800UL; // permute control for sign bits
1546#endif
1547  return __builtin_bpermd(__p, __A);
1548#else
1549#ifdef __LITTLE_ENDIAN__
1550  unsigned int __mask = 0x20283038UL;
1551  unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552  unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1553#else
1554  unsigned int __mask = 0x38302820UL;
1555  unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556  unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1557#endif
1558  return (__r2 << 4) | __r1;
1559#endif
1560}
1561
1562extern __inline int
1563    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1564    _m_pmovmskb(__m64 __A) {
1565  return _mm_movemask_pi8(__A);
1566}
1567
1568/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1569   in B and produce the high 16 bits of the 32-bit results.  */
1570extern __inline __m64
1571    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572    _mm_mulhi_pu16(__m64 __A, __m64 __B) {
1573  __vector unsigned short __a, __b;
1574  __vector unsigned short __c;
1575  __vector unsigned int __w0, __w1;
1576  __vector unsigned char __xform1 = {
1577#ifdef __LITTLE_ENDIAN__
1578      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1580#else
1581      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1583#endif
1584  };
1585
1586  __a = (__vector unsigned short)vec_splats(__A);
1587  __b = (__vector unsigned short)vec_splats(__B);
1588
1589  __w0 = vec_vmuleuh(__a, __b);
1590  __w1 = vec_vmulouh(__a, __b);
1591  __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
1592
1593  return (__m64)((__vector long long)__c)[0];
1594}
1595
1596extern __inline __m64
1597    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1598    _m_pmulhuw(__m64 __A, __m64 __B) {
1599  return _mm_mulhi_pu16(__A, __B);
1600}
1601
1602/* Return a combination of the four 16-bit values in A.  The selector
1603   must be an immediate.  */
1604extern __inline __m64
1605    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606    _mm_shuffle_pi16(__m64 __A, int const __N) {
1607  unsigned long __element_selector_10 = __N & 0x03;
1608  unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609  unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610  unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611  static const unsigned short __permute_selectors[4] = {
1612#ifdef __LITTLE_ENDIAN__
1613      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614#else
1615      0x0607, 0x0405, 0x0203, 0x0001
1616#endif
1617  };
1618  __m64_union __t;
1619  __vector unsigned long long __a, __p, __r;
1620
1621#ifdef __LITTLE_ENDIAN__
1622  __t.as_short[0] = __permute_selectors[__element_selector_10];
1623  __t.as_short[1] = __permute_selectors[__element_selector_32];
1624  __t.as_short[2] = __permute_selectors[__element_selector_54];
1625  __t.as_short[3] = __permute_selectors[__element_selector_76];
1626#else
1627  __t.as_short[3] = __permute_selectors[__element_selector_10];
1628  __t.as_short[2] = __permute_selectors[__element_selector_32];
1629  __t.as_short[1] = __permute_selectors[__element_selector_54];
1630  __t.as_short[0] = __permute_selectors[__element_selector_76];
1631#endif
1632  __p = vec_splats(__t.as_m64);
1633  __a = vec_splats(__A);
1634  __r = vec_perm(__a, __a, (__vector unsigned char)__p);
1635  return (__m64)((__vector long long)__r)[0];
1636}
1637
1638extern __inline __m64
1639    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640    _m_pshufw(__m64 __A, int const __N) {
1641  return _mm_shuffle_pi16(__A, __N);
1642}
1643
1644/* Conditionally store byte elements of A into P.  The high bit of each
1645   byte in the selector N determines whether the corresponding byte from
1646   A is stored.  */
1647extern __inline void
1648    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649    _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
1650  __m64 __hibit = 0x8080808080808080UL;
1651  __m64 __mask, __tmp;
1652  __m64 *__p = (__m64 *)__P;
1653
1654  __tmp = *__p;
1655  __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
1656  __tmp = (__tmp & (~__mask)) | (__A & __mask);
1657  *__p = __tmp;
1658}
1659
1660extern __inline void
1661    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662    _m_maskmovq(__m64 __A, __m64 __N, char *__P) {
1663  _mm_maskmove_si64(__A, __N, __P);
1664}
1665
1666/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1667extern __inline __m64
1668    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1669    _mm_avg_pu8(__m64 __A, __m64 __B) {
1670  __vector unsigned char __a, __b, __c;
1671
1672  __a = (__vector unsigned char)vec_splats(__A);
1673  __b = (__vector unsigned char)vec_splats(__B);
1674  __c = vec_avg(__a, __b);
1675  return (__m64)((__vector long long)__c)[0];
1676}
1677
1678extern __inline __m64
1679    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680    _m_pavgb(__m64 __A, __m64 __B) {
1681  return _mm_avg_pu8(__A, __B);
1682}
1683
1684/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1685extern __inline __m64
1686    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1687    _mm_avg_pu16(__m64 __A, __m64 __B) {
1688  __vector unsigned short __a, __b, __c;
1689
1690  __a = (__vector unsigned short)vec_splats(__A);
1691  __b = (__vector unsigned short)vec_splats(__B);
1692  __c = vec_avg(__a, __b);
1693  return (__m64)((__vector long long)__c)[0];
1694}
1695
1696extern __inline __m64
1697    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1698    _m_pavgw(__m64 __A, __m64 __B) {
1699  return _mm_avg_pu16(__A, __B);
1700}
1701
1702/* Compute the sum of the absolute differences of the unsigned 8-bit
1703   values in A and B.  Return the value in the lower 16-bit word; the
1704   upper words are cleared.  */
1705extern __inline __m64
1706    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707    _mm_sad_pu8(__m64 __A, __m64 __B) {
1708  __vector unsigned char __a, __b;
1709  __vector unsigned char __vmin, __vmax, __vabsdiff;
1710  __vector signed int __vsum;
1711  const __vector unsigned int __zero = {0, 0, 0, 0};
1712  __m64_union __result = {0};
1713
1714  __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
1715  __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
1716  __vmin = vec_min(__a, __b);
1717  __vmax = vec_max(__a, __b);
1718  __vabsdiff = vec_sub(__vmax, __vmin);
1719  /* Sum four groups of bytes into integers.  */
1720  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
1721  /* Sum across four integers with integer result.  */
1722  __vsum = vec_sums(__vsum, (__vector signed int)__zero);
1723  /* The sum is in the right most 32-bits of the vector result.
1724     Transfer to a GPR and truncate to 16 bits.  */
1725  __result.as_short[0] = __vsum[3];
1726  return __result.as_m64;
1727}
1728
1729extern __inline __m64
1730    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731    _m_psadbw(__m64 __A, __m64 __B) {
1732  return _mm_sad_pu8(__A, __B);
1733}
1734
1735/* Stores the data in A to the address P without polluting the caches.  */
1736extern __inline void
1737    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738    _mm_stream_pi(__m64 *__P, __m64 __A) {
1739  /* Use the data cache block touch for store transient.  */
1740  __asm__("	dcbtstt	0,%0" : : "b"(__P) : "memory");
1741  *__P = __A;
1742}
1743
1744/* Likewise.  The address must be 16-byte aligned.  */
1745extern __inline void
1746    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1747    _mm_stream_ps(float *__P, __m128 __A) {
1748  /* Use the data cache block touch for store transient.  */
1749  __asm__("	dcbtstt	0,%0" : : "b"(__P) : "memory");
1750  _mm_store_ps(__P, __A);
1751}
1752
1753/* Guarantees that every preceding store is globally visible before
1754   any subsequent store.  */
1755extern __inline void
1756    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757    _mm_sfence(void) {
1758  /* Generate a light weight sync.  */
1759  __atomic_thread_fence(__ATOMIC_RELEASE);
1760}
1761
1762/* The execution of the next instruction is delayed by an implementation
1763   specific amount of time.  The instruction does not modify the
1764   architectural state.  This is after the pop_options pragma because
1765   it does not require SSE support in the processor--the encoding is a
1766   nop on processors that do not support it.  */
1767extern __inline void
1768    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1769    _mm_pause(void) {
1770  /* There is no exact match with this construct, but the following is
1771     close to the desired effect.  */
1772#if _ARCH_PWR8
1773  /* On power8 and later processors we can depend on Program Priority
1774     (PRI) and associated "very low" PPI setting.  Since we don't know
1775     what PPI this thread is running at we: 1) save the current PRI
1776     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1777     via the special or 31,31,31 encoding. 3) issue an "isync" to
1778     insure the PRI change takes effect before we execute any more
1779     instructions.
1780     Now we can execute a lwsync (release barrier) while we execute
1781     this thread at "very low" PRI.  Finally we restore the original
1782     PRI and continue execution.  */
1783  unsigned long __PPR;
1784
1785  __asm__ volatile("	mfppr	%0;"
1786                   "   or 31,31,31;"
1787                   "   isync;"
1788                   "   lwsync;"
1789                   "   isync;"
1790                   "   mtppr	%0;"
1791                   : "=r"(__PPR)
1792                   :
1793                   : "memory");
1794#else
1795  /* For older processor where we may not even have Program Priority
1796     controls we can only depend on Heavy Weight Sync.  */
1797  __atomic_thread_fence(__ATOMIC_SEQ_CST);
1798#endif
1799}
1800
1801/* Transpose the 4x4 matrix composed of row[0-3].  */
1802#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
1803  do {                                                                         \
1804    __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);         \
1805    __v4sf __t0 = vec_vmrghw(__r0, __r1);                                      \
1806    __v4sf __t1 = vec_vmrghw(__r2, __r3);                                      \
1807    __v4sf __t2 = vec_vmrglw(__r0, __r1);                                      \
1808    __v4sf __t3 = vec_vmrglw(__r2, __r3);                                      \
1809    (row0) = (__v4sf)vec_mergeh((__vector long long)__t0,                      \
1810                                (__vector long long)__t1);                     \
1811    (row1) = (__v4sf)vec_mergel((__vector long long)__t0,                      \
1812                                (__vector long long)__t1);                     \
1813    (row2) = (__v4sf)vec_mergeh((__vector long long)__t2,                      \
1814                                (__vector long long)__t3);                     \
1815    (row3) = (__v4sf)vec_mergel((__vector long long)__t2,                      \
1816                                (__vector long long)__t3);                     \
1817  } while (0)
1818
1819/* For backward source compatibility.  */
1820//# include <emmintrin.h>
1821
1822#else
1823#include_next <xmmintrin.h>
1824#endif /* defined(__powerpc64__) &&                                            \
1825        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1826
1827#endif /* XMMINTRIN_H_ */
1828