1/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef NO_WARN_X86_INTRINSICS
28/* This header is distributed to simplify porting x86_64 code that
29   makes explicit use of Intel intrinsics to powerpc64le.
30   It is the user's responsibility to determine if the results are
31   acceptable and make additional changes as necessary.
32   Note that much code that uses Intel intrinsics can be rewritten in
33   standard C or GNU C extensions, which are more portable and better
34   optimized across multiple targets.
35
36   In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37   VMX/VSX ISA is a good match for vector float SIMD operations.
38   However scalar float operations in vector (XMM) registers require
39   the POWER8 VSX ISA (2.07) level. Also there are important
40   differences for data format and placement of float scalars in the
41   vector register. For PowerISA Scalar floats in FPRs (left most
42   64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43   uses the right most 32-bits of the XMM. These differences require
44   extra steps on POWER to match the SSE scalar float semantics.
45
46   Most SSE scalar float intrinsic operations can be performed more
47   efficiently as C language float scalar operations or optimized to
48   use vector SIMD operations.  We recommend this for new applications.
49
50   Another difference is the format and details of the X86_64 MXSCR vs
51   the PowerISA FPSCR / VSCR registers. We recommend applications
52   replace direct access to the MXSCR with the more portable <fenv.h>
53   Posix APIs. */
54#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
55#endif
56
57#ifndef _XMMINTRIN_H_INCLUDED
58#define _XMMINTRIN_H_INCLUDED
59
60/* Define four value permute mask */
61#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
62
63#include <altivec.h>
64
65/* Avoid collisions between altivec.h and strict adherence to C++ and
66   C11 standards.  This should eventually be done inside altivec.h itself,
67   but only after testing a full distro build.  */
68#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69				 (defined(__STDC_VERSION__) &&	\
70				  __STDC_VERSION__ >= 201112L))
71#undef vector
72#undef pixel
73#undef bool
74#endif
75
76#include <assert.h>
77
78/* We need type definitions from the MMX header file.  */
79#include <mmintrin.h>
80
81/* Get _mm_malloc () and _mm_free ().  */
82#include <mm_malloc.h>
83
84/* The Intel API is flexible enough that we must allow aliasing with other
85   vector types, and their scalar components.  */
86typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
87
88/* Unaligned version of the same type.  */
89typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
90				       __aligned__ (1)));
91
92/* Internal data types for implementing the intrinsics.  */
93typedef float __v4sf __attribute__ ((__vector_size__ (16)));
94
95/* Create an undefined vector.  */
96extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97_mm_undefined_ps (void)
98{
99  __m128 __Y = __Y;
100  return __Y;
101}
102
103/* Create a vector of zeros.  */
104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_setzero_ps (void)
106{
107  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
108}
109
110/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
111extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112_mm_load_ps (float const *__P)
113{
114  assert(((unsigned long)__P & 0xfUL) == 0UL);
115  return ((__m128)vec_ld(0, (__v4sf*)__P));
116}
117
118/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_loadu_ps (float const *__P)
121{
122  return (vec_vsx_ld(0, __P));
123}
124
125/* Load four SPFP values in reverse order.  The address must be aligned.  */
126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127_mm_loadr_ps (float const *__P)
128{
129  __v4sf   __tmp;
130  __m128 __result;
131  static const __vector unsigned char __permute_vector =
132    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
133	0x17, 0x10, 0x11, 0x12, 0x13 };
134
135  __tmp = vec_ld (0, (__v4sf *) __P);
136  __result = (__m128) vec_perm (__tmp, __tmp, __permute_vector);
137  return __result;
138}
139
140/* Create a vector with all four elements equal to F.  */
141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_set1_ps (float __F)
143{
144  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
145}
146
147extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148_mm_set_ps1 (float __F)
149{
150  return _mm_set1_ps (__F);
151}
152
153/* Create the vector [Z Y X W].  */
154extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
156{
157  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
158}
159
160/* Create the vector [W X Y Z].  */
161extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm_setr_ps (float __Z, float __Y, float __X, float __W)
163{
164  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
165}
166
167/* Store four SPFP values.  The address must be 16-byte aligned.  */
168extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169_mm_store_ps (float *__P, __m128 __A)
170{
171  assert(((unsigned long)__P & 0xfUL) == 0UL);
172  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
173}
174
175/* Store four SPFP values.  The address need not be 16-byte aligned.  */
176extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm_storeu_ps (float *__P, __m128 __A)
178{
179  *(__m128_u *)__P = __A;
180}
181
182/* Store four SPFP values in reverse order.  The address must be aligned.  */
183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184_mm_storer_ps (float *__P, __m128 __A)
185{
186  __v4sf   __tmp;
187  static const __vector unsigned char __permute_vector =
188    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
189	0x17, 0x10, 0x11, 0x12, 0x13 };
190
191  __tmp = (__m128) vec_perm (__A, __A, __permute_vector);
192
193  _mm_store_ps (__P, __tmp);
194}
195
196/* Store the lower SPFP value across four words.  */
197extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198_mm_store1_ps (float *__P, __m128 __A)
199{
200  __v4sf __va = vec_splat((__v4sf)__A, 0);
201  _mm_store_ps (__P, __va);
202}
203
204extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm_store_ps1 (float *__P, __m128 __A)
206{
207  _mm_store1_ps (__P, __A);
208}
209
210/* Create a vector with element 0 as F and the rest zero.  */
211extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212_mm_set_ss (float __F)
213{
214  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
215}
216
217/* Sets the low SPFP value of A from the low value of B.  */
218extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm_move_ss (__m128 __A, __m128 __B)
220{
221  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
222
223  return (vec_sel ((__v4sf)__A, (__v4sf)__B, __mask));
224}
225
226/* Create a vector with element 0 as *P and the rest zero.  */
227extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_load_ss (float const *__P)
229{
230  return _mm_set_ss (*__P);
231}
232
233/* Stores the lower SPFP value.  */
234extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm_store_ss (float *__P, __m128 __A)
236{
237  *__P = ((__v4sf)__A)[0];
238}
239
240/* Perform the respective operation on the lower SPFP (single-precision
241   floating-point) values of A and B; the upper three SPFP values are
242   passed through from A.  */
243
244extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245_mm_add_ss (__m128 __A, __m128 __B)
246{
247#ifdef _ARCH_PWR7
248  __m128 __a, __b, __c;
249  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250  /* PowerISA VSX does not allow partial (for just lower double)
251     results. So to insure we don't generate spurious exceptions
252     (from the upper double values) we splat the lower double
253     before we to the operation.  */
254  __a = vec_splat (__A, 0);
255  __b = vec_splat (__B, 0);
256  __c = __a + __b;
257  /* Then we merge the lower float result with the original upper
258     float elements from __A.  */
259  return (vec_sel (__A, __c, __mask));
260#else
261  __A[0] = __A[0] + __B[0];
262  return (__A);
263#endif
264}
265
266extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267_mm_sub_ss (__m128 __A, __m128 __B)
268{
269#ifdef _ARCH_PWR7
270  __m128 __a, __b, __c;
271  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272  /* PowerISA VSX does not allow partial (for just lower double)
273     results. So to insure we don't generate spurious exceptions
274     (from the upper double values) we splat the lower double
275     before we to the operation.  */
276  __a = vec_splat (__A, 0);
277  __b = vec_splat (__B, 0);
278  __c = __a - __b;
279  /* Then we merge the lower float result with the original upper
280     float elements from __A.  */
281  return (vec_sel (__A, __c, __mask));
282#else
283  __A[0] = __A[0] - __B[0];
284  return (__A);
285#endif
286}
287
288extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_mul_ss (__m128 __A, __m128 __B)
290{
291#ifdef _ARCH_PWR7
292  __m128 __a, __b, __c;
293  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294  /* PowerISA VSX does not allow partial (for just lower double)
295     results. So to insure we don't generate spurious exceptions
296     (from the upper double values) we splat the lower double
297     before we to the operation.  */
298  __a = vec_splat (__A, 0);
299  __b = vec_splat (__B, 0);
300  __c = __a * __b;
301  /* Then we merge the lower float result with the original upper
302     float elements from __A.  */
303  return (vec_sel (__A, __c, __mask));
304#else
305  __A[0] = __A[0] * __B[0];
306  return (__A);
307#endif
308}
309
310extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm_div_ss (__m128 __A, __m128 __B)
312{
313#ifdef _ARCH_PWR7
314  __m128 __a, __b, __c;
315  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
316  /* PowerISA VSX does not allow partial (for just lower double)
317     results. So to insure we don't generate spurious exceptions
318     (from the upper double values) we splat the lower double
319     before we to the operation.  */
320  __a = vec_splat (__A, 0);
321  __b = vec_splat (__B, 0);
322  __c = __a / __b;
323  /* Then we merge the lower float result with the original upper
324     float elements from __A.  */
325  return (vec_sel (__A, __c, __mask));
326#else
327  __A[0] = __A[0] / __B[0];
328  return (__A);
329#endif
330}
331
332extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_sqrt_ss (__m128 __A)
334{
335  __m128 __a, __c;
336  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
337  /* PowerISA VSX does not allow partial (for just lower double)
338   * results. So to insure we don't generate spurious exceptions
339   * (from the upper double values) we splat the lower double
340   * before we to the operation. */
341  __a = vec_splat (__A, 0);
342  __c = vec_sqrt (__a);
343  /* Then we merge the lower float result with the original upper
344   * float elements from __A.  */
345  return (vec_sel (__A, __c, __mask));
346}
347
348/* Perform the respective operation on the four SPFP values in A and B.  */
349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm_add_ps (__m128 __A, __m128 __B)
351{
352  return (__m128) ((__v4sf)__A + (__v4sf)__B);
353}
354
355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm_sub_ps (__m128 __A, __m128 __B)
357{
358  return (__m128) ((__v4sf)__A - (__v4sf)__B);
359}
360
361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362_mm_mul_ps (__m128 __A, __m128 __B)
363{
364  return (__m128) ((__v4sf)__A * (__v4sf)__B);
365}
366
367extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368_mm_div_ps (__m128 __A, __m128 __B)
369{
370  return (__m128) ((__v4sf)__A / (__v4sf)__B);
371}
372
373extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374_mm_sqrt_ps (__m128 __A)
375{
376  return (vec_sqrt ((__v4sf)__A));
377}
378
379extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_rcp_ps (__m128 __A)
381{
382  return (vec_re ((__v4sf)__A));
383}
384
385extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_rsqrt_ps (__m128 __A)
387{
388  return (vec_rsqrte (__A));
389}
390
391extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm_rcp_ss (__m128 __A)
393{
394  __m128 __a, __c;
395  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
396  /* PowerISA VSX does not allow partial (for just lower double)
397   * results. So to insure we don't generate spurious exceptions
398   * (from the upper double values) we splat the lower double
399   * before we to the operation. */
400  __a = vec_splat (__A, 0);
401  __c = _mm_rcp_ps (__a);
402  /* Then we merge the lower float result with the original upper
403   * float elements from __A.  */
404  return (vec_sel (__A, __c, __mask));
405}
406
407extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
408_mm_rsqrt_ss (__m128 __A)
409{
410  __m128 __a, __c;
411  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
412  /* PowerISA VSX does not allow partial (for just lower double)
413   * results. So to insure we don't generate spurious exceptions
414   * (from the upper double values) we splat the lower double
415   * before we to the operation. */
416  __a = vec_splat (__A, 0);
417  __c = vec_rsqrte (__a);
418  /* Then we merge the lower float result with the original upper
419   * float elements from __A.  */
420  return (vec_sel (__A, __c, __mask));
421}
422
423extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424_mm_min_ss (__m128 __A, __m128 __B)
425{
426  __v4sf __a, __b, __c;
427  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
428  /* PowerISA VSX does not allow partial (for just lower float)
429   * results. So to insure we don't generate spurious exceptions
430   * (from the upper float values) we splat the lower float
431   * before we to the operation. */
432  __a = vec_splat ((__v4sf)__A, 0);
433  __b = vec_splat ((__v4sf)__B, 0);
434  __c = vec_min (__a, __b);
435  /* Then we merge the lower float result with the original upper
436   * float elements from __A.  */
437  return (vec_sel ((__v4sf)__A, __c, __mask));
438}
439
440extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm_max_ss (__m128 __A, __m128 __B)
442{
443  __v4sf __a, __b, __c;
444  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
445  /* PowerISA VSX does not allow partial (for just lower float)
446   * results. So to insure we don't generate spurious exceptions
447   * (from the upper float values) we splat the lower float
448   * before we to the operation. */
449  __a = vec_splat (__A, 0);
450  __b = vec_splat (__B, 0);
451  __c = vec_max (__a, __b);
452  /* Then we merge the lower float result with the original upper
453   * float elements from __A.  */
454  return (vec_sel ((__v4sf)__A, __c, __mask));
455}
456
457extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458_mm_min_ps (__m128 __A, __m128 __B)
459{
460  __vector __bool int __m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
461  return vec_sel (__B, __A, __m);
462}
463
464extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm_max_ps (__m128 __A, __m128 __B)
466{
467  __vector __bool int __m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
468  return vec_sel (__B, __A, __m);
469}
470
471/* Perform logical bit-wise operations on 128-bit values.  */
472extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473_mm_and_ps (__m128 __A, __m128 __B)
474{
475  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
476//  return __builtin_ia32_andps (__A, __B);
477}
478
479extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480_mm_andnot_ps (__m128 __A, __m128 __B)
481{
482  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
483}
484
485extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486_mm_or_ps (__m128 __A, __m128 __B)
487{
488  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
489}
490
491extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492_mm_xor_ps (__m128 __A, __m128 __B)
493{
494  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
495}
496
497/* Perform a comparison on the four SPFP values of A and B.  For each
498   element, if the comparison is true, place a mask of all ones in the
499   result, otherwise a mask of zeros.  */
500extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501_mm_cmpeq_ps (__m128 __A, __m128 __B)
502{
503  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
504}
505
506extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_cmplt_ps (__m128 __A, __m128 __B)
508{
509  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
510}
511
512extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_mm_cmple_ps (__m128 __A, __m128 __B)
514{
515  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
516}
517
518extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519_mm_cmpgt_ps (__m128 __A, __m128 __B)
520{
521  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
522}
523
524extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525_mm_cmpge_ps (__m128 __A, __m128 __B)
526{
527  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
528}
529
530extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531_mm_cmpneq_ps (__m128  __A, __m128  __B)
532{
533  __v4sf __temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
534  return ((__m128)vec_nor (__temp, __temp));
535}
536
537extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_mm_cmpnlt_ps (__m128 __A, __m128 __B)
539{
540  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
541}
542
543extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544_mm_cmpnle_ps (__m128 __A, __m128 __B)
545{
546  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
547}
548
549extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550_mm_cmpngt_ps (__m128 __A, __m128 __B)
551{
552  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
553}
554
555extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556_mm_cmpnge_ps (__m128 __A, __m128 __B)
557{
558  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
559}
560
561extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562_mm_cmpord_ps (__m128  __A, __m128  __B)
563{
564  __vector unsigned int __a, __b;
565  __vector unsigned int __c, __d;
566  static const __vector unsigned int __float_exp_mask =
567    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
568
569  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
570  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
571  __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
572  __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
573  return ((__m128 ) vec_and (__c, __d));
574}
575
576extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577_mm_cmpunord_ps (__m128 __A, __m128 __B)
578{
579  __vector unsigned int __a, __b;
580  __vector unsigned int __c, __d;
581  static const __vector unsigned int __float_exp_mask =
582    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
583
584  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
585  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
586  __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
587  __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
588  return ((__m128 ) vec_or (__c, __d));
589}
590
591/* Perform a comparison on the lower SPFP values of A and B.  If the
592   comparison is true, place a mask of all ones in the result, otherwise a
593   mask of zeros.  The upper three SPFP values are passed through from A.  */
594extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm_cmpeq_ss (__m128  __A, __m128  __B)
596{
597  static const __vector unsigned int __mask =
598    { 0xffffffff, 0, 0, 0 };
599  __v4sf __a, __b, __c;
600  /* PowerISA VMX does not allow partial (for just element 0)
601   * results. So to insure we don't generate spurious exceptions
602   * (from the upper elements) we splat the lower float
603   * before we to the operation. */
604  __a = vec_splat ((__v4sf) __A, 0);
605  __b = vec_splat ((__v4sf) __B, 0);
606  __c = (__v4sf) vec_cmpeq (__a, __b);
607  /* Then we merge the lower float result with the original upper
608   * float elements from __A.  */
609  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
610}
611
612extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_cmplt_ss (__m128 __A, __m128 __B)
614{
615  static const __vector unsigned int __mask =
616    { 0xffffffff, 0, 0, 0 };
617  __v4sf __a, __b, __c;
618  /* PowerISA VMX does not allow partial (for just element 0)
619   * results. So to insure we don't generate spurious exceptions
620   * (from the upper elements) we splat the lower float
621   * before we to the operation. */
622  __a = vec_splat ((__v4sf) __A, 0);
623  __b = vec_splat ((__v4sf) __B, 0);
624  __c = (__v4sf) vec_cmplt(__a, __b);
625  /* Then we merge the lower float result with the original upper
626   * float elements from __A.  */
627  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
628}
629
630extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm_cmple_ss (__m128 __A, __m128 __B)
632{
633  static const __vector unsigned int __mask =
634    { 0xffffffff, 0, 0, 0 };
635  __v4sf __a, __b, __c;
636  /* PowerISA VMX does not allow partial (for just element 0)
637   * results. So to insure we don't generate spurious exceptions
638   * (from the upper elements) we splat the lower float
639   * before we to the operation. */
640  __a = vec_splat ((__v4sf) __A, 0);
641  __b = vec_splat ((__v4sf) __B, 0);
642  __c = (__v4sf) vec_cmple(__a, __b);
643  /* Then we merge the lower float result with the original upper
644   * float elements from __A.  */
645  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
646}
647
648extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm_cmpgt_ss (__m128 __A, __m128 __B)
650{
651  static const __vector unsigned int __mask =
652    { 0xffffffff, 0, 0, 0 };
653  __v4sf __a, __b, __c;
654  /* PowerISA VMX does not allow partial (for just element 0)
655   * results. So to insure we don't generate spurious exceptions
656   * (from the upper elements) we splat the lower float
657   * before we to the operation. */
658  __a = vec_splat ((__v4sf) __A, 0);
659  __b = vec_splat ((__v4sf) __B, 0);
660  __c = (__v4sf) vec_cmpgt(__a, __b);
661  /* Then we merge the lower float result with the original upper
662   * float elements from __A.  */
663  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
664}
665
666extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667_mm_cmpge_ss (__m128 __A, __m128 __B)
668{
669  static const __vector unsigned int __mask =
670    { 0xffffffff, 0, 0, 0 };
671  __v4sf __a, __b, __c;
672  /* PowerISA VMX does not allow partial (for just element 0)
673   * results. So to insure we don't generate spurious exceptions
674   * (from the upper elements) we splat the lower float
675   * before we to the operation. */
676  __a = vec_splat ((__v4sf) __A, 0);
677  __b = vec_splat ((__v4sf) __B, 0);
678  __c = (__v4sf) vec_cmpge(__a, __b);
679  /* Then we merge the lower float result with the original upper
680   * float elements from __A.  */
681  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
682}
683
684extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685_mm_cmpneq_ss (__m128 __A, __m128 __B)
686{
687  static const __vector unsigned int __mask =
688    { 0xffffffff, 0, 0, 0 };
689  __v4sf __a, __b, __c;
690  /* PowerISA VMX does not allow partial (for just element 0)
691   * results. So to insure we don't generate spurious exceptions
692   * (from the upper elements) we splat the lower float
693   * before we to the operation. */
694  __a = vec_splat ((__v4sf) __A, 0);
695  __b = vec_splat ((__v4sf) __B, 0);
696  __c = (__v4sf) vec_cmpeq(__a, __b);
697  __c = vec_nor (__c, __c);
698  /* Then we merge the lower float result with the original upper
699   * float elements from __A.  */
700  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
701}
702
703extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704_mm_cmpnlt_ss (__m128 __A, __m128 __B)
705{
706  static const __vector unsigned int __mask =
707    { 0xffffffff, 0, 0, 0 };
708  __v4sf __a, __b, __c;
709  /* PowerISA VMX does not allow partial (for just element 0)
710   * results. So to insure we don't generate spurious exceptions
711   * (from the upper elements) we splat the lower float
712   * before we to the operation. */
713  __a = vec_splat ((__v4sf) __A, 0);
714  __b = vec_splat ((__v4sf) __B, 0);
715  __c = (__v4sf) vec_cmpge(__a, __b);
716  /* Then we merge the lower float result with the original upper
717   * float elements from __A.  */
718  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
719}
720
721extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722_mm_cmpnle_ss (__m128 __A, __m128 __B)
723{
724  static const __vector unsigned int __mask =
725    { 0xffffffff, 0, 0, 0 };
726  __v4sf __a, __b, __c;
727  /* PowerISA VMX does not allow partial (for just element 0)
728   * results. So to insure we don't generate spurious exceptions
729   * (from the upper elements) we splat the lower float
730   * before we to the operation. */
731  __a = vec_splat ((__v4sf) __A, 0);
732  __b = vec_splat ((__v4sf) __B, 0);
733  __c = (__v4sf) vec_cmpgt(__a, __b);
734  /* Then we merge the lower float result with the original upper
735   * float elements from __A.  */
736  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
737}
738
739extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm_cmpngt_ss (__m128 __A, __m128 __B)
741{
742  static const __vector unsigned int __mask =
743    { 0xffffffff, 0, 0, 0 };
744  __v4sf __a, __b, __c;
745  /* PowerISA VMX does not allow partial (for just element 0)
746   * results. So to insure we don't generate spurious exceptions
747   * (from the upper elements) we splat the lower float
748   * before we to the operation. */
749  __a = vec_splat ((__v4sf) __A, 0);
750  __b = vec_splat ((__v4sf) __B, 0);
751  __c = (__v4sf) vec_cmple(__a, __b);
752  /* Then we merge the lower float result with the original upper
753   * float elements from __A.  */
754  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
755}
756
757extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758_mm_cmpnge_ss (__m128 __A, __m128 __B)
759{
760  static const __vector unsigned int __mask =
761    { 0xffffffff, 0, 0, 0 };
762  __v4sf __a, __b, __c;
763  /* PowerISA VMX does not allow partial (for just element 0)
764   * results. So to insure we don't generate spurious exceptions
765   * (from the upper elements) we splat the lower float
766   * before we do the operation. */
767  __a = vec_splat ((__v4sf) __A, 0);
768  __b = vec_splat ((__v4sf) __B, 0);
769  __c = (__v4sf) vec_cmplt(__a, __b);
770  /* Then we merge the lower float result with the original upper
771   * float elements from __A.  */
772  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
773}
774
775extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776_mm_cmpord_ss (__m128 __A, __m128 __B)
777{
778  __vector unsigned int __a, __b;
779  __vector unsigned int __c, __d;
780  static const __vector unsigned int __float_exp_mask =
781    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
782  static const __vector unsigned int __mask =
783    { 0xffffffff, 0, 0, 0 };
784
785  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
786  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
787  __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
788  __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
789  __c = vec_and (__c, __d);
790  /* Then we merge the lower float result with the original upper
791   * float elements from __A.  */
792  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
793}
794
795extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796_mm_cmpunord_ss (__m128 __A, __m128 __B)
797{
798  __vector unsigned int __a, __b;
799  __vector unsigned int __c, __d;
800  static const __vector unsigned int __float_exp_mask =
801    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
802  static const __vector unsigned int __mask =
803    { 0xffffffff, 0, 0, 0 };
804
805  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
806  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
807  __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
808  __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
809  __c = vec_or (__c, __d);
810  /* Then we merge the lower float result with the original upper
811   * float elements from __A.  */
812  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
813}
814
815/* Compare the lower SPFP values of A and B and return 1 if true
816   and 0 if false.  */
817extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818_mm_comieq_ss (__m128 __A, __m128 __B)
819{
820  return (__A[0] == __B[0]);
821}
822
823extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm_comilt_ss (__m128 __A, __m128 __B)
825{
826  return (__A[0] < __B[0]);
827}
828
829extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm_comile_ss (__m128 __A, __m128 __B)
831{
832  return (__A[0] <= __B[0]);
833}
834
835extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836_mm_comigt_ss (__m128 __A, __m128 __B)
837{
838  return (__A[0] > __B[0]);
839}
840
841extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm_comige_ss (__m128 __A, __m128 __B)
843{
844  return (__A[0] >= __B[0]);
845}
846
847extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm_comineq_ss (__m128 __A, __m128 __B)
849{
850  return (__A[0] != __B[0]);
851}
852
853/* FIXME
854 * The __mm_ucomi??_ss implementations below are exactly the same as
855 * __mm_comi??_ss because GCC for PowerPC only generates unordered
856 * compares (scalar and vector).
857 * Technically __mm_comieq_ss et al should be using the ordered
858 * compare and signal for QNaNs.
859 * The __mm_ucomieq_sd et all should be OK, as is.
860 */
861extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862_mm_ucomieq_ss (__m128 __A, __m128 __B)
863{
864  return (__A[0] == __B[0]);
865}
866
867extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868_mm_ucomilt_ss (__m128 __A, __m128 __B)
869{
870  return (__A[0] < __B[0]);
871}
872
873extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874_mm_ucomile_ss (__m128 __A, __m128 __B)
875{
876  return (__A[0] <= __B[0]);
877}
878
879extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880_mm_ucomigt_ss (__m128 __A, __m128 __B)
881{
882  return (__A[0] > __B[0]);
883}
884
885extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886_mm_ucomige_ss (__m128 __A, __m128 __B)
887{
888  return (__A[0] >= __B[0]);
889}
890
891extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892_mm_ucomineq_ss (__m128 __A, __m128 __B)
893{
894  return (__A[0] != __B[0]);
895}
896
897extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
898_mm_cvtss_f32 (__m128 __A)
899{
900  return ((__v4sf)__A)[0];
901}
902
903/* Convert the lower SPFP value to a 32-bit integer according to the current
904   rounding mode.  */
905extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_cvtss_si32 (__m128 __A)
907{
908  int __res;
909#ifdef _ARCH_PWR8
910  double __dtmp;
911  __asm__(
912#ifdef __LITTLE_ENDIAN__
913      "xxsldwi %x0,%x0,%x0,3;\n"
914#endif
915      "xscvspdp %x2,%x0;\n"
916      "fctiw  %2,%2;\n"
917      "mfvsrd  %1,%x2;\n"
918      : "+wa" (__A),
919        "=r" (__res),
920        "=f" (__dtmp)
921      : );
922#else
923  __res = __builtin_rint(__A[0]);
924#endif
925  return __res;
926}
927
928extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929_mm_cvt_ss2si (__m128 __A)
930{
931  return _mm_cvtss_si32 (__A);
932}
933
934/* Convert the lower SPFP value to a 32-bit integer according to the
935   current rounding mode.  */
936
937/* Intel intrinsic.  */
938extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm_cvtss_si64 (__m128 __A)
940{
941  long long __res;
942#if defined (_ARCH_PWR8) && defined (__powerpc64__)
943  double __dtmp;
944  __asm__(
945#ifdef __LITTLE_ENDIAN__
946      "xxsldwi %x0,%x0,%x0,3;\n"
947#endif
948      "xscvspdp %x2,%x0;\n"
949      "fctid  %2,%2;\n"
950      "mfvsrd  %1,%x2;\n"
951      : "+wa" (__A),
952        "=r" (__res),
953        "=f" (__dtmp)
954      : );
955#else
956  __res = __builtin_llrint(__A[0]);
957#endif
958  return __res;
959}
960
961/* Microsoft intrinsic.  */
962extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963_mm_cvtss_si64x (__m128 __A)
964{
965  return _mm_cvtss_si64 ((__v4sf) __A);
966}
967
968/* Constants for use with _mm_prefetch.  */
969enum _mm_hint
970{
971  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
972  _MM_HINT_ET0 = 7,
973  _MM_HINT_ET1 = 6,
974  _MM_HINT_T0 = 3,
975  _MM_HINT_T1 = 2,
976  _MM_HINT_T2 = 1,
977  _MM_HINT_NTA = 0
978};
979
980/* Loads one cache line from address P to a location "closer" to the
981   processor.  The selector I specifies the type of prefetch operation.  */
982extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_prefetch (const void *__P, enum _mm_hint __I)
984{
985  /* Current PowerPC will ignores the hint parameters.  */
986  __builtin_prefetch (__P);
987}
988
989/* Convert the two lower SPFP values to 32-bit integers according to the
990   current rounding mode.  Return the integers in packed form.  */
991extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992_mm_cvtps_pi32 (__m128 __A)
993{
994  /* Splat two lower SPFP values to both halves.  */
995  __v4sf __temp, __rounded;
996  __vector unsigned long long __result;
997
998  /* Splat two lower SPFP values to both halves.  */
999  __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000  __rounded = vec_rint (__temp);
1001  __result = (__vector unsigned long long) vec_cts (__rounded, 0);
1002
1003  return (__m64) ((__vector long long) __result)[0];
1004}
1005
1006extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_cvt_ps2pi (__m128 __A)
1008{
1009  return _mm_cvtps_pi32 (__A);
1010}
1011
1012/* Truncate the lower SPFP value to a 32-bit integer.  */
1013extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014_mm_cvttss_si32 (__m128 __A)
1015{
1016  /* Extract the lower float element.  */
1017  float __temp = __A[0];
1018  /* truncate to 32-bit integer and return.  */
1019  return __temp;
1020}
1021
1022extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023_mm_cvtt_ss2si (__m128 __A)
1024{
1025  return _mm_cvttss_si32 (__A);
1026}
1027
1028/* Intel intrinsic.  */
1029extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1030_mm_cvttss_si64 (__m128 __A)
1031{
1032  /* Extract the lower float element.  */
1033  float __temp = __A[0];
1034  /* truncate to 32-bit integer and return.  */
1035  return __temp;
1036}
1037
1038/* Microsoft intrinsic.  */
1039extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040_mm_cvttss_si64x (__m128 __A)
1041{
1042  /* Extract the lower float element.  */
1043  float __temp = __A[0];
1044  /* truncate to 32-bit integer and return.  */
1045  return __temp;
1046}
1047
1048/* Truncate the two lower SPFP values to 32-bit integers.  Return the
1049   integers in packed form.  */
1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_mm_cvttps_pi32 (__m128 __A)
1052{
1053  __v4sf __temp;
1054  __vector unsigned long long __result;
1055
1056  /* Splat two lower SPFP values to both halves.  */
1057  __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1058  __result = (__vector unsigned long long) vec_cts (__temp, 0);
1059
1060  return (__m64) ((__vector long long) __result)[0];
1061}
1062
1063extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064_mm_cvtt_ps2pi (__m128 __A)
1065{
1066  return _mm_cvttps_pi32 (__A);
1067}
1068
1069/* Convert B to a SPFP value and insert it as element zero in A.  */
1070extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071_mm_cvtsi32_ss (__m128 __A, int __B)
1072{
1073  float __temp = __B;
1074  __A[0] = __temp;
1075
1076  return __A;
1077}
1078
1079extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080_mm_cvt_si2ss (__m128 __A, int __B)
1081{
1082  return _mm_cvtsi32_ss (__A, __B);
1083}
1084
1085/* Convert B to a SPFP value and insert it as element zero in A.  */
1086/* Intel intrinsic.  */
1087extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088_mm_cvtsi64_ss (__m128 __A, long long __B)
1089{
1090  float __temp = __B;
1091  __A[0] = __temp;
1092
1093  return __A;
1094}
1095
1096/* Microsoft intrinsic.  */
1097extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098_mm_cvtsi64x_ss (__m128 __A, long long __B)
1099{
1100  return _mm_cvtsi64_ss (__A, __B);
1101}
1102
1103/* Convert the two 32-bit values in B to SPFP form and insert them
1104   as the two lower elements in A.  */
1105extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106_mm_cvtpi32_ps (__m128        __A, __m64        __B)
1107{
1108  __vector signed int __vm1;
1109  __vector float __vf1;
1110
1111  __vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1112  __vf1 = (__vector float) vec_ctf (__vm1, 0);
1113
1114  return ((__m128) (__vector unsigned long long)
1115    { ((__vector unsigned long long)__vf1) [0],
1116	((__vector unsigned long long)__A) [1]});
1117}
1118
1119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120_mm_cvt_pi2ps (__m128 __A, __m64 __B)
1121{
1122  return _mm_cvtpi32_ps (__A, __B);
1123}
1124
1125/* Convert the four signed 16-bit values in A to SPFP form.  */
1126extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127_mm_cvtpi16_ps (__m64 __A)
1128{
1129  __vector signed short __vs8;
1130  __vector signed int __vi4;
1131  __vector float __vf1;
1132
1133  __vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1134  __vi4 = vec_vupklsh (__vs8);
1135  __vf1 = (__vector float) vec_ctf (__vi4, 0);
1136
1137  return (__m128) __vf1;
1138}
1139
1140/* Convert the four unsigned 16-bit values in A to SPFP form.  */
1141extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142_mm_cvtpu16_ps (__m64 __A)
1143{
1144  const __vector unsigned short __zero =
1145    { 0, 0, 0, 0, 0, 0, 0, 0 };
1146  __vector unsigned short __vs8;
1147  __vector unsigned int __vi4;
1148  __vector float __vf1;
1149
1150  __vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1151  __vi4 = (__vector unsigned int) vec_mergel
1152#ifdef __LITTLE_ENDIAN__
1153                                           (__vs8, __zero);
1154#else
1155                                           (__zero, __vs8);
1156#endif
1157  __vf1 = (__vector float) vec_ctf (__vi4, 0);
1158
1159  return (__m128) __vf1;
1160}
1161
1162/* Convert the low four signed 8-bit values in A to SPFP form.  */
1163extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164_mm_cvtpi8_ps (__m64 __A)
1165{
1166  __vector signed char __vc16;
1167  __vector signed short __vs8;
1168  __vector signed int __vi4;
1169  __vector float __vf1;
1170
1171  __vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1172  __vs8 = vec_vupkhsb (__vc16);
1173  __vi4 = vec_vupkhsh (__vs8);
1174  __vf1 = (__vector float) vec_ctf (__vi4, 0);
1175
1176  return (__m128) __vf1;
1177}
1178
1179/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1180extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181
1182_mm_cvtpu8_ps (__m64  __A)
1183{
1184  const __vector unsigned char __zero =
1185    { 0, 0, 0, 0, 0, 0, 0, 0 };
1186  __vector unsigned char __vc16;
1187  __vector unsigned short __vs8;
1188  __vector unsigned int __vi4;
1189  __vector float __vf1;
1190
1191  __vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1192#ifdef __LITTLE_ENDIAN__
1193  __vs8 = (__vector unsigned short) vec_mergel (__vc16, __zero);
1194  __vi4 = (__vector unsigned int) vec_mergeh (__vs8,
1195					    (__vector unsigned short) __zero);
1196#else
1197  __vs8 = (__vector unsigned short) vec_mergel (__zero, __vc16);
1198  __vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) __zero,
1199                                            __vs8);
1200#endif
1201  __vf1 = (__vector float) vec_ctf (__vi4, 0);
1202
1203  return (__m128) __vf1;
1204}
1205
1206/* Convert the four signed 32-bit values in A and B to SPFP form.  */
1207extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1209{
1210  __vector signed int __vi4;
1211  __vector float __vf4;
1212
1213  __vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1214  __vf4 = (__vector float) vec_ctf (__vi4, 0);
1215  return (__m128) __vf4;
1216}
1217
1218/* Convert the four SPFP values in A to four signed 16-bit integers.  */
1219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220_mm_cvtps_pi16 (__m128 __A)
1221{
1222  __v4sf __rounded;
1223  __vector signed int __temp;
1224  __vector unsigned long long __result;
1225
1226  __rounded = vec_rint(__A);
1227  __temp = vec_cts (__rounded, 0);
1228  __result = (__vector unsigned long long) vec_pack (__temp, __temp);
1229
1230  return (__m64) ((__vector long long) __result)[0];
1231}
1232
1233/* Convert the four SPFP values in A to four signed 8-bit integers.  */
1234extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235_mm_cvtps_pi8 (__m128 __A)
1236{
1237  __v4sf __rounded;
1238  __vector signed int __tmp_i;
1239  static const __vector signed int __zero = {0, 0, 0, 0};
1240  __vector signed short __tmp_s;
1241  __vector signed char __res_v;
1242
1243  __rounded = vec_rint(__A);
1244  __tmp_i = vec_cts (__rounded, 0);
1245  __tmp_s = vec_pack (__tmp_i, __zero);
1246  __res_v = vec_pack (__tmp_s, __tmp_s);
1247  return (__m64) ((__vector long long) __res_v)[0];
1248}
1249
1250/* Selects four specific SPFP values from A and B based on MASK.  */
1251extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252
1253_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1254{
1255  unsigned long __element_selector_10 = __mask & 0x03;
1256  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1257  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1258  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1259  static const unsigned int __permute_selectors[4] =
1260    {
1261#ifdef __LITTLE_ENDIAN__
1262      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1263#else
1264      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1265#endif
1266    };
1267  __vector unsigned int __t;
1268
1269  __t[0] = __permute_selectors[__element_selector_10];
1270  __t[1] = __permute_selectors[__element_selector_32];
1271  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1272  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1273  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)__t);
1274}
1275
1276/* Selects and interleaves the upper two SPFP values from A and B.  */
1277extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm_unpackhi_ps (__m128 __A, __m128 __B)
1279{
1280  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1281}
1282
1283/* Selects and interleaves the lower two SPFP values from A and B.  */
1284extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285_mm_unpacklo_ps (__m128 __A, __m128 __B)
1286{
1287  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1288}
1289
1290/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1291   the lower two values are passed through from A.  */
1292extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293_mm_loadh_pi (__m128 __A, __m64 const *__P)
1294{
1295  __vector unsigned long long __a = (__vector unsigned long long)__A;
1296  __vector unsigned long long __p = vec_splats(*__P);
1297  __a [1] = __p [1];
1298
1299  return (__m128)__a;
1300}
1301
1302/* Stores the upper two SPFP values of A into P.  */
1303extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304_mm_storeh_pi (__m64 *__P, __m128 __A)
1305{
1306  __vector unsigned long long __a = (__vector unsigned long long) __A;
1307
1308  *__P = __a[1];
1309}
1310
1311/* Moves the upper two values of B into the lower two values of A.  */
1312extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313_mm_movehl_ps (__m128 __A, __m128 __B)
1314{
1315  return (__m128) vec_mergel ((__vector unsigned long long)__B,
1316			      (__vector unsigned long long)__A);
1317}
1318
1319/* Moves the lower two values of B into the upper two values of A.  */
1320extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321_mm_movelh_ps (__m128 __A, __m128 __B)
1322{
1323  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1324			      (__vector unsigned long long)__B);
1325}
1326
1327/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1328   the upper two values are passed through from A.  */
1329extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330_mm_loadl_pi (__m128 __A, __m64 const *__P)
1331{
1332  __vector unsigned long long __a = (__vector unsigned long long)__A;
1333  __vector unsigned long long __p = vec_splats(*__P);
1334  __a [0] = __p [0];
1335
1336  return (__m128)__a;
1337}
1338
1339/* Stores the lower two SPFP values of A into P.  */
1340extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341_mm_storel_pi (__m64 *__P, __m128 __A)
1342{
1343  __vector unsigned long long __a = (__vector unsigned long long) __A;
1344
1345  *__P = __a[0];
1346}
1347
1348#ifdef _ARCH_PWR8
1349/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1350
1351/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1352extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_movemask_ps (__m128  __A)
1354{
1355  __vector unsigned long long __result;
1356  static const __vector unsigned int __perm_mask =
1357    {
1358#ifdef __LITTLE_ENDIAN__
1359	0x00204060, 0x80808080, 0x80808080, 0x80808080
1360#else
1361      0x80808080, 0x80808080, 0x80808080, 0x00204060
1362#endif
1363    };
1364
1365  __result = ((__vector unsigned long long)
1366	    vec_vbpermq ((__vector unsigned char) __A,
1367			 (__vector unsigned char) __perm_mask));
1368
1369#ifdef __LITTLE_ENDIAN__
1370  return __result[1];
1371#else
1372  return __result[0];
1373#endif
1374}
1375#endif /* _ARCH_PWR8 */
1376
1377/* Create a vector with all four elements equal to *P.  */
1378extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_mm_load1_ps (float const *__P)
1380{
1381  return _mm_set1_ps (*__P);
1382}
1383
1384extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385_mm_load_ps1 (float const *__P)
1386{
1387  return _mm_load1_ps (__P);
1388}
1389
1390/* Extracts one of the four words of A.  The selector N must be immediate.  */
1391extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392_mm_extract_pi16 (__m64 const __A, int const __N)
1393{
1394  unsigned int __shiftr = __N & 3;
1395#ifdef __BIG_ENDIAN__
1396  __shiftr = 3 - __shiftr;
1397#endif
1398
1399  return ((__A >> (__shiftr * 16)) & 0xffff);
1400}
1401
1402extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_m_pextrw (__m64 const __A, int const __N)
1404{
1405  return _mm_extract_pi16 (__A, __N);
1406}
1407
1408/* Inserts word D into one of four words of A.  The selector N must be
1409   immediate.  */
1410extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1412{
1413  const int __shiftl = (__N & 3) * 16;
1414  const __m64 __shiftD = (const __m64) __D << __shiftl;
1415  const __m64 __mask = 0xffffUL << __shiftl;
1416  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1417
1418  return __result;
1419}
1420
1421extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422_m_pinsrw (__m64 const __A, int const __D, int const __N)
1423{
1424  return _mm_insert_pi16 (__A, __D, __N);
1425}
1426
1427/* Compute the element-wise maximum of signed 16-bit values.  */
1428extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429
1430_mm_max_pi16 (__m64 __A, __m64 __B)
1431{
1432#if _ARCH_PWR8
1433  __vector signed short __a, __b, __r;
1434  __vector __bool short __c;
1435
1436  __a = (__vector signed short)vec_splats (__A);
1437  __b = (__vector signed short)vec_splats (__B);
1438  __c = (__vector __bool short)vec_cmpgt (__a, __b);
1439  __r = vec_sel (__b, __a, __c);
1440  return (__m64) ((__vector long long) __r)[0];
1441#else
1442  __m64_union __m1, __m2, __res;
1443
1444  __m1.as_m64 = __A;
1445  __m2.as_m64 = __B;
1446
1447  __res.as_short[0] =
1448      (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
1449  __res.as_short[1] =
1450      (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
1451  __res.as_short[2] =
1452      (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
1453  __res.as_short[3] =
1454      (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
1455
1456  return (__m64) __res.as_m64;
1457#endif
1458}
1459
1460extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1461_m_pmaxsw (__m64 __A, __m64 __B)
1462{
1463  return _mm_max_pi16 (__A, __B);
1464}
1465
1466/* Compute the element-wise maximum of unsigned 8-bit values.  */
1467extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1468_mm_max_pu8 (__m64 __A, __m64 __B)
1469{
1470#if _ARCH_PWR8
1471  __vector unsigned char __a, __b, __r;
1472  __vector __bool char __c;
1473
1474  __a = (__vector unsigned char)vec_splats (__A);
1475  __b = (__vector unsigned char)vec_splats (__B);
1476  __c = (__vector __bool char)vec_cmpgt (__a, __b);
1477  __r = vec_sel (__b, __a, __c);
1478  return (__m64) ((__vector long long) __r)[0];
1479#else
1480  __m64_union __m1, __m2, __res;
1481  long __i;
1482
1483  __m1.as_m64 = __A;
1484  __m2.as_m64 = __B;
1485
1486  for (__i = 0; __i < 8; __i++)
1487    __res.as_char[__i] =
1488      ((unsigned char) __m1.as_char[__i] > (unsigned char) __m2.as_char[__i]) ?
1489	  __m1.as_char[__i] : __m2.as_char[__i];
1490
1491  return (__m64) __res.as_m64;
1492#endif
1493}
1494
1495extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496_m_pmaxub (__m64 __A, __m64 __B)
1497{
1498  return _mm_max_pu8 (__A, __B);
1499}
1500
1501/* Compute the element-wise minimum of signed 16-bit values.  */
1502extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503_mm_min_pi16 (__m64 __A, __m64 __B)
1504{
1505#if _ARCH_PWR8
1506  __vector signed short __a, __b, __r;
1507  __vector __bool short __c;
1508
1509  __a = (__vector signed short)vec_splats (__A);
1510  __b = (__vector signed short)vec_splats (__B);
1511  __c = (__vector __bool short)vec_cmplt (__a, __b);
1512  __r = vec_sel (__b, __a, __c);
1513  return (__m64) ((__vector long long) __r)[0];
1514#else
1515  __m64_union __m1, __m2, __res;
1516
1517  __m1.as_m64 = __A;
1518  __m2.as_m64 = __B;
1519
1520  __res.as_short[0] =
1521      (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
1522  __res.as_short[1] =
1523      (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
1524  __res.as_short[2] =
1525      (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
1526  __res.as_short[3] =
1527      (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
1528
1529  return (__m64) __res.as_m64;
1530#endif
1531}
1532
1533extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534_m_pminsw (__m64 __A, __m64 __B)
1535{
1536  return _mm_min_pi16 (__A, __B);
1537}
1538
1539/* Compute the element-wise minimum of unsigned 8-bit values.  */
1540extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1541_mm_min_pu8 (__m64 __A, __m64 __B)
1542{
1543#if _ARCH_PWR8
1544  __vector unsigned char __a, __b, __r;
1545  __vector __bool char __c;
1546
1547  __a = (__vector unsigned char)vec_splats (__A);
1548  __b = (__vector unsigned char)vec_splats (__B);
1549  __c = (__vector __bool char)vec_cmplt (__a, __b);
1550  __r = vec_sel (__b, __a, __c);
1551  return (__m64) ((__vector long long) __r)[0];
1552#else
1553  __m64_union __m1, __m2, __res;
1554  long __i;
1555
1556  __m1.as_m64 = __A;
1557  __m2.as_m64 = __B;
1558
1559
1560  for (__i = 0; __i < 8; __i++)
1561    __res.as_char[__i] =
1562      ((unsigned char) __m1.as_char[__i] < (unsigned char) __m2.as_char[__i]) ?
1563	  __m1.as_char[__i] : __m2.as_char[__i];
1564
1565  return (__m64) __res.as_m64;
1566#endif
1567}
1568
1569extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1570_m_pminub (__m64 __A, __m64 __B)
1571{
1572  return _mm_min_pu8 (__A, __B);
1573}
1574
1575/* Create an 8-bit mask of the signs of 8-bit values.  */
1576extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577_mm_movemask_pi8 (__m64 __A)
1578{
1579#ifdef __powerpc64__
1580  unsigned long long __p =
1581#ifdef __LITTLE_ENDIAN__
1582                         0x0008101820283038UL; // permute control for sign bits
1583#else
1584                         0x3830282018100800UL; // permute control for sign bits
1585#endif
1586  return __builtin_bpermd (__p, __A);
1587#else
1588#ifdef __LITTLE_ENDIAN__
1589  unsigned int __mask = 0x20283038UL;
1590  unsigned int __r1 = __builtin_bpermd (__mask, __A) & 0xf;
1591  unsigned int __r2 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
1592#else
1593  unsigned int __mask = 0x38302820UL;
1594  unsigned int __r1 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
1595  unsigned int __r2 = __builtin_bpermd (__mask, __A) & 0xf;
1596#endif
1597  return (__r2 << 4) | __r1;
1598#endif
1599}
1600
1601extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1602_m_pmovmskb (__m64 __A)
1603{
1604  return _mm_movemask_pi8 (__A);
1605}
1606
1607/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1608   in B and produce the high 16 bits of the 32-bit results.  */
1609extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1610_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1611{
1612  __vector unsigned short __a, __b;
1613  __vector unsigned short __c;
1614  __vector unsigned int __w0, __w1;
1615  __vector unsigned char __xform1 = {
1616#ifdef __LITTLE_ENDIAN__
1617      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1618      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1619#else
1620      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1621      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1622#endif
1623    };
1624
1625  __a = (__vector unsigned short)vec_splats (__A);
1626  __b = (__vector unsigned short)vec_splats (__B);
1627
1628  __w0 = vec_vmuleuh (__a, __b);
1629  __w1 = vec_vmulouh (__a, __b);
1630  __c = (__vector unsigned short)vec_perm (__w0, __w1, __xform1);
1631
1632  return (__m64) ((__vector long long) __c)[0];
1633}
1634
1635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1636_m_pmulhuw (__m64 __A, __m64 __B)
1637{
1638  return _mm_mulhi_pu16 (__A, __B);
1639}
1640
1641/* Return a combination of the four 16-bit values in A.  The selector
1642   must be an immediate.  */
1643extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1644_mm_shuffle_pi16 (__m64 __A, int const __N)
1645{
1646  unsigned long __element_selector_10 = __N & 0x03;
1647  unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1648  unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1649  unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1650  static const unsigned short __permute_selectors[4] =
1651    {
1652#ifdef __LITTLE_ENDIAN__
1653	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1654#else
1655	      0x0607, 0x0405, 0x0203, 0x0001
1656#endif
1657    };
1658  __m64_union __t;
1659  __vector unsigned long long __a, __p, __r;
1660
1661#ifdef __LITTLE_ENDIAN__
1662  __t.as_short[0] = __permute_selectors[__element_selector_10];
1663  __t.as_short[1] = __permute_selectors[__element_selector_32];
1664  __t.as_short[2] = __permute_selectors[__element_selector_54];
1665  __t.as_short[3] = __permute_selectors[__element_selector_76];
1666#else
1667  __t.as_short[3] = __permute_selectors[__element_selector_10];
1668  __t.as_short[2] = __permute_selectors[__element_selector_32];
1669  __t.as_short[1] = __permute_selectors[__element_selector_54];
1670  __t.as_short[0] = __permute_selectors[__element_selector_76];
1671#endif
1672  __p = vec_splats (__t.as_m64);
1673  __a = vec_splats (__A);
1674  __r = vec_perm (__a, __a, (__vector unsigned char)__p);
1675  return (__m64) ((__vector long long) __r)[0];
1676}
1677
1678extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1679_m_pshufw (__m64 __A, int const __N)
1680{
1681  return _mm_shuffle_pi16 (__A, __N);
1682}
1683
1684/* Conditionally store byte elements of A into P.  The high bit of each
1685   byte in the selector N determines whether the corresponding byte from
1686   A is stored.  */
1687extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1688_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1689{
1690  __m64 __hibit = 0x8080808080808080UL;
1691  __m64 __mask, __tmp;
1692  __m64 *__p = (__m64*)__P;
1693
1694  __tmp = *__p;
1695  __mask = _mm_cmpeq_pi8 ((__N & __hibit), __hibit);
1696  __tmp = (__tmp & (~__mask)) | (__A & __mask);
1697  *__p = __tmp;
1698}
1699
1700extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1702{
1703  _mm_maskmove_si64 (__A, __N, __P);
1704}
1705
1706/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1707extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1708_mm_avg_pu8 (__m64 __A, __m64 __B)
1709{
1710  __vector unsigned char __a, __b, __c;
1711
1712  __a = (__vector unsigned char)vec_splats (__A);
1713  __b = (__vector unsigned char)vec_splats (__B);
1714  __c = vec_avg (__a, __b);
1715  return (__m64) ((__vector long long) __c)[0];
1716}
1717
1718extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1719_m_pavgb (__m64 __A, __m64 __B)
1720{
1721  return _mm_avg_pu8 (__A, __B);
1722}
1723
1724/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1725extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1726_mm_avg_pu16 (__m64 __A, __m64 __B)
1727{
1728  __vector unsigned short __a, __b, __c;
1729
1730  __a = (__vector unsigned short)vec_splats (__A);
1731  __b = (__vector unsigned short)vec_splats (__B);
1732  __c = vec_avg (__a, __b);
1733  return (__m64) ((__vector long long) __c)[0];
1734}
1735
1736extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1737_m_pavgw (__m64 __A, __m64 __B)
1738{
1739  return _mm_avg_pu16 (__A, __B);
1740}
1741
1742/* Compute the sum of the absolute differences of the unsigned 8-bit
1743   values in A and B.  Return the value in the lower 16-bit word; the
1744   upper words are cleared.  */
1745extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1746_mm_sad_pu8 (__m64  __A, __m64  __B)
1747{
1748  __vector unsigned char __a, __b;
1749  __vector unsigned char __vmin, __vmax, __vabsdiff;
1750  __vector signed int __vsum;
1751  const __vector unsigned int __zero =
1752    { 0, 0, 0, 0 };
1753  __m64_union __result = {0};
1754
1755  __a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1756  __b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1757  __vmin = vec_min (__a, __b);
1758  __vmax = vec_max (__a, __b);
1759  __vabsdiff = vec_sub (__vmax, __vmin);
1760  /* Sum four groups of bytes into integers.  */
1761  __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
1762  /* Sum across four integers with integer result.  */
1763  __vsum = vec_sums (__vsum, (__vector signed int) __zero);
1764  /* The sum is in the right most 32-bits of the vector result.
1765     Transfer to a GPR and truncate to 16 bits.  */
1766  __result.as_short[0] = __vsum[3];
1767  return __result.as_m64;
1768}
1769
1770extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771_m_psadbw (__m64 __A, __m64 __B)
1772{
1773  return _mm_sad_pu8 (__A, __B);
1774}
1775
1776/* Stores the data in A to the address P without polluting the caches.  */
1777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1778_mm_stream_pi (__m64 *__P, __m64 __A)
1779{
1780  /* Use the data cache block touch for store transient.  */
1781  __asm__ (
1782    "	dcbtstt	0,%0"
1783    :
1784    : "b" (__P)
1785    : "memory"
1786  );
1787  *__P = __A;
1788}
1789
1790/* Likewise.  The address must be 16-byte aligned.  */
1791extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1792_mm_stream_ps (float *__P, __m128 __A)
1793{
1794  /* Use the data cache block touch for store transient.  */
1795  __asm__ (
1796    "	dcbtstt	0,%0"
1797    :
1798    : "b" (__P)
1799    : "memory"
1800  );
1801  _mm_store_ps (__P, __A);
1802}
1803
1804/* Guarantees that every preceding store is globally visible before
1805   any subsequent store.  */
1806extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1807_mm_sfence (void)
1808{
1809  /* Generate a light weight sync.  */
1810  __atomic_thread_fence (__ATOMIC_RELEASE);
1811}
1812
1813/* The execution of the next instruction is delayed by an implementation
1814   specific amount of time.  The instruction does not modify the
1815   architectural state.  This is after the pop_options pragma because
1816   it does not require SSE support in the processor--the encoding is a
1817   nop on processors that do not support it.  */
1818extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1819_mm_pause (void)
1820{
1821  /* There is no exact match with this construct, but the following is
1822     close to the desired effect.  */
1823#if _ARCH_PWR8
1824  /* On power8 and later processors we can depend on Program Priority
1825     (PRI) and associated "very low" PPI setting.  Since we don't know
1826     what PPI this thread is running at we: 1) save the current PRI
1827     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1828     via the special or 31,31,31 encoding. 3) issue an "isync" to
1829     insure the PRI change takes effect before we execute any more
1830     instructions.
1831     Now we can execute a lwsync (release barrier) while we execute
1832     this thread at "very low" PRI.  Finally we restore the original
1833     PRI and continue execution.  */
1834  unsigned long __PPR;
1835
1836  __asm__ volatile (
1837    "	mfppr	%0;"
1838    "   or 31,31,31;"
1839    "   isync;"
1840    "   lwsync;"
1841    "   isync;"
1842    "   mtppr	%0;"
1843    : "=r" (__PPR)
1844    :
1845    : "memory"
1846  );
1847#else
1848  /* For older processor where we may not even have Program Priority
1849     controls we can only depend on Heavy Weight Sync.  */
1850  __atomic_thread_fence (__ATOMIC_SEQ_CST);
1851#endif
1852}
1853
1854/* Transpose the 4x4 matrix composed of row[0-3].  */
1855#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1856do {									\
1857  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1858  __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
1859  __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
1860  __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
1861  __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
1862  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
1863			       (__vector long long)__t1);	\
1864  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
1865			       (__vector long long)__t1);	\
1866  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
1867			       (__vector long long)__t3);	\
1868  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
1869			       (__vector long long)__t3);	\
1870} while (0)
1871
1872/* For backward source compatibility.  */
1873//# include <emmintrin.h>
1874
1875#endif /* _XMMINTRIN_H_INCLUDED */
1876