1/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef NO_WARN_X86_INTRINSICS
28/* This header is distributed to simplify porting x86_64 code that
29   makes explicit use of Intel intrinsics to powerpc64le.
30   It is the user's responsibility to determine if the results are
31   acceptable and make additional changes as necessary.
32   Note that much code that uses Intel intrinsics can be rewritten in
33   standard C or GNU C extensions, which are more portable and better
34   optimized across multiple targets.
35
36   In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37   target does not support a native __vector_size__ (8) type.  Instead
38   we typedef __m64 to a 64-bit unsigned long long, which is natively
39   supported in 64-bit mode.  This works well for the _si64 and some
40   _pi32 operations, but starts to generate long sequences for _pi16
41   and _pi8 operations.  For those cases it better (faster and
42   smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43   unit, perform the operation, and then transfer the result back to
44   the __m64 type. This implies that the direct register move
45   instructions, introduced with power8, are available for efficient
46   implementation of these transfers.
47
48   Most MMX intrinsic operations can be performed efficiently as
49   C language 64-bit scalar operation or optimized to use the newer
50   128-bit SSE/Altivec operations.  We recomend this for new
51   applications.  */
52#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
53#endif
54
55#ifndef _MMINTRIN_H_INCLUDED
56#define _MMINTRIN_H_INCLUDED
57
58#include <altivec.h>
59/* The Intel API is flexible enough that we must allow aliasing with other
60   vector types, and their scalar components.  */
61typedef __attribute__ ((__aligned__ (8),
62			__may_alias__)) unsigned long long __m64;
63
64typedef __attribute__ ((__aligned__ (8)))
65union
66  {
67    __m64 as_m64;
68    char as_char[8];
69    signed char as_signed_char [8];
70    short as_short[4];
71    int as_int[2];
72    long long as_long_long;
73    float as_float[2];
74    double as_double;
75  } __m64_union;
76
77/* Empty the multimedia state.  */
78extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79_mm_empty (void)
80{
81  /* nothing to do on PowerPC.  */
82}
83
84extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85_m_empty (void)
86{
87  /* nothing to do on PowerPC.  */
88}
89
90/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
91extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92_mm_cvtsi32_si64 (int __i)
93{
94  return (__m64) (unsigned int) __i;
95}
96
97extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98_m_from_int (int __i)
99{
100  return _mm_cvtsi32_si64 (__i);
101}
102
103/* Convert the lower 32 bits of the __m64 object into an integer.  */
104extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_cvtsi64_si32 (__m64 __i)
106{
107  return ((int) __i);
108}
109
110extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111_m_to_int (__m64 __i)
112{
113  return _mm_cvtsi64_si32 (__i);
114}
115
116/* Convert I to a __m64 object.  */
117
118/* Intel intrinsic.  */
119extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_m_from_int64 (long long __i)
121{
122  return (__m64) __i;
123}
124
125extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_mm_cvtsi64_m64 (long long __i)
127{
128  return (__m64) __i;
129}
130
131/* Microsoft intrinsic.  */
132extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_cvtsi64x_si64 (long long __i)
134{
135  return (__m64) __i;
136}
137
138extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139_mm_set_pi64x (long long __i)
140{
141  return (__m64) __i;
142}
143
144/* Convert the __m64 object to a 64bit integer.  */
145
146/* Intel intrinsic.  */
147extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148_m_to_int64 (__m64 __i)
149{
150  return (long long)__i;
151}
152
153extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154_mm_cvtm64_si64 (__m64 __i)
155{
156  return (long long) __i;
157}
158
159/* Microsoft intrinsic.  */
160extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161_mm_cvtsi64_si64x (__m64 __i)
162{
163  return (long long) __i;
164}
165
166#ifdef _ARCH_PWR8
167/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168   the result, and the four 16-bit values from M2 into the upper four 8-bit
169   values of the result, all with signed saturation.  */
170extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171_mm_packs_pi16 (__m64 __m1, __m64 __m2)
172{
173  __vector signed short __vm1;
174  __vector signed char __vresult;
175
176  __vm1 = (__vector signed short) (__vector unsigned long long)
177#ifdef __LITTLE_ENDIAN__
178        { __m1, __m2 };
179#else
180        { __m2, __m1 };
181#endif
182  __vresult = vec_packs (__vm1, __vm1);
183  return (__m64) ((__vector long long) __vresult)[0];
184}
185
186extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187_m_packsswb (__m64 __m1, __m64 __m2)
188{
189  return _mm_packs_pi16 (__m1, __m2);
190}
191
192/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193   the result, and the two 32-bit values from M2 into the upper two 16-bit
194   values of the result, all with signed saturation.  */
195extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_packs_pi32 (__m64 __m1, __m64 __m2)
197{
198  __vector signed int __vm1;
199  __vector signed short __vresult;
200
201  __vm1 = (__vector signed int) (__vector unsigned long long)
202#ifdef __LITTLE_ENDIAN__
203        { __m1, __m2 };
204#else
205        { __m2, __m1 };
206#endif
207  __vresult = vec_packs (__vm1, __vm1);
208  return (__m64) ((__vector long long) __vresult)[0];
209}
210
211extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212_m_packssdw (__m64 __m1, __m64 __m2)
213{
214  return _mm_packs_pi32 (__m1, __m2);
215}
216
217/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218   the result, and the four 16-bit values from M2 into the upper four 8-bit
219   values of the result, all with unsigned saturation.  */
220extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_packs_pu16 (__m64 __m1, __m64 __m2)
222{
223  __vector unsigned char __r;
224  __vector signed short __vm1 = (__vector signed short) (__vector long long)
225#ifdef __LITTLE_ENDIAN__
226        { __m1, __m2 };
227#else
228        { __m2, __m1 };
229#endif
230  const __vector signed short __zero = { 0 };
231  __vector __bool short __select = vec_cmplt (__vm1, __zero);
232  __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
233  __vector __bool char __packsel = vec_pack (__select, __select);
234  __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
235  return (__m64) ((__vector long long) __r)[0];
236}
237
238extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239_m_packuswb (__m64 __m1, __m64 __m2)
240{
241  return _mm_packs_pu16 (__m1, __m2);
242}
243#endif /* end ARCH_PWR8 */
244
245/* Interleave the four 8-bit values from the high half of M1 with the four
246   8-bit values from the high half of M2.  */
247extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
249{
250#if _ARCH_PWR8
251  __vector unsigned char __a, __b, __c;
252
253  __a = (__vector unsigned char)vec_splats (__m1);
254  __b = (__vector unsigned char)vec_splats (__m2);
255  __c = vec_mergel (__a, __b);
256  return (__m64) ((__vector long long) __c)[1];
257#else
258  __m64_union __mu1, __mu2, __res;
259
260  __mu1.as_m64 = __m1;
261  __mu2.as_m64 = __m2;
262
263  __res.as_char[0] = __mu1.as_char[4];
264  __res.as_char[1] = __mu2.as_char[4];
265  __res.as_char[2] = __mu1.as_char[5];
266  __res.as_char[3] = __mu2.as_char[5];
267  __res.as_char[4] = __mu1.as_char[6];
268  __res.as_char[5] = __mu2.as_char[6];
269  __res.as_char[6] = __mu1.as_char[7];
270  __res.as_char[7] = __mu2.as_char[7];
271
272  return (__m64) __res.as_m64;
273#endif
274}
275
276extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277_m_punpckhbw (__m64 __m1, __m64 __m2)
278{
279  return _mm_unpackhi_pi8 (__m1, __m2);
280}
281
282/* Interleave the two 16-bit values from the high half of M1 with the two
283   16-bit values from the high half of M2.  */
284extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
286{
287  __m64_union __mu1, __mu2, __res;
288
289  __mu1.as_m64 = __m1;
290  __mu2.as_m64 = __m2;
291
292  __res.as_short[0] = __mu1.as_short[2];
293  __res.as_short[1] = __mu2.as_short[2];
294  __res.as_short[2] = __mu1.as_short[3];
295  __res.as_short[3] = __mu2.as_short[3];
296
297  return (__m64) __res.as_m64;
298}
299
300extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301_m_punpckhwd (__m64 __m1, __m64 __m2)
302{
303  return _mm_unpackhi_pi16 (__m1, __m2);
304}
305/* Interleave the 32-bit value from the high half of M1 with the 32-bit
306   value from the high half of M2.  */
307extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
309{
310  __m64_union __mu1, __mu2, __res;
311
312  __mu1.as_m64 = __m1;
313  __mu2.as_m64 = __m2;
314
315  __res.as_int[0] = __mu1.as_int[1];
316  __res.as_int[1] = __mu2.as_int[1];
317
318  return (__m64) __res.as_m64;
319}
320
321extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322_m_punpckhdq (__m64 __m1, __m64 __m2)
323{
324  return _mm_unpackhi_pi32 (__m1, __m2);
325}
326/* Interleave the four 8-bit values from the low half of M1 with the four
327   8-bit values from the low half of M2.  */
328extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
330{
331#if _ARCH_PWR8
332  __vector unsigned char __a, __b, __c;
333
334  __a = (__vector unsigned char)vec_splats (__m1);
335  __b = (__vector unsigned char)vec_splats (__m2);
336  __c = vec_mergel (__a, __b);
337  return (__m64) ((__vector long long) __c)[0];
338#else
339  __m64_union __mu1, __mu2, __res;
340
341  __mu1.as_m64 = __m1;
342  __mu2.as_m64 = __m2;
343
344  __res.as_char[0] = __mu1.as_char[0];
345  __res.as_char[1] = __mu2.as_char[0];
346  __res.as_char[2] = __mu1.as_char[1];
347  __res.as_char[3] = __mu2.as_char[1];
348  __res.as_char[4] = __mu1.as_char[2];
349  __res.as_char[5] = __mu2.as_char[2];
350  __res.as_char[6] = __mu1.as_char[3];
351  __res.as_char[7] = __mu2.as_char[3];
352
353  return (__m64) __res.as_m64;
354#endif
355}
356
357extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358_m_punpcklbw (__m64 __m1, __m64 __m2)
359{
360  return _mm_unpacklo_pi8 (__m1, __m2);
361}
362/* Interleave the two 16-bit values from the low half of M1 with the two
363   16-bit values from the low half of M2.  */
364extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
366{
367  __m64_union __mu1, __mu2, __res;
368
369  __mu1.as_m64 = __m1;
370  __mu2.as_m64 = __m2;
371
372  __res.as_short[0] = __mu1.as_short[0];
373  __res.as_short[1] = __mu2.as_short[0];
374  __res.as_short[2] = __mu1.as_short[1];
375  __res.as_short[3] = __mu2.as_short[1];
376
377  return (__m64) __res.as_m64;
378}
379
380extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381_m_punpcklwd (__m64 __m1, __m64 __m2)
382{
383  return _mm_unpacklo_pi16 (__m1, __m2);
384}
385
386/* Interleave the 32-bit value from the low half of M1 with the 32-bit
387   value from the low half of M2.  */
388extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
390{
391  __m64_union __mu1, __mu2, __res;
392
393  __mu1.as_m64 = __m1;
394  __mu2.as_m64 = __m2;
395
396  __res.as_int[0] = __mu1.as_int[0];
397  __res.as_int[1] = __mu2.as_int[0];
398
399  return (__m64) __res.as_m64;
400}
401
402extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403_m_punpckldq (__m64 __m1, __m64 __m2)
404{
405  return _mm_unpacklo_pi32 (__m1, __m2);
406}
407
408/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
409extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410_mm_add_pi8 (__m64 __m1, __m64 __m2)
411{
412#if _ARCH_PWR8
413  __vector signed char __a, __b, __c;
414
415  __a = (__vector signed char)vec_splats (__m1);
416  __b = (__vector signed char)vec_splats (__m2);
417  __c = vec_add (__a, __b);
418  return (__m64) ((__vector long long) __c)[0];
419#else
420  __m64_union __mu1, __mu2, __res;
421
422  __mu1.as_m64 = __m1;
423  __mu2.as_m64 = __m2;
424
425  __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
426  __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
427  __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
428  __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
429  __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
430  __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
431  __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
432  __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
433
434  return (__m64) __res.as_m64;
435#endif
436}
437
438extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439_m_paddb (__m64 __m1, __m64 __m2)
440{
441  return _mm_add_pi8 (__m1, __m2);
442}
443
444/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
445extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446_mm_add_pi16 (__m64 __m1, __m64 __m2)
447{
448#if _ARCH_PWR8
449  __vector signed short __a, __b, __c;
450
451  __a = (__vector signed short)vec_splats (__m1);
452  __b = (__vector signed short)vec_splats (__m2);
453  __c = vec_add (__a, __b);
454  return (__m64) ((__vector long long) __c)[0];
455#else
456  __m64_union __mu1, __mu2, __res;
457
458  __mu1.as_m64 = __m1;
459  __mu2.as_m64 = __m2;
460
461  __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
462  __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
463  __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
464  __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
465
466  return (__m64) __res.as_m64;
467#endif
468}
469
470extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471_m_paddw (__m64 __m1, __m64 __m2)
472{
473  return _mm_add_pi16 (__m1, __m2);
474}
475
476/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
477extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm_add_pi32 (__m64 __m1, __m64 __m2)
479{
480#if _ARCH_PWR9
481  __vector signed int __a, __b, __c;
482
483  __a = (__vector signed int)vec_splats (__m1);
484  __b = (__vector signed int)vec_splats (__m2);
485  __c = vec_add (__a, __b);
486  return (__m64) ((__vector long long) __c)[0];
487#else
488  __m64_union __mu1, __mu2, __res;
489
490  __mu1.as_m64 = __m1;
491  __mu2.as_m64 = __m2;
492
493  __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
494  __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
495
496  return (__m64) __res.as_m64;
497#endif
498}
499
500extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501_m_paddd (__m64 __m1, __m64 __m2)
502{
503  return _mm_add_pi32 (__m1, __m2);
504}
505
506/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
507extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508_mm_sub_pi8 (__m64 __m1, __m64 __m2)
509{
510#if _ARCH_PWR8
511  __vector signed char __a, __b, __c;
512
513  __a = (__vector signed char)vec_splats (__m1);
514  __b = (__vector signed char)vec_splats (__m2);
515  __c = vec_sub (__a, __b);
516  return (__m64) ((__vector long long) __c)[0];
517#else
518  __m64_union __mu1, __mu2, __res;
519
520  __mu1.as_m64 = __m1;
521  __mu2.as_m64 = __m2;
522
523  __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
524  __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
525  __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
526  __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
527  __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
528  __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
529  __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
530  __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
531
532  return (__m64) __res.as_m64;
533#endif
534}
535
536extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537_m_psubb (__m64 __m1, __m64 __m2)
538{
539  return _mm_sub_pi8 (__m1, __m2);
540}
541
542/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
543extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544_mm_sub_pi16 (__m64 __m1, __m64 __m2)
545{
546#if _ARCH_PWR8
547  __vector signed short __a, __b, __c;
548
549  __a = (__vector signed short)vec_splats (__m1);
550  __b = (__vector signed short)vec_splats (__m2);
551  __c = vec_sub (__a, __b);
552  return (__m64) ((__vector long long) __c)[0];
553#else
554  __m64_union __mu1, __mu2, __res;
555
556  __mu1.as_m64 = __m1;
557  __mu2.as_m64 = __m2;
558
559  __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
560  __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
561  __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
562  __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
563
564  return (__m64) __res.as_m64;
565#endif
566}
567
568extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569_m_psubw (__m64 __m1, __m64 __m2)
570{
571  return _mm_sub_pi16 (__m1, __m2);
572}
573
574/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
575extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576_mm_sub_pi32 (__m64 __m1, __m64 __m2)
577{
578#if _ARCH_PWR9
579  __vector signed int __a, __b, __c;
580
581  __a = (__vector signed int)vec_splats (__m1);
582  __b = (__vector signed int)vec_splats (__m2);
583  __c = vec_sub (__a, __b);
584  return (__m64) ((__vector long long) __c)[0];
585#else
586  __m64_union __mu1, __mu2, __res;
587
588  __mu1.as_m64 = __m1;
589  __mu2.as_m64 = __m2;
590
591  __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
592  __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
593
594  return (__m64) __res.as_m64;
595#endif
596}
597
598extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599_m_psubd (__m64 __m1, __m64 __m2)
600{
601  return _mm_sub_pi32 (__m1, __m2);
602}
603
604extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605_mm_add_si64 (__m64 __m1, __m64 __m2)
606{
607  return (__m1 + __m2);
608}
609
610extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611_mm_sub_si64 (__m64 __m1, __m64 __m2)
612{
613  return (__m1 - __m2);
614}
615
616/* Shift the 64-bit value in M left by COUNT.  */
617extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618_mm_sll_si64 (__m64 __m, __m64 __count)
619{
620  return (__m << __count);
621}
622
623extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624_m_psllq (__m64 __m, __m64 __count)
625{
626  return _mm_sll_si64 (__m, __count);
627}
628
629extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630_mm_slli_si64 (__m64 __m, const int __count)
631{
632  return (__m << __count);
633}
634
635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636_m_psllqi (__m64 __m, const int __count)
637{
638  return _mm_slli_si64 (__m, __count);
639}
640
641/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
642extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_srl_si64 (__m64 __m, __m64 __count)
644{
645  return (__m >> __count);
646}
647
648extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_m_psrlq (__m64 __m, __m64 __count)
650{
651  return _mm_srl_si64 (__m, __count);
652}
653
654extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655_mm_srli_si64 (__m64 __m, const int __count)
656{
657  return (__m >> __count);
658}
659
660extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_m_psrlqi (__m64 __m, const int __count)
662{
663  return _mm_srli_si64 (__m, __count);
664}
665
666/* Bit-wise AND the 64-bit values in M1 and M2.  */
667extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668_mm_and_si64 (__m64 __m1, __m64 __m2)
669{
670  return (__m1 & __m2);
671}
672
673extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674_m_pand (__m64 __m1, __m64 __m2)
675{
676  return _mm_and_si64 (__m1, __m2);
677}
678
679/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680   64-bit value in M2.  */
681extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682_mm_andnot_si64 (__m64 __m1, __m64 __m2)
683{
684  return (~__m1 & __m2);
685}
686
687extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688_m_pandn (__m64 __m1, __m64 __m2)
689{
690  return _mm_andnot_si64 (__m1, __m2);
691}
692
693/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
694extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_or_si64 (__m64 __m1, __m64 __m2)
696{
697  return (__m1 | __m2);
698}
699
700extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701_m_por (__m64 __m1, __m64 __m2)
702{
703  return _mm_or_si64 (__m1, __m2);
704}
705
706/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
707extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708_mm_xor_si64 (__m64 __m1, __m64 __m2)
709{
710  return  (__m1 ^ __m2);
711}
712
713extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
714_m_pxor (__m64 __m1, __m64 __m2)
715{
716  return _mm_xor_si64 (__m1, __m2);
717}
718
719/* Creates a 64-bit zero.  */
720extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721_mm_setzero_si64 (void)
722{
723  return (__m64) 0;
724}
725
726/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
727   test is true and zero if false.  */
728extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
730{
731#if defined(_ARCH_PWR6) && defined(__powerpc64__)
732  __m64 __res;
733  __asm__(
734      "cmpb %0,%1,%2;\n"
735      : "=r" (__res)
736      : "r" (__m1),
737	"r" (__m2)
738      : );
739  return (__res);
740#else
741  __m64_union __mu1, __mu2, __res;
742
743  __mu1.as_m64 = __m1;
744  __mu2.as_m64 = __m2;
745
746  __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
747  __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
748  __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
749  __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
750  __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
751  __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
752  __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
753  __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
754
755  return (__m64) __res.as_m64;
756#endif
757}
758
759extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760_m_pcmpeqb (__m64 __m1, __m64 __m2)
761{
762  return _mm_cmpeq_pi8 (__m1, __m2);
763}
764
765extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
767{
768#if _ARCH_PWR8
769  __vector signed char __a, __b, __c;
770
771  __a = (__vector signed char)vec_splats (__m1);
772  __b = (__vector signed char)vec_splats (__m2);
773  __c = (__vector signed char)vec_cmpgt (__a, __b);
774  return (__m64) ((__vector long long) __c)[0];
775#else
776  __m64_union __mu1, __mu2, __res;
777
778  __mu1.as_m64 = __m1;
779  __mu2.as_m64 = __m2;
780
781  __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
782  __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
783  __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
784  __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
785  __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
786  __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
787  __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
788  __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
789
790  return (__m64) __res.as_m64;
791#endif
792}
793
794extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795_m_pcmpgtb (__m64 __m1, __m64 __m2)
796{
797  return _mm_cmpgt_pi8 (__m1, __m2);
798}
799
800/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
801   the test is true and zero if false.  */
802extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
804{
805#if _ARCH_PWR8
806  __vector signed short __a, __b, __c;
807
808  __a = (__vector signed short)vec_splats (__m1);
809  __b = (__vector signed short)vec_splats (__m2);
810  __c = (__vector signed short)vec_cmpeq (__a, __b);
811  return (__m64) ((__vector long long) __c)[0];
812#else
813  __m64_union __mu1, __mu2, __res;
814
815  __mu1.as_m64 = __m1;
816  __mu2.as_m64 = __m2;
817
818  __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
819  __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
820  __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
821  __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
822
823  return (__m64) __res.as_m64;
824#endif
825}
826
827extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828_m_pcmpeqw (__m64 __m1, __m64 __m2)
829{
830  return _mm_cmpeq_pi16 (__m1, __m2);
831}
832
833extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
835{
836#if _ARCH_PWR8
837  __vector signed short __a, __b, __c;
838
839  __a = (__vector signed short)vec_splats (__m1);
840  __b = (__vector signed short)vec_splats (__m2);
841  __c = (__vector signed short)vec_cmpgt (__a, __b);
842  return (__m64) ((__vector long long) __c)[0];
843#else
844  __m64_union __mu1, __mu2, __res;
845
846  __mu1.as_m64 = __m1;
847  __mu2.as_m64 = __m2;
848
849  __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
850  __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
851  __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
852  __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
853
854  return (__m64) __res.as_m64;
855#endif
856}
857
858extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859_m_pcmpgtw (__m64 __m1, __m64 __m2)
860{
861  return _mm_cmpgt_pi16 (__m1, __m2);
862}
863
864/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
865   the test is true and zero if false.  */
866extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868{
869#if _ARCH_PWR9
870  __vector signed int __a, __b, __c;
871
872  __a = (__vector signed int)vec_splats (__m1);
873  __b = (__vector signed int)vec_splats (__m2);
874  __c = (__vector signed int)vec_cmpeq (__a, __b);
875  return (__m64) ((__vector long long) __c)[0];
876#else
877  __m64_union __mu1, __mu2, __res;
878
879  __mu1.as_m64 = __m1;
880  __mu2.as_m64 = __m2;
881
882  __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
883  __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
884
885  return (__m64) __res.as_m64;
886#endif
887}
888
889extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_m_pcmpeqd (__m64 __m1, __m64 __m2)
891{
892  return _mm_cmpeq_pi32 (__m1, __m2);
893}
894
895extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
897{
898#if _ARCH_PWR9
899  __vector signed int __a, __b, __c;
900
901  __a = (__vector signed int)vec_splats (__m1);
902  __b = (__vector signed int)vec_splats (__m2);
903  __c = (__vector signed int)vec_cmpgt (__a, __b);
904  return (__m64) ((__vector long long) __c)[0];
905#else
906  __m64_union __mu1, __mu2, __res;
907
908  __mu1.as_m64 = __m1;
909  __mu2.as_m64 = __m2;
910
911  __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
912  __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
913
914  return (__m64) __res.as_m64;
915#endif
916}
917
918extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919_m_pcmpgtd (__m64 __m1, __m64 __m2)
920{
921  return _mm_cmpgt_pi32 (__m1, __m2);
922}
923
924#if _ARCH_PWR8
925/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926   saturated arithmetic.  */
927extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928_mm_adds_pi8 (__m64 __m1, __m64 __m2)
929{
930  __vector signed char __a, __b, __c;
931
932  __a = (__vector signed char)vec_splats (__m1);
933  __b = (__vector signed char)vec_splats (__m2);
934  __c = vec_adds (__a, __b);
935  return (__m64) ((__vector long long) __c)[0];
936}
937
938extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_m_paddsb (__m64 __m1, __m64 __m2)
940{
941  return _mm_adds_pi8 (__m1, __m2);
942}
943/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944   saturated arithmetic.  */
945extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
946_mm_adds_pi16 (__m64 __m1, __m64 __m2)
947{
948  __vector signed short __a, __b, __c;
949
950  __a = (__vector signed short)vec_splats (__m1);
951  __b = (__vector signed short)vec_splats (__m2);
952  __c = vec_adds (__a, __b);
953  return (__m64) ((__vector long long) __c)[0];
954}
955
956extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957_m_paddsw (__m64 __m1, __m64 __m2)
958{
959  return _mm_adds_pi16 (__m1, __m2);
960}
961/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962   saturated arithmetic.  */
963extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964_mm_adds_pu8 (__m64 __m1, __m64 __m2)
965{
966  __vector unsigned char __a, __b, __c;
967
968  __a = (__vector unsigned char)vec_splats (__m1);
969  __b = (__vector unsigned char)vec_splats (__m2);
970  __c = vec_adds (__a, __b);
971  return (__m64) ((__vector long long) __c)[0];
972}
973
974extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975_m_paddusb (__m64 __m1, __m64 __m2)
976{
977  return _mm_adds_pu8 (__m1, __m2);
978}
979
980/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981   saturated arithmetic.  */
982extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_adds_pu16 (__m64 __m1, __m64 __m2)
984{
985  __vector unsigned short __a, __b, __c;
986
987  __a = (__vector unsigned short)vec_splats (__m1);
988  __b = (__vector unsigned short)vec_splats (__m2);
989  __c = vec_adds (__a, __b);
990  return (__m64) ((__vector long long) __c)[0];
991}
992
993extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994_m_paddusw (__m64 __m1, __m64 __m2)
995{
996  return _mm_adds_pu16 (__m1, __m2);
997}
998
999/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000   saturating arithmetic.  */
1001extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002_mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003{
1004  __vector signed char __a, __b, __c;
1005
1006  __a = (__vector signed char)vec_splats (__m1);
1007  __b = (__vector signed char)vec_splats (__m2);
1008  __c = vec_subs (__a, __b);
1009  return (__m64) ((__vector long long) __c)[0];
1010}
1011
1012extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013_m_psubsb (__m64 __m1, __m64 __m2)
1014{
1015  return _mm_subs_pi8 (__m1, __m2);
1016}
1017
1018/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019   signed saturating arithmetic.  */
1020extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021_mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022{
1023  __vector signed short __a, __b, __c;
1024
1025  __a = (__vector signed short)vec_splats (__m1);
1026  __b = (__vector signed short)vec_splats (__m2);
1027  __c = vec_subs (__a, __b);
1028  return (__m64) ((__vector long long) __c)[0];
1029}
1030
1031extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032_m_psubsw (__m64 __m1, __m64 __m2)
1033{
1034  return _mm_subs_pi16 (__m1, __m2);
1035}
1036
1037/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038   unsigned saturating arithmetic.  */
1039extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040_mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041{
1042  __vector unsigned char __a, __b, __c;
1043
1044  __a = (__vector unsigned char)vec_splats (__m1);
1045  __b = (__vector unsigned char)vec_splats (__m2);
1046  __c = vec_subs (__a, __b);
1047  return (__m64) ((__vector long long) __c)[0];
1048}
1049
1050extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051_m_psubusb (__m64 __m1, __m64 __m2)
1052{
1053  return _mm_subs_pu8 (__m1, __m2);
1054}
1055
1056/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057   unsigned saturating arithmetic.  */
1058extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059_mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060{
1061  __vector unsigned short __a, __b, __c;
1062
1063  __a = (__vector unsigned short)vec_splats (__m1);
1064  __b = (__vector unsigned short)vec_splats (__m2);
1065  __c = vec_subs (__a, __b);
1066  return (__m64) ((__vector long long) __c)[0];
1067}
1068
1069extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070_m_psubusw (__m64 __m1, __m64 __m2)
1071{
1072  return _mm_subs_pu16 (__m1, __m2);
1073}
1074
1075/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076   four 32-bit intermediate results, which are then summed by pairs to
1077   produce two 32-bit results.  */
1078extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079_mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080{
1081  __vector signed short __a, __b;
1082  __vector signed int __c;
1083  __vector signed int __zero = {0, 0, 0, 0};
1084
1085  __a = (__vector signed short)vec_splats (__m1);
1086  __b = (__vector signed short)vec_splats (__m2);
1087  __c = vec_vmsumshm (__a, __b, __zero);
1088  return (__m64) ((__vector long long) __c)[0];
1089}
1090
1091extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092_m_pmaddwd (__m64 __m1, __m64 __m2)
1093{
1094  return _mm_madd_pi16 (__m1, __m2);
1095}
1096/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097   M2 and produce the high 16 bits of the 32-bit results.  */
1098extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100{
1101  __vector signed short __a, __b;
1102  __vector signed short __c;
1103  __vector signed int __w0, __w1;
1104  __vector unsigned char __xform1 = {
1105#ifdef __LITTLE_ENDIAN__
1106      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1107      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1108#else
1109      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1110      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1111#endif
1112    };
1113
1114  __a = (__vector signed short)vec_splats (__m1);
1115  __b = (__vector signed short)vec_splats (__m2);
1116
1117  __w0 = vec_vmulesh (__a, __b);
1118  __w1 = vec_vmulosh (__a, __b);
1119  __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
1120
1121  return (__m64) ((__vector long long) __c)[0];
1122}
1123
1124extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125_m_pmulhw (__m64 __m1, __m64 __m2)
1126{
1127  return _mm_mulhi_pi16 (__m1, __m2);
1128}
1129
1130/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131   the low 16 bits of the results.  */
1132extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134{
1135  __vector signed short __a, __b, __c;
1136
1137  __a = (__vector signed short)vec_splats (__m1);
1138  __b = (__vector signed short)vec_splats (__m2);
1139  __c = __a * __b;
1140  return (__m64) ((__vector long long) __c)[0];
1141}
1142
1143extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144_m_pmullw (__m64 __m1, __m64 __m2)
1145{
1146  return _mm_mullo_pi16 (__m1, __m2);
1147}
1148
1149/* Shift four 16-bit values in M left by COUNT.  */
1150extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151_mm_sll_pi16 (__m64 __m, __m64 __count)
1152{
1153  __vector signed short __r;
1154  __vector unsigned short __c;
1155
1156  if (__count <= 15)
1157    {
1158      __r = (__vector signed short)vec_splats (__m);
1159      __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160      __r = vec_sl (__r, (__vector unsigned short)__c);
1161      return (__m64) ((__vector long long) __r)[0];
1162    }
1163  else
1164  return (0);
1165}
1166
1167extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168_m_psllw (__m64 __m, __m64 __count)
1169{
1170  return _mm_sll_pi16 (__m, __count);
1171}
1172
1173extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174_mm_slli_pi16 (__m64 __m, int __count)
1175{
1176  /* Promote int to long then invoke mm_sll_pi16.  */
1177  return _mm_sll_pi16 (__m, __count);
1178}
1179
1180extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181_m_psllwi (__m64 __m, int __count)
1182{
1183  return _mm_slli_pi16 (__m, __count);
1184}
1185
1186/* Shift two 32-bit values in M left by COUNT.  */
1187extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188_mm_sll_pi32 (__m64 __m, __m64 __count)
1189{
1190  __m64_union __res;
1191
1192  __res.as_m64 = __m;
1193
1194  __res.as_int[0] = __res.as_int[0] << __count;
1195  __res.as_int[1] = __res.as_int[1] << __count;
1196  return (__res.as_m64);
1197}
1198
1199extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200_m_pslld (__m64 __m, __m64 __count)
1201{
1202  return _mm_sll_pi32 (__m, __count);
1203}
1204
1205extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm_slli_pi32 (__m64 __m, int __count)
1207{
1208  /* Promote int to long then invoke mm_sll_pi32.  */
1209  return _mm_sll_pi32 (__m, __count);
1210}
1211
1212extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213_m_pslldi (__m64 __m, int __count)
1214{
1215  return _mm_slli_pi32 (__m, __count);
1216}
1217
1218/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220_mm_sra_pi16 (__m64 __m, __m64 __count)
1221{
1222  __vector signed short __r;
1223  __vector unsigned short __c;
1224
1225  if (__count <= 15)
1226    {
1227	__r = (__vector signed short)vec_splats (__m);
1228	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229	__r = vec_sra (__r, (__vector unsigned short)__c);
1230        return (__m64) ((__vector long long) __r)[0];
1231    }
1232  else
1233  return (0);
1234}
1235
1236extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237_m_psraw (__m64 __m, __m64 __count)
1238{
1239  return _mm_sra_pi16 (__m, __count);
1240}
1241
1242extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243_mm_srai_pi16 (__m64 __m, int __count)
1244{
1245  /* Promote int to long then invoke mm_sra_pi32.  */
1246  return _mm_sra_pi16 (__m, __count);
1247}
1248
1249extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250_m_psrawi (__m64 __m, int __count)
1251{
1252  return _mm_srai_pi16 (__m, __count);
1253}
1254
1255/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1256extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257_mm_sra_pi32 (__m64 __m, __m64 __count)
1258{
1259  __m64_union __res;
1260
1261  __res.as_m64 = __m;
1262
1263  __res.as_int[0] = __res.as_int[0] >> __count;
1264  __res.as_int[1] = __res.as_int[1] >> __count;
1265  return (__res.as_m64);
1266}
1267
1268extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269_m_psrad (__m64 __m, __m64 __count)
1270{
1271  return _mm_sra_pi32 (__m, __count);
1272}
1273
1274extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275_mm_srai_pi32 (__m64 __m, int __count)
1276{
1277  /* Promote int to long then invoke mm_sra_pi32.  */
1278  return _mm_sra_pi32 (__m, __count);
1279}
1280
1281extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282_m_psradi (__m64 __m, int __count)
1283{
1284  return _mm_srai_pi32 (__m, __count);
1285}
1286
1287/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1288extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289_mm_srl_pi16 (__m64 __m, __m64 __count)
1290{
1291  __vector unsigned short __r;
1292  __vector unsigned short __c;
1293
1294  if (__count <= 15)
1295    {
1296	__r = (__vector unsigned short)vec_splats (__m);
1297	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298	__r = vec_sr (__r, (__vector unsigned short)__c);
1299        return (__m64) ((__vector long long) __r)[0];
1300    }
1301  else
1302    return (0);
1303}
1304
1305extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306_m_psrlw (__m64 __m, __m64 __count)
1307{
1308  return _mm_srl_pi16 (__m, __count);
1309}
1310
1311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312_mm_srli_pi16 (__m64 __m, int __count)
1313{
1314  /* Promote int to long then invoke mm_sra_pi32.  */
1315  return _mm_srl_pi16 (__m, __count);
1316}
1317
1318extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319_m_psrlwi (__m64 __m, int __count)
1320{
1321  return _mm_srli_pi16 (__m, __count);
1322}
1323
1324/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1325extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326_mm_srl_pi32 (__m64 __m, __m64 __count)
1327{
1328  __m64_union __res;
1329
1330  __res.as_m64 = __m;
1331
1332  __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1333  __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1334  return (__res.as_m64);
1335}
1336
1337extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338_m_psrld (__m64 __m, __m64 __count)
1339{
1340  return _mm_srl_pi32 (__m, __count);
1341}
1342
1343extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344_mm_srli_pi32 (__m64 __m, int __count)
1345{
1346  /* Promote int to long then invoke mm_srl_pi32.  */
1347  return _mm_srl_pi32 (__m, __count);
1348}
1349
1350extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351_m_psrldi (__m64 __m, int __count)
1352{
1353  return _mm_srli_pi32 (__m, __count);
1354}
1355#endif /* _ARCH_PWR8 */
1356
1357/* Creates a vector of two 32-bit values; I0 is least significant.  */
1358extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359_mm_set_pi32 (int __i1, int __i0)
1360{
1361  __m64_union __res;
1362
1363  __res.as_int[0] = __i0;
1364  __res.as_int[1] = __i1;
1365  return (__res.as_m64);
1366}
1367
1368/* Creates a vector of four 16-bit values; W0 is least significant.  */
1369extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371{
1372  __m64_union __res;
1373
1374  __res.as_short[0] = __w0;
1375  __res.as_short[1] = __w1;
1376  __res.as_short[2] = __w2;
1377  __res.as_short[3] = __w3;
1378  return (__res.as_m64);
1379}
1380
1381/* Creates a vector of eight 8-bit values; B0 is least significant.  */
1382extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384	     char __b3, char __b2, char __b1, char __b0)
1385{
1386  __m64_union __res;
1387
1388  __res.as_char[0] = __b0;
1389  __res.as_char[1] = __b1;
1390  __res.as_char[2] = __b2;
1391  __res.as_char[3] = __b3;
1392  __res.as_char[4] = __b4;
1393  __res.as_char[5] = __b5;
1394  __res.as_char[6] = __b6;
1395  __res.as_char[7] = __b7;
1396  return (__res.as_m64);
1397}
1398
1399/* Similar, but with the arguments in reverse order.  */
1400extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401_mm_setr_pi32 (int __i0, int __i1)
1402{
1403  __m64_union __res;
1404
1405  __res.as_int[0] = __i0;
1406  __res.as_int[1] = __i1;
1407  return (__res.as_m64);
1408}
1409
1410extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412{
1413  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414}
1415
1416extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418	      char __b4, char __b5, char __b6, char __b7)
1419{
1420  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421}
1422
1423/* Creates a vector of two 32-bit values, both elements containing I.  */
1424extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425_mm_set1_pi32 (int __i)
1426{
1427  __m64_union __res;
1428
1429  __res.as_int[0] = __i;
1430  __res.as_int[1] = __i;
1431  return (__res.as_m64);
1432}
1433
1434/* Creates a vector of four 16-bit values, all elements containing W.  */
1435extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436_mm_set1_pi16 (short __w)
1437{
1438#if _ARCH_PWR9
1439  __vector signed short w;
1440
1441  w = (__vector signed short)vec_splats (__w);
1442  return (__m64) ((__vector long long) w)[0];
1443#else
1444  __m64_union __res;
1445
1446  __res.as_short[0] = __w;
1447  __res.as_short[1] = __w;
1448  __res.as_short[2] = __w;
1449  __res.as_short[3] = __w;
1450  return (__res.as_m64);
1451#endif
1452}
1453
1454/* Creates a vector of eight 8-bit values, all elements containing B.  */
1455extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1456_mm_set1_pi8 (signed char __b)
1457{
1458#if _ARCH_PWR8
1459  __vector signed char __res;
1460
1461  __res = (__vector signed char)vec_splats (__b);
1462  return (__m64) ((__vector long long) __res)[0];
1463#else
1464  __m64_union __res;
1465
1466  __res.as_char[0] = __b;
1467  __res.as_char[1] = __b;
1468  __res.as_char[2] = __b;
1469  __res.as_char[3] = __b;
1470  __res.as_char[4] = __b;
1471  __res.as_char[5] = __b;
1472  __res.as_char[6] = __b;
1473  __res.as_char[7] = __b;
1474  return (__res.as_m64);
1475#endif
1476}
1477#endif /* _MMINTRIN_H_INCLUDED */
1478