1/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since PowerPC target doesn't support native 64-bit vector type, we
18   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19   works well for _si64 and some _pi32 operations.
20
21   For _pi16 and _pi8 operations, it's better to transfer __m64 into
22   128-bit PowerPC vector first. Power8 introduced direct register
23   move instructions which helps for more efficient implementation.
24
25   It's user's responsibility to determine if the results of such port
26   are acceptable or further changes are needed. Please note that much
27   code using Intel intrinsics CAN BE REWRITTEN in more portable and
28   efficient standard C or GNU C extensions with 64-bit scalar
29   operations, or 128-bit SSE/Altivec operations, which are more
30   recommended. */
31#error                                                                         \
32    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef _MMINTRIN_H_INCLUDED
36#define _MMINTRIN_H_INCLUDED
37
38#if defined(__linux__) && defined(__ppc64__)
39
40#include <altivec.h>
41/* The Intel API is flexible enough that we must allow aliasing with other
42   vector types, and their scalar components.  */
43typedef __attribute__((__aligned__(8))) unsigned long long __m64;
44
45typedef __attribute__((__aligned__(8))) union {
46  __m64 as_m64;
47  char as_char[8];
48  signed char as_signed_char[8];
49  short as_short[4];
50  int as_int[2];
51  long long as_long_long;
52  float as_float[2];
53  double as_double;
54} __m64_union;
55
56/* Empty the multimedia state.  */
57extern __inline void
58    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59    _mm_empty(void) {
60  /* nothing to do on PowerPC.  */
61}
62
63extern __inline void
64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65    _m_empty(void) {
66  /* nothing to do on PowerPC.  */
67}
68
69/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
70extern __inline __m64
71    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72    _mm_cvtsi32_si64(int __i) {
73  return (__m64)(unsigned int)__i;
74}
75
76extern __inline __m64
77    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78    _m_from_int(int __i) {
79  return _mm_cvtsi32_si64(__i);
80}
81
82/* Convert the lower 32 bits of the __m64 object into an integer.  */
83extern __inline int
84    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85    _mm_cvtsi64_si32(__m64 __i) {
86  return ((int)__i);
87}
88
89extern __inline int
90    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91    _m_to_int(__m64 __i) {
92  return _mm_cvtsi64_si32(__i);
93}
94
95/* Convert I to a __m64 object.  */
96
97/* Intel intrinsic.  */
98extern __inline __m64
99    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100    _m_from_int64(long long __i) {
101  return (__m64)__i;
102}
103
104extern __inline __m64
105    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106    _mm_cvtsi64_m64(long long __i) {
107  return (__m64)__i;
108}
109
110/* Microsoft intrinsic.  */
111extern __inline __m64
112    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113    _mm_cvtsi64x_si64(long long __i) {
114  return (__m64)__i;
115}
116
117extern __inline __m64
118    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119    _mm_set_pi64x(long long __i) {
120  return (__m64)__i;
121}
122
123/* Convert the __m64 object to a 64bit integer.  */
124
125/* Intel intrinsic.  */
126extern __inline long long
127    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128    _m_to_int64(__m64 __i) {
129  return (long long)__i;
130}
131
132extern __inline long long
133    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134    _mm_cvtm64_si64(__m64 __i) {
135  return (long long)__i;
136}
137
138/* Microsoft intrinsic.  */
139extern __inline long long
140    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141    _mm_cvtsi64_si64x(__m64 __i) {
142  return (long long)__i;
143}
144
145#ifdef _ARCH_PWR8
146/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147   the result, and the four 16-bit values from M2 into the upper four 8-bit
148   values of the result, all with signed saturation.  */
149extern __inline __m64
150    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152  __vector signed short vm1;
153  __vector signed char vresult;
154
155  vm1 = (__vector signed short)(__vector unsigned long long)
156#ifdef __LITTLE_ENDIAN__
157      {__m1, __m2};
158#else
159      {__m2, __m1};
160#endif
161  vresult = vec_packs(vm1, vm1);
162  return (__m64)((__vector long long)vresult)[0];
163}
164
165extern __inline __m64
166    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167    _m_packsswb(__m64 __m1, __m64 __m2) {
168  return _mm_packs_pi16(__m1, __m2);
169}
170
171/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172   the result, and the two 32-bit values from M2 into the upper two 16-bit
173   values of the result, all with signed saturation.  */
174extern __inline __m64
175    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177  __vector signed int vm1;
178  __vector signed short vresult;
179
180  vm1 = (__vector signed int)(__vector unsigned long long)
181#ifdef __LITTLE_ENDIAN__
182      {__m1, __m2};
183#else
184      {__m2, __m1};
185#endif
186  vresult = vec_packs(vm1, vm1);
187  return (__m64)((__vector long long)vresult)[0];
188}
189
190extern __inline __m64
191    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192    _m_packssdw(__m64 __m1, __m64 __m2) {
193  return _mm_packs_pi32(__m1, __m2);
194}
195
196/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197   the result, and the four 16-bit values from M2 into the upper four 8-bit
198   values of the result, all with unsigned saturation.  */
199extern __inline __m64
200    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202  __vector unsigned char r;
203  __vector signed short vm1 = (__vector signed short)(__vector long long)
204#ifdef __LITTLE_ENDIAN__
205      {__m1, __m2};
206#else
207      {__m2, __m1};
208#endif
209  const __vector signed short __zero = {0};
210  __vector __bool short __select = vec_cmplt(vm1, __zero);
211  r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
212  __vector __bool char packsel = vec_pack(__select, __select);
213  r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
214  return (__m64)((__vector long long)r)[0];
215}
216
217extern __inline __m64
218    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219    _m_packuswb(__m64 __m1, __m64 __m2) {
220  return _mm_packs_pu16(__m1, __m2);
221}
222#endif /* end ARCH_PWR8 */
223
224/* Interleave the four 8-bit values from the high half of M1 with the four
225   8-bit values from the high half of M2.  */
226extern __inline __m64
227    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
229#if _ARCH_PWR8
230  __vector unsigned char a, b, c;
231
232  a = (__vector unsigned char)vec_splats(__m1);
233  b = (__vector unsigned char)vec_splats(__m2);
234  c = vec_mergel(a, b);
235  return (__m64)((__vector long long)c)[1];
236#else
237  __m64_union m1, m2, res;
238
239  m1.as_m64 = __m1;
240  m2.as_m64 = __m2;
241
242  res.as_char[0] = m1.as_char[4];
243  res.as_char[1] = m2.as_char[4];
244  res.as_char[2] = m1.as_char[5];
245  res.as_char[3] = m2.as_char[5];
246  res.as_char[4] = m1.as_char[6];
247  res.as_char[5] = m2.as_char[6];
248  res.as_char[6] = m1.as_char[7];
249  res.as_char[7] = m2.as_char[7];
250
251  return (__m64)res.as_m64;
252#endif
253}
254
255extern __inline __m64
256    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257    _m_punpckhbw(__m64 __m1, __m64 __m2) {
258  return _mm_unpackhi_pi8(__m1, __m2);
259}
260
261/* Interleave the two 16-bit values from the high half of M1 with the two
262   16-bit values from the high half of M2.  */
263extern __inline __m64
264    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
266  __m64_union m1, m2, res;
267
268  m1.as_m64 = __m1;
269  m2.as_m64 = __m2;
270
271  res.as_short[0] = m1.as_short[2];
272  res.as_short[1] = m2.as_short[2];
273  res.as_short[2] = m1.as_short[3];
274  res.as_short[3] = m2.as_short[3];
275
276  return (__m64)res.as_m64;
277}
278
279extern __inline __m64
280    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281    _m_punpckhwd(__m64 __m1, __m64 __m2) {
282  return _mm_unpackhi_pi16(__m1, __m2);
283}
284/* Interleave the 32-bit value from the high half of M1 with the 32-bit
285   value from the high half of M2.  */
286extern __inline __m64
287    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
289  __m64_union m1, m2, res;
290
291  m1.as_m64 = __m1;
292  m2.as_m64 = __m2;
293
294  res.as_int[0] = m1.as_int[1];
295  res.as_int[1] = m2.as_int[1];
296
297  return (__m64)res.as_m64;
298}
299
300extern __inline __m64
301    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302    _m_punpckhdq(__m64 __m1, __m64 __m2) {
303  return _mm_unpackhi_pi32(__m1, __m2);
304}
305/* Interleave the four 8-bit values from the low half of M1 with the four
306   8-bit values from the low half of M2.  */
307extern __inline __m64
308    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
310#if _ARCH_PWR8
311  __vector unsigned char a, b, c;
312
313  a = (__vector unsigned char)vec_splats(__m1);
314  b = (__vector unsigned char)vec_splats(__m2);
315  c = vec_mergel(a, b);
316  return (__m64)((__vector long long)c)[0];
317#else
318  __m64_union m1, m2, res;
319
320  m1.as_m64 = __m1;
321  m2.as_m64 = __m2;
322
323  res.as_char[0] = m1.as_char[0];
324  res.as_char[1] = m2.as_char[0];
325  res.as_char[2] = m1.as_char[1];
326  res.as_char[3] = m2.as_char[1];
327  res.as_char[4] = m1.as_char[2];
328  res.as_char[5] = m2.as_char[2];
329  res.as_char[6] = m1.as_char[3];
330  res.as_char[7] = m2.as_char[3];
331
332  return (__m64)res.as_m64;
333#endif
334}
335
336extern __inline __m64
337    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338    _m_punpcklbw(__m64 __m1, __m64 __m2) {
339  return _mm_unpacklo_pi8(__m1, __m2);
340}
341/* Interleave the two 16-bit values from the low half of M1 with the two
342   16-bit values from the low half of M2.  */
343extern __inline __m64
344    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
346  __m64_union m1, m2, res;
347
348  m1.as_m64 = __m1;
349  m2.as_m64 = __m2;
350
351  res.as_short[0] = m1.as_short[0];
352  res.as_short[1] = m2.as_short[0];
353  res.as_short[2] = m1.as_short[1];
354  res.as_short[3] = m2.as_short[1];
355
356  return (__m64)res.as_m64;
357}
358
359extern __inline __m64
360    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361    _m_punpcklwd(__m64 __m1, __m64 __m2) {
362  return _mm_unpacklo_pi16(__m1, __m2);
363}
364
365/* Interleave the 32-bit value from the low half of M1 with the 32-bit
366   value from the low half of M2.  */
367extern __inline __m64
368    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
370  __m64_union m1, m2, res;
371
372  m1.as_m64 = __m1;
373  m2.as_m64 = __m2;
374
375  res.as_int[0] = m1.as_int[0];
376  res.as_int[1] = m2.as_int[0];
377
378  return (__m64)res.as_m64;
379}
380
381extern __inline __m64
382    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383    _m_punpckldq(__m64 __m1, __m64 __m2) {
384  return _mm_unpacklo_pi32(__m1, __m2);
385}
386
387/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
388extern __inline __m64
389    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390    _mm_add_pi8(__m64 __m1, __m64 __m2) {
391#if _ARCH_PWR8
392  __vector signed char a, b, c;
393
394  a = (__vector signed char)vec_splats(__m1);
395  b = (__vector signed char)vec_splats(__m2);
396  c = vec_add(a, b);
397  return (__m64)((__vector long long)c)[0];
398#else
399  __m64_union m1, m2, res;
400
401  m1.as_m64 = __m1;
402  m2.as_m64 = __m2;
403
404  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
405  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
406  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
407  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
408  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
409  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
410  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
411  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
412
413  return (__m64)res.as_m64;
414#endif
415}
416
417extern __inline __m64
418    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419    _m_paddb(__m64 __m1, __m64 __m2) {
420  return _mm_add_pi8(__m1, __m2);
421}
422
423/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
424extern __inline __m64
425    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426    _mm_add_pi16(__m64 __m1, __m64 __m2) {
427#if _ARCH_PWR8
428  __vector signed short a, b, c;
429
430  a = (__vector signed short)vec_splats(__m1);
431  b = (__vector signed short)vec_splats(__m2);
432  c = vec_add(a, b);
433  return (__m64)((__vector long long)c)[0];
434#else
435  __m64_union m1, m2, res;
436
437  m1.as_m64 = __m1;
438  m2.as_m64 = __m2;
439
440  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
441  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
442  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
443  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
444
445  return (__m64)res.as_m64;
446#endif
447}
448
449extern __inline __m64
450    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451    _m_paddw(__m64 __m1, __m64 __m2) {
452  return _mm_add_pi16(__m1, __m2);
453}
454
455/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
456extern __inline __m64
457    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458    _mm_add_pi32(__m64 __m1, __m64 __m2) {
459#if _ARCH_PWR9
460  __vector signed int a, b, c;
461
462  a = (__vector signed int)vec_splats(__m1);
463  b = (__vector signed int)vec_splats(__m2);
464  c = vec_add(a, b);
465  return (__m64)((__vector long long)c)[0];
466#else
467  __m64_union m1, m2, res;
468
469  m1.as_m64 = __m1;
470  m2.as_m64 = __m2;
471
472  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
473  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
474
475  return (__m64)res.as_m64;
476#endif
477}
478
479extern __inline __m64
480    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481    _m_paddd(__m64 __m1, __m64 __m2) {
482  return _mm_add_pi32(__m1, __m2);
483}
484
485/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
486extern __inline __m64
487    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
489#if _ARCH_PWR8
490  __vector signed char a, b, c;
491
492  a = (__vector signed char)vec_splats(__m1);
493  b = (__vector signed char)vec_splats(__m2);
494  c = vec_sub(a, b);
495  return (__m64)((__vector long long)c)[0];
496#else
497  __m64_union m1, m2, res;
498
499  m1.as_m64 = __m1;
500  m2.as_m64 = __m2;
501
502  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
503  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
504  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
505  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
506  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
507  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
508  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
509  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
510
511  return (__m64)res.as_m64;
512#endif
513}
514
515extern __inline __m64
516    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517    _m_psubb(__m64 __m1, __m64 __m2) {
518  return _mm_sub_pi8(__m1, __m2);
519}
520
521/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
522extern __inline __m64
523    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
525#if _ARCH_PWR8
526  __vector signed short a, b, c;
527
528  a = (__vector signed short)vec_splats(__m1);
529  b = (__vector signed short)vec_splats(__m2);
530  c = vec_sub(a, b);
531  return (__m64)((__vector long long)c)[0];
532#else
533  __m64_union m1, m2, res;
534
535  m1.as_m64 = __m1;
536  m2.as_m64 = __m2;
537
538  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
539  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
540  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
541  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
542
543  return (__m64)res.as_m64;
544#endif
545}
546
547extern __inline __m64
548    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549    _m_psubw(__m64 __m1, __m64 __m2) {
550  return _mm_sub_pi16(__m1, __m2);
551}
552
553/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
554extern __inline __m64
555    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
557#if _ARCH_PWR9
558  __vector signed int a, b, c;
559
560  a = (__vector signed int)vec_splats(__m1);
561  b = (__vector signed int)vec_splats(__m2);
562  c = vec_sub(a, b);
563  return (__m64)((__vector long long)c)[0];
564#else
565  __m64_union m1, m2, res;
566
567  m1.as_m64 = __m1;
568  m2.as_m64 = __m2;
569
570  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
571  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
572
573  return (__m64)res.as_m64;
574#endif
575}
576
577extern __inline __m64
578    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579    _m_psubd(__m64 __m1, __m64 __m2) {
580  return _mm_sub_pi32(__m1, __m2);
581}
582
583extern __inline __m64
584    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585    _mm_add_si64(__m64 __m1, __m64 __m2) {
586  return (__m1 + __m2);
587}
588
589extern __inline __m64
590    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591    _mm_sub_si64(__m64 __m1, __m64 __m2) {
592  return (__m1 - __m2);
593}
594
595/* Shift the 64-bit value in M left by COUNT.  */
596extern __inline __m64
597    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598    _mm_sll_si64(__m64 __m, __m64 __count) {
599  return (__m << __count);
600}
601
602extern __inline __m64
603    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604    _m_psllq(__m64 __m, __m64 __count) {
605  return _mm_sll_si64(__m, __count);
606}
607
608extern __inline __m64
609    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610    _mm_slli_si64(__m64 __m, const int __count) {
611  return (__m << __count);
612}
613
614extern __inline __m64
615    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616    _m_psllqi(__m64 __m, const int __count) {
617  return _mm_slli_si64(__m, __count);
618}
619
620/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
621extern __inline __m64
622    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623    _mm_srl_si64(__m64 __m, __m64 __count) {
624  return (__m >> __count);
625}
626
627extern __inline __m64
628    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629    _m_psrlq(__m64 __m, __m64 __count) {
630  return _mm_srl_si64(__m, __count);
631}
632
633extern __inline __m64
634    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635    _mm_srli_si64(__m64 __m, const int __count) {
636  return (__m >> __count);
637}
638
639extern __inline __m64
640    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641    _m_psrlqi(__m64 __m, const int __count) {
642  return _mm_srli_si64(__m, __count);
643}
644
645/* Bit-wise AND the 64-bit values in M1 and M2.  */
646extern __inline __m64
647    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648    _mm_and_si64(__m64 __m1, __m64 __m2) {
649  return (__m1 & __m2);
650}
651
652extern __inline __m64
653    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654    _m_pand(__m64 __m1, __m64 __m2) {
655  return _mm_and_si64(__m1, __m2);
656}
657
658/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
659   64-bit value in M2.  */
660extern __inline __m64
661    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
663  return (~__m1 & __m2);
664}
665
666extern __inline __m64
667    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668    _m_pandn(__m64 __m1, __m64 __m2) {
669  return _mm_andnot_si64(__m1, __m2);
670}
671
672/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
673extern __inline __m64
674    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
675    _mm_or_si64(__m64 __m1, __m64 __m2) {
676  return (__m1 | __m2);
677}
678
679extern __inline __m64
680    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681    _m_por(__m64 __m1, __m64 __m2) {
682  return _mm_or_si64(__m1, __m2);
683}
684
685/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
686extern __inline __m64
687    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688    _mm_xor_si64(__m64 __m1, __m64 __m2) {
689  return (__m1 ^ __m2);
690}
691
692extern __inline __m64
693    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694    _m_pxor(__m64 __m1, __m64 __m2) {
695  return _mm_xor_si64(__m1, __m2);
696}
697
698/* Creates a 64-bit zero.  */
699extern __inline __m64
700    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701    _mm_setzero_si64(void) {
702  return (__m64)0;
703}
704
705/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
706   test is true and zero if false.  */
707extern __inline __m64
708    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
710#if defined(_ARCH_PWR6) && defined(__powerpc64__)
711  __m64 res;
712  __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
713  return (res);
714#else
715  __m64_union m1, m2, res;
716
717  m1.as_m64 = __m1;
718  m2.as_m64 = __m2;
719
720  res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
721  res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
722  res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
723  res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
724  res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
725  res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
726  res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
727  res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
728
729  return (__m64)res.as_m64;
730#endif
731}
732
733extern __inline __m64
734    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
736  return _mm_cmpeq_pi8(__m1, __m2);
737}
738
739extern __inline __m64
740    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
742#if _ARCH_PWR8
743  __vector signed char a, b, c;
744
745  a = (__vector signed char)vec_splats(__m1);
746  b = (__vector signed char)vec_splats(__m2);
747  c = (__vector signed char)vec_cmpgt(a, b);
748  return (__m64)((__vector long long)c)[0];
749#else
750  __m64_union m1, m2, res;
751
752  m1.as_m64 = __m1;
753  m2.as_m64 = __m2;
754
755  res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
756  res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
757  res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
758  res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
759  res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
760  res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
761  res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
762  res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
763
764  return (__m64)res.as_m64;
765#endif
766}
767
768extern __inline __m64
769    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
771  return _mm_cmpgt_pi8(__m1, __m2);
772}
773
774/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
775   the test is true and zero if false.  */
776extern __inline __m64
777    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
779#if _ARCH_PWR8
780  __vector signed short a, b, c;
781
782  a = (__vector signed short)vec_splats(__m1);
783  b = (__vector signed short)vec_splats(__m2);
784  c = (__vector signed short)vec_cmpeq(a, b);
785  return (__m64)((__vector long long)c)[0];
786#else
787  __m64_union m1, m2, res;
788
789  m1.as_m64 = __m1;
790  m2.as_m64 = __m2;
791
792  res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
793  res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
794  res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
795  res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
796
797  return (__m64)res.as_m64;
798#endif
799}
800
801extern __inline __m64
802    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
804  return _mm_cmpeq_pi16(__m1, __m2);
805}
806
807extern __inline __m64
808    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
810#if _ARCH_PWR8
811  __vector signed short a, b, c;
812
813  a = (__vector signed short)vec_splats(__m1);
814  b = (__vector signed short)vec_splats(__m2);
815  c = (__vector signed short)vec_cmpgt(a, b);
816  return (__m64)((__vector long long)c)[0];
817#else
818  __m64_union m1, m2, res;
819
820  m1.as_m64 = __m1;
821  m2.as_m64 = __m2;
822
823  res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
824  res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
825  res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
826  res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
827
828  return (__m64)res.as_m64;
829#endif
830}
831
832extern __inline __m64
833    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
835  return _mm_cmpgt_pi16(__m1, __m2);
836}
837
838/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
839   the test is true and zero if false.  */
840extern __inline __m64
841    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
843#if _ARCH_PWR9
844  __vector signed int a, b, c;
845
846  a = (__vector signed int)vec_splats(__m1);
847  b = (__vector signed int)vec_splats(__m2);
848  c = (__vector signed int)vec_cmpeq(a, b);
849  return (__m64)((__vector long long)c)[0];
850#else
851  __m64_union m1, m2, res;
852
853  m1.as_m64 = __m1;
854  m2.as_m64 = __m2;
855
856  res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
857  res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
858
859  return (__m64)res.as_m64;
860#endif
861}
862
863extern __inline __m64
864    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
866  return _mm_cmpeq_pi32(__m1, __m2);
867}
868
869extern __inline __m64
870    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
872#if _ARCH_PWR9
873  __vector signed int a, b, c;
874
875  a = (__vector signed int)vec_splats(__m1);
876  b = (__vector signed int)vec_splats(__m2);
877  c = (__vector signed int)vec_cmpgt(a, b);
878  return (__m64)((__vector long long)c)[0];
879#else
880  __m64_union m1, m2, res;
881
882  m1.as_m64 = __m1;
883  m2.as_m64 = __m2;
884
885  res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
886  res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
887
888  return (__m64)res.as_m64;
889#endif
890}
891
892extern __inline __m64
893    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
895  return _mm_cmpgt_pi32(__m1, __m2);
896}
897
898#if _ARCH_PWR8
899/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
900   saturated arithmetic.  */
901extern __inline __m64
902    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
904  __vector signed char a, b, c;
905
906  a = (__vector signed char)vec_splats(__m1);
907  b = (__vector signed char)vec_splats(__m2);
908  c = vec_adds(a, b);
909  return (__m64)((__vector long long)c)[0];
910}
911
912extern __inline __m64
913    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914    _m_paddsb(__m64 __m1, __m64 __m2) {
915  return _mm_adds_pi8(__m1, __m2);
916}
917/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
918   saturated arithmetic.  */
919extern __inline __m64
920    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
922  __vector signed short a, b, c;
923
924  a = (__vector signed short)vec_splats(__m1);
925  b = (__vector signed short)vec_splats(__m2);
926  c = vec_adds(a, b);
927  return (__m64)((__vector long long)c)[0];
928}
929
930extern __inline __m64
931    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932    _m_paddsw(__m64 __m1, __m64 __m2) {
933  return _mm_adds_pi16(__m1, __m2);
934}
935/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
936   saturated arithmetic.  */
937extern __inline __m64
938    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
940  __vector unsigned char a, b, c;
941
942  a = (__vector unsigned char)vec_splats(__m1);
943  b = (__vector unsigned char)vec_splats(__m2);
944  c = vec_adds(a, b);
945  return (__m64)((__vector long long)c)[0];
946}
947
948extern __inline __m64
949    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950    _m_paddusb(__m64 __m1, __m64 __m2) {
951  return _mm_adds_pu8(__m1, __m2);
952}
953
954/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
955   saturated arithmetic.  */
956extern __inline __m64
957    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
959  __vector unsigned short a, b, c;
960
961  a = (__vector unsigned short)vec_splats(__m1);
962  b = (__vector unsigned short)vec_splats(__m2);
963  c = vec_adds(a, b);
964  return (__m64)((__vector long long)c)[0];
965}
966
967extern __inline __m64
968    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969    _m_paddusw(__m64 __m1, __m64 __m2) {
970  return _mm_adds_pu16(__m1, __m2);
971}
972
973/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
974   saturating arithmetic.  */
975extern __inline __m64
976    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
978  __vector signed char a, b, c;
979
980  a = (__vector signed char)vec_splats(__m1);
981  b = (__vector signed char)vec_splats(__m2);
982  c = vec_subs(a, b);
983  return (__m64)((__vector long long)c)[0];
984}
985
986extern __inline __m64
987    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988    _m_psubsb(__m64 __m1, __m64 __m2) {
989  return _mm_subs_pi8(__m1, __m2);
990}
991
992/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
993   signed saturating arithmetic.  */
994extern __inline __m64
995    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
997  __vector signed short a, b, c;
998
999  a = (__vector signed short)vec_splats(__m1);
1000  b = (__vector signed short)vec_splats(__m2);
1001  c = vec_subs(a, b);
1002  return (__m64)((__vector long long)c)[0];
1003}
1004
1005extern __inline __m64
1006    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007    _m_psubsw(__m64 __m1, __m64 __m2) {
1008  return _mm_subs_pi16(__m1, __m2);
1009}
1010
1011/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012   unsigned saturating arithmetic.  */
1013extern __inline __m64
1014    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016  __vector unsigned char a, b, c;
1017
1018  a = (__vector unsigned char)vec_splats(__m1);
1019  b = (__vector unsigned char)vec_splats(__m2);
1020  c = vec_subs(a, b);
1021  return (__m64)((__vector long long)c)[0];
1022}
1023
1024extern __inline __m64
1025    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026    _m_psubusb(__m64 __m1, __m64 __m2) {
1027  return _mm_subs_pu8(__m1, __m2);
1028}
1029
1030/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031   unsigned saturating arithmetic.  */
1032extern __inline __m64
1033    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035  __vector unsigned short a, b, c;
1036
1037  a = (__vector unsigned short)vec_splats(__m1);
1038  b = (__vector unsigned short)vec_splats(__m2);
1039  c = vec_subs(a, b);
1040  return (__m64)((__vector long long)c)[0];
1041}
1042
1043extern __inline __m64
1044    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045    _m_psubusw(__m64 __m1, __m64 __m2) {
1046  return _mm_subs_pu16(__m1, __m2);
1047}
1048
1049/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050   four 32-bit intermediate results, which are then summed by pairs to
1051   produce two 32-bit results.  */
1052extern __inline __m64
1053    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055  __vector signed short a, b;
1056  __vector signed int c;
1057  __vector signed int zero = {0, 0, 0, 0};
1058
1059  a = (__vector signed short)vec_splats(__m1);
1060  b = (__vector signed short)vec_splats(__m2);
1061  c = vec_vmsumshm(a, b, zero);
1062  return (__m64)((__vector long long)c)[0];
1063}
1064
1065extern __inline __m64
1066    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067    _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068  return _mm_madd_pi16(__m1, __m2);
1069}
1070/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071   M2 and produce the high 16 bits of the 32-bit results.  */
1072extern __inline __m64
1073    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075  __vector signed short a, b;
1076  __vector signed short c;
1077  __vector signed int w0, w1;
1078  __vector unsigned char xform1 = {
1079#ifdef __LITTLE_ENDIAN__
1080      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1082#else
1083      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1085#endif
1086  };
1087
1088  a = (__vector signed short)vec_splats(__m1);
1089  b = (__vector signed short)vec_splats(__m2);
1090
1091  w0 = vec_vmulesh(a, b);
1092  w1 = vec_vmulosh(a, b);
1093  c = (__vector signed short)vec_perm(w0, w1, xform1);
1094
1095  return (__m64)((__vector long long)c)[0];
1096}
1097
1098extern __inline __m64
1099    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100    _m_pmulhw(__m64 __m1, __m64 __m2) {
1101  return _mm_mulhi_pi16(__m1, __m2);
1102}
1103
1104/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105   the low 16 bits of the results.  */
1106extern __inline __m64
1107    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109  __vector signed short a, b, c;
1110
1111  a = (__vector signed short)vec_splats(__m1);
1112  b = (__vector signed short)vec_splats(__m2);
1113  c = a * b;
1114  return (__m64)((__vector long long)c)[0];
1115}
1116
1117extern __inline __m64
1118    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119    _m_pmullw(__m64 __m1, __m64 __m2) {
1120  return _mm_mullo_pi16(__m1, __m2);
1121}
1122
1123/* Shift four 16-bit values in M left by COUNT.  */
1124extern __inline __m64
1125    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126    _mm_sll_pi16(__m64 __m, __m64 __count) {
1127  __vector signed short m, r;
1128  __vector unsigned short c;
1129
1130  if (__count <= 15) {
1131    m = (__vector signed short)vec_splats(__m);
1132    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133    r = vec_sl(m, (__vector unsigned short)c);
1134    return (__m64)((__vector long long)r)[0];
1135  } else
1136    return (0);
1137}
1138
1139extern __inline __m64
1140    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141    _m_psllw(__m64 __m, __m64 __count) {
1142  return _mm_sll_pi16(__m, __count);
1143}
1144
1145extern __inline __m64
1146    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147    _mm_slli_pi16(__m64 __m, int __count) {
1148  /* Promote int to long then invoke mm_sll_pi16.  */
1149  return _mm_sll_pi16(__m, __count);
1150}
1151
1152extern __inline __m64
1153    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154    _m_psllwi(__m64 __m, int __count) {
1155  return _mm_slli_pi16(__m, __count);
1156}
1157
1158/* Shift two 32-bit values in M left by COUNT.  */
1159extern __inline __m64
1160    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161    _mm_sll_pi32(__m64 __m, __m64 __count) {
1162  __m64_union m, res;
1163
1164  m.as_m64 = __m;
1165
1166  res.as_int[0] = m.as_int[0] << __count;
1167  res.as_int[1] = m.as_int[1] << __count;
1168  return (res.as_m64);
1169}
1170
1171extern __inline __m64
1172    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173    _m_pslld(__m64 __m, __m64 __count) {
1174  return _mm_sll_pi32(__m, __count);
1175}
1176
1177extern __inline __m64
1178    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179    _mm_slli_pi32(__m64 __m, int __count) {
1180  /* Promote int to long then invoke mm_sll_pi32.  */
1181  return _mm_sll_pi32(__m, __count);
1182}
1183
1184extern __inline __m64
1185    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186    _m_pslldi(__m64 __m, int __count) {
1187  return _mm_slli_pi32(__m, __count);
1188}
1189
1190/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1191extern __inline __m64
1192    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193    _mm_sra_pi16(__m64 __m, __m64 __count) {
1194  __vector signed short m, r;
1195  __vector unsigned short c;
1196
1197  if (__count <= 15) {
1198    m = (__vector signed short)vec_splats(__m);
1199    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200    r = vec_sra(m, (__vector unsigned short)c);
1201    return (__m64)((__vector long long)r)[0];
1202  } else
1203    return (0);
1204}
1205
1206extern __inline __m64
1207    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208    _m_psraw(__m64 __m, __m64 __count) {
1209  return _mm_sra_pi16(__m, __count);
1210}
1211
1212extern __inline __m64
1213    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214    _mm_srai_pi16(__m64 __m, int __count) {
1215  /* Promote int to long then invoke mm_sra_pi32.  */
1216  return _mm_sra_pi16(__m, __count);
1217}
1218
1219extern __inline __m64
1220    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221    _m_psrawi(__m64 __m, int __count) {
1222  return _mm_srai_pi16(__m, __count);
1223}
1224
1225/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1226extern __inline __m64
1227    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228    _mm_sra_pi32(__m64 __m, __m64 __count) {
1229  __m64_union m, res;
1230
1231  m.as_m64 = __m;
1232
1233  res.as_int[0] = m.as_int[0] >> __count;
1234  res.as_int[1] = m.as_int[1] >> __count;
1235  return (res.as_m64);
1236}
1237
1238extern __inline __m64
1239    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240    _m_psrad(__m64 __m, __m64 __count) {
1241  return _mm_sra_pi32(__m, __count);
1242}
1243
1244extern __inline __m64
1245    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246    _mm_srai_pi32(__m64 __m, int __count) {
1247  /* Promote int to long then invoke mm_sra_pi32.  */
1248  return _mm_sra_pi32(__m, __count);
1249}
1250
1251extern __inline __m64
1252    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253    _m_psradi(__m64 __m, int __count) {
1254  return _mm_srai_pi32(__m, __count);
1255}
1256
1257/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1258extern __inline __m64
1259    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260    _mm_srl_pi16(__m64 __m, __m64 __count) {
1261  __vector unsigned short m, r;
1262  __vector unsigned short c;
1263
1264  if (__count <= 15) {
1265    m = (__vector unsigned short)vec_splats(__m);
1266    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267    r = vec_sr(m, (__vector unsigned short)c);
1268    return (__m64)((__vector long long)r)[0];
1269  } else
1270    return (0);
1271}
1272
1273extern __inline __m64
1274    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275    _m_psrlw(__m64 __m, __m64 __count) {
1276  return _mm_srl_pi16(__m, __count);
1277}
1278
1279extern __inline __m64
1280    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281    _mm_srli_pi16(__m64 __m, int __count) {
1282  /* Promote int to long then invoke mm_sra_pi32.  */
1283  return _mm_srl_pi16(__m, __count);
1284}
1285
1286extern __inline __m64
1287    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288    _m_psrlwi(__m64 __m, int __count) {
1289  return _mm_srli_pi16(__m, __count);
1290}
1291
1292/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1293extern __inline __m64
1294    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295    _mm_srl_pi32(__m64 __m, __m64 __count) {
1296  __m64_union m, res;
1297
1298  m.as_m64 = __m;
1299
1300  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302  return (res.as_m64);
1303}
1304
1305extern __inline __m64
1306    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307    _m_psrld(__m64 __m, __m64 __count) {
1308  return _mm_srl_pi32(__m, __count);
1309}
1310
1311extern __inline __m64
1312    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313    _mm_srli_pi32(__m64 __m, int __count) {
1314  /* Promote int to long then invoke mm_srl_pi32.  */
1315  return _mm_srl_pi32(__m, __count);
1316}
1317
1318extern __inline __m64
1319    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320    _m_psrldi(__m64 __m, int __count) {
1321  return _mm_srli_pi32(__m, __count);
1322}
1323#endif /* _ARCH_PWR8 */
1324
1325/* Creates a vector of two 32-bit values; I0 is least significant.  */
1326extern __inline __m64
1327    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328    _mm_set_pi32(int __i1, int __i0) {
1329  __m64_union res;
1330
1331  res.as_int[0] = __i0;
1332  res.as_int[1] = __i1;
1333  return (res.as_m64);
1334}
1335
1336/* Creates a vector of four 16-bit values; W0 is least significant.  */
1337extern __inline __m64
1338    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340  __m64_union res;
1341
1342  res.as_short[0] = __w0;
1343  res.as_short[1] = __w1;
1344  res.as_short[2] = __w2;
1345  res.as_short[3] = __w3;
1346  return (res.as_m64);
1347}
1348
1349/* Creates a vector of eight 8-bit values; B0 is least significant.  */
1350extern __inline __m64
1351    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353                char __b2, char __b1, char __b0) {
1354  __m64_union res;
1355
1356  res.as_char[0] = __b0;
1357  res.as_char[1] = __b1;
1358  res.as_char[2] = __b2;
1359  res.as_char[3] = __b3;
1360  res.as_char[4] = __b4;
1361  res.as_char[5] = __b5;
1362  res.as_char[6] = __b6;
1363  res.as_char[7] = __b7;
1364  return (res.as_m64);
1365}
1366
1367/* Similar, but with the arguments in reverse order.  */
1368extern __inline __m64
1369    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370    _mm_setr_pi32(int __i0, int __i1) {
1371  __m64_union res;
1372
1373  res.as_int[0] = __i0;
1374  res.as_int[1] = __i1;
1375  return (res.as_m64);
1376}
1377
1378extern __inline __m64
1379    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382}
1383
1384extern __inline __m64
1385    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387                 char __b5, char __b6, char __b7) {
1388  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389}
1390
1391/* Creates a vector of two 32-bit values, both elements containing I.  */
1392extern __inline __m64
1393    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394    _mm_set1_pi32(int __i) {
1395  __m64_union res;
1396
1397  res.as_int[0] = __i;
1398  res.as_int[1] = __i;
1399  return (res.as_m64);
1400}
1401
1402/* Creates a vector of four 16-bit values, all elements containing W.  */
1403extern __inline __m64
1404    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405    _mm_set1_pi16(short __w) {
1406#if _ARCH_PWR9
1407  __vector signed short w;
1408
1409  w = (__vector signed short)vec_splats(__w);
1410  return (__m64)((__vector long long)w)[0];
1411#else
1412  __m64_union res;
1413
1414  res.as_short[0] = __w;
1415  res.as_short[1] = __w;
1416  res.as_short[2] = __w;
1417  res.as_short[3] = __w;
1418  return (res.as_m64);
1419#endif
1420}
1421
1422/* Creates a vector of eight 8-bit values, all elements containing B.  */
1423extern __inline __m64
1424    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425    _mm_set1_pi8(signed char __b) {
1426#if _ARCH_PWR8
1427  __vector signed char b;
1428
1429  b = (__vector signed char)vec_splats(__b);
1430  return (__m64)((__vector long long)b)[0];
1431#else
1432  __m64_union res;
1433
1434  res.as_char[0] = __b;
1435  res.as_char[1] = __b;
1436  res.as_char[2] = __b;
1437  res.as_char[3] = __b;
1438  res.as_char[4] = __b;
1439  res.as_char[5] = __b;
1440  res.as_char[6] = __b;
1441  res.as_char[7] = __b;
1442  return (res.as_m64);
1443#endif
1444}
1445
1446#else
1447#include_next <mmintrin.h>
1448#endif /* defined(__linux__) && defined(__ppc64__) */
1449
1450#endif /* _MMINTRIN_H_INCLUDED */
1451