mmintrin.h revision 351280
1/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since PowerPC target doesn't support native 64-bit vector type, we
18   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19   works well for _si64 and some _pi32 operations.
20
21   For _pi16 and _pi8 operations, it's better to transfer __m64 into
22   128-bit PowerPC vector first. Power8 introduced direct register
23   move instructions which helps for more efficient implementation.
24
25   It's user's responsibility to determine if the results of such port
26   are acceptable or further changes are needed. Please note that much
27   code using Intel intrinsics CAN BE REWRITTEN in more portable and
28   efficient standard C or GNU C extensions with 64-bit scalar
29   operations, or 128-bit SSE/Altivec operations, which are more
30   recommended. */
31#error                                                                         \
32    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef _MMINTRIN_H_INCLUDED
36#define _MMINTRIN_H_INCLUDED
37
38#include <altivec.h>
39/* The Intel API is flexible enough that we must allow aliasing with other
40   vector types, and their scalar components.  */
41typedef __attribute__((__aligned__(8))) unsigned long long __m64;
42
43typedef __attribute__((__aligned__(8))) union {
44  __m64 as_m64;
45  char as_char[8];
46  signed char as_signed_char[8];
47  short as_short[4];
48  int as_int[2];
49  long long as_long_long;
50  float as_float[2];
51  double as_double;
52} __m64_union;
53
54/* Empty the multimedia state.  */
55extern __inline void
56    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57    _mm_empty(void) {
58  /* nothing to do on PowerPC.  */
59}
60
61extern __inline void
62    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63    _m_empty(void) {
64  /* nothing to do on PowerPC.  */
65}
66
67/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
68extern __inline __m64
69    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70    _mm_cvtsi32_si64(int __i) {
71  return (__m64)(unsigned int)__i;
72}
73
74extern __inline __m64
75    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76    _m_from_int(int __i) {
77  return _mm_cvtsi32_si64(__i);
78}
79
80/* Convert the lower 32 bits of the __m64 object into an integer.  */
81extern __inline int
82    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83    _mm_cvtsi64_si32(__m64 __i) {
84  return ((int)__i);
85}
86
87extern __inline int
88    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89    _m_to_int(__m64 __i) {
90  return _mm_cvtsi64_si32(__i);
91}
92
93/* Convert I to a __m64 object.  */
94
95/* Intel intrinsic.  */
96extern __inline __m64
97    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98    _m_from_int64(long long __i) {
99  return (__m64)__i;
100}
101
102extern __inline __m64
103    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104    _mm_cvtsi64_m64(long long __i) {
105  return (__m64)__i;
106}
107
108/* Microsoft intrinsic.  */
109extern __inline __m64
110    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111    _mm_cvtsi64x_si64(long long __i) {
112  return (__m64)__i;
113}
114
115extern __inline __m64
116    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117    _mm_set_pi64x(long long __i) {
118  return (__m64)__i;
119}
120
121/* Convert the __m64 object to a 64bit integer.  */
122
123/* Intel intrinsic.  */
124extern __inline long long
125    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126    _m_to_int64(__m64 __i) {
127  return (long long)__i;
128}
129
130extern __inline long long
131    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132    _mm_cvtm64_si64(__m64 __i) {
133  return (long long)__i;
134}
135
136/* Microsoft intrinsic.  */
137extern __inline long long
138    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139    _mm_cvtsi64_si64x(__m64 __i) {
140  return (long long)__i;
141}
142
143#ifdef _ARCH_PWR8
144/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
145   the result, and the four 16-bit values from M2 into the upper four 8-bit
146   values of the result, all with signed saturation.  */
147extern __inline __m64
148    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
150  __vector signed short vm1;
151  __vector signed char vresult;
152
153  vm1 = (__vector signed short)(__vector unsigned long long)
154#ifdef __LITTLE_ENDIAN__
155      {__m1, __m2};
156#else
157      {__m2, __m1};
158#endif
159  vresult = vec_packs(vm1, vm1);
160  return (__m64)((__vector long long)vresult)[0];
161}
162
163extern __inline __m64
164    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165    _m_packsswb(__m64 __m1, __m64 __m2) {
166  return _mm_packs_pi16(__m1, __m2);
167}
168
169/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
170   the result, and the two 32-bit values from M2 into the upper two 16-bit
171   values of the result, all with signed saturation.  */
172extern __inline __m64
173    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
175  __vector signed int vm1;
176  __vector signed short vresult;
177
178  vm1 = (__vector signed int)(__vector unsigned long long)
179#ifdef __LITTLE_ENDIAN__
180      {__m1, __m2};
181#else
182      {__m2, __m1};
183#endif
184  vresult = vec_packs(vm1, vm1);
185  return (__m64)((__vector long long)vresult)[0];
186}
187
188extern __inline __m64
189    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190    _m_packssdw(__m64 __m1, __m64 __m2) {
191  return _mm_packs_pi32(__m1, __m2);
192}
193
194/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
195   the result, and the four 16-bit values from M2 into the upper four 8-bit
196   values of the result, all with unsigned saturation.  */
197extern __inline __m64
198    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
200  __vector unsigned char r;
201  __vector signed short vm1 = (__vector signed short)(__vector long long)
202#ifdef __LITTLE_ENDIAN__
203      {__m1, __m2};
204#else
205      {__m2, __m1};
206#endif
207  const __vector signed short __zero = {0};
208  __vector __bool short __select = vec_cmplt(vm1, __zero);
209  r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
210  __vector __bool char packsel = vec_pack(__select, __select);
211  r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
212  return (__m64)((__vector long long)r)[0];
213}
214
215extern __inline __m64
216    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
217    _m_packuswb(__m64 __m1, __m64 __m2) {
218  return _mm_packs_pu16(__m1, __m2);
219}
220#endif /* end ARCH_PWR8 */
221
222/* Interleave the four 8-bit values from the high half of M1 with the four
223   8-bit values from the high half of M2.  */
224extern __inline __m64
225    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
227#if _ARCH_PWR8
228  __vector unsigned char a, b, c;
229
230  a = (__vector unsigned char)vec_splats(__m1);
231  b = (__vector unsigned char)vec_splats(__m2);
232  c = vec_mergel(a, b);
233  return (__m64)((__vector long long)c)[1];
234#else
235  __m64_union m1, m2, res;
236
237  m1.as_m64 = __m1;
238  m2.as_m64 = __m2;
239
240  res.as_char[0] = m1.as_char[4];
241  res.as_char[1] = m2.as_char[4];
242  res.as_char[2] = m1.as_char[5];
243  res.as_char[3] = m2.as_char[5];
244  res.as_char[4] = m1.as_char[6];
245  res.as_char[5] = m2.as_char[6];
246  res.as_char[6] = m1.as_char[7];
247  res.as_char[7] = m2.as_char[7];
248
249  return (__m64)res.as_m64;
250#endif
251}
252
253extern __inline __m64
254    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255    _m_punpckhbw(__m64 __m1, __m64 __m2) {
256  return _mm_unpackhi_pi8(__m1, __m2);
257}
258
259/* Interleave the two 16-bit values from the high half of M1 with the two
260   16-bit values from the high half of M2.  */
261extern __inline __m64
262    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
264  __m64_union m1, m2, res;
265
266  m1.as_m64 = __m1;
267  m2.as_m64 = __m2;
268
269  res.as_short[0] = m1.as_short[2];
270  res.as_short[1] = m2.as_short[2];
271  res.as_short[2] = m1.as_short[3];
272  res.as_short[3] = m2.as_short[3];
273
274  return (__m64)res.as_m64;
275}
276
277extern __inline __m64
278    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279    _m_punpckhwd(__m64 __m1, __m64 __m2) {
280  return _mm_unpackhi_pi16(__m1, __m2);
281}
282/* Interleave the 32-bit value from the high half of M1 with the 32-bit
283   value from the high half of M2.  */
284extern __inline __m64
285    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
287  __m64_union m1, m2, res;
288
289  m1.as_m64 = __m1;
290  m2.as_m64 = __m2;
291
292  res.as_int[0] = m1.as_int[1];
293  res.as_int[1] = m2.as_int[1];
294
295  return (__m64)res.as_m64;
296}
297
298extern __inline __m64
299    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300    _m_punpckhdq(__m64 __m1, __m64 __m2) {
301  return _mm_unpackhi_pi32(__m1, __m2);
302}
303/* Interleave the four 8-bit values from the low half of M1 with the four
304   8-bit values from the low half of M2.  */
305extern __inline __m64
306    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
308#if _ARCH_PWR8
309  __vector unsigned char a, b, c;
310
311  a = (__vector unsigned char)vec_splats(__m1);
312  b = (__vector unsigned char)vec_splats(__m2);
313  c = vec_mergel(a, b);
314  return (__m64)((__vector long long)c)[0];
315#else
316  __m64_union m1, m2, res;
317
318  m1.as_m64 = __m1;
319  m2.as_m64 = __m2;
320
321  res.as_char[0] = m1.as_char[0];
322  res.as_char[1] = m2.as_char[0];
323  res.as_char[2] = m1.as_char[1];
324  res.as_char[3] = m2.as_char[1];
325  res.as_char[4] = m1.as_char[2];
326  res.as_char[5] = m2.as_char[2];
327  res.as_char[6] = m1.as_char[3];
328  res.as_char[7] = m2.as_char[3];
329
330  return (__m64)res.as_m64;
331#endif
332}
333
334extern __inline __m64
335    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336    _m_punpcklbw(__m64 __m1, __m64 __m2) {
337  return _mm_unpacklo_pi8(__m1, __m2);
338}
339/* Interleave the two 16-bit values from the low half of M1 with the two
340   16-bit values from the low half of M2.  */
341extern __inline __m64
342    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
344  __m64_union m1, m2, res;
345
346  m1.as_m64 = __m1;
347  m2.as_m64 = __m2;
348
349  res.as_short[0] = m1.as_short[0];
350  res.as_short[1] = m2.as_short[0];
351  res.as_short[2] = m1.as_short[1];
352  res.as_short[3] = m2.as_short[1];
353
354  return (__m64)res.as_m64;
355}
356
357extern __inline __m64
358    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359    _m_punpcklwd(__m64 __m1, __m64 __m2) {
360  return _mm_unpacklo_pi16(__m1, __m2);
361}
362
363/* Interleave the 32-bit value from the low half of M1 with the 32-bit
364   value from the low half of M2.  */
365extern __inline __m64
366    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
368  __m64_union m1, m2, res;
369
370  m1.as_m64 = __m1;
371  m2.as_m64 = __m2;
372
373  res.as_int[0] = m1.as_int[0];
374  res.as_int[1] = m2.as_int[0];
375
376  return (__m64)res.as_m64;
377}
378
379extern __inline __m64
380    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381    _m_punpckldq(__m64 __m1, __m64 __m2) {
382  return _mm_unpacklo_pi32(__m1, __m2);
383}
384
385/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
386extern __inline __m64
387    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388    _mm_add_pi8(__m64 __m1, __m64 __m2) {
389#if _ARCH_PWR8
390  __vector signed char a, b, c;
391
392  a = (__vector signed char)vec_splats(__m1);
393  b = (__vector signed char)vec_splats(__m2);
394  c = vec_add(a, b);
395  return (__m64)((__vector long long)c)[0];
396#else
397  __m64_union m1, m2, res;
398
399  m1.as_m64 = __m1;
400  m2.as_m64 = __m2;
401
402  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
403  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
404  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
405  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
406  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
407  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
408  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
409  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
410
411  return (__m64)res.as_m64;
412#endif
413}
414
415extern __inline __m64
416    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417    _m_paddb(__m64 __m1, __m64 __m2) {
418  return _mm_add_pi8(__m1, __m2);
419}
420
421/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
422extern __inline __m64
423    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424    _mm_add_pi16(__m64 __m1, __m64 __m2) {
425#if _ARCH_PWR8
426  __vector signed short a, b, c;
427
428  a = (__vector signed short)vec_splats(__m1);
429  b = (__vector signed short)vec_splats(__m2);
430  c = vec_add(a, b);
431  return (__m64)((__vector long long)c)[0];
432#else
433  __m64_union m1, m2, res;
434
435  m1.as_m64 = __m1;
436  m2.as_m64 = __m2;
437
438  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
439  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
440  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
441  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
442
443  return (__m64)res.as_m64;
444#endif
445}
446
447extern __inline __m64
448    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449    _m_paddw(__m64 __m1, __m64 __m2) {
450  return _mm_add_pi16(__m1, __m2);
451}
452
453/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
454extern __inline __m64
455    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456    _mm_add_pi32(__m64 __m1, __m64 __m2) {
457#if _ARCH_PWR9
458  __vector signed int a, b, c;
459
460  a = (__vector signed int)vec_splats(__m1);
461  b = (__vector signed int)vec_splats(__m2);
462  c = vec_add(a, b);
463  return (__m64)((__vector long long)c)[0];
464#else
465  __m64_union m1, m2, res;
466
467  m1.as_m64 = __m1;
468  m2.as_m64 = __m2;
469
470  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
471  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
472
473  return (__m64)res.as_m64;
474#endif
475}
476
477extern __inline __m64
478    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479    _m_paddd(__m64 __m1, __m64 __m2) {
480  return _mm_add_pi32(__m1, __m2);
481}
482
483/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
484extern __inline __m64
485    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
487#if _ARCH_PWR8
488  __vector signed char a, b, c;
489
490  a = (__vector signed char)vec_splats(__m1);
491  b = (__vector signed char)vec_splats(__m2);
492  c = vec_sub(a, b);
493  return (__m64)((__vector long long)c)[0];
494#else
495  __m64_union m1, m2, res;
496
497  m1.as_m64 = __m1;
498  m2.as_m64 = __m2;
499
500  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
501  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
502  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
503  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
504  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
505  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
506  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
507  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
508
509  return (__m64)res.as_m64;
510#endif
511}
512
513extern __inline __m64
514    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515    _m_psubb(__m64 __m1, __m64 __m2) {
516  return _mm_sub_pi8(__m1, __m2);
517}
518
519/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
520extern __inline __m64
521    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
523#if _ARCH_PWR8
524  __vector signed short a, b, c;
525
526  a = (__vector signed short)vec_splats(__m1);
527  b = (__vector signed short)vec_splats(__m2);
528  c = vec_sub(a, b);
529  return (__m64)((__vector long long)c)[0];
530#else
531  __m64_union m1, m2, res;
532
533  m1.as_m64 = __m1;
534  m2.as_m64 = __m2;
535
536  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
537  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
538  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
539  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
540
541  return (__m64)res.as_m64;
542#endif
543}
544
545extern __inline __m64
546    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547    _m_psubw(__m64 __m1, __m64 __m2) {
548  return _mm_sub_pi16(__m1, __m2);
549}
550
551/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
552extern __inline __m64
553    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
555#if _ARCH_PWR9
556  __vector signed int a, b, c;
557
558  a = (__vector signed int)vec_splats(__m1);
559  b = (__vector signed int)vec_splats(__m2);
560  c = vec_sub(a, b);
561  return (__m64)((__vector long long)c)[0];
562#else
563  __m64_union m1, m2, res;
564
565  m1.as_m64 = __m1;
566  m2.as_m64 = __m2;
567
568  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
569  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
570
571  return (__m64)res.as_m64;
572#endif
573}
574
575extern __inline __m64
576    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577    _m_psubd(__m64 __m1, __m64 __m2) {
578  return _mm_sub_pi32(__m1, __m2);
579}
580
581extern __inline __m64
582    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583    _mm_add_si64(__m64 __m1, __m64 __m2) {
584  return (__m1 + __m2);
585}
586
587extern __inline __m64
588    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589    _mm_sub_si64(__m64 __m1, __m64 __m2) {
590  return (__m1 - __m2);
591}
592
593/* Shift the 64-bit value in M left by COUNT.  */
594extern __inline __m64
595    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596    _mm_sll_si64(__m64 __m, __m64 __count) {
597  return (__m << __count);
598}
599
600extern __inline __m64
601    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602    _m_psllq(__m64 __m, __m64 __count) {
603  return _mm_sll_si64(__m, __count);
604}
605
606extern __inline __m64
607    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608    _mm_slli_si64(__m64 __m, const int __count) {
609  return (__m << __count);
610}
611
612extern __inline __m64
613    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
614    _m_psllqi(__m64 __m, const int __count) {
615  return _mm_slli_si64(__m, __count);
616}
617
618/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
619extern __inline __m64
620    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621    _mm_srl_si64(__m64 __m, __m64 __count) {
622  return (__m >> __count);
623}
624
625extern __inline __m64
626    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627    _m_psrlq(__m64 __m, __m64 __count) {
628  return _mm_srl_si64(__m, __count);
629}
630
631extern __inline __m64
632    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633    _mm_srli_si64(__m64 __m, const int __count) {
634  return (__m >> __count);
635}
636
637extern __inline __m64
638    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639    _m_psrlqi(__m64 __m, const int __count) {
640  return _mm_srli_si64(__m, __count);
641}
642
643/* Bit-wise AND the 64-bit values in M1 and M2.  */
644extern __inline __m64
645    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646    _mm_and_si64(__m64 __m1, __m64 __m2) {
647  return (__m1 & __m2);
648}
649
650extern __inline __m64
651    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652    _m_pand(__m64 __m1, __m64 __m2) {
653  return _mm_and_si64(__m1, __m2);
654}
655
656/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
657   64-bit value in M2.  */
658extern __inline __m64
659    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
661  return (~__m1 & __m2);
662}
663
664extern __inline __m64
665    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666    _m_pandn(__m64 __m1, __m64 __m2) {
667  return _mm_andnot_si64(__m1, __m2);
668}
669
670/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
671extern __inline __m64
672    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673    _mm_or_si64(__m64 __m1, __m64 __m2) {
674  return (__m1 | __m2);
675}
676
677extern __inline __m64
678    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679    _m_por(__m64 __m1, __m64 __m2) {
680  return _mm_or_si64(__m1, __m2);
681}
682
683/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
684extern __inline __m64
685    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686    _mm_xor_si64(__m64 __m1, __m64 __m2) {
687  return (__m1 ^ __m2);
688}
689
690extern __inline __m64
691    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692    _m_pxor(__m64 __m1, __m64 __m2) {
693  return _mm_xor_si64(__m1, __m2);
694}
695
696/* Creates a 64-bit zero.  */
697extern __inline __m64
698    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699    _mm_setzero_si64(void) {
700  return (__m64)0;
701}
702
703/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
704   test is true and zero if false.  */
705extern __inline __m64
706    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
708#if defined(_ARCH_PWR6) && defined(__powerpc64__)
709  __m64 res;
710  __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
711  return (res);
712#else
713  __m64_union m1, m2, res;
714
715  m1.as_m64 = __m1;
716  m2.as_m64 = __m2;
717
718  res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
719  res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
720  res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
721  res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
722  res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
723  res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
724  res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
725  res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
726
727  return (__m64)res.as_m64;
728#endif
729}
730
731extern __inline __m64
732    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
734  return _mm_cmpeq_pi8(__m1, __m2);
735}
736
737extern __inline __m64
738    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
739    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
740#if _ARCH_PWR8
741  __vector signed char a, b, c;
742
743  a = (__vector signed char)vec_splats(__m1);
744  b = (__vector signed char)vec_splats(__m2);
745  c = (__vector signed char)vec_cmpgt(a, b);
746  return (__m64)((__vector long long)c)[0];
747#else
748  __m64_union m1, m2, res;
749
750  m1.as_m64 = __m1;
751  m2.as_m64 = __m2;
752
753  res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
754  res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
755  res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
756  res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
757  res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
758  res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
759  res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
760  res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
761
762  return (__m64)res.as_m64;
763#endif
764}
765
766extern __inline __m64
767    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
768    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
769  return _mm_cmpgt_pi8(__m1, __m2);
770}
771
772/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
773   the test is true and zero if false.  */
774extern __inline __m64
775    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
777#if _ARCH_PWR8
778  __vector signed short a, b, c;
779
780  a = (__vector signed short)vec_splats(__m1);
781  b = (__vector signed short)vec_splats(__m2);
782  c = (__vector signed short)vec_cmpeq(a, b);
783  return (__m64)((__vector long long)c)[0];
784#else
785  __m64_union m1, m2, res;
786
787  m1.as_m64 = __m1;
788  m2.as_m64 = __m2;
789
790  res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
791  res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
792  res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
793  res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
794
795  return (__m64)res.as_m64;
796#endif
797}
798
799extern __inline __m64
800    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
802  return _mm_cmpeq_pi16(__m1, __m2);
803}
804
805extern __inline __m64
806    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
808#if _ARCH_PWR8
809  __vector signed short a, b, c;
810
811  a = (__vector signed short)vec_splats(__m1);
812  b = (__vector signed short)vec_splats(__m2);
813  c = (__vector signed short)vec_cmpgt(a, b);
814  return (__m64)((__vector long long)c)[0];
815#else
816  __m64_union m1, m2, res;
817
818  m1.as_m64 = __m1;
819  m2.as_m64 = __m2;
820
821  res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
822  res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
823  res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
824  res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
825
826  return (__m64)res.as_m64;
827#endif
828}
829
830extern __inline __m64
831    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
833  return _mm_cmpgt_pi16(__m1, __m2);
834}
835
836/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
837   the test is true and zero if false.  */
838extern __inline __m64
839    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
841#if _ARCH_PWR9
842  __vector signed int a, b, c;
843
844  a = (__vector signed int)vec_splats(__m1);
845  b = (__vector signed int)vec_splats(__m2);
846  c = (__vector signed int)vec_cmpeq(a, b);
847  return (__m64)((__vector long long)c)[0];
848#else
849  __m64_union m1, m2, res;
850
851  m1.as_m64 = __m1;
852  m2.as_m64 = __m2;
853
854  res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
855  res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
856
857  return (__m64)res.as_m64;
858#endif
859}
860
861extern __inline __m64
862    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
864  return _mm_cmpeq_pi32(__m1, __m2);
865}
866
867extern __inline __m64
868    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
870#if _ARCH_PWR9
871  __vector signed int a, b, c;
872
873  a = (__vector signed int)vec_splats(__m1);
874  b = (__vector signed int)vec_splats(__m2);
875  c = (__vector signed int)vec_cmpgt(a, b);
876  return (__m64)((__vector long long)c)[0];
877#else
878  __m64_union m1, m2, res;
879
880  m1.as_m64 = __m1;
881  m2.as_m64 = __m2;
882
883  res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
884  res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
885
886  return (__m64)res.as_m64;
887#endif
888}
889
890extern __inline __m64
891    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
893  return _mm_cmpgt_pi32(__m1, __m2);
894}
895
896#if _ARCH_PWR8
897/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
898   saturated arithmetic.  */
899extern __inline __m64
900    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
902  __vector signed char a, b, c;
903
904  a = (__vector signed char)vec_splats(__m1);
905  b = (__vector signed char)vec_splats(__m2);
906  c = vec_adds(a, b);
907  return (__m64)((__vector long long)c)[0];
908}
909
910extern __inline __m64
911    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
912    _m_paddsb(__m64 __m1, __m64 __m2) {
913  return _mm_adds_pi8(__m1, __m2);
914}
915/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
916   saturated arithmetic.  */
917extern __inline __m64
918    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
920  __vector signed short a, b, c;
921
922  a = (__vector signed short)vec_splats(__m1);
923  b = (__vector signed short)vec_splats(__m2);
924  c = vec_adds(a, b);
925  return (__m64)((__vector long long)c)[0];
926}
927
928extern __inline __m64
929    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
930    _m_paddsw(__m64 __m1, __m64 __m2) {
931  return _mm_adds_pi16(__m1, __m2);
932}
933/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
934   saturated arithmetic.  */
935extern __inline __m64
936    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
938  __vector unsigned char a, b, c;
939
940  a = (__vector unsigned char)vec_splats(__m1);
941  b = (__vector unsigned char)vec_splats(__m2);
942  c = vec_adds(a, b);
943  return (__m64)((__vector long long)c)[0];
944}
945
946extern __inline __m64
947    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948    _m_paddusb(__m64 __m1, __m64 __m2) {
949  return _mm_adds_pu8(__m1, __m2);
950}
951
952/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
953   saturated arithmetic.  */
954extern __inline __m64
955    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
957  __vector unsigned short a, b, c;
958
959  a = (__vector unsigned short)vec_splats(__m1);
960  b = (__vector unsigned short)vec_splats(__m2);
961  c = vec_adds(a, b);
962  return (__m64)((__vector long long)c)[0];
963}
964
965extern __inline __m64
966    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967    _m_paddusw(__m64 __m1, __m64 __m2) {
968  return _mm_adds_pu16(__m1, __m2);
969}
970
971/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
972   saturating arithmetic.  */
973extern __inline __m64
974    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
976  __vector signed char a, b, c;
977
978  a = (__vector signed char)vec_splats(__m1);
979  b = (__vector signed char)vec_splats(__m2);
980  c = vec_subs(a, b);
981  return (__m64)((__vector long long)c)[0];
982}
983
984extern __inline __m64
985    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986    _m_psubsb(__m64 __m1, __m64 __m2) {
987  return _mm_subs_pi8(__m1, __m2);
988}
989
990/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
991   signed saturating arithmetic.  */
992extern __inline __m64
993    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
995  __vector signed short a, b, c;
996
997  a = (__vector signed short)vec_splats(__m1);
998  b = (__vector signed short)vec_splats(__m2);
999  c = vec_subs(a, b);
1000  return (__m64)((__vector long long)c)[0];
1001}
1002
1003extern __inline __m64
1004    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005    _m_psubsw(__m64 __m1, __m64 __m2) {
1006  return _mm_subs_pi16(__m1, __m2);
1007}
1008
1009/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1010   unsigned saturating arithmetic.  */
1011extern __inline __m64
1012    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1014  __vector unsigned char a, b, c;
1015
1016  a = (__vector unsigned char)vec_splats(__m1);
1017  b = (__vector unsigned char)vec_splats(__m2);
1018  c = vec_subs(a, b);
1019  return (__m64)((__vector long long)c)[0];
1020}
1021
1022extern __inline __m64
1023    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024    _m_psubusb(__m64 __m1, __m64 __m2) {
1025  return _mm_subs_pu8(__m1, __m2);
1026}
1027
1028/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1029   unsigned saturating arithmetic.  */
1030extern __inline __m64
1031    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1033  __vector unsigned short a, b, c;
1034
1035  a = (__vector unsigned short)vec_splats(__m1);
1036  b = (__vector unsigned short)vec_splats(__m2);
1037  c = vec_subs(a, b);
1038  return (__m64)((__vector long long)c)[0];
1039}
1040
1041extern __inline __m64
1042    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043    _m_psubusw(__m64 __m1, __m64 __m2) {
1044  return _mm_subs_pu16(__m1, __m2);
1045}
1046
1047/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1048   four 32-bit intermediate results, which are then summed by pairs to
1049   produce two 32-bit results.  */
1050extern __inline __m64
1051    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1053  __vector signed short a, b;
1054  __vector signed int c;
1055  __vector signed int zero = {0, 0, 0, 0};
1056
1057  a = (__vector signed short)vec_splats(__m1);
1058  b = (__vector signed short)vec_splats(__m2);
1059  c = vec_vmsumshm(a, b, zero);
1060  return (__m64)((__vector long long)c)[0];
1061}
1062
1063extern __inline __m64
1064    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065    _m_pmaddwd(__m64 __m1, __m64 __m2) {
1066  return _mm_madd_pi16(__m1, __m2);
1067}
1068/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1069   M2 and produce the high 16 bits of the 32-bit results.  */
1070extern __inline __m64
1071    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1073  __vector signed short a, b;
1074  __vector signed short c;
1075  __vector signed int w0, w1;
1076  __vector unsigned char xform1 = {
1077#ifdef __LITTLE_ENDIAN__
1078      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1079      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1080#else
1081      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1082      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1083#endif
1084  };
1085
1086  a = (__vector signed short)vec_splats(__m1);
1087  b = (__vector signed short)vec_splats(__m2);
1088
1089  w0 = vec_vmulesh(a, b);
1090  w1 = vec_vmulosh(a, b);
1091  c = (__vector signed short)vec_perm(w0, w1, xform1);
1092
1093  return (__m64)((__vector long long)c)[0];
1094}
1095
1096extern __inline __m64
1097    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098    _m_pmulhw(__m64 __m1, __m64 __m2) {
1099  return _mm_mulhi_pi16(__m1, __m2);
1100}
1101
1102/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1103   the low 16 bits of the results.  */
1104extern __inline __m64
1105    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1107  __vector signed short a, b, c;
1108
1109  a = (__vector signed short)vec_splats(__m1);
1110  b = (__vector signed short)vec_splats(__m2);
1111  c = a * b;
1112  return (__m64)((__vector long long)c)[0];
1113}
1114
1115extern __inline __m64
1116    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117    _m_pmullw(__m64 __m1, __m64 __m2) {
1118  return _mm_mullo_pi16(__m1, __m2);
1119}
1120
1121/* Shift four 16-bit values in M left by COUNT.  */
1122extern __inline __m64
1123    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124    _mm_sll_pi16(__m64 __m, __m64 __count) {
1125  __vector signed short m, r;
1126  __vector unsigned short c;
1127
1128  if (__count <= 15) {
1129    m = (__vector signed short)vec_splats(__m);
1130    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1131    r = vec_sl(m, (__vector unsigned short)c);
1132    return (__m64)((__vector long long)r)[0];
1133  } else
1134    return (0);
1135}
1136
1137extern __inline __m64
1138    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139    _m_psllw(__m64 __m, __m64 __count) {
1140  return _mm_sll_pi16(__m, __count);
1141}
1142
1143extern __inline __m64
1144    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145    _mm_slli_pi16(__m64 __m, int __count) {
1146  /* Promote int to long then invoke mm_sll_pi16.  */
1147  return _mm_sll_pi16(__m, __count);
1148}
1149
1150extern __inline __m64
1151    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152    _m_psllwi(__m64 __m, int __count) {
1153  return _mm_slli_pi16(__m, __count);
1154}
1155
1156/* Shift two 32-bit values in M left by COUNT.  */
1157extern __inline __m64
1158    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159    _mm_sll_pi32(__m64 __m, __m64 __count) {
1160  __m64_union m, res;
1161
1162  m.as_m64 = __m;
1163
1164  res.as_int[0] = m.as_int[0] << __count;
1165  res.as_int[1] = m.as_int[1] << __count;
1166  return (res.as_m64);
1167}
1168
1169extern __inline __m64
1170    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171    _m_pslld(__m64 __m, __m64 __count) {
1172  return _mm_sll_pi32(__m, __count);
1173}
1174
1175extern __inline __m64
1176    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177    _mm_slli_pi32(__m64 __m, int __count) {
1178  /* Promote int to long then invoke mm_sll_pi32.  */
1179  return _mm_sll_pi32(__m, __count);
1180}
1181
1182extern __inline __m64
1183    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184    _m_pslldi(__m64 __m, int __count) {
1185  return _mm_slli_pi32(__m, __count);
1186}
1187
1188/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1189extern __inline __m64
1190    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191    _mm_sra_pi16(__m64 __m, __m64 __count) {
1192  __vector signed short m, r;
1193  __vector unsigned short c;
1194
1195  if (__count <= 15) {
1196    m = (__vector signed short)vec_splats(__m);
1197    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1198    r = vec_sra(m, (__vector unsigned short)c);
1199    return (__m64)((__vector long long)r)[0];
1200  } else
1201    return (0);
1202}
1203
1204extern __inline __m64
1205    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206    _m_psraw(__m64 __m, __m64 __count) {
1207  return _mm_sra_pi16(__m, __count);
1208}
1209
1210extern __inline __m64
1211    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212    _mm_srai_pi16(__m64 __m, int __count) {
1213  /* Promote int to long then invoke mm_sra_pi32.  */
1214  return _mm_sra_pi16(__m, __count);
1215}
1216
1217extern __inline __m64
1218    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219    _m_psrawi(__m64 __m, int __count) {
1220  return _mm_srai_pi16(__m, __count);
1221}
1222
1223/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1224extern __inline __m64
1225    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226    _mm_sra_pi32(__m64 __m, __m64 __count) {
1227  __m64_union m, res;
1228
1229  m.as_m64 = __m;
1230
1231  res.as_int[0] = m.as_int[0] >> __count;
1232  res.as_int[1] = m.as_int[1] >> __count;
1233  return (res.as_m64);
1234}
1235
1236extern __inline __m64
1237    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238    _m_psrad(__m64 __m, __m64 __count) {
1239  return _mm_sra_pi32(__m, __count);
1240}
1241
1242extern __inline __m64
1243    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244    _mm_srai_pi32(__m64 __m, int __count) {
1245  /* Promote int to long then invoke mm_sra_pi32.  */
1246  return _mm_sra_pi32(__m, __count);
1247}
1248
1249extern __inline __m64
1250    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251    _m_psradi(__m64 __m, int __count) {
1252  return _mm_srai_pi32(__m, __count);
1253}
1254
1255/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1256extern __inline __m64
1257    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258    _mm_srl_pi16(__m64 __m, __m64 __count) {
1259  __vector unsigned short m, r;
1260  __vector unsigned short c;
1261
1262  if (__count <= 15) {
1263    m = (__vector unsigned short)vec_splats(__m);
1264    c = (__vector unsigned short)vec_splats((unsigned short)__count);
1265    r = vec_sr(m, (__vector unsigned short)c);
1266    return (__m64)((__vector long long)r)[0];
1267  } else
1268    return (0);
1269}
1270
1271extern __inline __m64
1272    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273    _m_psrlw(__m64 __m, __m64 __count) {
1274  return _mm_srl_pi16(__m, __count);
1275}
1276
1277extern __inline __m64
1278    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279    _mm_srli_pi16(__m64 __m, int __count) {
1280  /* Promote int to long then invoke mm_sra_pi32.  */
1281  return _mm_srl_pi16(__m, __count);
1282}
1283
1284extern __inline __m64
1285    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286    _m_psrlwi(__m64 __m, int __count) {
1287  return _mm_srli_pi16(__m, __count);
1288}
1289
1290/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1291extern __inline __m64
1292    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293    _mm_srl_pi32(__m64 __m, __m64 __count) {
1294  __m64_union m, res;
1295
1296  m.as_m64 = __m;
1297
1298  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1299  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1300  return (res.as_m64);
1301}
1302
1303extern __inline __m64
1304    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305    _m_psrld(__m64 __m, __m64 __count) {
1306  return _mm_srl_pi32(__m, __count);
1307}
1308
1309extern __inline __m64
1310    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311    _mm_srli_pi32(__m64 __m, int __count) {
1312  /* Promote int to long then invoke mm_srl_pi32.  */
1313  return _mm_srl_pi32(__m, __count);
1314}
1315
1316extern __inline __m64
1317    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318    _m_psrldi(__m64 __m, int __count) {
1319  return _mm_srli_pi32(__m, __count);
1320}
1321#endif /* _ARCH_PWR8 */
1322
1323/* Creates a vector of two 32-bit values; I0 is least significant.  */
1324extern __inline __m64
1325    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326    _mm_set_pi32(int __i1, int __i0) {
1327  __m64_union res;
1328
1329  res.as_int[0] = __i0;
1330  res.as_int[1] = __i1;
1331  return (res.as_m64);
1332}
1333
1334/* Creates a vector of four 16-bit values; W0 is least significant.  */
1335extern __inline __m64
1336    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1338  __m64_union res;
1339
1340  res.as_short[0] = __w0;
1341  res.as_short[1] = __w1;
1342  res.as_short[2] = __w2;
1343  res.as_short[3] = __w3;
1344  return (res.as_m64);
1345}
1346
1347/* Creates a vector of eight 8-bit values; B0 is least significant.  */
1348extern __inline __m64
1349    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1351                char __b2, char __b1, char __b0) {
1352  __m64_union res;
1353
1354  res.as_char[0] = __b0;
1355  res.as_char[1] = __b1;
1356  res.as_char[2] = __b2;
1357  res.as_char[3] = __b3;
1358  res.as_char[4] = __b4;
1359  res.as_char[5] = __b5;
1360  res.as_char[6] = __b6;
1361  res.as_char[7] = __b7;
1362  return (res.as_m64);
1363}
1364
1365/* Similar, but with the arguments in reverse order.  */
1366extern __inline __m64
1367    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368    _mm_setr_pi32(int __i0, int __i1) {
1369  __m64_union res;
1370
1371  res.as_int[0] = __i0;
1372  res.as_int[1] = __i1;
1373  return (res.as_m64);
1374}
1375
1376extern __inline __m64
1377    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1379  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1380}
1381
1382extern __inline __m64
1383    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1385                 char __b5, char __b6, char __b7) {
1386  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1387}
1388
1389/* Creates a vector of two 32-bit values, both elements containing I.  */
1390extern __inline __m64
1391    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392    _mm_set1_pi32(int __i) {
1393  __m64_union res;
1394
1395  res.as_int[0] = __i;
1396  res.as_int[1] = __i;
1397  return (res.as_m64);
1398}
1399
1400/* Creates a vector of four 16-bit values, all elements containing W.  */
1401extern __inline __m64
1402    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403    _mm_set1_pi16(short __w) {
1404#if _ARCH_PWR9
1405  __vector signed short w;
1406
1407  w = (__vector signed short)vec_splats(__w);
1408  return (__m64)((__vector long long)w)[0];
1409#else
1410  __m64_union res;
1411
1412  res.as_short[0] = __w;
1413  res.as_short[1] = __w;
1414  res.as_short[2] = __w;
1415  res.as_short[3] = __w;
1416  return (res.as_m64);
1417#endif
1418}
1419
1420/* Creates a vector of eight 8-bit values, all elements containing B.  */
1421extern __inline __m64
1422    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423    _mm_set1_pi8(signed char __b) {
1424#if _ARCH_PWR8
1425  __vector signed char b;
1426
1427  b = (__vector signed char)vec_splats(__b);
1428  return (__m64)((__vector long long)b)[0];
1429#else
1430  __m64_union res;
1431
1432  res.as_char[0] = __b;
1433  res.as_char[1] = __b;
1434  res.as_char[2] = __b;
1435  res.as_char[3] = __b;
1436  res.as_char[4] = __b;
1437  res.as_char[5] = __b;
1438  res.as_char[6] = __b;
1439  res.as_char[7] = __b;
1440  return (res.as_m64);
1441#endif
1442}
1443#endif /* _MMINTRIN_H_INCLUDED */
1444