mmintrin.h revision 302408
1/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007
2   Free Software Foundation, Inc.
3
4   This file is part of GCC.
5
6   GCC is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   GCC is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with GCC; see the file COPYING.  If not, write to
18   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19   Boston, MA 02110-1301, USA.  */
20
21/* As a special exception, if you include this header file into source
22   files compiled by GCC, this header file does not by itself cause
23   the resulting executable to be covered by the GNU General Public
24   License.  This exception does not however invalidate any other
25   reasons why the executable file might be covered by the GNU General
26   Public License.  */
27
28/* Implemented from the specification included in the Intel C++ Compiler
29   User Guide and Reference, version 9.0.  */
30
31#ifndef _MMINTRIN_H_INCLUDED
32#define _MMINTRIN_H_INCLUDED
33
34#ifndef __MMX__
35# error "MMX instruction set not enabled"
36#else
37/* The Intel API is flexible enough that we must allow aliasing with other
38   vector types, and their scalar components.  */
39typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
40
41/* Internal data types for implementing the intrinsics.  */
42typedef int __v2si __attribute__ ((__vector_size__ (8)));
43typedef short __v4hi __attribute__ ((__vector_size__ (8)));
44typedef char __v8qi __attribute__ ((__vector_size__ (8)));
45
46/* Empty the multimedia state.  */
47static __inline void __attribute__((__always_inline__))
48_mm_empty (void)
49{
50  __builtin_ia32_emms ();
51}
52
53static __inline void __attribute__((__always_inline__))
54_m_empty (void)
55{
56  _mm_empty ();
57}
58
59/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
60static __inline __m64  __attribute__((__always_inline__))
61_mm_cvtsi32_si64 (int __i)
62{
63  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
64}
65
66static __inline __m64  __attribute__((__always_inline__))
67_m_from_int (int __i)
68{
69  return _mm_cvtsi32_si64 (__i);
70}
71
72#ifdef __x86_64__
73/* Convert I to a __m64 object.  */
74
75/* Intel intrinsic.  */
76static __inline __m64  __attribute__((__always_inline__))
77_m_from_int64 (long long __i)
78{
79  return (__m64) __i;
80}
81
82static __inline __m64  __attribute__((__always_inline__))
83_mm_cvtsi64_m64 (long long __i)
84{
85  return (__m64) __i;
86}
87
88/* Microsoft intrinsic.  */
89static __inline __m64  __attribute__((__always_inline__))
90_mm_cvtsi64x_si64 (long long __i)
91{
92  return (__m64) __i;
93}
94
95static __inline __m64  __attribute__((__always_inline__))
96_mm_set_pi64x (long long __i)
97{
98  return (__m64) __i;
99}
100#endif
101
102/* Convert the lower 32 bits of the __m64 object into an integer.  */
103static __inline int __attribute__((__always_inline__))
104_mm_cvtsi64_si32 (__m64 __i)
105{
106  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
107}
108
109static __inline int __attribute__((__always_inline__))
110_m_to_int (__m64 __i)
111{
112  return _mm_cvtsi64_si32 (__i);
113}
114
115#ifdef __x86_64__
116/* Convert the __m64 object to a 64bit integer.  */
117
118/* Intel intrinsic.  */
119static __inline long long __attribute__((__always_inline__))
120_m_to_int64 (__m64 __i)
121{
122  return (long long)__i;
123}
124
125static __inline long long __attribute__((__always_inline__))
126_mm_cvtm64_si64 (__m64 __i)
127{
128  return (long long)__i;
129}
130
131/* Microsoft intrinsic.  */
132static __inline long long __attribute__((__always_inline__))
133_mm_cvtsi64_si64x (__m64 __i)
134{
135  return (long long)__i;
136}
137#endif
138
139/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
140   the result, and the four 16-bit values from M2 into the upper four 8-bit
141   values of the result, all with signed saturation.  */
142static __inline __m64 __attribute__((__always_inline__))
143_mm_packs_pi16 (__m64 __m1, __m64 __m2)
144{
145  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
146}
147
148static __inline __m64 __attribute__((__always_inline__))
149_m_packsswb (__m64 __m1, __m64 __m2)
150{
151  return _mm_packs_pi16 (__m1, __m2);
152}
153
154/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
155   the result, and the two 32-bit values from M2 into the upper two 16-bit
156   values of the result, all with signed saturation.  */
157static __inline __m64 __attribute__((__always_inline__))
158_mm_packs_pi32 (__m64 __m1, __m64 __m2)
159{
160  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
161}
162
163static __inline __m64 __attribute__((__always_inline__))
164_m_packssdw (__m64 __m1, __m64 __m2)
165{
166  return _mm_packs_pi32 (__m1, __m2);
167}
168
169/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
170   the result, and the four 16-bit values from M2 into the upper four 8-bit
171   values of the result, all with unsigned saturation.  */
172static __inline __m64 __attribute__((__always_inline__))
173_mm_packs_pu16 (__m64 __m1, __m64 __m2)
174{
175  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
176}
177
178static __inline __m64 __attribute__((__always_inline__))
179_m_packuswb (__m64 __m1, __m64 __m2)
180{
181  return _mm_packs_pu16 (__m1, __m2);
182}
183
184/* Interleave the four 8-bit values from the high half of M1 with the four
185   8-bit values from the high half of M2.  */
186static __inline __m64 __attribute__((__always_inline__))
187_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
188{
189  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
190}
191
192static __inline __m64 __attribute__((__always_inline__))
193_m_punpckhbw (__m64 __m1, __m64 __m2)
194{
195  return _mm_unpackhi_pi8 (__m1, __m2);
196}
197
198/* Interleave the two 16-bit values from the high half of M1 with the two
199   16-bit values from the high half of M2.  */
200static __inline __m64 __attribute__((__always_inline__))
201_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
202{
203  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
204}
205
206static __inline __m64 __attribute__((__always_inline__))
207_m_punpckhwd (__m64 __m1, __m64 __m2)
208{
209  return _mm_unpackhi_pi16 (__m1, __m2);
210}
211
212/* Interleave the 32-bit value from the high half of M1 with the 32-bit
213   value from the high half of M2.  */
214static __inline __m64 __attribute__((__always_inline__))
215_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
216{
217  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
218}
219
220static __inline __m64 __attribute__((__always_inline__))
221_m_punpckhdq (__m64 __m1, __m64 __m2)
222{
223  return _mm_unpackhi_pi32 (__m1, __m2);
224}
225
226/* Interleave the four 8-bit values from the low half of M1 with the four
227   8-bit values from the low half of M2.  */
228static __inline __m64 __attribute__((__always_inline__))
229_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
230{
231  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
232}
233
234static __inline __m64 __attribute__((__always_inline__))
235_m_punpcklbw (__m64 __m1, __m64 __m2)
236{
237  return _mm_unpacklo_pi8 (__m1, __m2);
238}
239
240/* Interleave the two 16-bit values from the low half of M1 with the two
241   16-bit values from the low half of M2.  */
242static __inline __m64 __attribute__((__always_inline__))
243_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
244{
245  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
246}
247
248static __inline __m64 __attribute__((__always_inline__))
249_m_punpcklwd (__m64 __m1, __m64 __m2)
250{
251  return _mm_unpacklo_pi16 (__m1, __m2);
252}
253
254/* Interleave the 32-bit value from the low half of M1 with the 32-bit
255   value from the low half of M2.  */
256static __inline __m64 __attribute__((__always_inline__))
257_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
258{
259  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
260}
261
262static __inline __m64 __attribute__((__always_inline__))
263_m_punpckldq (__m64 __m1, __m64 __m2)
264{
265  return _mm_unpacklo_pi32 (__m1, __m2);
266}
267
268/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
269static __inline __m64 __attribute__((__always_inline__))
270_mm_add_pi8 (__m64 __m1, __m64 __m2)
271{
272  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
273}
274
275static __inline __m64 __attribute__((__always_inline__))
276_m_paddb (__m64 __m1, __m64 __m2)
277{
278  return _mm_add_pi8 (__m1, __m2);
279}
280
281/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
282static __inline __m64 __attribute__((__always_inline__))
283_mm_add_pi16 (__m64 __m1, __m64 __m2)
284{
285  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
286}
287
288static __inline __m64 __attribute__((__always_inline__))
289_m_paddw (__m64 __m1, __m64 __m2)
290{
291  return _mm_add_pi16 (__m1, __m2);
292}
293
294/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
295static __inline __m64 __attribute__((__always_inline__))
296_mm_add_pi32 (__m64 __m1, __m64 __m2)
297{
298  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
299}
300
301static __inline __m64 __attribute__((__always_inline__))
302_m_paddd (__m64 __m1, __m64 __m2)
303{
304  return _mm_add_pi32 (__m1, __m2);
305}
306
307/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
308#ifdef __SSE2__
309static __inline __m64 __attribute__((__always_inline__))
310_mm_add_si64 (__m64 __m1, __m64 __m2)
311{
312  return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
313}
314#endif
315
316/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
317   saturated arithmetic.  */
318static __inline __m64 __attribute__((__always_inline__))
319_mm_adds_pi8 (__m64 __m1, __m64 __m2)
320{
321  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
322}
323
324static __inline __m64 __attribute__((__always_inline__))
325_m_paddsb (__m64 __m1, __m64 __m2)
326{
327  return _mm_adds_pi8 (__m1, __m2);
328}
329
330/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
331   saturated arithmetic.  */
332static __inline __m64 __attribute__((__always_inline__))
333_mm_adds_pi16 (__m64 __m1, __m64 __m2)
334{
335  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
336}
337
338static __inline __m64 __attribute__((__always_inline__))
339_m_paddsw (__m64 __m1, __m64 __m2)
340{
341  return _mm_adds_pi16 (__m1, __m2);
342}
343
344/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
345   saturated arithmetic.  */
346static __inline __m64 __attribute__((__always_inline__))
347_mm_adds_pu8 (__m64 __m1, __m64 __m2)
348{
349  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
350}
351
352static __inline __m64 __attribute__((__always_inline__))
353_m_paddusb (__m64 __m1, __m64 __m2)
354{
355  return _mm_adds_pu8 (__m1, __m2);
356}
357
358/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
359   saturated arithmetic.  */
360static __inline __m64 __attribute__((__always_inline__))
361_mm_adds_pu16 (__m64 __m1, __m64 __m2)
362{
363  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
364}
365
366static __inline __m64 __attribute__((__always_inline__))
367_m_paddusw (__m64 __m1, __m64 __m2)
368{
369  return _mm_adds_pu16 (__m1, __m2);
370}
371
372/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
373static __inline __m64 __attribute__((__always_inline__))
374_mm_sub_pi8 (__m64 __m1, __m64 __m2)
375{
376  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
377}
378
379static __inline __m64 __attribute__((__always_inline__))
380_m_psubb (__m64 __m1, __m64 __m2)
381{
382  return _mm_sub_pi8 (__m1, __m2);
383}
384
385/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
386static __inline __m64 __attribute__((__always_inline__))
387_mm_sub_pi16 (__m64 __m1, __m64 __m2)
388{
389  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
390}
391
392static __inline __m64 __attribute__((__always_inline__))
393_m_psubw (__m64 __m1, __m64 __m2)
394{
395  return _mm_sub_pi16 (__m1, __m2);
396}
397
398/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
399static __inline __m64 __attribute__((__always_inline__))
400_mm_sub_pi32 (__m64 __m1, __m64 __m2)
401{
402  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
403}
404
405static __inline __m64 __attribute__((__always_inline__))
406_m_psubd (__m64 __m1, __m64 __m2)
407{
408  return _mm_sub_pi32 (__m1, __m2);
409}
410
411/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
412#ifdef __SSE2__
413static __inline __m64 __attribute__((__always_inline__))
414_mm_sub_si64 (__m64 __m1, __m64 __m2)
415{
416  return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
417}
418#endif
419
420/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
421   saturating arithmetic.  */
422static __inline __m64 __attribute__((__always_inline__))
423_mm_subs_pi8 (__m64 __m1, __m64 __m2)
424{
425  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
426}
427
428static __inline __m64 __attribute__((__always_inline__))
429_m_psubsb (__m64 __m1, __m64 __m2)
430{
431  return _mm_subs_pi8 (__m1, __m2);
432}
433
434/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
435   signed saturating arithmetic.  */
436static __inline __m64 __attribute__((__always_inline__))
437_mm_subs_pi16 (__m64 __m1, __m64 __m2)
438{
439  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
440}
441
442static __inline __m64 __attribute__((__always_inline__))
443_m_psubsw (__m64 __m1, __m64 __m2)
444{
445  return _mm_subs_pi16 (__m1, __m2);
446}
447
448/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
449   unsigned saturating arithmetic.  */
450static __inline __m64 __attribute__((__always_inline__))
451_mm_subs_pu8 (__m64 __m1, __m64 __m2)
452{
453  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
454}
455
456static __inline __m64 __attribute__((__always_inline__))
457_m_psubusb (__m64 __m1, __m64 __m2)
458{
459  return _mm_subs_pu8 (__m1, __m2);
460}
461
462/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
463   unsigned saturating arithmetic.  */
464static __inline __m64 __attribute__((__always_inline__))
465_mm_subs_pu16 (__m64 __m1, __m64 __m2)
466{
467  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
468}
469
470static __inline __m64 __attribute__((__always_inline__))
471_m_psubusw (__m64 __m1, __m64 __m2)
472{
473  return _mm_subs_pu16 (__m1, __m2);
474}
475
476/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
477   four 32-bit intermediate results, which are then summed by pairs to
478   produce two 32-bit results.  */
479static __inline __m64 __attribute__((__always_inline__))
480_mm_madd_pi16 (__m64 __m1, __m64 __m2)
481{
482  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
483}
484
485static __inline __m64 __attribute__((__always_inline__))
486_m_pmaddwd (__m64 __m1, __m64 __m2)
487{
488  return _mm_madd_pi16 (__m1, __m2);
489}
490
491/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
492   M2 and produce the high 16 bits of the 32-bit results.  */
493static __inline __m64 __attribute__((__always_inline__))
494_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
495{
496  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
497}
498
499static __inline __m64 __attribute__((__always_inline__))
500_m_pmulhw (__m64 __m1, __m64 __m2)
501{
502  return _mm_mulhi_pi16 (__m1, __m2);
503}
504
505/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
506   the low 16 bits of the results.  */
507static __inline __m64 __attribute__((__always_inline__))
508_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
509{
510  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
511}
512
513static __inline __m64 __attribute__((__always_inline__))
514_m_pmullw (__m64 __m1, __m64 __m2)
515{
516  return _mm_mullo_pi16 (__m1, __m2);
517}
518
519/* Shift four 16-bit values in M left by COUNT.  */
520static __inline __m64 __attribute__((__always_inline__))
521_mm_sll_pi16 (__m64 __m, __m64 __count)
522{
523  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
524}
525
526static __inline __m64 __attribute__((__always_inline__))
527_m_psllw (__m64 __m, __m64 __count)
528{
529  return _mm_sll_pi16 (__m, __count);
530}
531
532static __inline __m64 __attribute__((__always_inline__))
533_mm_slli_pi16 (__m64 __m, int __count)
534{
535  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
536}
537
538static __inline __m64 __attribute__((__always_inline__))
539_m_psllwi (__m64 __m, int __count)
540{
541  return _mm_slli_pi16 (__m, __count);
542}
543
544/* Shift two 32-bit values in M left by COUNT.  */
545static __inline __m64 __attribute__((__always_inline__))
546_mm_sll_pi32 (__m64 __m, __m64 __count)
547{
548  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
549}
550
551static __inline __m64 __attribute__((__always_inline__))
552_m_pslld (__m64 __m, __m64 __count)
553{
554  return _mm_sll_pi32 (__m, __count);
555}
556
557static __inline __m64 __attribute__((__always_inline__))
558_mm_slli_pi32 (__m64 __m, int __count)
559{
560  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
561}
562
563static __inline __m64 __attribute__((__always_inline__))
564_m_pslldi (__m64 __m, int __count)
565{
566  return _mm_slli_pi32 (__m, __count);
567}
568
569/* Shift the 64-bit value in M left by COUNT.  */
570static __inline __m64 __attribute__((__always_inline__))
571_mm_sll_si64 (__m64 __m, __m64 __count)
572{
573  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
574}
575
576static __inline __m64 __attribute__((__always_inline__))
577_m_psllq (__m64 __m, __m64 __count)
578{
579  return _mm_sll_si64 (__m, __count);
580}
581
582static __inline __m64 __attribute__((__always_inline__))
583_mm_slli_si64 (__m64 __m, int __count)
584{
585  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
586}
587
588static __inline __m64 __attribute__((__always_inline__))
589_m_psllqi (__m64 __m, int __count)
590{
591  return _mm_slli_si64 (__m, __count);
592}
593
594/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
595static __inline __m64 __attribute__((__always_inline__))
596_mm_sra_pi16 (__m64 __m, __m64 __count)
597{
598  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
599}
600
601static __inline __m64 __attribute__((__always_inline__))
602_m_psraw (__m64 __m, __m64 __count)
603{
604  return _mm_sra_pi16 (__m, __count);
605}
606
607static __inline __m64 __attribute__((__always_inline__))
608_mm_srai_pi16 (__m64 __m, int __count)
609{
610  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
611}
612
613static __inline __m64 __attribute__((__always_inline__))
614_m_psrawi (__m64 __m, int __count)
615{
616  return _mm_srai_pi16 (__m, __count);
617}
618
619/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
620static __inline __m64 __attribute__((__always_inline__))
621_mm_sra_pi32 (__m64 __m, __m64 __count)
622{
623  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
624}
625
626static __inline __m64 __attribute__((__always_inline__))
627_m_psrad (__m64 __m, __m64 __count)
628{
629  return _mm_sra_pi32 (__m, __count);
630}
631
632static __inline __m64 __attribute__((__always_inline__))
633_mm_srai_pi32 (__m64 __m, int __count)
634{
635  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
636}
637
638static __inline __m64 __attribute__((__always_inline__))
639_m_psradi (__m64 __m, int __count)
640{
641  return _mm_srai_pi32 (__m, __count);
642}
643
644/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
645static __inline __m64 __attribute__((__always_inline__))
646_mm_srl_pi16 (__m64 __m, __m64 __count)
647{
648  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
649}
650
651static __inline __m64 __attribute__((__always_inline__))
652_m_psrlw (__m64 __m, __m64 __count)
653{
654  return _mm_srl_pi16 (__m, __count);
655}
656
657static __inline __m64 __attribute__((__always_inline__))
658_mm_srli_pi16 (__m64 __m, int __count)
659{
660  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
661}
662
663static __inline __m64 __attribute__((__always_inline__))
664_m_psrlwi (__m64 __m, int __count)
665{
666  return _mm_srli_pi16 (__m, __count);
667}
668
669/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
670static __inline __m64 __attribute__((__always_inline__))
671_mm_srl_pi32 (__m64 __m, __m64 __count)
672{
673  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
674}
675
676static __inline __m64 __attribute__((__always_inline__))
677_m_psrld (__m64 __m, __m64 __count)
678{
679  return _mm_srl_pi32 (__m, __count);
680}
681
682static __inline __m64 __attribute__((__always_inline__))
683_mm_srli_pi32 (__m64 __m, int __count)
684{
685  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
686}
687
688static __inline __m64 __attribute__((__always_inline__))
689_m_psrldi (__m64 __m, int __count)
690{
691  return _mm_srli_pi32 (__m, __count);
692}
693
694/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
695static __inline __m64 __attribute__((__always_inline__))
696_mm_srl_si64 (__m64 __m, __m64 __count)
697{
698  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
699}
700
701static __inline __m64 __attribute__((__always_inline__))
702_m_psrlq (__m64 __m, __m64 __count)
703{
704  return _mm_srl_si64 (__m, __count);
705}
706
707static __inline __m64 __attribute__((__always_inline__))
708_mm_srli_si64 (__m64 __m, int __count)
709{
710  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
711}
712
713static __inline __m64 __attribute__((__always_inline__))
714_m_psrlqi (__m64 __m, int __count)
715{
716  return _mm_srli_si64 (__m, __count);
717}
718
719/* Bit-wise AND the 64-bit values in M1 and M2.  */
720static __inline __m64 __attribute__((__always_inline__))
721_mm_and_si64 (__m64 __m1, __m64 __m2)
722{
723  return __builtin_ia32_pand (__m1, __m2);
724}
725
726static __inline __m64 __attribute__((__always_inline__))
727_m_pand (__m64 __m1, __m64 __m2)
728{
729  return _mm_and_si64 (__m1, __m2);
730}
731
732/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
733   64-bit value in M2.  */
734static __inline __m64 __attribute__((__always_inline__))
735_mm_andnot_si64 (__m64 __m1, __m64 __m2)
736{
737  return __builtin_ia32_pandn (__m1, __m2);
738}
739
740static __inline __m64 __attribute__((__always_inline__))
741_m_pandn (__m64 __m1, __m64 __m2)
742{
743  return _mm_andnot_si64 (__m1, __m2);
744}
745
746/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
747static __inline __m64 __attribute__((__always_inline__))
748_mm_or_si64 (__m64 __m1, __m64 __m2)
749{
750  return __builtin_ia32_por (__m1, __m2);
751}
752
753static __inline __m64 __attribute__((__always_inline__))
754_m_por (__m64 __m1, __m64 __m2)
755{
756  return _mm_or_si64 (__m1, __m2);
757}
758
759/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
760static __inline __m64 __attribute__((__always_inline__))
761_mm_xor_si64 (__m64 __m1, __m64 __m2)
762{
763  return __builtin_ia32_pxor (__m1, __m2);
764}
765
766static __inline __m64 __attribute__((__always_inline__))
767_m_pxor (__m64 __m1, __m64 __m2)
768{
769  return _mm_xor_si64 (__m1, __m2);
770}
771
772/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
773   test is true and zero if false.  */
774static __inline __m64 __attribute__((__always_inline__))
775_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
776{
777  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
778}
779
780static __inline __m64 __attribute__((__always_inline__))
781_m_pcmpeqb (__m64 __m1, __m64 __m2)
782{
783  return _mm_cmpeq_pi8 (__m1, __m2);
784}
785
786static __inline __m64 __attribute__((__always_inline__))
787_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
788{
789  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
790}
791
792static __inline __m64 __attribute__((__always_inline__))
793_m_pcmpgtb (__m64 __m1, __m64 __m2)
794{
795  return _mm_cmpgt_pi8 (__m1, __m2);
796}
797
798/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
799   the test is true and zero if false.  */
800static __inline __m64 __attribute__((__always_inline__))
801_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
802{
803  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
804}
805
806static __inline __m64 __attribute__((__always_inline__))
807_m_pcmpeqw (__m64 __m1, __m64 __m2)
808{
809  return _mm_cmpeq_pi16 (__m1, __m2);
810}
811
812static __inline __m64 __attribute__((__always_inline__))
813_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
814{
815  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
816}
817
818static __inline __m64 __attribute__((__always_inline__))
819_m_pcmpgtw (__m64 __m1, __m64 __m2)
820{
821  return _mm_cmpgt_pi16 (__m1, __m2);
822}
823
824/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
825   the test is true and zero if false.  */
826static __inline __m64 __attribute__((__always_inline__))
827_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
828{
829  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
830}
831
832static __inline __m64 __attribute__((__always_inline__))
833_m_pcmpeqd (__m64 __m1, __m64 __m2)
834{
835  return _mm_cmpeq_pi32 (__m1, __m2);
836}
837
838static __inline __m64 __attribute__((__always_inline__))
839_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
840{
841  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
842}
843
844static __inline __m64 __attribute__((__always_inline__))
845_m_pcmpgtd (__m64 __m1, __m64 __m2)
846{
847  return _mm_cmpgt_pi32 (__m1, __m2);
848}
849
850/* Creates a 64-bit zero.  */
851static __inline __m64 __attribute__((__always_inline__))
852_mm_setzero_si64 (void)
853{
854  return (__m64)0LL;
855}
856
857/* Creates a vector of two 32-bit values; I0 is least significant.  */
858static __inline __m64 __attribute__((__always_inline__))
859_mm_set_pi32 (int __i1, int __i0)
860{
861  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
862}
863
864/* Creates a vector of four 16-bit values; W0 is least significant.  */
865static __inline __m64 __attribute__((__always_inline__))
866_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
867{
868  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
869}
870
871/* Creates a vector of eight 8-bit values; B0 is least significant.  */
872static __inline __m64 __attribute__((__always_inline__))
873_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
874	     char __b3, char __b2, char __b1, char __b0)
875{
876  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
877					       __b4, __b5, __b6, __b7);
878}
879
880/* Similar, but with the arguments in reverse order.  */
881static __inline __m64 __attribute__((__always_inline__))
882_mm_setr_pi32 (int __i0, int __i1)
883{
884  return _mm_set_pi32 (__i1, __i0);
885}
886
887static __inline __m64 __attribute__((__always_inline__))
888_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
889{
890  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
891}
892
893static __inline __m64 __attribute__((__always_inline__))
894_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
895	      char __b4, char __b5, char __b6, char __b7)
896{
897  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
898}
899
900/* Creates a vector of two 32-bit values, both elements containing I.  */
901static __inline __m64 __attribute__((__always_inline__))
902_mm_set1_pi32 (int __i)
903{
904  return _mm_set_pi32 (__i, __i);
905}
906
907/* Creates a vector of four 16-bit values, all elements containing W.  */
908static __inline __m64 __attribute__((__always_inline__))
909_mm_set1_pi16 (short __w)
910{
911  return _mm_set_pi16 (__w, __w, __w, __w);
912}
913
914/* Creates a vector of eight 8-bit values, all elements containing B.  */
915static __inline __m64 __attribute__((__always_inline__))
916_mm_set1_pi8 (char __b)
917{
918  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
919}
920
921#endif /* __MMX__ */
922#endif /* _MMINTRIN_H_INCLUDED */
923