1/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GCC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18   Boston, MA 02110-1301, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 8.0.  */
29
30#ifndef _MMINTRIN_H_INCLUDED
31#define _MMINTRIN_H_INCLUDED
32
33#ifndef __MMX__
34# error "MMX instruction set not enabled"
35#else
36/* The Intel API is flexible enough that we must allow aliasing with other
37   vector types, and their scalar components.  */
38typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
39
40/* Internal data types for implementing the intrinsics.  */
41typedef int __v2si __attribute__ ((__vector_size__ (8)));
42typedef short __v4hi __attribute__ ((__vector_size__ (8)));
43typedef char __v8qi __attribute__ ((__vector_size__ (8)));
44
45/* Empty the multimedia state.  */
46static __inline void __attribute__((__always_inline__))
47_mm_empty (void)
48{
49  __builtin_ia32_emms ();
50}
51
52static __inline void __attribute__((__always_inline__))
53_m_empty (void)
54{
55  _mm_empty ();
56}
57
58/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
59static __inline __m64  __attribute__((__always_inline__))
60_mm_cvtsi32_si64 (int __i)
61{
62  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
63}
64
65static __inline __m64  __attribute__((__always_inline__))
66_m_from_int (int __i)
67{
68  return _mm_cvtsi32_si64 (__i);
69}
70
71#ifdef __x86_64__
72/* Convert I to a __m64 object.  */
73static __inline __m64  __attribute__((__always_inline__))
74_mm_cvtsi64x_si64 (long long __i)
75{
76  return (__m64) __i;
77}
78
79/* Convert I to a __m64 object.  */
80static __inline __m64  __attribute__((__always_inline__))
81_mm_set_pi64x (long long __i)
82{
83  return (__m64) __i;
84}
85#endif
86
87/* Convert the lower 32 bits of the __m64 object into an integer.  */
88static __inline int __attribute__((__always_inline__))
89_mm_cvtsi64_si32 (__m64 __i)
90{
91  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
92}
93
94static __inline int __attribute__((__always_inline__))
95_m_to_int (__m64 __i)
96{
97  return _mm_cvtsi64_si32 (__i);
98}
99
100#ifdef __x86_64__
101/* Convert the lower 32 bits of the __m64 object into an integer.  */
102static __inline long long __attribute__((__always_inline__))
103_mm_cvtsi64_si64x (__m64 __i)
104{
105  return (long long)__i;
106}
107#endif
108
109/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
110   the result, and the four 16-bit values from M2 into the upper four 8-bit
111   values of the result, all with signed saturation.  */
112static __inline __m64 __attribute__((__always_inline__))
113_mm_packs_pi16 (__m64 __m1, __m64 __m2)
114{
115  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
116}
117
118static __inline __m64 __attribute__((__always_inline__))
119_m_packsswb (__m64 __m1, __m64 __m2)
120{
121  return _mm_packs_pi16 (__m1, __m2);
122}
123
124/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
125   the result, and the two 32-bit values from M2 into the upper two 16-bit
126   values of the result, all with signed saturation.  */
127static __inline __m64 __attribute__((__always_inline__))
128_mm_packs_pi32 (__m64 __m1, __m64 __m2)
129{
130  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
131}
132
133static __inline __m64 __attribute__((__always_inline__))
134_m_packssdw (__m64 __m1, __m64 __m2)
135{
136  return _mm_packs_pi32 (__m1, __m2);
137}
138
139/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
140   the result, and the four 16-bit values from M2 into the upper four 8-bit
141   values of the result, all with unsigned saturation.  */
142static __inline __m64 __attribute__((__always_inline__))
143_mm_packs_pu16 (__m64 __m1, __m64 __m2)
144{
145  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
146}
147
148static __inline __m64 __attribute__((__always_inline__))
149_m_packuswb (__m64 __m1, __m64 __m2)
150{
151  return _mm_packs_pu16 (__m1, __m2);
152}
153
154/* Interleave the four 8-bit values from the high half of M1 with the four
155   8-bit values from the high half of M2.  */
156static __inline __m64 __attribute__((__always_inline__))
157_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
158{
159  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
160}
161
162static __inline __m64 __attribute__((__always_inline__))
163_m_punpckhbw (__m64 __m1, __m64 __m2)
164{
165  return _mm_unpackhi_pi8 (__m1, __m2);
166}
167
168/* Interleave the two 16-bit values from the high half of M1 with the two
169   16-bit values from the high half of M2.  */
170static __inline __m64 __attribute__((__always_inline__))
171_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
172{
173  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
174}
175
176static __inline __m64 __attribute__((__always_inline__))
177_m_punpckhwd (__m64 __m1, __m64 __m2)
178{
179  return _mm_unpackhi_pi16 (__m1, __m2);
180}
181
182/* Interleave the 32-bit value from the high half of M1 with the 32-bit
183   value from the high half of M2.  */
184static __inline __m64 __attribute__((__always_inline__))
185_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
186{
187  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
188}
189
190static __inline __m64 __attribute__((__always_inline__))
191_m_punpckhdq (__m64 __m1, __m64 __m2)
192{
193  return _mm_unpackhi_pi32 (__m1, __m2);
194}
195
196/* Interleave the four 8-bit values from the low half of M1 with the four
197   8-bit values from the low half of M2.  */
198static __inline __m64 __attribute__((__always_inline__))
199_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
200{
201  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
202}
203
204static __inline __m64 __attribute__((__always_inline__))
205_m_punpcklbw (__m64 __m1, __m64 __m2)
206{
207  return _mm_unpacklo_pi8 (__m1, __m2);
208}
209
210/* Interleave the two 16-bit values from the low half of M1 with the two
211   16-bit values from the low half of M2.  */
212static __inline __m64 __attribute__((__always_inline__))
213_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
214{
215  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
216}
217
218static __inline __m64 __attribute__((__always_inline__))
219_m_punpcklwd (__m64 __m1, __m64 __m2)
220{
221  return _mm_unpacklo_pi16 (__m1, __m2);
222}
223
224/* Interleave the 32-bit value from the low half of M1 with the 32-bit
225   value from the low half of M2.  */
226static __inline __m64 __attribute__((__always_inline__))
227_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
228{
229  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
230}
231
232static __inline __m64 __attribute__((__always_inline__))
233_m_punpckldq (__m64 __m1, __m64 __m2)
234{
235  return _mm_unpacklo_pi32 (__m1, __m2);
236}
237
238/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
239static __inline __m64 __attribute__((__always_inline__))
240_mm_add_pi8 (__m64 __m1, __m64 __m2)
241{
242  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
243}
244
245static __inline __m64 __attribute__((__always_inline__))
246_m_paddb (__m64 __m1, __m64 __m2)
247{
248  return _mm_add_pi8 (__m1, __m2);
249}
250
251/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
252static __inline __m64 __attribute__((__always_inline__))
253_mm_add_pi16 (__m64 __m1, __m64 __m2)
254{
255  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
256}
257
258static __inline __m64 __attribute__((__always_inline__))
259_m_paddw (__m64 __m1, __m64 __m2)
260{
261  return _mm_add_pi16 (__m1, __m2);
262}
263
264/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
265static __inline __m64 __attribute__((__always_inline__))
266_mm_add_pi32 (__m64 __m1, __m64 __m2)
267{
268  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
269}
270
271static __inline __m64 __attribute__((__always_inline__))
272_m_paddd (__m64 __m1, __m64 __m2)
273{
274  return _mm_add_pi32 (__m1, __m2);
275}
276
277/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
278static __inline __m64 __attribute__((__always_inline__))
279_mm_add_si64 (__m64 __m1, __m64 __m2)
280{
281  return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
282}
283
284/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
285   saturated arithmetic.  */
286static __inline __m64 __attribute__((__always_inline__))
287_mm_adds_pi8 (__m64 __m1, __m64 __m2)
288{
289  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
290}
291
292static __inline __m64 __attribute__((__always_inline__))
293_m_paddsb (__m64 __m1, __m64 __m2)
294{
295  return _mm_adds_pi8 (__m1, __m2);
296}
297
298/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
299   saturated arithmetic.  */
300static __inline __m64 __attribute__((__always_inline__))
301_mm_adds_pi16 (__m64 __m1, __m64 __m2)
302{
303  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
304}
305
306static __inline __m64 __attribute__((__always_inline__))
307_m_paddsw (__m64 __m1, __m64 __m2)
308{
309  return _mm_adds_pi16 (__m1, __m2);
310}
311
312/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
313   saturated arithmetic.  */
314static __inline __m64 __attribute__((__always_inline__))
315_mm_adds_pu8 (__m64 __m1, __m64 __m2)
316{
317  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
318}
319
320static __inline __m64 __attribute__((__always_inline__))
321_m_paddusb (__m64 __m1, __m64 __m2)
322{
323  return _mm_adds_pu8 (__m1, __m2);
324}
325
326/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
327   saturated arithmetic.  */
328static __inline __m64 __attribute__((__always_inline__))
329_mm_adds_pu16 (__m64 __m1, __m64 __m2)
330{
331  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
332}
333
334static __inline __m64 __attribute__((__always_inline__))
335_m_paddusw (__m64 __m1, __m64 __m2)
336{
337  return _mm_adds_pu16 (__m1, __m2);
338}
339
340/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
341static __inline __m64 __attribute__((__always_inline__))
342_mm_sub_pi8 (__m64 __m1, __m64 __m2)
343{
344  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
345}
346
347static __inline __m64 __attribute__((__always_inline__))
348_m_psubb (__m64 __m1, __m64 __m2)
349{
350  return _mm_sub_pi8 (__m1, __m2);
351}
352
353/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
354static __inline __m64 __attribute__((__always_inline__))
355_mm_sub_pi16 (__m64 __m1, __m64 __m2)
356{
357  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
358}
359
360static __inline __m64 __attribute__((__always_inline__))
361_m_psubw (__m64 __m1, __m64 __m2)
362{
363  return _mm_sub_pi16 (__m1, __m2);
364}
365
366/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
367static __inline __m64 __attribute__((__always_inline__))
368_mm_sub_pi32 (__m64 __m1, __m64 __m2)
369{
370  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
371}
372
373static __inline __m64 __attribute__((__always_inline__))
374_m_psubd (__m64 __m1, __m64 __m2)
375{
376  return _mm_sub_pi32 (__m1, __m2);
377}
378
379/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
380static __inline __m64 __attribute__((__always_inline__))
381_mm_sub_si64 (__m64 __m1, __m64 __m2)
382{
383  return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
384}
385
386/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
387   saturating arithmetic.  */
388static __inline __m64 __attribute__((__always_inline__))
389_mm_subs_pi8 (__m64 __m1, __m64 __m2)
390{
391  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
392}
393
394static __inline __m64 __attribute__((__always_inline__))
395_m_psubsb (__m64 __m1, __m64 __m2)
396{
397  return _mm_subs_pi8 (__m1, __m2);
398}
399
400/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
401   signed saturating arithmetic.  */
402static __inline __m64 __attribute__((__always_inline__))
403_mm_subs_pi16 (__m64 __m1, __m64 __m2)
404{
405  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
406}
407
408static __inline __m64 __attribute__((__always_inline__))
409_m_psubsw (__m64 __m1, __m64 __m2)
410{
411  return _mm_subs_pi16 (__m1, __m2);
412}
413
414/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
415   unsigned saturating arithmetic.  */
416static __inline __m64 __attribute__((__always_inline__))
417_mm_subs_pu8 (__m64 __m1, __m64 __m2)
418{
419  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
420}
421
422static __inline __m64 __attribute__((__always_inline__))
423_m_psubusb (__m64 __m1, __m64 __m2)
424{
425  return _mm_subs_pu8 (__m1, __m2);
426}
427
428/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
429   unsigned saturating arithmetic.  */
430static __inline __m64 __attribute__((__always_inline__))
431_mm_subs_pu16 (__m64 __m1, __m64 __m2)
432{
433  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
434}
435
436static __inline __m64 __attribute__((__always_inline__))
437_m_psubusw (__m64 __m1, __m64 __m2)
438{
439  return _mm_subs_pu16 (__m1, __m2);
440}
441
442/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
443   four 32-bit intermediate results, which are then summed by pairs to
444   produce two 32-bit results.  */
445static __inline __m64 __attribute__((__always_inline__))
446_mm_madd_pi16 (__m64 __m1, __m64 __m2)
447{
448  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
449}
450
451static __inline __m64 __attribute__((__always_inline__))
452_m_pmaddwd (__m64 __m1, __m64 __m2)
453{
454  return _mm_madd_pi16 (__m1, __m2);
455}
456
457/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
458   M2 and produce the high 16 bits of the 32-bit results.  */
459static __inline __m64 __attribute__((__always_inline__))
460_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
461{
462  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
463}
464
465static __inline __m64 __attribute__((__always_inline__))
466_m_pmulhw (__m64 __m1, __m64 __m2)
467{
468  return _mm_mulhi_pi16 (__m1, __m2);
469}
470
471/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
472   the low 16 bits of the results.  */
473static __inline __m64 __attribute__((__always_inline__))
474_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
475{
476  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
477}
478
479static __inline __m64 __attribute__((__always_inline__))
480_m_pmullw (__m64 __m1, __m64 __m2)
481{
482  return _mm_mullo_pi16 (__m1, __m2);
483}
484
485/* Shift four 16-bit values in M left by COUNT.  */
486static __inline __m64 __attribute__((__always_inline__))
487_mm_sll_pi16 (__m64 __m, __m64 __count)
488{
489  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
490}
491
492static __inline __m64 __attribute__((__always_inline__))
493_m_psllw (__m64 __m, __m64 __count)
494{
495  return _mm_sll_pi16 (__m, __count);
496}
497
498static __inline __m64 __attribute__((__always_inline__))
499_mm_slli_pi16 (__m64 __m, int __count)
500{
501  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
502}
503
504static __inline __m64 __attribute__((__always_inline__))
505_m_psllwi (__m64 __m, int __count)
506{
507  return _mm_slli_pi16 (__m, __count);
508}
509
510/* Shift two 32-bit values in M left by COUNT.  */
511static __inline __m64 __attribute__((__always_inline__))
512_mm_sll_pi32 (__m64 __m, __m64 __count)
513{
514  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
515}
516
517static __inline __m64 __attribute__((__always_inline__))
518_m_pslld (__m64 __m, __m64 __count)
519{
520  return _mm_sll_pi32 (__m, __count);
521}
522
523static __inline __m64 __attribute__((__always_inline__))
524_mm_slli_pi32 (__m64 __m, int __count)
525{
526  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
527}
528
529static __inline __m64 __attribute__((__always_inline__))
530_m_pslldi (__m64 __m, int __count)
531{
532  return _mm_slli_pi32 (__m, __count);
533}
534
535/* Shift the 64-bit value in M left by COUNT.  */
536static __inline __m64 __attribute__((__always_inline__))
537_mm_sll_si64 (__m64 __m, __m64 __count)
538{
539  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
540}
541
542static __inline __m64 __attribute__((__always_inline__))
543_m_psllq (__m64 __m, __m64 __count)
544{
545  return _mm_sll_si64 (__m, __count);
546}
547
548static __inline __m64 __attribute__((__always_inline__))
549_mm_slli_si64 (__m64 __m, int __count)
550{
551  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
552}
553
554static __inline __m64 __attribute__((__always_inline__))
555_m_psllqi (__m64 __m, int __count)
556{
557  return _mm_slli_si64 (__m, __count);
558}
559
560/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
561static __inline __m64 __attribute__((__always_inline__))
562_mm_sra_pi16 (__m64 __m, __m64 __count)
563{
564  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
565}
566
567static __inline __m64 __attribute__((__always_inline__))
568_m_psraw (__m64 __m, __m64 __count)
569{
570  return _mm_sra_pi16 (__m, __count);
571}
572
573static __inline __m64 __attribute__((__always_inline__))
574_mm_srai_pi16 (__m64 __m, int __count)
575{
576  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
577}
578
579static __inline __m64 __attribute__((__always_inline__))
580_m_psrawi (__m64 __m, int __count)
581{
582  return _mm_srai_pi16 (__m, __count);
583}
584
585/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
586static __inline __m64 __attribute__((__always_inline__))
587_mm_sra_pi32 (__m64 __m, __m64 __count)
588{
589  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
590}
591
592static __inline __m64 __attribute__((__always_inline__))
593_m_psrad (__m64 __m, __m64 __count)
594{
595  return _mm_sra_pi32 (__m, __count);
596}
597
598static __inline __m64 __attribute__((__always_inline__))
599_mm_srai_pi32 (__m64 __m, int __count)
600{
601  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
602}
603
604static __inline __m64 __attribute__((__always_inline__))
605_m_psradi (__m64 __m, int __count)
606{
607  return _mm_srai_pi32 (__m, __count);
608}
609
610/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
611static __inline __m64 __attribute__((__always_inline__))
612_mm_srl_pi16 (__m64 __m, __m64 __count)
613{
614  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
615}
616
617static __inline __m64 __attribute__((__always_inline__))
618_m_psrlw (__m64 __m, __m64 __count)
619{
620  return _mm_srl_pi16 (__m, __count);
621}
622
623static __inline __m64 __attribute__((__always_inline__))
624_mm_srli_pi16 (__m64 __m, int __count)
625{
626  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
627}
628
629static __inline __m64 __attribute__((__always_inline__))
630_m_psrlwi (__m64 __m, int __count)
631{
632  return _mm_srli_pi16 (__m, __count);
633}
634
635/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
636static __inline __m64 __attribute__((__always_inline__))
637_mm_srl_pi32 (__m64 __m, __m64 __count)
638{
639  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
640}
641
642static __inline __m64 __attribute__((__always_inline__))
643_m_psrld (__m64 __m, __m64 __count)
644{
645  return _mm_srl_pi32 (__m, __count);
646}
647
648static __inline __m64 __attribute__((__always_inline__))
649_mm_srli_pi32 (__m64 __m, int __count)
650{
651  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
652}
653
654static __inline __m64 __attribute__((__always_inline__))
655_m_psrldi (__m64 __m, int __count)
656{
657  return _mm_srli_pi32 (__m, __count);
658}
659
660/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
661static __inline __m64 __attribute__((__always_inline__))
662_mm_srl_si64 (__m64 __m, __m64 __count)
663{
664  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
665}
666
667static __inline __m64 __attribute__((__always_inline__))
668_m_psrlq (__m64 __m, __m64 __count)
669{
670  return _mm_srl_si64 (__m, __count);
671}
672
673static __inline __m64 __attribute__((__always_inline__))
674_mm_srli_si64 (__m64 __m, int __count)
675{
676  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
677}
678
679static __inline __m64 __attribute__((__always_inline__))
680_m_psrlqi (__m64 __m, int __count)
681{
682  return _mm_srli_si64 (__m, __count);
683}
684
685/* Bit-wise AND the 64-bit values in M1 and M2.  */
686static __inline __m64 __attribute__((__always_inline__))
687_mm_and_si64 (__m64 __m1, __m64 __m2)
688{
689  return __builtin_ia32_pand (__m1, __m2);
690}
691
692static __inline __m64 __attribute__((__always_inline__))
693_m_pand (__m64 __m1, __m64 __m2)
694{
695  return _mm_and_si64 (__m1, __m2);
696}
697
698/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
699   64-bit value in M2.  */
700static __inline __m64 __attribute__((__always_inline__))
701_mm_andnot_si64 (__m64 __m1, __m64 __m2)
702{
703  return __builtin_ia32_pandn (__m1, __m2);
704}
705
706static __inline __m64 __attribute__((__always_inline__))
707_m_pandn (__m64 __m1, __m64 __m2)
708{
709  return _mm_andnot_si64 (__m1, __m2);
710}
711
712/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
713static __inline __m64 __attribute__((__always_inline__))
714_mm_or_si64 (__m64 __m1, __m64 __m2)
715{
716  return __builtin_ia32_por (__m1, __m2);
717}
718
719static __inline __m64 __attribute__((__always_inline__))
720_m_por (__m64 __m1, __m64 __m2)
721{
722  return _mm_or_si64 (__m1, __m2);
723}
724
725/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
726static __inline __m64 __attribute__((__always_inline__))
727_mm_xor_si64 (__m64 __m1, __m64 __m2)
728{
729  return __builtin_ia32_pxor (__m1, __m2);
730}
731
732static __inline __m64 __attribute__((__always_inline__))
733_m_pxor (__m64 __m1, __m64 __m2)
734{
735  return _mm_xor_si64 (__m1, __m2);
736}
737
738/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
739   test is true and zero if false.  */
740static __inline __m64 __attribute__((__always_inline__))
741_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
742{
743  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
744}
745
746static __inline __m64 __attribute__((__always_inline__))
747_m_pcmpeqb (__m64 __m1, __m64 __m2)
748{
749  return _mm_cmpeq_pi8 (__m1, __m2);
750}
751
752static __inline __m64 __attribute__((__always_inline__))
753_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
754{
755  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
756}
757
758static __inline __m64 __attribute__((__always_inline__))
759_m_pcmpgtb (__m64 __m1, __m64 __m2)
760{
761  return _mm_cmpgt_pi8 (__m1, __m2);
762}
763
764/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
765   the test is true and zero if false.  */
766static __inline __m64 __attribute__((__always_inline__))
767_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
768{
769  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
770}
771
772static __inline __m64 __attribute__((__always_inline__))
773_m_pcmpeqw (__m64 __m1, __m64 __m2)
774{
775  return _mm_cmpeq_pi16 (__m1, __m2);
776}
777
778static __inline __m64 __attribute__((__always_inline__))
779_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
780{
781  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
782}
783
784static __inline __m64 __attribute__((__always_inline__))
785_m_pcmpgtw (__m64 __m1, __m64 __m2)
786{
787  return _mm_cmpgt_pi16 (__m1, __m2);
788}
789
790/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
791   the test is true and zero if false.  */
792static __inline __m64 __attribute__((__always_inline__))
793_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
794{
795  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
796}
797
798static __inline __m64 __attribute__((__always_inline__))
799_m_pcmpeqd (__m64 __m1, __m64 __m2)
800{
801  return _mm_cmpeq_pi32 (__m1, __m2);
802}
803
804static __inline __m64 __attribute__((__always_inline__))
805_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
806{
807  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
808}
809
810static __inline __m64 __attribute__((__always_inline__))
811_m_pcmpgtd (__m64 __m1, __m64 __m2)
812{
813  return _mm_cmpgt_pi32 (__m1, __m2);
814}
815
816/* Creates a 64-bit zero.  */
817static __inline __m64 __attribute__((__always_inline__))
818_mm_setzero_si64 (void)
819{
820  return (__m64)0LL;
821}
822
823/* Creates a vector of two 32-bit values; I0 is least significant.  */
824static __inline __m64 __attribute__((__always_inline__))
825_mm_set_pi32 (int __i1, int __i0)
826{
827  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
828}
829
830/* Creates a vector of four 16-bit values; W0 is least significant.  */
831static __inline __m64 __attribute__((__always_inline__))
832_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
833{
834  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
835}
836
837/* Creates a vector of eight 8-bit values; B0 is least significant.  */
838static __inline __m64 __attribute__((__always_inline__))
839_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
840	     char __b3, char __b2, char __b1, char __b0)
841{
842  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
843					       __b4, __b5, __b6, __b7);
844}
845
846/* Similar, but with the arguments in reverse order.  */
847static __inline __m64 __attribute__((__always_inline__))
848_mm_setr_pi32 (int __i0, int __i1)
849{
850  return _mm_set_pi32 (__i1, __i0);
851}
852
853static __inline __m64 __attribute__((__always_inline__))
854_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
855{
856  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
857}
858
859static __inline __m64 __attribute__((__always_inline__))
860_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
861	      char __b4, char __b5, char __b6, char __b7)
862{
863  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
864}
865
866/* Creates a vector of two 32-bit values, both elements containing I.  */
867static __inline __m64 __attribute__((__always_inline__))
868_mm_set1_pi32 (int __i)
869{
870  return _mm_set_pi32 (__i, __i);
871}
872
873/* Creates a vector of four 16-bit values, all elements containing W.  */
874static __inline __m64 __attribute__((__always_inline__))
875_mm_set1_pi16 (short __w)
876{
877  return _mm_set_pi16 (__w, __w, __w, __w);
878}
879
880/* Creates a vector of eight 8-bit values, all elements containing B.  */
881static __inline __m64 __attribute__((__always_inline__))
882_mm_set1_pi8 (char __b)
883{
884  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
885}
886
887#endif /* __MMX__ */
888#endif /* _MMINTRIN_H_INCLUDED */
889