1/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
2   Free Software Foundation, Inc.
3
4   This file is part of GCC.
5
6   GCC is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3, or (at your option)
9   any later version.
10
11   GCC is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   Under Section 7 of GPL version 3, you are granted additional
17   permissions described in the GCC Runtime Library Exception, version
18   3.1, as published by the Free Software Foundation.
19
20   You should have received a copy of the GNU General Public License and
21   a copy of the GCC Runtime Library Exception along with this program;
22   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23   <http://www.gnu.org/licenses/>.  */
24
25/* Implemented from the specification included in the Intel C++ Compiler
26   User Guide and Reference, version 9.0.  */
27
28#ifndef _MMINTRIN_H_INCLUDED
29#define _MMINTRIN_H_INCLUDED
30
31#ifndef __MMX__
32# error "MMX instruction set not enabled"
33#else
34/* The Intel API is flexible enough that we must allow aliasing with other
35   vector types, and their scalar components.  */
36typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
37
38/* Internal data types for implementing the intrinsics.  */
39typedef int __v2si __attribute__ ((__vector_size__ (8)));
40typedef short __v4hi __attribute__ ((__vector_size__ (8)));
41typedef char __v8qi __attribute__ ((__vector_size__ (8)));
42typedef long long __v1di __attribute__ ((__vector_size__ (8)));
43typedef float __v2sf __attribute__ ((__vector_size__ (8)));
44
45/* Empty the multimedia state.  */
46extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
47_mm_empty (void)
48{
49  __builtin_ia32_emms ();
50}
51
52extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
53_m_empty (void)
54{
55  _mm_empty ();
56}
57
58/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
59extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60_mm_cvtsi32_si64 (int __i)
61{
62  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
63}
64
65extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66_m_from_int (int __i)
67{
68  return _mm_cvtsi32_si64 (__i);
69}
70
71#ifdef __x86_64__
72/* Convert I to a __m64 object.  */
73
74/* Intel intrinsic.  */
75extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76_m_from_int64 (long long __i)
77{
78  return (__m64) __i;
79}
80
81extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82_mm_cvtsi64_m64 (long long __i)
83{
84  return (__m64) __i;
85}
86
87/* Microsoft intrinsic.  */
88extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89_mm_cvtsi64x_si64 (long long __i)
90{
91  return (__m64) __i;
92}
93
94extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95_mm_set_pi64x (long long __i)
96{
97  return (__m64) __i;
98}
99#endif
100
101/* Convert the lower 32 bits of the __m64 object into an integer.  */
102extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103_mm_cvtsi64_si32 (__m64 __i)
104{
105  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
106}
107
108extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109_m_to_int (__m64 __i)
110{
111  return _mm_cvtsi64_si32 (__i);
112}
113
114#ifdef __x86_64__
115/* Convert the __m64 object to a 64bit integer.  */
116
117/* Intel intrinsic.  */
118extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_m_to_int64 (__m64 __i)
120{
121  return (long long)__i;
122}
123
124extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm_cvtm64_si64 (__m64 __i)
126{
127  return (long long)__i;
128}
129
130/* Microsoft intrinsic.  */
131extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132_mm_cvtsi64_si64x (__m64 __i)
133{
134  return (long long)__i;
135}
136#endif
137
138/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
139   the result, and the four 16-bit values from M2 into the upper four 8-bit
140   values of the result, all with signed saturation.  */
141extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_packs_pi16 (__m64 __m1, __m64 __m2)
143{
144  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
145}
146
147extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148_m_packsswb (__m64 __m1, __m64 __m2)
149{
150  return _mm_packs_pi16 (__m1, __m2);
151}
152
153/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
154   the result, and the two 32-bit values from M2 into the upper two 16-bit
155   values of the result, all with signed saturation.  */
156extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
157_mm_packs_pi32 (__m64 __m1, __m64 __m2)
158{
159  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
160}
161
162extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163_m_packssdw (__m64 __m1, __m64 __m2)
164{
165  return _mm_packs_pi32 (__m1, __m2);
166}
167
168/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
169   the result, and the four 16-bit values from M2 into the upper four 8-bit
170   values of the result, all with unsigned saturation.  */
171extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm_packs_pu16 (__m64 __m1, __m64 __m2)
173{
174  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
175}
176
177extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178_m_packuswb (__m64 __m1, __m64 __m2)
179{
180  return _mm_packs_pu16 (__m1, __m2);
181}
182
183/* Interleave the four 8-bit values from the high half of M1 with the four
184   8-bit values from the high half of M2.  */
185extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
187{
188  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
189}
190
191extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192_m_punpckhbw (__m64 __m1, __m64 __m2)
193{
194  return _mm_unpackhi_pi8 (__m1, __m2);
195}
196
197/* Interleave the two 16-bit values from the high half of M1 with the two
198   16-bit values from the high half of M2.  */
199extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
201{
202  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
203}
204
205extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206_m_punpckhwd (__m64 __m1, __m64 __m2)
207{
208  return _mm_unpackhi_pi16 (__m1, __m2);
209}
210
211/* Interleave the 32-bit value from the high half of M1 with the 32-bit
212   value from the high half of M2.  */
213extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
215{
216  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
217}
218
219extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220_m_punpckhdq (__m64 __m1, __m64 __m2)
221{
222  return _mm_unpackhi_pi32 (__m1, __m2);
223}
224
225/* Interleave the four 8-bit values from the low half of M1 with the four
226   8-bit values from the low half of M2.  */
227extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
229{
230  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
231}
232
233extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234_m_punpcklbw (__m64 __m1, __m64 __m2)
235{
236  return _mm_unpacklo_pi8 (__m1, __m2);
237}
238
239/* Interleave the two 16-bit values from the low half of M1 with the two
240   16-bit values from the low half of M2.  */
241extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
243{
244  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
245}
246
247extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248_m_punpcklwd (__m64 __m1, __m64 __m2)
249{
250  return _mm_unpacklo_pi16 (__m1, __m2);
251}
252
253/* Interleave the 32-bit value from the low half of M1 with the 32-bit
254   value from the low half of M2.  */
255extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
257{
258  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
259}
260
261extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262_m_punpckldq (__m64 __m1, __m64 __m2)
263{
264  return _mm_unpacklo_pi32 (__m1, __m2);
265}
266
267/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
268extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269_mm_add_pi8 (__m64 __m1, __m64 __m2)
270{
271  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
272}
273
274extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275_m_paddb (__m64 __m1, __m64 __m2)
276{
277  return _mm_add_pi8 (__m1, __m2);
278}
279
280/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
281extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282_mm_add_pi16 (__m64 __m1, __m64 __m2)
283{
284  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
285}
286
287extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288_m_paddw (__m64 __m1, __m64 __m2)
289{
290  return _mm_add_pi16 (__m1, __m2);
291}
292
293/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
294extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
295_mm_add_pi32 (__m64 __m1, __m64 __m2)
296{
297  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
298}
299
300extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301_m_paddd (__m64 __m1, __m64 __m2)
302{
303  return _mm_add_pi32 (__m1, __m2);
304}
305
306/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
307#ifdef __SSE2__
308extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309_mm_add_si64 (__m64 __m1, __m64 __m2)
310{
311  return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
312}
313#endif
314
315/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
316   saturated arithmetic.  */
317extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318_mm_adds_pi8 (__m64 __m1, __m64 __m2)
319{
320  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
321}
322
323extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324_m_paddsb (__m64 __m1, __m64 __m2)
325{
326  return _mm_adds_pi8 (__m1, __m2);
327}
328
329/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
330   saturated arithmetic.  */
331extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332_mm_adds_pi16 (__m64 __m1, __m64 __m2)
333{
334  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
335}
336
337extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338_m_paddsw (__m64 __m1, __m64 __m2)
339{
340  return _mm_adds_pi16 (__m1, __m2);
341}
342
343/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
344   saturated arithmetic.  */
345extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346_mm_adds_pu8 (__m64 __m1, __m64 __m2)
347{
348  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
349}
350
351extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352_m_paddusb (__m64 __m1, __m64 __m2)
353{
354  return _mm_adds_pu8 (__m1, __m2);
355}
356
357/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
358   saturated arithmetic.  */
359extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360_mm_adds_pu16 (__m64 __m1, __m64 __m2)
361{
362  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
363}
364
365extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366_m_paddusw (__m64 __m1, __m64 __m2)
367{
368  return _mm_adds_pu16 (__m1, __m2);
369}
370
371/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
372extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373_mm_sub_pi8 (__m64 __m1, __m64 __m2)
374{
375  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
376}
377
378extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379_m_psubb (__m64 __m1, __m64 __m2)
380{
381  return _mm_sub_pi8 (__m1, __m2);
382}
383
384/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
385extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_sub_pi16 (__m64 __m1, __m64 __m2)
387{
388  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
389}
390
391extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_m_psubw (__m64 __m1, __m64 __m2)
393{
394  return _mm_sub_pi16 (__m1, __m2);
395}
396
397/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
398extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399_mm_sub_pi32 (__m64 __m1, __m64 __m2)
400{
401  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
402}
403
404extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405_m_psubd (__m64 __m1, __m64 __m2)
406{
407  return _mm_sub_pi32 (__m1, __m2);
408}
409
410/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
411#ifdef __SSE2__
412extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_sub_si64 (__m64 __m1, __m64 __m2)
414{
415  return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
416}
417#endif
418
419/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
420   saturating arithmetic.  */
421extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422_mm_subs_pi8 (__m64 __m1, __m64 __m2)
423{
424  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
425}
426
427extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428_m_psubsb (__m64 __m1, __m64 __m2)
429{
430  return _mm_subs_pi8 (__m1, __m2);
431}
432
433/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
434   signed saturating arithmetic.  */
435extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436_mm_subs_pi16 (__m64 __m1, __m64 __m2)
437{
438  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
439}
440
441extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442_m_psubsw (__m64 __m1, __m64 __m2)
443{
444  return _mm_subs_pi16 (__m1, __m2);
445}
446
447/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
448   unsigned saturating arithmetic.  */
449extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm_subs_pu8 (__m64 __m1, __m64 __m2)
451{
452  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
453}
454
455extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456_m_psubusb (__m64 __m1, __m64 __m2)
457{
458  return _mm_subs_pu8 (__m1, __m2);
459}
460
461/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
462   unsigned saturating arithmetic.  */
463extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464_mm_subs_pu16 (__m64 __m1, __m64 __m2)
465{
466  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
467}
468
469extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470_m_psubusw (__m64 __m1, __m64 __m2)
471{
472  return _mm_subs_pu16 (__m1, __m2);
473}
474
475/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
476   four 32-bit intermediate results, which are then summed by pairs to
477   produce two 32-bit results.  */
478extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479_mm_madd_pi16 (__m64 __m1, __m64 __m2)
480{
481  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
482}
483
484extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485_m_pmaddwd (__m64 __m1, __m64 __m2)
486{
487  return _mm_madd_pi16 (__m1, __m2);
488}
489
490/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
491   M2 and produce the high 16 bits of the 32-bit results.  */
492extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
494{
495  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
496}
497
498extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499_m_pmulhw (__m64 __m1, __m64 __m2)
500{
501  return _mm_mulhi_pi16 (__m1, __m2);
502}
503
504/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
505   the low 16 bits of the results.  */
506extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
508{
509  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
510}
511
512extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513_m_pmullw (__m64 __m1, __m64 __m2)
514{
515  return _mm_mullo_pi16 (__m1, __m2);
516}
517
518/* Shift four 16-bit values in M left by COUNT.  */
519extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520_mm_sll_pi16 (__m64 __m, __m64 __count)
521{
522  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
523}
524
525extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526_m_psllw (__m64 __m, __m64 __count)
527{
528  return _mm_sll_pi16 (__m, __count);
529}
530
531extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532_mm_slli_pi16 (__m64 __m, int __count)
533{
534  return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
535}
536
537extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_m_psllwi (__m64 __m, int __count)
539{
540  return _mm_slli_pi16 (__m, __count);
541}
542
543/* Shift two 32-bit values in M left by COUNT.  */
544extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
545_mm_sll_pi32 (__m64 __m, __m64 __count)
546{
547  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
548}
549
550extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551_m_pslld (__m64 __m, __m64 __count)
552{
553  return _mm_sll_pi32 (__m, __count);
554}
555
556extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557_mm_slli_pi32 (__m64 __m, int __count)
558{
559  return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
560}
561
562extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563_m_pslldi (__m64 __m, int __count)
564{
565  return _mm_slli_pi32 (__m, __count);
566}
567
568/* Shift the 64-bit value in M left by COUNT.  */
569extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570_mm_sll_si64 (__m64 __m, __m64 __count)
571{
572  return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
573}
574
575extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576_m_psllq (__m64 __m, __m64 __count)
577{
578  return _mm_sll_si64 (__m, __count);
579}
580
581extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582_mm_slli_si64 (__m64 __m, int __count)
583{
584  return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
585}
586
587extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588_m_psllqi (__m64 __m, int __count)
589{
590  return _mm_slli_si64 (__m, __count);
591}
592
593/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
594extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm_sra_pi16 (__m64 __m, __m64 __count)
596{
597  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
598}
599
600extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601_m_psraw (__m64 __m, __m64 __count)
602{
603  return _mm_sra_pi16 (__m, __count);
604}
605
606extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_srai_pi16 (__m64 __m, int __count)
608{
609  return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
610}
611
612extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_m_psrawi (__m64 __m, int __count)
614{
615  return _mm_srai_pi16 (__m, __count);
616}
617
618/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
619extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620_mm_sra_pi32 (__m64 __m, __m64 __count)
621{
622  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
623}
624
625extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626_m_psrad (__m64 __m, __m64 __count)
627{
628  return _mm_sra_pi32 (__m, __count);
629}
630
631extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632_mm_srai_pi32 (__m64 __m, int __count)
633{
634  return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
635}
636
637extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638_m_psradi (__m64 __m, int __count)
639{
640  return _mm_srai_pi32 (__m, __count);
641}
642
643/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
644extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm_srl_pi16 (__m64 __m, __m64 __count)
646{
647  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
648}
649
650extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651_m_psrlw (__m64 __m, __m64 __count)
652{
653  return _mm_srl_pi16 (__m, __count);
654}
655
656extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
657_mm_srli_pi16 (__m64 __m, int __count)
658{
659  return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
660}
661
662extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663_m_psrlwi (__m64 __m, int __count)
664{
665  return _mm_srli_pi16 (__m, __count);
666}
667
668/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
669extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670_mm_srl_pi32 (__m64 __m, __m64 __count)
671{
672  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
673}
674
675extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676_m_psrld (__m64 __m, __m64 __count)
677{
678  return _mm_srl_pi32 (__m, __count);
679}
680
681extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682_mm_srli_pi32 (__m64 __m, int __count)
683{
684  return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
685}
686
687extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688_m_psrldi (__m64 __m, int __count)
689{
690  return _mm_srli_pi32 (__m, __count);
691}
692
693/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
694extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_srl_si64 (__m64 __m, __m64 __count)
696{
697  return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
698}
699
700extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701_m_psrlq (__m64 __m, __m64 __count)
702{
703  return _mm_srl_si64 (__m, __count);
704}
705
706extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707_mm_srli_si64 (__m64 __m, int __count)
708{
709  return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
710}
711
712extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713_m_psrlqi (__m64 __m, int __count)
714{
715  return _mm_srli_si64 (__m, __count);
716}
717
718/* Bit-wise AND the 64-bit values in M1 and M2.  */
719extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720_mm_and_si64 (__m64 __m1, __m64 __m2)
721{
722  return __builtin_ia32_pand (__m1, __m2);
723}
724
725extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726_m_pand (__m64 __m1, __m64 __m2)
727{
728  return _mm_and_si64 (__m1, __m2);
729}
730
731/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
732   64-bit value in M2.  */
733extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_andnot_si64 (__m64 __m1, __m64 __m2)
735{
736  return __builtin_ia32_pandn (__m1, __m2);
737}
738
739extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_m_pandn (__m64 __m1, __m64 __m2)
741{
742  return _mm_andnot_si64 (__m1, __m2);
743}
744
745/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
746extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747_mm_or_si64 (__m64 __m1, __m64 __m2)
748{
749  return __builtin_ia32_por (__m1, __m2);
750}
751
752extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753_m_por (__m64 __m1, __m64 __m2)
754{
755  return _mm_or_si64 (__m1, __m2);
756}
757
758/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
759extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760_mm_xor_si64 (__m64 __m1, __m64 __m2)
761{
762  return __builtin_ia32_pxor (__m1, __m2);
763}
764
765extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766_m_pxor (__m64 __m1, __m64 __m2)
767{
768  return _mm_xor_si64 (__m1, __m2);
769}
770
771/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
772   test is true and zero if false.  */
773extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
774_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
775{
776  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
777}
778
779extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780_m_pcmpeqb (__m64 __m1, __m64 __m2)
781{
782  return _mm_cmpeq_pi8 (__m1, __m2);
783}
784
785extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
787{
788  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
789}
790
791extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792_m_pcmpgtb (__m64 __m1, __m64 __m2)
793{
794  return _mm_cmpgt_pi8 (__m1, __m2);
795}
796
797/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
798   the test is true and zero if false.  */
799extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
801{
802  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
803}
804
805extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806_m_pcmpeqw (__m64 __m1, __m64 __m2)
807{
808  return _mm_cmpeq_pi16 (__m1, __m2);
809}
810
811extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
813{
814  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
815}
816
817extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818_m_pcmpgtw (__m64 __m1, __m64 __m2)
819{
820  return _mm_cmpgt_pi16 (__m1, __m2);
821}
822
823/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
824   the test is true and zero if false.  */
825extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
826_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
827{
828  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
829}
830
831extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832_m_pcmpeqd (__m64 __m1, __m64 __m2)
833{
834  return _mm_cmpeq_pi32 (__m1, __m2);
835}
836
837extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
839{
840  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
841}
842
843extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844_m_pcmpgtd (__m64 __m1, __m64 __m2)
845{
846  return _mm_cmpgt_pi32 (__m1, __m2);
847}
848
849/* Creates a 64-bit zero.  */
850extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851_mm_setzero_si64 (void)
852{
853  return (__m64)0LL;
854}
855
856/* Creates a vector of two 32-bit values; I0 is least significant.  */
857extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858_mm_set_pi32 (int __i1, int __i0)
859{
860  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
861}
862
863/* Creates a vector of four 16-bit values; W0 is least significant.  */
864extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
866{
867  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
868}
869
870/* Creates a vector of eight 8-bit values; B0 is least significant.  */
871extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
873	     char __b3, char __b2, char __b1, char __b0)
874{
875  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
876					       __b4, __b5, __b6, __b7);
877}
878
879/* Similar, but with the arguments in reverse order.  */
880extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881_mm_setr_pi32 (int __i0, int __i1)
882{
883  return _mm_set_pi32 (__i1, __i0);
884}
885
886extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
888{
889  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
890}
891
892extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
894	      char __b4, char __b5, char __b6, char __b7)
895{
896  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
897}
898
899/* Creates a vector of two 32-bit values, both elements containing I.  */
900extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901_mm_set1_pi32 (int __i)
902{
903  return _mm_set_pi32 (__i, __i);
904}
905
906/* Creates a vector of four 16-bit values, all elements containing W.  */
907extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908_mm_set1_pi16 (short __w)
909{
910  return _mm_set_pi16 (__w, __w, __w, __w);
911}
912
913/* Creates a vector of eight 8-bit values, all elements containing B.  */
914extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915_mm_set1_pi8 (char __b)
916{
917  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
918}
919
920#endif /* __MMX__ */
921#endif /* _MMINTRIN_H_INCLUDED */
922