mmintrin.h revision 122180
1292068Ssjg/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
2246149Ssjg
3246149Ssjg   This file is part of GNU CC.
4246149Ssjg
5246149Ssjg   GNU CC is free software; you can redistribute it and/or modify
6246149Ssjg   it under the terms of the GNU General Public License as published by
7246149Ssjg   the Free Software Foundation; either version 2, or (at your option)
8246149Ssjg   any later version.
9246149Ssjg
10246149Ssjg   GNU CC is distributed in the hope that it will be useful,
11246149Ssjg   but WITHOUT ANY WARRANTY; without even the implied warranty of
12246149Ssjg   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13246149Ssjg   GNU General Public License for more details.
14246149Ssjg
15246149Ssjg   You should have received a copy of the GNU General Public License
16246149Ssjg   along with GNU CC; see the file COPYING.  If not, write to
17246149Ssjg   the Free Software Foundation, 59 Temple Place - Suite 330,
18246149Ssjg   Boston, MA 02111-1307, USA.  */
19246149Ssjg
20246149Ssjg/* As a special exception, if you include this header file into source
21246149Ssjg   files compiled by GCC, this header file does not by itself cause
22246149Ssjg   the resulting executable to be covered by the GNU General Public
23292068Ssjg   License.  This exception does not however invalidate any other
24246149Ssjg   reasons why the executable file might be covered by the GNU General
25292068Ssjg   Public License.  */
26292068Ssjg
27292068Ssjg/* Implemented from the specification included in the Intel C++ Compiler
28246149Ssjg   User Guide and Reference, version 8.0.  */
29246149Ssjg
30246149Ssjg#ifndef _MMINTRIN_H_INCLUDED
31246149Ssjg#define _MMINTRIN_H_INCLUDED
32246149Ssjg
33246149Ssjg#ifndef __MMX__
34246149Ssjg# error "MMX instruction set not enabled"
35246149Ssjg#else
36246149Ssjg/* The data type intended for user use.  */
37246149Ssjgtypedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
38246149Ssjg
39246149Ssjg/* Internal data types for implementing the intrinsics.  */
40246149Ssjgtypedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
41246149Ssjgtypedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
42246149Ssjgtypedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
43246149Ssjg
44246149Ssjg/* Empty the multimedia state.  */
45246149Ssjgstatic __inline void
46246149Ssjg_mm_empty (void)
47246149Ssjg{
48246149Ssjg  __builtin_ia32_emms ();
49246149Ssjg}
50246149Ssjg
51246149Ssjgstatic __inline void
52246149Ssjg_m_empty (void)
53246149Ssjg{
54246149Ssjg  _mm_empty ();
55246149Ssjg}
56246149Ssjg
57246149Ssjg/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
58246149Ssjgstatic __inline __m64
59246149Ssjg_mm_cvtsi32_si64 (int __i)
60246149Ssjg{
61246149Ssjg  long long __tmp = (unsigned int)__i;
62246149Ssjg  return (__m64) __tmp;
63246149Ssjg}
64246149Ssjg
65292068Ssjgstatic __inline __m64
66246149Ssjg_m_from_int (int __i)
67246149Ssjg{
68246149Ssjg  return _mm_cvtsi32_si64 (__i);
69246149Ssjg}
70246149Ssjg
71246149Ssjg#ifdef __x86_64__
72246149Ssjg/* Convert I to a __m64 object.  */
73246149Ssjgstatic __inline __m64
74246149Ssjg_mm_cvtsi64x_si64 (long long __i)
75246149Ssjg{
76246149Ssjg  return (__m64) __i;
77246149Ssjg}
78246149Ssjg
79246149Ssjg/* Convert I to a __m64 object.  */
80246149Ssjgstatic __inline __m64
81246149Ssjg_mm_set_pi64x (long long __i)
82246149Ssjg{
83246149Ssjg  return (__m64) __i;
84246149Ssjg}
85246149Ssjg#endif
86246149Ssjg
87246149Ssjg/* Convert the lower 32 bits of the __m64 object into an integer.  */
88246149Ssjgstatic __inline int
89246149Ssjg_mm_cvtsi64_si32 (__m64 __i)
90246149Ssjg{
91246149Ssjg  long long __tmp = (long long)__i;
92246149Ssjg  return __tmp;
93246149Ssjg}
94246149Ssjg
95246149Ssjgstatic __inline int
96246149Ssjg_m_to_int (__m64 __i)
97246149Ssjg{
98246149Ssjg  return _mm_cvtsi64_si32 (__i);
99246149Ssjg}
100246149Ssjg
101292068Ssjg#ifdef __x86_64__
102292068Ssjg/* Convert the lower 32 bits of the __m64 object into an integer.  */
103292068Ssjgstatic __inline long long
104246149Ssjg_mm_cvtsi64_si64x (__m64 __i)
105246149Ssjg{
106292068Ssjg  return (long long)__i;
107246149Ssjg}
108292068Ssjg#endif
109292068Ssjg
110292068Ssjg/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
111292068Ssjg   the result, and the four 16-bit values from M2 into the upper four 8-bit
112292068Ssjg   values of the result, all with signed saturation.  */
113292068Ssjgstatic __inline __m64
114292068Ssjg_mm_packs_pi16 (__m64 __m1, __m64 __m2)
115292068Ssjg{
116292068Ssjg  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
117292068Ssjg}
118292068Ssjg
119292068Ssjgstatic __inline __m64
120292068Ssjg_m_packsswb (__m64 __m1, __m64 __m2)
121292068Ssjg{
122292068Ssjg  return _mm_packs_pi16 (__m1, __m2);
123292068Ssjg}
124292068Ssjg
125292068Ssjg/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
126292068Ssjg   the result, and the two 32-bit values from M2 into the upper two 16-bit
127292068Ssjg   values of the result, all with signed saturation.  */
128292068Ssjgstatic __inline __m64
129292068Ssjg_mm_packs_pi32 (__m64 __m1, __m64 __m2)
130292068Ssjg{
131292068Ssjg  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
132292068Ssjg}
133246149Ssjg
134246149Ssjgstatic __inline __m64
135246149Ssjg_m_packssdw (__m64 __m1, __m64 __m2)
136246149Ssjg{
137246149Ssjg  return _mm_packs_pi32 (__m1, __m2);
138246149Ssjg}
139246149Ssjg
140246149Ssjg/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
141246149Ssjg   the result, and the four 16-bit values from M2 into the upper four 8-bit
142246149Ssjg   values of the result, all with unsigned saturation.  */
143246149Ssjgstatic __inline __m64
144246149Ssjg_mm_packs_pu16 (__m64 __m1, __m64 __m2)
145246149Ssjg{
146292068Ssjg  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
147246149Ssjg}
148246149Ssjg
149246149Ssjgstatic __inline __m64
150246149Ssjg_m_packuswb (__m64 __m1, __m64 __m2)
151246149Ssjg{
152246149Ssjg  return _mm_packs_pu16 (__m1, __m2);
153246149Ssjg}
154
155/* Interleave the four 8-bit values from the high half of M1 with the four
156   8-bit values from the high half of M2.  */
157static __inline __m64
158_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
159{
160  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
161}
162
163static __inline __m64
164_m_punpckhbw (__m64 __m1, __m64 __m2)
165{
166  return _mm_unpackhi_pi8 (__m1, __m2);
167}
168
169/* Interleave the two 16-bit values from the high half of M1 with the two
170   16-bit values from the high half of M2.  */
171static __inline __m64
172_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
173{
174  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
175}
176
177static __inline __m64
178_m_punpckhwd (__m64 __m1, __m64 __m2)
179{
180  return _mm_unpackhi_pi16 (__m1, __m2);
181}
182
183/* Interleave the 32-bit value from the high half of M1 with the 32-bit
184   value from the high half of M2.  */
185static __inline __m64
186_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
187{
188  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
189}
190
191static __inline __m64
192_m_punpckhdq (__m64 __m1, __m64 __m2)
193{
194  return _mm_unpackhi_pi32 (__m1, __m2);
195}
196
197/* Interleave the four 8-bit values from the low half of M1 with the four
198   8-bit values from the low half of M2.  */
199static __inline __m64
200_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
201{
202  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
203}
204
205static __inline __m64
206_m_punpcklbw (__m64 __m1, __m64 __m2)
207{
208  return _mm_unpacklo_pi8 (__m1, __m2);
209}
210
211/* Interleave the two 16-bit values from the low half of M1 with the two
212   16-bit values from the low half of M2.  */
213static __inline __m64
214_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
215{
216  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
217}
218
219static __inline __m64
220_m_punpcklwd (__m64 __m1, __m64 __m2)
221{
222  return _mm_unpacklo_pi16 (__m1, __m2);
223}
224
225/* Interleave the 32-bit value from the low half of M1 with the 32-bit
226   value from the low half of M2.  */
227static __inline __m64
228_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
229{
230  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
231}
232
233static __inline __m64
234_m_punpckldq (__m64 __m1, __m64 __m2)
235{
236  return _mm_unpacklo_pi32 (__m1, __m2);
237}
238
239/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
240static __inline __m64
241_mm_add_pi8 (__m64 __m1, __m64 __m2)
242{
243  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
244}
245
246static __inline __m64
247_m_paddb (__m64 __m1, __m64 __m2)
248{
249  return _mm_add_pi8 (__m1, __m2);
250}
251
252/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
253static __inline __m64
254_mm_add_pi16 (__m64 __m1, __m64 __m2)
255{
256  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
257}
258
259static __inline __m64
260_m_paddw (__m64 __m1, __m64 __m2)
261{
262  return _mm_add_pi16 (__m1, __m2);
263}
264
265/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
266static __inline __m64
267_mm_add_pi32 (__m64 __m1, __m64 __m2)
268{
269  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
270}
271
272static __inline __m64
273_m_paddd (__m64 __m1, __m64 __m2)
274{
275  return _mm_add_pi32 (__m1, __m2);
276}
277
278/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
279static __inline __m64
280_mm_add_si64 (__m64 __m1, __m64 __m2)
281{
282  return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
283}
284
285/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
286   saturated arithmetic.  */
287static __inline __m64
288_mm_adds_pi8 (__m64 __m1, __m64 __m2)
289{
290  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
291}
292
293static __inline __m64
294_m_paddsb (__m64 __m1, __m64 __m2)
295{
296  return _mm_adds_pi8 (__m1, __m2);
297}
298
299/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
300   saturated arithmetic.  */
301static __inline __m64
302_mm_adds_pi16 (__m64 __m1, __m64 __m2)
303{
304  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
305}
306
307static __inline __m64
308_m_paddsw (__m64 __m1, __m64 __m2)
309{
310  return _mm_adds_pi16 (__m1, __m2);
311}
312
313/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
314   saturated arithmetic.  */
315static __inline __m64
316_mm_adds_pu8 (__m64 __m1, __m64 __m2)
317{
318  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
319}
320
321static __inline __m64
322_m_paddusb (__m64 __m1, __m64 __m2)
323{
324  return _mm_adds_pu8 (__m1, __m2);
325}
326
327/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
328   saturated arithmetic.  */
329static __inline __m64
330_mm_adds_pu16 (__m64 __m1, __m64 __m2)
331{
332  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
333}
334
335static __inline __m64
336_m_paddusw (__m64 __m1, __m64 __m2)
337{
338  return _mm_adds_pu16 (__m1, __m2);
339}
340
341/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
342static __inline __m64
343_mm_sub_pi8 (__m64 __m1, __m64 __m2)
344{
345  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
346}
347
348static __inline __m64
349_m_psubb (__m64 __m1, __m64 __m2)
350{
351  return _mm_sub_pi8 (__m1, __m2);
352}
353
354/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
355static __inline __m64
356_mm_sub_pi16 (__m64 __m1, __m64 __m2)
357{
358  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
359}
360
361static __inline __m64
362_m_psubw (__m64 __m1, __m64 __m2)
363{
364  return _mm_sub_pi16 (__m1, __m2);
365}
366
367/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
368static __inline __m64
369_mm_sub_pi32 (__m64 __m1, __m64 __m2)
370{
371  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
372}
373
374static __inline __m64
375_m_psubd (__m64 __m1, __m64 __m2)
376{
377  return _mm_sub_pi32 (__m1, __m2);
378}
379
380/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
381static __inline __m64
382_mm_sub_si64 (__m64 __m1, __m64 __m2)
383{
384  return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
385}
386
387/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
388   saturating arithmetic.  */
389static __inline __m64
390_mm_subs_pi8 (__m64 __m1, __m64 __m2)
391{
392  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
393}
394
395static __inline __m64
396_m_psubsb (__m64 __m1, __m64 __m2)
397{
398  return _mm_subs_pi8 (__m1, __m2);
399}
400
401/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
402   signed saturating arithmetic.  */
403static __inline __m64
404_mm_subs_pi16 (__m64 __m1, __m64 __m2)
405{
406  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
407}
408
409static __inline __m64
410_m_psubsw (__m64 __m1, __m64 __m2)
411{
412  return _mm_subs_pi16 (__m1, __m2);
413}
414
415/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
416   unsigned saturating arithmetic.  */
417static __inline __m64
418_mm_subs_pu8 (__m64 __m1, __m64 __m2)
419{
420  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
421}
422
423static __inline __m64
424_m_psubusb (__m64 __m1, __m64 __m2)
425{
426  return _mm_subs_pu8 (__m1, __m2);
427}
428
429/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
430   unsigned saturating arithmetic.  */
431static __inline __m64
432_mm_subs_pu16 (__m64 __m1, __m64 __m2)
433{
434  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
435}
436
437static __inline __m64
438_m_psubusw (__m64 __m1, __m64 __m2)
439{
440  return _mm_subs_pu16 (__m1, __m2);
441}
442
443/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
444   four 32-bit intermediate results, which are then summed by pairs to
445   produce two 32-bit results.  */
446static __inline __m64
447_mm_madd_pi16 (__m64 __m1, __m64 __m2)
448{
449  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
450}
451
452static __inline __m64
453_m_pmaddwd (__m64 __m1, __m64 __m2)
454{
455  return _mm_madd_pi16 (__m1, __m2);
456}
457
458/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
459   M2 and produce the high 16 bits of the 32-bit results.  */
460static __inline __m64
461_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
462{
463  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
464}
465
466static __inline __m64
467_m_pmulhw (__m64 __m1, __m64 __m2)
468{
469  return _mm_mulhi_pi16 (__m1, __m2);
470}
471
472/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
473   the low 16 bits of the results.  */
474static __inline __m64
475_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
476{
477  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
478}
479
480static __inline __m64
481_m_pmullw (__m64 __m1, __m64 __m2)
482{
483  return _mm_mullo_pi16 (__m1, __m2);
484}
485
486/* Shift four 16-bit values in M left by COUNT.  */
487static __inline __m64
488_mm_sll_pi16 (__m64 __m, __m64 __count)
489{
490  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
491}
492
493static __inline __m64
494_m_psllw (__m64 __m, __m64 __count)
495{
496  return _mm_sll_pi16 (__m, __count);
497}
498
499static __inline __m64
500_mm_slli_pi16 (__m64 __m, int __count)
501{
502  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
503}
504
505static __inline __m64
506_m_psllwi (__m64 __m, int __count)
507{
508  return _mm_slli_pi16 (__m, __count);
509}
510
511/* Shift two 32-bit values in M left by COUNT.  */
512static __inline __m64
513_mm_sll_pi32 (__m64 __m, __m64 __count)
514{
515  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
516}
517
518static __inline __m64
519_m_pslld (__m64 __m, __m64 __count)
520{
521  return _mm_sll_pi32 (__m, __count);
522}
523
524static __inline __m64
525_mm_slli_pi32 (__m64 __m, int __count)
526{
527  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
528}
529
530static __inline __m64
531_m_pslldi (__m64 __m, int __count)
532{
533  return _mm_slli_pi32 (__m, __count);
534}
535
536/* Shift the 64-bit value in M left by COUNT.  */
537static __inline __m64
538_mm_sll_si64 (__m64 __m, __m64 __count)
539{
540  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
541}
542
543static __inline __m64
544_m_psllq (__m64 __m, __m64 __count)
545{
546  return _mm_sll_si64 (__m, __count);
547}
548
549static __inline __m64
550_mm_slli_si64 (__m64 __m, int __count)
551{
552  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
553}
554
555static __inline __m64
556_m_psllqi (__m64 __m, int __count)
557{
558  return _mm_slli_si64 (__m, __count);
559}
560
561/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
562static __inline __m64
563_mm_sra_pi16 (__m64 __m, __m64 __count)
564{
565  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
566}
567
568static __inline __m64
569_m_psraw (__m64 __m, __m64 __count)
570{
571  return _mm_sra_pi16 (__m, __count);
572}
573
574static __inline __m64
575_mm_srai_pi16 (__m64 __m, int __count)
576{
577  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
578}
579
580static __inline __m64
581_m_psrawi (__m64 __m, int __count)
582{
583  return _mm_srai_pi16 (__m, __count);
584}
585
586/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
587static __inline __m64
588_mm_sra_pi32 (__m64 __m, __m64 __count)
589{
590  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
591}
592
593static __inline __m64
594_m_psrad (__m64 __m, __m64 __count)
595{
596  return _mm_sra_pi32 (__m, __count);
597}
598
599static __inline __m64
600_mm_srai_pi32 (__m64 __m, int __count)
601{
602  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
603}
604
605static __inline __m64
606_m_psradi (__m64 __m, int __count)
607{
608  return _mm_srai_pi32 (__m, __count);
609}
610
611/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
612static __inline __m64
613_mm_srl_pi16 (__m64 __m, __m64 __count)
614{
615  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
616}
617
618static __inline __m64
619_m_psrlw (__m64 __m, __m64 __count)
620{
621  return _mm_srl_pi16 (__m, __count);
622}
623
624static __inline __m64
625_mm_srli_pi16 (__m64 __m, int __count)
626{
627  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
628}
629
630static __inline __m64
631_m_psrlwi (__m64 __m, int __count)
632{
633  return _mm_srli_pi16 (__m, __count);
634}
635
636/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
637static __inline __m64
638_mm_srl_pi32 (__m64 __m, __m64 __count)
639{
640  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
641}
642
643static __inline __m64
644_m_psrld (__m64 __m, __m64 __count)
645{
646  return _mm_srl_pi32 (__m, __count);
647}
648
649static __inline __m64
650_mm_srli_pi32 (__m64 __m, int __count)
651{
652  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
653}
654
655static __inline __m64
656_m_psrldi (__m64 __m, int __count)
657{
658  return _mm_srli_pi32 (__m, __count);
659}
660
661/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
662static __inline __m64
663_mm_srl_si64 (__m64 __m, __m64 __count)
664{
665  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
666}
667
668static __inline __m64
669_m_psrlq (__m64 __m, __m64 __count)
670{
671  return _mm_srl_si64 (__m, __count);
672}
673
674static __inline __m64
675_mm_srli_si64 (__m64 __m, int __count)
676{
677  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
678}
679
680static __inline __m64
681_m_psrlqi (__m64 __m, int __count)
682{
683  return _mm_srli_si64 (__m, __count);
684}
685
686/* Bit-wise AND the 64-bit values in M1 and M2.  */
687static __inline __m64
688_mm_and_si64 (__m64 __m1, __m64 __m2)
689{
690  return (__m64) __builtin_ia32_pand ((long long)__m1, (long long)__m2);
691}
692
693static __inline __m64
694_m_pand (__m64 __m1, __m64 __m2)
695{
696  return _mm_and_si64 (__m1, __m2);
697}
698
699/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
700   64-bit value in M2.  */
701static __inline __m64
702_mm_andnot_si64 (__m64 __m1, __m64 __m2)
703{
704  return (__m64) __builtin_ia32_pandn ((long long)__m1, (long long)__m2);
705}
706
707static __inline __m64
708_m_pandn (__m64 __m1, __m64 __m2)
709{
710  return _mm_andnot_si64 (__m1, __m2);
711}
712
713/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
714static __inline __m64
715_mm_or_si64 (__m64 __m1, __m64 __m2)
716{
717  return (__m64)__builtin_ia32_por ((long long)__m1, (long long)__m2);
718}
719
720static __inline __m64
721_m_por (__m64 __m1, __m64 __m2)
722{
723  return _mm_or_si64 (__m1, __m2);
724}
725
726/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
727static __inline __m64
728_mm_xor_si64 (__m64 __m1, __m64 __m2)
729{
730  return (__m64)__builtin_ia32_pxor ((long long)__m1, (long long)__m2);
731}
732
733static __inline __m64
734_m_pxor (__m64 __m1, __m64 __m2)
735{
736  return _mm_xor_si64 (__m1, __m2);
737}
738
739/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
740   test is true and zero if false.  */
741static __inline __m64
742_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
743{
744  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
745}
746
747static __inline __m64
748_m_pcmpeqb (__m64 __m1, __m64 __m2)
749{
750  return _mm_cmpeq_pi8 (__m1, __m2);
751}
752
753static __inline __m64
754_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
755{
756  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
757}
758
759static __inline __m64
760_m_pcmpgtb (__m64 __m1, __m64 __m2)
761{
762  return _mm_cmpgt_pi8 (__m1, __m2);
763}
764
765/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
766   the test is true and zero if false.  */
767static __inline __m64
768_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
769{
770  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
771}
772
773static __inline __m64
774_m_pcmpeqw (__m64 __m1, __m64 __m2)
775{
776  return _mm_cmpeq_pi16 (__m1, __m2);
777}
778
779static __inline __m64
780_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
781{
782  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
783}
784
785static __inline __m64
786_m_pcmpgtw (__m64 __m1, __m64 __m2)
787{
788  return _mm_cmpgt_pi16 (__m1, __m2);
789}
790
791/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
792   the test is true and zero if false.  */
793static __inline __m64
794_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
795{
796  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
797}
798
799static __inline __m64
800_m_pcmpeqd (__m64 __m1, __m64 __m2)
801{
802  return _mm_cmpeq_pi32 (__m1, __m2);
803}
804
805static __inline __m64
806_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
807{
808  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
809}
810
811static __inline __m64
812_m_pcmpgtd (__m64 __m1, __m64 __m2)
813{
814  return _mm_cmpgt_pi32 (__m1, __m2);
815}
816
817/* Creates a 64-bit zero.  */
818static __inline __m64
819_mm_setzero_si64 (void)
820{
821  return (__m64)__builtin_ia32_mmx_zero ();
822}
823
824/* Creates a vector of two 32-bit values; I0 is least significant.  */
825static __inline __m64
826_mm_set_pi32 (int __i1, int __i0)
827{
828  union {
829    __m64 __q;
830    struct {
831      unsigned int __i0;
832      unsigned int __i1;
833    } __s;
834  } __u;
835
836  __u.__s.__i0 = __i0;
837  __u.__s.__i1 = __i1;
838
839  return __u.__q;
840}
841
842/* Creates a vector of four 16-bit values; W0 is least significant.  */
843static __inline __m64
844_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
845{
846  unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
847  unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
848  return _mm_set_pi32 (__i1, __i0);
849
850}
851
852/* Creates a vector of eight 8-bit values; B0 is least significant.  */
853static __inline __m64
854_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
855	     char __b3, char __b2, char __b1, char __b0)
856{
857  unsigned int __i1, __i0;
858
859  __i1 = (unsigned char)__b7;
860  __i1 = __i1 << 8 | (unsigned char)__b6;
861  __i1 = __i1 << 8 | (unsigned char)__b5;
862  __i1 = __i1 << 8 | (unsigned char)__b4;
863
864  __i0 = (unsigned char)__b3;
865  __i0 = __i0 << 8 | (unsigned char)__b2;
866  __i0 = __i0 << 8 | (unsigned char)__b1;
867  __i0 = __i0 << 8 | (unsigned char)__b0;
868
869  return _mm_set_pi32 (__i1, __i0);
870}
871
872/* Similar, but with the arguments in reverse order.  */
873static __inline __m64
874_mm_setr_pi32 (int __i0, int __i1)
875{
876  return _mm_set_pi32 (__i1, __i0);
877}
878
879static __inline __m64
880_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
881{
882  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
883}
884
885static __inline __m64
886_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
887	      char __b4, char __b5, char __b6, char __b7)
888{
889  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
890}
891
892/* Creates a vector of two 32-bit values, both elements containing I.  */
893static __inline __m64
894_mm_set1_pi32 (int __i)
895{
896  return _mm_set_pi32 (__i, __i);
897}
898
899/* Creates a vector of four 16-bit values, all elements containing W.  */
900static __inline __m64
901_mm_set1_pi16 (short __w)
902{
903  unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
904  return _mm_set1_pi32 (__i);
905}
906
907/* Creates a vector of eight 8-bit values, all elements containing B.  */
908static __inline __m64
909_mm_set1_pi8 (char __b)
910{
911  unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
912  unsigned int __i = __w << 16 | __w;
913  return _mm_set1_pi32 (__i);
914}
915
916#endif /* __MMX__ */
917#endif /* _MMINTRIN_H_INCLUDED */
918