mmintrin.h revision 107590
1/* Copyright (C) 2002 Free Software Foundation, Inc.
2
3   This file is part of GNU CC.
4
5   GNU CC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GNU CC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GNU CC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 59 Temple Place - Suite 330,
18   Boston, MA 02111-1307, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 5.0.  */
29
30#ifndef _MMINTRIN_H_INCLUDED
31#define _MMINTRIN_H_INCLUDED
32
33/* The data type intended for user use.  */
34typedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
35
36/* Internal data types for implementing the intrinsics.  */
37typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
38typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
39typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
40
41/* Empty the multimedia state.  */
42static __inline void
43_mm_empty (void)
44{
45  __builtin_ia32_emms ();
46}
47
48/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
49static __inline __m64
50_mm_cvtsi32_si64 (int __i)
51{
52  long long __tmp = (unsigned int)__i;
53  return (__m64) __tmp;
54}
55
56/* Convert the lower 32 bits of the __m64 object into an integer.  */
57static __inline int
58_mm_cvtsi64_si32 (__m64 __i)
59{
60  long long __tmp = (long long)__i;
61  return __tmp;
62}
63
64/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
65   the result, and the four 16-bit values from M2 into the upper four 8-bit
66   values of the result, all with signed saturation.  */
67static __inline __m64
68_mm_packs_pi16 (__m64 __m1, __m64 __m2)
69{
70  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
71}
72
73/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
74   the result, and the two 32-bit values from M2 into the upper two 16-bit
75   values of the result, all with signed saturation.  */
76static __inline __m64
77_mm_packs_pi32 (__m64 __m1, __m64 __m2)
78{
79  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
80}
81
82/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
83   the result, and the four 16-bit values from M2 into the upper four 8-bit
84   values of the result, all with unsigned saturation.  */
85static __inline __m64
86_mm_packs_pu16 (__m64 __m1, __m64 __m2)
87{
88  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
89}
90
91/* Interleave the four 8-bit values from the high half of M1 with the four
92   8-bit values from the high half of M2.  */
93static __inline __m64
94_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
95{
96  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
97}
98
99/* Interleave the two 16-bit values from the high half of M1 with the two
100   16-bit values from the high half of M2.  */
101static __inline __m64
102_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
103{
104  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
105}
106
107/* Interleave the 32-bit value from the high half of M1 with the 32-bit
108   value from the high half of M2.  */
109static __inline __m64
110_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
111{
112  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
113}
114
115/* Interleave the four 8-bit values from the low half of M1 with the four
116   8-bit values from the low half of M2.  */
117static __inline __m64
118_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
119{
120  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
121}
122
123/* Interleave the two 16-bit values from the low half of M1 with the two
124   16-bit values from the low half of M2.  */
125static __inline __m64
126_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
127{
128  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
129}
130
131/* Interleave the 32-bit value from the low half of M1 with the 32-bit
132   value from the low half of M2.  */
133static __inline __m64
134_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
135{
136  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
137}
138
139/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
140static __inline __m64
141_mm_add_pi8 (__m64 __m1, __m64 __m2)
142{
143  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
144}
145
146/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
147static __inline __m64
148_mm_add_pi16 (__m64 __m1, __m64 __m2)
149{
150  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
151}
152
153/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
154static __inline __m64
155_mm_add_pi32 (__m64 __m1, __m64 __m2)
156{
157  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
158}
159
160/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
161   saturated arithmetic.  */
162static __inline __m64
163_mm_adds_pi8 (__m64 __m1, __m64 __m2)
164{
165  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
166}
167
168/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
169   saturated arithmetic.  */
170static __inline __m64
171_mm_adds_pi16 (__m64 __m1, __m64 __m2)
172{
173  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
174}
175
176/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
177   saturated arithmetic.  */
178static __inline __m64
179_mm_adds_pu8 (__m64 __m1, __m64 __m2)
180{
181  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
182}
183
184/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
185   saturated arithmetic.  */
186static __inline __m64
187_mm_adds_pu16 (__m64 __m1, __m64 __m2)
188{
189  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
190}
191
192/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
193static __inline __m64
194_mm_sub_pi8 (__m64 __m1, __m64 __m2)
195{
196  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
197}
198
199/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
200static __inline __m64
201_mm_sub_pi16 (__m64 __m1, __m64 __m2)
202{
203  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
204}
205
206/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
207static __inline __m64
208_mm_sub_pi32 (__m64 __m1, __m64 __m2)
209{
210  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
211}
212
213/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
214   saturating arithmetic.  */
215static __inline __m64
216_mm_subs_pi8 (__m64 __m1, __m64 __m2)
217{
218  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
219}
220
221/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
222   signed saturating arithmetic.  */
223static __inline __m64
224_mm_subs_pi16 (__m64 __m1, __m64 __m2)
225{
226  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
227}
228
229/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
230   unsigned saturating arithmetic.  */
231static __inline __m64
232_mm_subs_pu8 (__m64 __m1, __m64 __m2)
233{
234  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
235}
236
237/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
238   unsigned saturating arithmetic.  */
239static __inline __m64
240_mm_subs_pu16 (__m64 __m1, __m64 __m2)
241{
242  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
243}
244
245/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
246   four 32-bit intermediate results, which are then summed by pairs to
247   produce two 32-bit results.  */
248static __inline __m64
249_mm_madd_pi16 (__m64 __m1, __m64 __m2)
250{
251  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
252}
253
254/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
255   M2 and produce the high 16 bits of the 32-bit results.  */
256static __inline __m64
257_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
258{
259  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
260}
261
262/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
263   the low 16 bits of the results.  */
264static __inline __m64
265_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
266{
267  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
268}
269
270/* Shift four 16-bit values in M left by COUNT.  */
271static __inline __m64
272_mm_sll_pi16 (__m64 __m, __m64 __count)
273{
274  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
275}
276
277static __inline __m64
278_mm_slli_pi16 (__m64 __m, int __count)
279{
280  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
281}
282
283/* Shift two 32-bit values in M left by COUNT.  */
284static __inline __m64
285_mm_sll_pi32 (__m64 __m, __m64 __count)
286{
287  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
288}
289
290static __inline __m64
291_mm_slli_pi32 (__m64 __m, int __count)
292{
293  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
294}
295
296/* Shift the 64-bit value in M left by COUNT.  */
297static __inline __m64
298_mm_sll_si64 (__m64 __m, __m64 __count)
299{
300  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
301}
302
303static __inline __m64
304_mm_slli_si64 (__m64 __m, int __count)
305{
306  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
307}
308
309/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
310static __inline __m64
311_mm_sra_pi16 (__m64 __m, __m64 __count)
312{
313  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
314}
315
316static __inline __m64
317_mm_srai_pi16 (__m64 __m, int __count)
318{
319  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
320}
321
322/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
323static __inline __m64
324_mm_sra_pi32 (__m64 __m, __m64 __count)
325{
326  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
327}
328
329static __inline __m64
330_mm_srai_pi32 (__m64 __m, int __count)
331{
332  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
333}
334
335/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
336static __inline __m64
337_mm_srl_pi16 (__m64 __m, __m64 __count)
338{
339  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
340}
341
342static __inline __m64
343_mm_srli_pi16 (__m64 __m, int __count)
344{
345  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
346}
347
348/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
349static __inline __m64
350_mm_srl_pi32 (__m64 __m, __m64 __count)
351{
352  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
353}
354
355static __inline __m64
356_mm_srli_pi32 (__m64 __m, int __count)
357{
358  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
359}
360
361/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
362static __inline __m64
363_mm_srl_si64 (__m64 __m, __m64 __count)
364{
365  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
366}
367
368static __inline __m64
369_mm_srli_si64 (__m64 __m, int __count)
370{
371  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
372}
373
374/* Bit-wise AND the 64-bit values in M1 and M2.  */
375static __inline __m64
376_mm_and_si64 (__m64 __m1, __m64 __m2)
377{
378  return (__m64) __builtin_ia32_pand ((long long)__m1, (long long)__m2);
379}
380
381/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
382   64-bit value in M2.  */
383static __inline __m64
384_mm_andnot_si64 (__m64 __m1, __m64 __m2)
385{
386  return (__m64) __builtin_ia32_pandn ((long long)__m1, (long long)__m2);
387}
388
389/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
390static __inline __m64
391_mm_or_si64 (__m64 __m1, __m64 __m2)
392{
393  return (__m64)__builtin_ia32_por ((long long)__m1, (long long)__m2);
394}
395
396/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
397static __inline __m64
398_mm_xor_si64 (__m64 __m1, __m64 __m2)
399{
400  return (__m64)__builtin_ia32_pxor ((long long)__m1, (long long)__m2);
401}
402
403/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
404   test is true and zero if false.  */
405static __inline __m64
406_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
407{
408  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
409}
410
411static __inline __m64
412_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
413{
414  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
415}
416
417/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
418   the test is true and zero if false.  */
419static __inline __m64
420_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
421{
422  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
423}
424
425static __inline __m64
426_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
427{
428  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
429}
430
431/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
432   the test is true and zero if false.  */
433static __inline __m64
434_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
435{
436  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
437}
438
439static __inline __m64
440_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
441{
442  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
443}
444
445/* Creates a 64-bit zero.  */
446static __inline __m64
447_mm_setzero_si64 (void)
448{
449  return (__m64)__builtin_ia32_mmx_zero ();
450}
451
452/* Creates a vector of two 32-bit values; I0 is least significant.  */
453static __inline __m64
454_mm_set_pi32 (int __i1, int __i0)
455{
456  union {
457    __m64 __q;
458    struct {
459      unsigned int __i0;
460      unsigned int __i1;
461    } __s;
462  } __u;
463
464  __u.__s.__i0 = __i0;
465  __u.__s.__i1 = __i1;
466
467  return __u.__q;
468}
469
470/* Creates a vector of four 16-bit values; W0 is least significant.  */
471static __inline __m64
472_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
473{
474  unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
475  unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
476  return _mm_set_pi32 (__i1, __i0);
477
478}
479
480/* Creates a vector of eight 8-bit values; B0 is least significant.  */
481static __inline __m64
482_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
483	     char __b3, char __b2, char __b1, char __b0)
484{
485  unsigned int __i1, __i0;
486
487  __i1 = (unsigned char)__b7;
488  __i1 = __i1 << 8 | (unsigned char)__b6;
489  __i1 = __i1 << 8 | (unsigned char)__b5;
490  __i1 = __i1 << 8 | (unsigned char)__b4;
491
492  __i0 = (unsigned char)__b3;
493  __i0 = __i0 << 8 | (unsigned char)__b2;
494  __i0 = __i0 << 8 | (unsigned char)__b1;
495  __i0 = __i0 << 8 | (unsigned char)__b0;
496
497  return _mm_set_pi32 (__i1, __i0);
498}
499
500/* Similar, but with the arguments in reverse order.  */
501static __inline __m64
502_mm_setr_pi32 (int __i0, int __i1)
503{
504  return _mm_set_pi32 (__i1, __i0);
505}
506
507static __inline __m64
508_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
509{
510  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
511}
512
513static __inline __m64
514_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
515	      char __b4, char __b5, char __b6, char __b7)
516{
517  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
518}
519
520/* Creates a vector of two 32-bit values, both elements containing I.  */
521static __inline __m64
522_mm_set1_pi32 (int __i)
523{
524  return _mm_set_pi32 (__i, __i);
525}
526
527/* Creates a vector of four 16-bit values, all elements containing W.  */
528static __inline __m64
529_mm_set1_pi16 (short __w)
530{
531  unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
532  return _mm_set1_pi32 (__i);
533}
534
535/* Creates a vector of four 16-bit values, all elements containing B.  */
536static __inline __m64
537_mm_set1_pi8 (char __b)
538{
539  unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
540  unsigned int __i = __w << 16 | __w;
541  return _mm_set1_pi32 (__i);
542}
543
544#endif /* _MMINTRIN_H_INCLUDED */
545