mmintrin.h revision 102780
1/* Copyright (C) 2002 Free Software Foundation, Inc.
2
3   This file is part of GNU CC.
4
5   GNU CC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GNU CC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GNU CC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 59 Temple Place - Suite 330,
18   Boston, MA 02111-1307, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 5.0.  */
29
30#ifndef _MMINTRIN_H_INCLUDED
31#define _MMINTRIN_H_INCLUDED
32
33/* The data type intended for user use.  */
34typedef unsigned long long __m64 __attribute__ ((__aligned__ (8)));
35
36/* Internal data types for implementing the intrinsics.  */
37typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
38typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
39typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
40
41/* Empty the multimedia state.  */
42static __inline void
43_mm_empty (void)
44{
45  __builtin_ia32_emms ();
46}
47
48/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
49static __inline __m64
50_mm_cvtsi32_si64 (int __i)
51{
52  return (unsigned int) __i;
53}
54
55/* Convert the lower 32 bits of the __m64 object into an integer.  */
56static __inline int
57_mm_cvtsi64_si32 (__m64 __i)
58{
59  return __i;
60}
61
62/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
63   the result, and the four 16-bit values from M2 into the upper four 8-bit
64   values of the result, all with signed saturation.  */
65static __inline __m64
66_mm_packs_pi16 (__m64 __m1, __m64 __m2)
67{
68  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
69}
70
71/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
72   the result, and the two 32-bit values from M2 into the upper two 16-bit
73   values of the result, all with signed saturation.  */
74static __inline __m64
75_mm_packs_pi32 (__m64 __m1, __m64 __m2)
76{
77  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
78}
79
80/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
81   the result, and the four 16-bit values from M2 into the upper four 8-bit
82   values of the result, all with unsigned saturation.  */
83static __inline __m64
84_mm_packs_pu16 (__m64 __m1, __m64 __m2)
85{
86  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
87}
88
89/* Interleave the four 8-bit values from the high half of M1 with the four
90   8-bit values from the high half of M2.  */
91static __inline __m64
92_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
93{
94  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
95}
96
97/* Interleave the two 16-bit values from the high half of M1 with the two
98   16-bit values from the high half of M2.  */
99static __inline __m64
100_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
101{
102  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
103}
104
105/* Interleave the 32-bit value from the high half of M1 with the 32-bit
106   value from the high half of M2.  */
107static __inline __m64
108_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
109{
110  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
111}
112
113/* Interleave the four 8-bit values from the low half of M1 with the four
114   8-bit values from the low half of M2.  */
115static __inline __m64
116_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
117{
118  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
119}
120
121/* Interleave the two 16-bit values from the low half of M1 with the two
122   16-bit values from the low half of M2.  */
123static __inline __m64
124_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
125{
126  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
127}
128
129/* Interleave the 32-bit value from the low half of M1 with the 32-bit
130   value from the low half of M2.  */
131static __inline __m64
132_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
133{
134  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
135}
136
137/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
138static __inline __m64
139_mm_add_pi8 (__m64 __m1, __m64 __m2)
140{
141  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
142}
143
144/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
145static __inline __m64
146_mm_add_pi16 (__m64 __m1, __m64 __m2)
147{
148  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
149}
150
151/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
152static __inline __m64
153_mm_add_pi32 (__m64 __m1, __m64 __m2)
154{
155  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
156}
157
158/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
159   saturated arithmetic.  */
160static __inline __m64
161_mm_adds_pi8 (__m64 __m1, __m64 __m2)
162{
163  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
164}
165
166/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
167   saturated arithmetic.  */
168static __inline __m64
169_mm_adds_pi16 (__m64 __m1, __m64 __m2)
170{
171  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
172}
173
174/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
175   saturated arithmetic.  */
176static __inline __m64
177_mm_adds_pu8 (__m64 __m1, __m64 __m2)
178{
179  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
180}
181
182/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
183   saturated arithmetic.  */
184static __inline __m64
185_mm_adds_pu16 (__m64 __m1, __m64 __m2)
186{
187  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
188}
189
190/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
191static __inline __m64
192_mm_sub_pi8 (__m64 __m1, __m64 __m2)
193{
194  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
195}
196
197/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
198static __inline __m64
199_mm_sub_pi16 (__m64 __m1, __m64 __m2)
200{
201  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
202}
203
204/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
205static __inline __m64
206_mm_sub_pi32 (__m64 __m1, __m64 __m2)
207{
208  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
209}
210
211/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
212   saturating arithmetic.  */
213static __inline __m64
214_mm_subs_pi8 (__m64 __m1, __m64 __m2)
215{
216  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
217}
218
219/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
220   signed saturating arithmetic.  */
221static __inline __m64
222_mm_subs_pi16 (__m64 __m1, __m64 __m2)
223{
224  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
225}
226
227/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
228   unsigned saturating arithmetic.  */
229static __inline __m64
230_mm_subs_pu8 (__m64 __m1, __m64 __m2)
231{
232  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
233}
234
235/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
236   unsigned saturating arithmetic.  */
237static __inline __m64
238_mm_subs_pu16 (__m64 __m1, __m64 __m2)
239{
240  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
241}
242
243/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
244   four 32-bit intermediate results, which are then summed by pairs to
245   produce two 32-bit results.  */
246static __inline __m64
247_mm_madd_pi16 (__m64 __m1, __m64 __m2)
248{
249  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
250}
251
252/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
253   M2 and produce the high 16 bits of the 32-bit results.  */
254static __inline __m64
255_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
256{
257  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
258}
259
260/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
261   the low 16 bits of the results.  */
262static __inline __m64
263_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
264{
265  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
266}
267
268/* Shift four 16-bit values in M left by COUNT.  */
269static __inline __m64
270_mm_sll_pi16 (__m64 __m, __m64 __count)
271{
272  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
273}
274
275static __inline __m64
276_mm_slli_pi16 (__m64 __m, int __count)
277{
278  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
279}
280
281/* Shift two 32-bit values in M left by COUNT.  */
282static __inline __m64
283_mm_sll_pi32 (__m64 __m, __m64 __count)
284{
285  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
286}
287
288static __inline __m64
289_mm_slli_pi32 (__m64 __m, int __count)
290{
291  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
292}
293
294/* Shift the 64-bit value in M left by COUNT.  */
295static __inline __m64
296_mm_sll_pi64 (__m64 __m, __m64 __count)
297{
298  return (__m64) __builtin_ia32_psllq (__m, __count);
299}
300
301static __inline __m64
302_mm_slli_pi64 (__m64 __m, int __count)
303{
304  return (__m64) __builtin_ia32_psllq (__m, __count);
305}
306
307/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
308static __inline __m64
309_mm_sra_pi16 (__m64 __m, __m64 __count)
310{
311  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
312}
313
314static __inline __m64
315_mm_srai_pi16 (__m64 __m, int __count)
316{
317  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
318}
319
320/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
321static __inline __m64
322_mm_sra_pi32 (__m64 __m, __m64 __count)
323{
324  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
325}
326
327static __inline __m64
328_mm_srai_pi32 (__m64 __m, int __count)
329{
330  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
331}
332
333/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
334static __inline __m64
335_mm_srl_pi16 (__m64 __m, __m64 __count)
336{
337  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
338}
339
340static __inline __m64
341_mm_srli_pi16 (__m64 __m, int __count)
342{
343  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
344}
345
346/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
347static __inline __m64
348_mm_srl_pi32 (__m64 __m, __m64 __count)
349{
350  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
351}
352
353static __inline __m64
354_mm_srli_pi32 (__m64 __m, int __count)
355{
356  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
357}
358
359/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
360static __inline __m64
361_mm_srl_pi64 (__m64 __m, __m64 __count)
362{
363  return (__m64) __builtin_ia32_psrlq (__m, __count);
364}
365
366static __inline __m64
367_mm_srli_pi64 (__m64 __m, int __count)
368{
369  return (__m64) __builtin_ia32_psrlq (__m, __count);
370}
371
372/* Bit-wise AND the 64-bit values in M1 and M2.  */
373static __inline __m64
374_mm_and_si64 (__m64 __m1, __m64 __m2)
375{
376  return __builtin_ia32_pand (__m1, __m2);
377}
378
379/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
380   64-bit value in M2.  */
381static __inline __m64
382_mm_andnot_si64 (__m64 __m1, __m64 __m2)
383{
384  return __builtin_ia32_pandn (__m1, __m2);
385}
386
387/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
388static __inline __m64
389_mm_or_si64 (__m64 __m1, __m64 __m2)
390{
391  return __builtin_ia32_por (__m1, __m2);
392}
393
394/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
395static __inline __m64
396_mm_xor_si64 (__m64 __m1, __m64 __m2)
397{
398  return __builtin_ia32_pxor (__m1, __m2);
399}
400
401/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
402   test is true and zero if false.  */
403static __inline __m64
404_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
405{
406  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
407}
408
409static __inline __m64
410_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
411{
412  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
413}
414
415/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
416   the test is true and zero if false.  */
417static __inline __m64
418_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
419{
420  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
421}
422
423static __inline __m64
424_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
425{
426  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
427}
428
429/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
430   the test is true and zero if false.  */
431static __inline __m64
432_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
433{
434  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
435}
436
437static __inline __m64
438_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
439{
440  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
441}
442
443/* Creates a 64-bit zero.  */
444static __inline __m64
445_mm_setzero_si64 (void)
446{
447  return __builtin_ia32_mmx_zero ();
448}
449
450/* Creates a vector of two 32-bit values; I0 is least significant.  */
451static __inline __m64
452_mm_set_pi32 (int __i1, int __i0)
453{
454  union {
455    __m64 __q;
456    struct {
457      unsigned int __i0;
458      unsigned int __i1;
459    } __s;
460  } __u;
461
462  __u.__s.__i0 = __i0;
463  __u.__s.__i1 = __i1;
464
465  return __u.__q;
466}
467
468/* Creates a vector of four 16-bit values; W0 is least significant.  */
469static __inline __m64
470_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
471{
472  unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
473  unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
474  return _mm_set_pi32 (__i1, __i0);
475
476}
477
478/* Creates a vector of eight 8-bit values; B0 is least significant.  */
479static __inline __m64
480_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
481	     char __b3, char __b2, char __b1, char __b0)
482{
483  unsigned int __i1, __i0;
484
485  __i1 = (unsigned char)__b7;
486  __i1 = __i1 << 8 | (unsigned char)__b6;
487  __i1 = __i1 << 8 | (unsigned char)__b5;
488  __i1 = __i1 << 8 | (unsigned char)__b4;
489
490  __i0 = (unsigned char)__b3;
491  __i0 = __i0 << 8 | (unsigned char)__b2;
492  __i0 = __i0 << 8 | (unsigned char)__b1;
493  __i0 = __i0 << 8 | (unsigned char)__b0;
494
495  return _mm_set_pi32 (__i1, __i0);
496}
497
498/* Similar, but with the arguments in reverse order.  */
499static __inline __m64
500_mm_setr_pi32 (int __i0, int __i1)
501{
502  return _mm_set_pi32 (__i1, __i0);
503}
504
505static __inline __m64
506_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
507{
508  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
509}
510
511static __inline __m64
512_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
513	      char __b4, char __b5, char __b6, char __b7)
514{
515  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
516}
517
518/* Creates a vector of two 32-bit values, both elements containing I.  */
519static __inline __m64
520_mm_set1_pi32 (int __i)
521{
522  return _mm_set_pi32 (__i, __i);
523}
524
525/* Creates a vector of four 16-bit values, all elements containing W.  */
526static __inline __m64
527_mm_set1_pi16 (short __w)
528{
529  unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
530  return _mm_set1_pi32 (__i);
531}
532
533/* Creates a vector of four 16-bit values, all elements containing B.  */
534static __inline __m64
535_mm_set1_pi8 (char __b)
536{
537  unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
538  unsigned int __i = __w << 16 | __w;
539  return _mm_set1_pi32 (__i);
540}
541
542#endif /* _MMINTRIN_H_INCLUDED */
543