mmintrin.h revision 90075
1327Sjkh/* Copyright (C) 2002 Free Software Foundation, Inc.
216404Sjkh
3327Sjkh   This file is part of GNU CC.
4327Sjkh
5327Sjkh   GNU CC is free software; you can redistribute it and/or modify
6327Sjkh   it under the terms of the GNU General Public License as published by
7327Sjkh   the Free Software Foundation; either version 2, or (at your option)
8327Sjkh   any later version.
9327Sjkh
10327Sjkh   GNU CC is distributed in the hope that it will be useful,
11327Sjkh   but WITHOUT ANY WARRANTY; without even the implied warranty of
12327Sjkh   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13327Sjkh   GNU General Public License for more details.
14327Sjkh
15327Sjkh   You should have received a copy of the GNU General Public License
16327Sjkh   along with GNU CC; see the file COPYING.  If not, write to
17327Sjkh   the Free Software Foundation, 59 Temple Place - Suite 330,
18327Sjkh   Boston, MA 02111-1307, USA.  */
19327Sjkh
20327Sjkh/* As a special exception, if you include this header file into source
21327Sjkh   files compiled by GCC, this header file does not by itself cause
22327Sjkh   the resulting executable to be covered by the GNU General Public
23327Sjkh   License.  This exception does not however invalidate any other
24327Sjkh   reasons why the executable file might be covered by the GNU General
25327Sjkh   Public License.  */
26327Sjkh
27327Sjkh/* Implemented from the specification included in the Intel C++ Compiler
28327Sjkh   User Guide and Reference, version 5.0.  */
29327Sjkh
30327Sjkh#ifndef _MMINTRIN_H_INCLUDED
31327Sjkh#define _MMINTRIN_H_INCLUDED
32327Sjkh
33327Sjkh/* The data type intended for user use.  */
34327Sjkhtypedef unsigned long long __m64;
35327Sjkh
367937Sjkh/* Internal data types for implementing the intrinsics.  */
37327Sjkhtypedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
38327Sjkhtypedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
39327Sjkhtypedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
407937Sjkh
417937Sjkh/* Empty the multimedia state.  */
427937Sjkhstatic __inline void
43327Sjkh_mm_empty (void)
4416404Sjkh{
4516404Sjkh  __builtin_ia32_emms ();
46327Sjkh}
4716404Sjkh
4816404Sjkh/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
4916404Sjkhstatic __inline __m64
5016404Sjkh_mm_cvtsi32_si64 (int __i)
5116404Sjkh{
5216404Sjkh  return (unsigned int) __i;
538857Srgrimes}
5416404Sjkh
5516404Sjkh/* Convert the lower 32 bits of the __m64 object into an integer.  */
5616404Sjkhstatic __inline int
5716404Sjkh_mm_cvtsi64_si32 (__m64 __i)
5816404Sjkh{
5916404Sjkh  return __i;
6016404Sjkh}
6116404Sjkh
6216404Sjkh/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
63327Sjkh   the result, and the four 16-bit values from M2 into the upper four 8-bit
6416404Sjkh   values of the result, all with signed saturation.  */
6516404Sjkhstatic __inline __m64
6616404Sjkh_mm_packs_pi16 (__m64 __m1, __m64 __m2)
67327Sjkh{
68327Sjkh  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
69327Sjkh}
7011780Sjkh
7111780Sjkh/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
72327Sjkh   the result, and the two 32-bit values from M2 into the upper two 16-bit
73327Sjkh   values of the result, all with signed saturation.  */
74327Sjkhstatic __inline __m64
758086Sjkh_mm_packs_pi32 (__m64 __m1, __m64 __m2)
76327Sjkh{
778086Sjkh  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
78327Sjkh}
79327Sjkh
808086Sjkh/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
818142Sjkh   the result, and the four 16-bit values from M2 into the upper four 8-bit
828086Sjkh   values of the result, all with unsigned saturation.  */
83327Sjkhstatic __inline __m64
848086Sjkh_mm_packs_pu16 (__m64 __m1, __m64 __m2)
8511780Sjkh{
868086Sjkh  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
878086Sjkh}
888086Sjkh
898086Sjkh/* Interleave the four 8-bit values from the high half of M1 with the four
909782Sache   8-bit values from the high half of M2.  */
918086Sjkhstatic __inline __m64
92327Sjkh_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
938423Sjkh{
948423Sjkh  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
958423Sjkh}
968423Sjkh
978423Sjkh/* Interleave the two 16-bit values from the high half of M1 with the two
988423Sjkh   16-bit values from the high half of M2.  */
998423Sjkhstatic __inline __m64
1008423Sjkh_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
1018086Sjkh{
1028086Sjkh  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
1038086Sjkh}
10411780Sjkh
1058086Sjkh/* Interleave the 32-bit value from the high half of M1 with the 32-bit
1068086Sjkh   value from the high half of M2.  */
1078086Sjkhstatic __inline __m64
1083364Sjkh_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
1093364Sjkh{
1103364Sjkh  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
1113577Sjkh}
1123577Sjkh
1133364Sjkh/* Interleave the four 8-bit values from the low half of M1 with the four
1143364Sjkh   8-bit values from the low half of M2.  */
1153364Sjkhstatic __inline __m64
1168086Sjkh_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
1178086Sjkh{
1183364Sjkh  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
11911780Sjkh}
120327Sjkh
121327Sjkh/* Interleave the two 16-bit values from the low half of M1 with the two
1228086Sjkh   16-bit values from the low half of M2.  */
1238086Sjkhstatic __inline __m64
124327Sjkh_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
125327Sjkh{
1268086Sjkh  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
127327Sjkh}
1287937Sjkh
1297937Sjkh/* Interleave the 32-bit value from the low half of M1 with the 32-bit
1307937Sjkh   value from the low half of M2.  */
1317937Sjkhstatic __inline __m64
132327Sjkh_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
1338086Sjkh{
134327Sjkh  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
135327Sjkh}
136327Sjkh
137327Sjkh/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
138327Sjkhstatic __inline __m64
139327Sjkh_mm_add_pi8 (__m64 __m1, __m64 __m2)
140327Sjkh{
141327Sjkh  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
142327Sjkh}
143327Sjkh
144327Sjkh/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
145327Sjkhstatic __inline __m64
146327Sjkh_mm_add_pi16 (__m64 __m1, __m64 __m2)
147327Sjkh{
1488086Sjkh  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
1498086Sjkh}
150327Sjkh
151327Sjkh/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
152327Sjkhstatic __inline __m64
153327Sjkh_mm_add_pi32 (__m64 __m1, __m64 __m2)
154327Sjkh{
155327Sjkh  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
156327Sjkh}
157327Sjkh
158327Sjkh/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
159327Sjkh   saturated arithmetic.  */
1608086Sjkhstatic __inline __m64
161327Sjkh_mm_adds_pi8 (__m64 __m1, __m64 __m2)
1628086Sjkh{
1638086Sjkh  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
164327Sjkh}
165327Sjkh
166327Sjkh/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
167411Sjkh   saturated arithmetic.  */
168411Sjkhstatic __inline __m64
169327Sjkh_mm_adds_pi16 (__m64 __m1, __m64 __m2)
170379Sjkh{
1714996Sjkh  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
1724996Sjkh}
173327Sjkh
174379Sjkh/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
1754996Sjkh   saturated arithmetic.  */
1764996Sjkhstatic __inline __m64
177327Sjkh_mm_adds_pu8 (__m64 __m1, __m64 __m2)
178379Sjkh{
179327Sjkh  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
180379Sjkh}
181327Sjkh
182379Sjkh/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
1834996Sjkh   saturated arithmetic.  */
1844996Sjkhstatic __inline __m64
185327Sjkh_mm_adds_pu16 (__m64 __m1, __m64 __m2)
186379Sjkh{
187411Sjkh  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
188411Sjkh}
189411Sjkh
190411Sjkh/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
191327Sjkhstatic __inline __m64
192327Sjkh_mm_sub_pi8 (__m64 __m1, __m64 __m2)
1938086Sjkh{
19411780Sjkh  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
1958086Sjkh}
1968086Sjkh
1978086Sjkh/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
198327Sjkhstatic __inline __m64
199327Sjkh_mm_sub_pi16 (__m64 __m1, __m64 __m2)
200327Sjkh{
201327Sjkh  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
202327Sjkh}
20311780Sjkh
204327Sjkh/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
205static __inline __m64
206_mm_sub_pi32 (__m64 __m1, __m64 __m2)
207{
208  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
209}
210
211/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
212   saturating arithmetic.  */
213static __inline __m64
214_mm_subs_pi8 (__m64 __m1, __m64 __m2)
215{
216  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
217}
218
219/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
220   signed saturating arithmetic.  */
221static __inline __m64
222_mm_subs_pi16 (__m64 __m1, __m64 __m2)
223{
224  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
225}
226
227/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
228   unsigned saturating arithmetic.  */
229static __inline __m64
230_mm_subs_pu8 (__m64 __m1, __m64 __m2)
231{
232  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
233}
234
235/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
236   unsigned saturating arithmetic.  */
237static __inline __m64
238_mm_subs_pu16 (__m64 __m1, __m64 __m2)
239{
240  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
241}
242
243/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
244   four 32-bit intermediate results, which are then summed by pairs to
245   produce two 32-bit results.  */
246static __inline __m64
247_mm_madd_pi16 (__m64 __m1, __m64 __m2)
248{
249  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
250}
251
252/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
253   M2 and produce the high 16 bits of the 32-bit results.  */
254static __inline __m64
255_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
256{
257  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
258}
259
260/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
261   the low 16 bits of the results.  */
262static __inline __m64
263_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
264{
265  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
266}
267
268/* Shift four 16-bit values in M left by COUNT.  */
269static __inline __m64
270_mm_sll_pi16 (__m64 __m, __m64 __count)
271{
272  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
273}
274
275static __inline __m64
276_mm_slli_pi16 (__m64 __m, int __count)
277{
278  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
279}
280
281/* Shift two 32-bit values in M left by COUNT.  */
282static __inline __m64
283_mm_sll_pi32 (__m64 __m, __m64 __count)
284{
285  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
286}
287
288static __inline __m64
289_mm_slli_pi32 (__m64 __m, int __count)
290{
291  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
292}
293
294/* Shift the 64-bit value in M left by COUNT.  */
295static __inline __m64
296_mm_sll_pi64 (__m64 __m, __m64 __count)
297{
298  return (__m64) __builtin_ia32_psllq (__m, __count);
299}
300
301static __inline __m64
302_mm_slli_pi64 (__m64 __m, int __count)
303{
304  return (__m64) __builtin_ia32_psllq (__m, __count);
305}
306
307/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
308static __inline __m64
309_mm_sra_pi16 (__m64 __m, __m64 __count)
310{
311  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
312}
313
314static __inline __m64
315_mm_srai_pi16 (__m64 __m, int __count)
316{
317  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
318}
319
320/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
321static __inline __m64
322_mm_sra_pi32 (__m64 __m, __m64 __count)
323{
324  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
325}
326
327static __inline __m64
328_mm_srai_pi32 (__m64 __m, int __count)
329{
330  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
331}
332
333/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
334static __inline __m64
335_mm_srl_pi16 (__m64 __m, __m64 __count)
336{
337  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
338}
339
340static __inline __m64
341_mm_srli_pi16 (__m64 __m, int __count)
342{
343  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
344}
345
346/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
347static __inline __m64
348_mm_srl_pi32 (__m64 __m, __m64 __count)
349{
350  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
351}
352
353static __inline __m64
354_mm_srli_pi32 (__m64 __m, int __count)
355{
356  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
357}
358
359/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
360static __inline __m64
361_mm_srl_pi64 (__m64 __m, __m64 __count)
362{
363  return (__m64) __builtin_ia32_psrlq (__m, __count);
364}
365
366static __inline __m64
367_mm_srli_pi64 (__m64 __m, int __count)
368{
369  return (__m64) __builtin_ia32_psrlq (__m, __count);
370}
371
372/* Bit-wise AND the 64-bit values in M1 and M2.  */
373static __inline __m64
374_mm_and_si64 (__m64 __m1, __m64 __m2)
375{
376  return __builtin_ia32_pand (__m1, __m2);
377}
378
379/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
380   64-bit value in M2.  */
381static __inline __m64
382_mm_andnot_si64 (__m64 __m1, __m64 __m2)
383{
384  return __builtin_ia32_pandn (__m1, __m2);
385}
386
387/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
388static __inline __m64
389_mm_or_si64 (__m64 __m1, __m64 __m2)
390{
391  return __builtin_ia32_por (__m1, __m2);
392}
393
394/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
395static __inline __m64
396_mm_xor_si64 (__m64 __m1, __m64 __m2)
397{
398  return __builtin_ia32_pxor (__m1, __m2);
399}
400
401/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
402   test is true and zero if false.  */
403static __inline __m64
404_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
405{
406  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
407}
408
409static __inline __m64
410_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
411{
412  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
413}
414
415/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
416   the test is true and zero if false.  */
417static __inline __m64
418_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
419{
420  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
421}
422
423static __inline __m64
424_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
425{
426  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
427}
428
429/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
430   the test is true and zero if false.  */
431static __inline __m64
432_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
433{
434  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
435}
436
437static __inline __m64
438_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
439{
440  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
441}
442
443/* Creates a 64-bit zero.  */
444static __inline __m64
445_mm_setzero_si64 (void)
446{
447  return __builtin_ia32_mmx_zero ();
448}
449
450/* Creates a vector of two 32-bit values; I0 is least significant.  */
451static __inline __m64
452_mm_set_pi32 (int __i1, int __i0)
453{
454  union {
455    __m64 __q;
456    struct {
457      unsigned int __i0;
458      unsigned int __i1;
459    } __s;
460  } __u;
461
462  __u.__s.__i0 = __i0;
463  __u.__s.__i1 = __i1;
464
465  return __u.__q;
466}
467
468/* Creates a vector of four 16-bit values; W0 is least significant.  */
469static __inline __m64
470_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
471{
472  unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
473  unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
474  return _mm_set_pi32 (__i1, __i0);
475
476}
477
478/* Creates a vector of eight 8-bit values; B0 is least significant.  */
479static __inline __m64
480_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
481	     char __b3, char __b2, char __b1, char __b0)
482{
483  unsigned int __i1, __i0;
484
485  __i1 = (unsigned char)__b7;
486  __i1 = __i1 << 8 | (unsigned char)__b6;
487  __i1 = __i1 << 8 | (unsigned char)__b5;
488  __i1 = __i1 << 8 | (unsigned char)__b4;
489
490  __i0 = (unsigned char)__b3;
491  __i0 = __i0 << 8 | (unsigned char)__b2;
492  __i0 = __i0 << 8 | (unsigned char)__b1;
493  __i0 = __i0 << 8 | (unsigned char)__b0;
494
495  return _mm_set_pi32 (__i1, __i0);
496}
497
498/* Similar, but with the arguments in reverse order.  */
499static __inline __m64
500_mm_setr_pi32 (int __i0, int __i1)
501{
502  return _mm_set_pi32 (__i1, __i0);
503}
504
505static __inline __m64
506_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
507{
508  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
509}
510
511static __inline __m64
512_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
513	      char __b4, char __b5, char __b6, char __b7)
514{
515  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
516}
517
518/* Creates a vector of two 32-bit values, both elements containing I.  */
519static __inline __m64
520_mm_set1_pi32 (int __i)
521{
522  return _mm_set_pi32 (__i, __i);
523}
524
525/* Creates a vector of four 16-bit values, all elements containing W.  */
526static __inline __m64
527_mm_set1_pi16 (short __w)
528{
529  unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
530  return _mm_set1_pi32 (__i);
531}
532
533/* Creates a vector of four 16-bit values, all elements containing B.  */
534static __inline __m64
535_mm_set1_pi8 (char __b)
536{
537  unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
538  unsigned int __i = __w << 16 | __w;
539  return _mm_set1_pi32 (__i);
540}
541
542#endif /* _MMINTRIN_H_INCLUDED */
543