1/* Copyright (C) 2002-2022 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef _MMINTRIN_H_INCLUDED
28#define _MMINTRIN_H_INCLUDED
29
30#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
31#pragma GCC push_options
32#ifdef __MMX_WITH_SSE__
33#pragma GCC target("sse2")
34#elif defined __x86_64__
35#pragma GCC target("sse,mmx")
36#else
37#pragma GCC target("mmx")
38#endif
39#define __DISABLE_MMX__
40#endif /* __MMX__ */
41
42/* The Intel API is flexible enough that we must allow aliasing with other
43   vector types, and their scalar components.  */
44typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
45typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
46typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));
47
48/* Unaligned version of the same type  */
49typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
50typedef int __m32_u __attribute__ ((__vector_size__ (4), \
51				    __may_alias__, __aligned__ (1)));
52typedef short __m16_u __attribute__ ((__vector_size__ (2), \
53				      __may_alias__, __aligned__ (1)));
54
55/* Internal data types for implementing the intrinsics.  */
56typedef int __v2si __attribute__ ((__vector_size__ (8)));
57typedef short __v4hi __attribute__ ((__vector_size__ (8)));
58typedef char __v8qi __attribute__ ((__vector_size__ (8)));
59typedef long long __v1di __attribute__ ((__vector_size__ (8)));
60typedef float __v2sf __attribute__ ((__vector_size__ (8)));
61
62/* Empty the multimedia state.  */
63extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64_mm_empty (void)
65{
66  __builtin_ia32_emms ();
67}
68
69extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_m_empty (void)
71{
72  _mm_empty ();
73}
74
75/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
76extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77_mm_cvtsi32_si64 (int __i)
78{
79  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
80}
81
82extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83_m_from_int (int __i)
84{
85  return _mm_cvtsi32_si64 (__i);
86}
87
88#ifdef __x86_64__
89/* Convert I to a __m64 object.  */
90
91/* Intel intrinsic.  */
92extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93_m_from_int64 (long long __i)
94{
95  return (__m64) __i;
96}
97
98extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
99_mm_cvtsi64_m64 (long long __i)
100{
101  return (__m64) __i;
102}
103
104/* Microsoft intrinsic.  */
105extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106_mm_cvtsi64x_si64 (long long __i)
107{
108  return (__m64) __i;
109}
110
111extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112_mm_set_pi64x (long long __i)
113{
114  return (__m64) __i;
115}
116#endif
117
118/* Convert the lower 32 bits of the __m64 object into an integer.  */
119extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_cvtsi64_si32 (__m64 __i)
121{
122  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
123}
124
125extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_m_to_int (__m64 __i)
127{
128  return _mm_cvtsi64_si32 (__i);
129}
130
131#ifdef __x86_64__
132/* Convert the __m64 object to a 64bit integer.  */
133
134/* Intel intrinsic.  */
135extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136_m_to_int64 (__m64 __i)
137{
138  return (long long)__i;
139}
140
141extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_cvtm64_si64 (__m64 __i)
143{
144  return (long long)__i;
145}
146
147/* Microsoft intrinsic.  */
148extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149_mm_cvtsi64_si64x (__m64 __i)
150{
151  return (long long)__i;
152}
153#endif
154
155/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
156   the result, and the four 16-bit values from M2 into the upper four 8-bit
157   values of the result, all with signed saturation.  */
158extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm_packs_pi16 (__m64 __m1, __m64 __m2)
160{
161  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
162}
163
164extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165_m_packsswb (__m64 __m1, __m64 __m2)
166{
167  return _mm_packs_pi16 (__m1, __m2);
168}
169
170/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
171   the result, and the two 32-bit values from M2 into the upper two 16-bit
172   values of the result, all with signed saturation.  */
173extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174_mm_packs_pi32 (__m64 __m1, __m64 __m2)
175{
176  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
177}
178
179extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180_m_packssdw (__m64 __m1, __m64 __m2)
181{
182  return _mm_packs_pi32 (__m1, __m2);
183}
184
185/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
186   the result, and the four 16-bit values from M2 into the upper four 8-bit
187   values of the result, all with unsigned saturation.  */
188extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189_mm_packs_pu16 (__m64 __m1, __m64 __m2)
190{
191  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
192}
193
194extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195_m_packuswb (__m64 __m1, __m64 __m2)
196{
197  return _mm_packs_pu16 (__m1, __m2);
198}
199
200/* Interleave the four 8-bit values from the high half of M1 with the four
201   8-bit values from the high half of M2.  */
202extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
204{
205  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
206}
207
208extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209_m_punpckhbw (__m64 __m1, __m64 __m2)
210{
211  return _mm_unpackhi_pi8 (__m1, __m2);
212}
213
214/* Interleave the two 16-bit values from the high half of M1 with the two
215   16-bit values from the high half of M2.  */
216extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
217_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
218{
219  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
220}
221
222extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_m_punpckhwd (__m64 __m1, __m64 __m2)
224{
225  return _mm_unpackhi_pi16 (__m1, __m2);
226}
227
228/* Interleave the 32-bit value from the high half of M1 with the 32-bit
229   value from the high half of M2.  */
230extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
232{
233  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
234}
235
236extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_m_punpckhdq (__m64 __m1, __m64 __m2)
238{
239  return _mm_unpackhi_pi32 (__m1, __m2);
240}
241
242/* Interleave the four 8-bit values from the low half of M1 with the four
243   8-bit values from the low half of M2.  */
244extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
246{
247  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
248}
249
250extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251_m_punpcklbw (__m64 __m1, __m64 __m2)
252{
253  return _mm_unpacklo_pi8 (__m1, __m2);
254}
255
256/* Interleave the two 16-bit values from the low half of M1 with the two
257   16-bit values from the low half of M2.  */
258extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
260{
261  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
262}
263
264extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265_m_punpcklwd (__m64 __m1, __m64 __m2)
266{
267  return _mm_unpacklo_pi16 (__m1, __m2);
268}
269
270/* Interleave the 32-bit value from the low half of M1 with the 32-bit
271   value from the low half of M2.  */
272extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
273_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
274{
275  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
276}
277
278extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279_m_punpckldq (__m64 __m1, __m64 __m2)
280{
281  return _mm_unpacklo_pi32 (__m1, __m2);
282}
283
284/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
285extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286_mm_add_pi8 (__m64 __m1, __m64 __m2)
287{
288  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
289}
290
291extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292_m_paddb (__m64 __m1, __m64 __m2)
293{
294  return _mm_add_pi8 (__m1, __m2);
295}
296
297/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
298extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299_mm_add_pi16 (__m64 __m1, __m64 __m2)
300{
301  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
302}
303
304extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_m_paddw (__m64 __m1, __m64 __m2)
306{
307  return _mm_add_pi16 (__m1, __m2);
308}
309
310/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312_mm_add_pi32 (__m64 __m1, __m64 __m2)
313{
314  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
315}
316
317extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318_m_paddd (__m64 __m1, __m64 __m2)
319{
320  return _mm_add_pi32 (__m1, __m2);
321}
322
323/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
324#ifndef __SSE2__
325#pragma GCC push_options
326#ifdef __MMX_WITH_SSE__
327#pragma GCC target("sse2")
328#else
329#pragma GCC target("sse2,mmx")
330#endif
331#define __DISABLE_SSE2__
332#endif /* __SSE2__ */
333
334extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335_mm_add_si64 (__m64 __m1, __m64 __m2)
336{
337  return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
338}
339#ifdef __DISABLE_SSE2__
340#undef __DISABLE_SSE2__
341#pragma GCC pop_options
342#endif /* __DISABLE_SSE2__ */
343
344/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
345   saturated arithmetic.  */
346extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347_mm_adds_pi8 (__m64 __m1, __m64 __m2)
348{
349  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
350}
351
352extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353_m_paddsb (__m64 __m1, __m64 __m2)
354{
355  return _mm_adds_pi8 (__m1, __m2);
356}
357
358/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
359   saturated arithmetic.  */
360extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361_mm_adds_pi16 (__m64 __m1, __m64 __m2)
362{
363  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
364}
365
366extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367_m_paddsw (__m64 __m1, __m64 __m2)
368{
369  return _mm_adds_pi16 (__m1, __m2);
370}
371
372/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
373   saturated arithmetic.  */
374extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm_adds_pu8 (__m64 __m1, __m64 __m2)
376{
377  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
378}
379
380extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381_m_paddusb (__m64 __m1, __m64 __m2)
382{
383  return _mm_adds_pu8 (__m1, __m2);
384}
385
386/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
387   saturated arithmetic.  */
388extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389_mm_adds_pu16 (__m64 __m1, __m64 __m2)
390{
391  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
392}
393
394extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395_m_paddusw (__m64 __m1, __m64 __m2)
396{
397  return _mm_adds_pu16 (__m1, __m2);
398}
399
400/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
401extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402_mm_sub_pi8 (__m64 __m1, __m64 __m2)
403{
404  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
405}
406
407extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
408_m_psubb (__m64 __m1, __m64 __m2)
409{
410  return _mm_sub_pi8 (__m1, __m2);
411}
412
413/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
414extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
415_mm_sub_pi16 (__m64 __m1, __m64 __m2)
416{
417  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
418}
419
420extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421_m_psubw (__m64 __m1, __m64 __m2)
422{
423  return _mm_sub_pi16 (__m1, __m2);
424}
425
426/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
427extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428_mm_sub_pi32 (__m64 __m1, __m64 __m2)
429{
430  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
431}
432
433extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434_m_psubd (__m64 __m1, __m64 __m2)
435{
436  return _mm_sub_pi32 (__m1, __m2);
437}
438
439/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
440#ifndef __SSE2__
441#pragma GCC push_options
442#ifdef __MMX_WITH_SSE__
443#pragma GCC target("sse2")
444#else
445#pragma GCC target("sse2,mmx")
446#endif
447#define __DISABLE_SSE2__
448#endif /* __SSE2__ */
449
450extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451_mm_sub_si64 (__m64 __m1, __m64 __m2)
452{
453  return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
454}
455#ifdef __DISABLE_SSE2__
456#undef __DISABLE_SSE2__
457#pragma GCC pop_options
458#endif /* __DISABLE_SSE2__ */
459
460/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
461   saturating arithmetic.  */
462extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463_mm_subs_pi8 (__m64 __m1, __m64 __m2)
464{
465  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
466}
467
468extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469_m_psubsb (__m64 __m1, __m64 __m2)
470{
471  return _mm_subs_pi8 (__m1, __m2);
472}
473
474/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
475   signed saturating arithmetic.  */
476extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477_mm_subs_pi16 (__m64 __m1, __m64 __m2)
478{
479  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
480}
481
482extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483_m_psubsw (__m64 __m1, __m64 __m2)
484{
485  return _mm_subs_pi16 (__m1, __m2);
486}
487
488/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
489   unsigned saturating arithmetic.  */
490extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491_mm_subs_pu8 (__m64 __m1, __m64 __m2)
492{
493  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
494}
495
496extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_m_psubusb (__m64 __m1, __m64 __m2)
498{
499  return _mm_subs_pu8 (__m1, __m2);
500}
501
502/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
503   unsigned saturating arithmetic.  */
504extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505_mm_subs_pu16 (__m64 __m1, __m64 __m2)
506{
507  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
508}
509
510extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
511_m_psubusw (__m64 __m1, __m64 __m2)
512{
513  return _mm_subs_pu16 (__m1, __m2);
514}
515
516/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
517   four 32-bit intermediate results, which are then summed by pairs to
518   produce two 32-bit results.  */
519extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520_mm_madd_pi16 (__m64 __m1, __m64 __m2)
521{
522  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
523}
524
525extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526_m_pmaddwd (__m64 __m1, __m64 __m2)
527{
528  return _mm_madd_pi16 (__m1, __m2);
529}
530
531/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
532   M2 and produce the high 16 bits of the 32-bit results.  */
533extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
534_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
535{
536  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
537}
538
539extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540_m_pmulhw (__m64 __m1, __m64 __m2)
541{
542  return _mm_mulhi_pi16 (__m1, __m2);
543}
544
545/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
546   the low 16 bits of the results.  */
547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
548_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
549{
550  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
551}
552
553extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554_m_pmullw (__m64 __m1, __m64 __m2)
555{
556  return _mm_mullo_pi16 (__m1, __m2);
557}
558
559/* Shift four 16-bit values in M left by COUNT.  */
560extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
561_mm_sll_pi16 (__m64 __m, __m64 __count)
562{
563  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
564}
565
566extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
567_m_psllw (__m64 __m, __m64 __count)
568{
569  return _mm_sll_pi16 (__m, __count);
570}
571
572extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573_mm_slli_pi16 (__m64 __m, int __count)
574{
575  return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
576}
577
578extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579_m_psllwi (__m64 __m, int __count)
580{
581  return _mm_slli_pi16 (__m, __count);
582}
583
584/* Shift two 32-bit values in M left by COUNT.  */
585extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586_mm_sll_pi32 (__m64 __m, __m64 __count)
587{
588  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
589}
590
591extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592_m_pslld (__m64 __m, __m64 __count)
593{
594  return _mm_sll_pi32 (__m, __count);
595}
596
597extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598_mm_slli_pi32 (__m64 __m, int __count)
599{
600  return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
601}
602
603extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604_m_pslldi (__m64 __m, int __count)
605{
606  return _mm_slli_pi32 (__m, __count);
607}
608
609/* Shift the 64-bit value in M left by COUNT.  */
610extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611_mm_sll_si64 (__m64 __m, __m64 __count)
612{
613  return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
614}
615
616extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617_m_psllq (__m64 __m, __m64 __count)
618{
619  return _mm_sll_si64 (__m, __count);
620}
621
622extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623_mm_slli_si64 (__m64 __m, int __count)
624{
625  return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
626}
627
628extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629_m_psllqi (__m64 __m, int __count)
630{
631  return _mm_slli_si64 (__m, __count);
632}
633
634/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
635extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636_mm_sra_pi16 (__m64 __m, __m64 __count)
637{
638  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
639}
640
641extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642_m_psraw (__m64 __m, __m64 __count)
643{
644  return _mm_sra_pi16 (__m, __count);
645}
646
647extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648_mm_srai_pi16 (__m64 __m, int __count)
649{
650  return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
651}
652
653extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654_m_psrawi (__m64 __m, int __count)
655{
656  return _mm_srai_pi16 (__m, __count);
657}
658
659/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
660extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm_sra_pi32 (__m64 __m, __m64 __count)
662{
663  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
664}
665
666extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667_m_psrad (__m64 __m, __m64 __count)
668{
669  return _mm_sra_pi32 (__m, __count);
670}
671
672extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673_mm_srai_pi32 (__m64 __m, int __count)
674{
675  return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
676}
677
678extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679_m_psradi (__m64 __m, int __count)
680{
681  return _mm_srai_pi32 (__m, __count);
682}
683
684/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
685extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686_mm_srl_pi16 (__m64 __m, __m64 __count)
687{
688  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
689}
690
691extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692_m_psrlw (__m64 __m, __m64 __count)
693{
694  return _mm_srl_pi16 (__m, __count);
695}
696
697extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698_mm_srli_pi16 (__m64 __m, int __count)
699{
700  return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
701}
702
703extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704_m_psrlwi (__m64 __m, int __count)
705{
706  return _mm_srli_pi16 (__m, __count);
707}
708
709/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
710extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711_mm_srl_pi32 (__m64 __m, __m64 __count)
712{
713  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
714}
715
716extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
717_m_psrld (__m64 __m, __m64 __count)
718{
719  return _mm_srl_pi32 (__m, __count);
720}
721
722extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
723_mm_srli_pi32 (__m64 __m, int __count)
724{
725  return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
726}
727
728extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729_m_psrldi (__m64 __m, int __count)
730{
731  return _mm_srli_pi32 (__m, __count);
732}
733
734/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
735extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736_mm_srl_si64 (__m64 __m, __m64 __count)
737{
738  return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
739}
740
741extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742_m_psrlq (__m64 __m, __m64 __count)
743{
744  return _mm_srl_si64 (__m, __count);
745}
746
747extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748_mm_srli_si64 (__m64 __m, int __count)
749{
750  return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
751}
752
753extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754_m_psrlqi (__m64 __m, int __count)
755{
756  return _mm_srli_si64 (__m, __count);
757}
758
759/* Bit-wise AND the 64-bit values in M1 and M2.  */
760extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761_mm_and_si64 (__m64 __m1, __m64 __m2)
762{
763  return __builtin_ia32_pand (__m1, __m2);
764}
765
766extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767_m_pand (__m64 __m1, __m64 __m2)
768{
769  return _mm_and_si64 (__m1, __m2);
770}
771
772/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
773   64-bit value in M2.  */
774extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775_mm_andnot_si64 (__m64 __m1, __m64 __m2)
776{
777  return __builtin_ia32_pandn (__m1, __m2);
778}
779
780extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
781_m_pandn (__m64 __m1, __m64 __m2)
782{
783  return _mm_andnot_si64 (__m1, __m2);
784}
785
786/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
787extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788_mm_or_si64 (__m64 __m1, __m64 __m2)
789{
790  return __builtin_ia32_por (__m1, __m2);
791}
792
793extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794_m_por (__m64 __m1, __m64 __m2)
795{
796  return _mm_or_si64 (__m1, __m2);
797}
798
799/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
800extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801_mm_xor_si64 (__m64 __m1, __m64 __m2)
802{
803  return __builtin_ia32_pxor (__m1, __m2);
804}
805
806extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807_m_pxor (__m64 __m1, __m64 __m2)
808{
809  return _mm_xor_si64 (__m1, __m2);
810}
811
812/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
813   test is true and zero if false.  */
814extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
816{
817  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
818}
819
820extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821_m_pcmpeqb (__m64 __m1, __m64 __m2)
822{
823  return _mm_cmpeq_pi8 (__m1, __m2);
824}
825
826extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
828{
829  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
830}
831
832extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833_m_pcmpgtb (__m64 __m1, __m64 __m2)
834{
835  return _mm_cmpgt_pi8 (__m1, __m2);
836}
837
838/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
839   the test is true and zero if false.  */
840extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
842{
843  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
844}
845
846extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847_m_pcmpeqw (__m64 __m1, __m64 __m2)
848{
849  return _mm_cmpeq_pi16 (__m1, __m2);
850}
851
852extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
854{
855  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
856}
857
858extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859_m_pcmpgtw (__m64 __m1, __m64 __m2)
860{
861  return _mm_cmpgt_pi16 (__m1, __m2);
862}
863
864/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
865   the test is true and zero if false.  */
866extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868{
869  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
870}
871
872extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873_m_pcmpeqd (__m64 __m1, __m64 __m2)
874{
875  return _mm_cmpeq_pi32 (__m1, __m2);
876}
877
878extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
880{
881  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
882}
883
884extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
885_m_pcmpgtd (__m64 __m1, __m64 __m2)
886{
887  return _mm_cmpgt_pi32 (__m1, __m2);
888}
889
890/* Creates a 64-bit zero.  */
891extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892_mm_setzero_si64 (void)
893{
894  return (__m64)0LL;
895}
896
897/* Creates a vector of two 32-bit values; I0 is least significant.  */
898extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
899_mm_set_pi32 (int __i1, int __i0)
900{
901  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
902}
903
904/* Creates a vector of four 16-bit values; W0 is least significant.  */
905extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
907{
908  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
909}
910
911/* Creates a vector of eight 8-bit values; B0 is least significant.  */
912extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
914	     char __b3, char __b2, char __b1, char __b0)
915{
916  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
917					       __b4, __b5, __b6, __b7);
918}
919
920/* Similar, but with the arguments in reverse order.  */
921extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922_mm_setr_pi32 (int __i0, int __i1)
923{
924  return _mm_set_pi32 (__i1, __i0);
925}
926
927extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
929{
930  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
931}
932
933extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
935	      char __b4, char __b5, char __b6, char __b7)
936{
937  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
938}
939
940/* Creates a vector of two 32-bit values, both elements containing I.  */
941extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
942_mm_set1_pi32 (int __i)
943{
944  return _mm_set_pi32 (__i, __i);
945}
946
947/* Creates a vector of four 16-bit values, all elements containing W.  */
948extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949_mm_set1_pi16 (short __w)
950{
951  return _mm_set_pi16 (__w, __w, __w, __w);
952}
953
954/* Creates a vector of eight 8-bit values, all elements containing B.  */
955extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956_mm_set1_pi8 (char __b)
957{
958  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
959}
960#ifdef __DISABLE_MMX__
961#undef __DISABLE_MMX__
962#pragma GCC pop_options
963#endif /* __DISABLE_MMX__ */
964
965#endif /* _MMINTRIN_H_INCLUDED */
966