emmintrin.h revision 122180
1/* Copyright (C) 2003 Free Software Foundation, Inc.
2
3   This file is part of GNU CC.
4
5   GNU CC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GNU CC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GNU CC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 59 Temple Place - Suite 330,
18   Boston, MA 02111-1307, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 8.0.  */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
33#ifdef __SSE2__
34#include <xmmintrin.h>
35
36/* SSE2 */
37typedef int __v2df __attribute__ ((mode (V2DF)));
38typedef int __v2di __attribute__ ((mode (V2DI)));
39typedef int __v4si __attribute__ ((mode (V4SI)));
40typedef int __v8hi __attribute__ ((mode (V8HI)));
41typedef int __v16qi __attribute__ ((mode (V16QI)));
42
43/* Create a selector for use with the SHUFPD instruction.  */
44#define _MM_SHUFFLE2(fp1,fp0) \
45 (((fp1) << 1) | (fp0))
46
47#define __m128i __v2di
48#define __m128d __v2df
49
50/* Create a vector with element 0 as *P and the rest zero.  */
51static __inline __m128d
52_mm_load_sd (double const *__P)
53{
54  return (__m128d) __builtin_ia32_loadsd (__P);
55}
56
57/* Create a vector with all two elements equal to *P.  */
58static __inline __m128d
59_mm_load1_pd (double const *__P)
60{
61  __v2df __tmp = __builtin_ia32_loadsd (__P);
62  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
63}
64
65static __inline __m128d
66_mm_load_pd1 (double const *__P)
67{
68  return _mm_load1_pd (__P);
69}
70
71/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
72static __inline __m128d
73_mm_load_pd (double const *__P)
74{
75  return (__m128d) __builtin_ia32_loadapd (__P);
76}
77
78/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
79static __inline __m128d
80_mm_loadu_pd (double const *__P)
81{
82  return (__m128d) __builtin_ia32_loadupd (__P);
83}
84
85/* Load two DPFP values in reverse order.  The address must be aligned.  */
86static __inline __m128d
87_mm_loadr_pd (double const *__P)
88{
89  __v2df __tmp = __builtin_ia32_loadapd (__P);
90  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
91}
92
93/* Create a vector with element 0 as F and the rest zero.  */
94static __inline __m128d
95_mm_set_sd (double __F)
96{
97  return (__m128d) __builtin_ia32_loadsd (&__F);
98}
99
100/* Create a vector with all two elements equal to F.  */
101static __inline __m128d
102_mm_set1_pd (double __F)
103{
104  __v2df __tmp = __builtin_ia32_loadsd (&__F);
105  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
106}
107
108static __inline __m128d
109_mm_set_pd1 (double __F)
110{
111  return _mm_set1_pd (__F);
112}
113
114/* Create the vector [Z Y].  */
115static __inline __m128d
116_mm_set_pd (double __Z, double __Y)
117{
118  union {
119    double __a[2];
120    __m128d __v;
121  } __u;
122
123  __u.__a[0] = __Y;
124  __u.__a[1] = __Z;
125
126  return __u.__v;
127}
128
129/* Create the vector [Y Z].  */
130static __inline __m128d
131_mm_setr_pd (double __Z, double __Y)
132{
133  return _mm_set_pd (__Y, __Z);
134}
135
136/* Create a vector of zeros.  */
137static __inline __m128d
138_mm_setzero_pd (void)
139{
140  return (__m128d) __builtin_ia32_setzeropd ();
141}
142
143/* Stores the lower DPFP value.  */
144static __inline void
145_mm_store_sd (double *__P, __m128d __A)
146{
147  __builtin_ia32_storesd (__P, (__v2df)__A);
148}
149
150/* Store the lower DPFP value acrosd two words.  */
151static __inline void
152_mm_store1_pd (double *__P, __m128d __A)
153{
154  __v2df __va = (__v2df)__A;
155  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0));
156  __builtin_ia32_storeapd (__P, __tmp);
157}
158
159static __inline void
160_mm_store_pd1 (double *__P, __m128d __A)
161{
162  _mm_store1_pd (__P, __A);
163}
164
165/* Store two DPFP values.  The address must be 16-byte aligned.  */
166static __inline void
167_mm_store_pd (double *__P, __m128d __A)
168{
169  __builtin_ia32_storeapd (__P, (__v2df)__A);
170}
171
172/* Store two DPFP values.  The address need not be 16-byte aligned.  */
173static __inline void
174_mm_storeu_pd (double *__P, __m128d __A)
175{
176  __builtin_ia32_storeupd (__P, (__v2df)__A);
177}
178
179/* Store two DPFP values in reverse order.  The address must be aligned.  */
180static __inline void
181_mm_storer_pd (double *__P, __m128d __A)
182{
183  __v2df __va = (__v2df)__A;
184  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1));
185  __builtin_ia32_storeapd (__P, __tmp);
186}
187
188/* Sets the low DPFP value of A from the low value of B.  */
189static __inline __m128d
190_mm_move_sd (__m128d __A, __m128d __B)
191{
192  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
193}
194
195
196static __inline __m128d
197_mm_add_pd (__m128d __A, __m128d __B)
198{
199  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
200}
201
202static __inline __m128d
203_mm_add_sd (__m128d __A, __m128d __B)
204{
205  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
206}
207
208static __inline __m128d
209_mm_sub_pd (__m128d __A, __m128d __B)
210{
211  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
212}
213
214static __inline __m128d
215_mm_sub_sd (__m128d __A, __m128d __B)
216{
217  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
218}
219
220static __inline __m128d
221_mm_mul_pd (__m128d __A, __m128d __B)
222{
223  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
224}
225
226static __inline __m128d
227_mm_mul_sd (__m128d __A, __m128d __B)
228{
229  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
230}
231
232static __inline __m128d
233_mm_div_pd (__m128d __A, __m128d __B)
234{
235  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
236}
237
238static __inline __m128d
239_mm_div_sd (__m128d __A, __m128d __B)
240{
241  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
242}
243
244static __inline __m128d
245_mm_sqrt_pd (__m128d __A)
246{
247  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
248}
249
250/* Return pair {sqrt (A[0), B[1]}.  */
251static __inline __m128d
252_mm_sqrt_sd (__m128d __A, __m128d __B)
253{
254  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
255  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
256}
257
258static __inline __m128d
259_mm_min_pd (__m128d __A, __m128d __B)
260{
261  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
262}
263
264static __inline __m128d
265_mm_min_sd (__m128d __A, __m128d __B)
266{
267  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
268}
269
270static __inline __m128d
271_mm_max_pd (__m128d __A, __m128d __B)
272{
273  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
274}
275
276static __inline __m128d
277_mm_max_sd (__m128d __A, __m128d __B)
278{
279  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
280}
281
282static __inline __m128d
283_mm_and_pd (__m128d __A, __m128d __B)
284{
285  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
286}
287
288static __inline __m128d
289_mm_andnot_pd (__m128d __A, __m128d __B)
290{
291  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
292}
293
294static __inline __m128d
295_mm_or_pd (__m128d __A, __m128d __B)
296{
297  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
298}
299
300static __inline __m128d
301_mm_xor_pd (__m128d __A, __m128d __B)
302{
303  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
304}
305
306static __inline __m128d
307_mm_cmpeq_pd (__m128d __A, __m128d __B)
308{
309  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
310}
311
312static __inline __m128d
313_mm_cmplt_pd (__m128d __A, __m128d __B)
314{
315  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
316}
317
318static __inline __m128d
319_mm_cmple_pd (__m128d __A, __m128d __B)
320{
321  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
322}
323
324static __inline __m128d
325_mm_cmpgt_pd (__m128d __A, __m128d __B)
326{
327  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
328}
329
330static __inline __m128d
331_mm_cmpge_pd (__m128d __A, __m128d __B)
332{
333  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
334}
335
336static __inline __m128d
337_mm_cmpneq_pd (__m128d __A, __m128d __B)
338{
339  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
340}
341
342static __inline __m128d
343_mm_cmpnlt_pd (__m128d __A, __m128d __B)
344{
345  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
346}
347
348static __inline __m128d
349_mm_cmpnle_pd (__m128d __A, __m128d __B)
350{
351  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
352}
353
354static __inline __m128d
355_mm_cmpngt_pd (__m128d __A, __m128d __B)
356{
357  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
358}
359
360static __inline __m128d
361_mm_cmpnge_pd (__m128d __A, __m128d __B)
362{
363  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
364}
365
366static __inline __m128d
367_mm_cmpord_pd (__m128d __A, __m128d __B)
368{
369  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
370}
371
372static __inline __m128d
373_mm_cmpunord_pd (__m128d __A, __m128d __B)
374{
375  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
376}
377
378static __inline __m128d
379_mm_cmpeq_sd (__m128d __A, __m128d __B)
380{
381  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
382}
383
384static __inline __m128d
385_mm_cmplt_sd (__m128d __A, __m128d __B)
386{
387  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
388}
389
390static __inline __m128d
391_mm_cmple_sd (__m128d __A, __m128d __B)
392{
393  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
394}
395
396static __inline __m128d
397_mm_cmpgt_sd (__m128d __A, __m128d __B)
398{
399  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
400					 (__v2df)
401					 __builtin_ia32_cmpltsd ((__v2df) __B,
402								 (__v2df)
403								 __A));
404}
405
406static __inline __m128d
407_mm_cmpge_sd (__m128d __A, __m128d __B)
408{
409  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
410					 (__v2df)
411					 __builtin_ia32_cmplesd ((__v2df) __B,
412								 (__v2df)
413								 __A));
414}
415
416static __inline __m128d
417_mm_cmpneq_sd (__m128d __A, __m128d __B)
418{
419  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
420}
421
422static __inline __m128d
423_mm_cmpnlt_sd (__m128d __A, __m128d __B)
424{
425  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
426}
427
428static __inline __m128d
429_mm_cmpnle_sd (__m128d __A, __m128d __B)
430{
431  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
432}
433
434static __inline __m128d
435_mm_cmpngt_sd (__m128d __A, __m128d __B)
436{
437  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
438					 (__v2df)
439					 __builtin_ia32_cmpnltsd ((__v2df) __B,
440								  (__v2df)
441								  __A));
442}
443
444static __inline __m128d
445_mm_cmpnge_sd (__m128d __A, __m128d __B)
446{
447  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
448					 (__v2df)
449					 __builtin_ia32_cmpnlesd ((__v2df) __B,
450								  (__v2df)
451								  __A));
452}
453
454static __inline __m128d
455_mm_cmpord_sd (__m128d __A, __m128d __B)
456{
457  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
458}
459
460static __inline __m128d
461_mm_cmpunord_sd (__m128d __A, __m128d __B)
462{
463  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
464}
465
466static __inline int
467_mm_comieq_sd (__m128d __A, __m128d __B)
468{
469  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
470}
471
472static __inline int
473_mm_comilt_sd (__m128d __A, __m128d __B)
474{
475  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
476}
477
478static __inline int
479_mm_comile_sd (__m128d __A, __m128d __B)
480{
481  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
482}
483
484static __inline int
485_mm_comigt_sd (__m128d __A, __m128d __B)
486{
487  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
488}
489
490static __inline int
491_mm_comige_sd (__m128d __A, __m128d __B)
492{
493  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
494}
495
496static __inline int
497_mm_comineq_sd (__m128d __A, __m128d __B)
498{
499  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
500}
501
502static __inline int
503_mm_ucomieq_sd (__m128d __A, __m128d __B)
504{
505  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
506}
507
508static __inline int
509_mm_ucomilt_sd (__m128d __A, __m128d __B)
510{
511  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
512}
513
514static __inline int
515_mm_ucomile_sd (__m128d __A, __m128d __B)
516{
517  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
518}
519
520static __inline int
521_mm_ucomigt_sd (__m128d __A, __m128d __B)
522{
523  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
524}
525
526static __inline int
527_mm_ucomige_sd (__m128d __A, __m128d __B)
528{
529  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
530}
531
532static __inline int
533_mm_ucomineq_sd (__m128d __A, __m128d __B)
534{
535  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
536}
537
538/* Create a vector with element 0 as *P and the rest zero.  */
539
540static __inline __m128i
541_mm_load_si128 (__m128i const *__P)
542{
543  return (__m128i) __builtin_ia32_loaddqa ((char const *)__P);
544}
545
546static __inline __m128i
547_mm_loadu_si128 (__m128i const *__P)
548{
549  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
550}
551
552static __inline __m128i
553_mm_loadl_epi64 (__m128i const *__P)
554{
555  return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P);
556}
557
558static __inline void
559_mm_store_si128 (__m128i *__P, __m128i __B)
560{
561  __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B);
562}
563
564static __inline void
565_mm_storeu_si128 (__m128i *__P, __m128i __B)
566{
567  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
568}
569
570static __inline void
571_mm_storel_epi64 (__m128i *__P, __m128i __B)
572{
573  *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B);
574}
575
576static __inline __m64
577_mm_movepi64_pi64 (__m128i __B)
578{
579  return (__m64) __builtin_ia32_movdq2q ((__v2di)__B);
580}
581
582static __inline __m128i
583_mm_move_epi64 (__m128i __A)
584{
585  return (__m128i) __builtin_ia32_movq ((__v2di)__A);
586}
587
588/* Create a vector of zeros.  */
589static __inline __m128i
590_mm_setzero_si128 (void)
591{
592  return (__m128i) __builtin_ia32_setzero128 ();
593}
594
595static __inline __m128i
596_mm_set_epi64 (__m64 __A,  __m64 __B)
597{
598  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
599  __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
600  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp);
601}
602
603/* Create the vector [Z Y X W].  */
604static __inline __m128i
605_mm_set_epi32 (int __Z, int __Y, int __X, int __W)
606{
607  union {
608    int __a[4];
609    __m128i __v;
610  } __u;
611
612  __u.__a[0] = __W;
613  __u.__a[1] = __X;
614  __u.__a[2] = __Y;
615  __u.__a[3] = __Z;
616
617  return __u.__v;
618}
619
620#ifdef __x86_64__
621/* Create the vector [Z Y].  */
622static __inline __m128i
623_mm_set_epi64x (long long __Z, long long __Y)
624{
625  union {
626    long __a[2];
627    __m128i __v;
628  } __u;
629
630  __u.__a[0] = __Y;
631  __u.__a[1] = __Z;
632
633  return __u.__v;
634}
635#endif
636
637/* Create the vector [S T U V Z Y X W].  */
638static __inline __m128i
639_mm_set_epi16 (short __Z, short __Y, short __X, short __W,
640	       short __V, short __U, short __T, short __S)
641{
642  union {
643    short __a[8];
644    __m128i __v;
645  } __u;
646
647  __u.__a[0] = __S;
648  __u.__a[1] = __T;
649  __u.__a[2] = __U;
650  __u.__a[3] = __V;
651  __u.__a[4] = __W;
652  __u.__a[5] = __X;
653  __u.__a[6] = __Y;
654  __u.__a[7] = __Z;
655
656  return __u.__v;
657}
658
659/* Create the vector [S T U V Z Y X W].  */
660static __inline __m128i
661_mm_set_epi8 (char __Z, char __Y, char __X, char __W,
662	      char __V, char __U, char __T, char __S,
663	      char __Z1, char __Y1, char __X1, char __W1,
664	      char __V1, char __U1, char __T1, char __S1)
665{
666  union {
667    char __a[16];
668    __m128i __v;
669  } __u;
670
671  __u.__a[0] = __S1;
672  __u.__a[1] = __T1;
673  __u.__a[2] = __U1;
674  __u.__a[3] = __V1;
675  __u.__a[4] = __W1;
676  __u.__a[5] = __X1;
677  __u.__a[6] = __Y1;
678  __u.__a[7] = __Z1;
679  __u.__a[8] = __S;
680  __u.__a[9] = __T;
681  __u.__a[10] = __U;
682  __u.__a[11] = __V;
683  __u.__a[12] = __W;
684  __u.__a[13] = __X;
685  __u.__a[14] = __Y;
686  __u.__a[15] = __Z;
687
688  return __u.__v;
689}
690
691static __inline __m128i
692_mm_set1_epi64 (__m64 __A)
693{
694  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
695  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp);
696}
697
698static __inline __m128i
699_mm_set1_epi32 (int __A)
700{
701  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A);
702  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
703}
704
705#ifdef __x86_64__
706static __inline __m128i
707_mm_set1_epi64x (long long __A)
708{
709  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
710  return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0));
711}
712#endif
713
714static __inline __m128i
715_mm_set1_epi16 (short __A)
716{
717  int __Acopy = (unsigned short)__A;
718  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
719  __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp);
720  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
721}
722
723static __inline __m128i
724_mm_set1_epi8 (char __A)
725{
726  int __Acopy = (unsigned char)__A;
727  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
728  __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
729  __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
730  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
731}
732
733static __inline __m128i
734_mm_setr_epi64 (__m64 __A,  __m64 __B)
735{
736  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
737  __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
738  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2);
739}
740
741/* Create the vector [Z Y X W].  */
742static __inline __m128i
743_mm_setr_epi32 (int __W, int __X, int __Y, int __Z)
744{
745  union {
746    int __a[4];
747    __m128i __v;
748  } __u;
749
750  __u.__a[0] = __W;
751  __u.__a[1] = __X;
752  __u.__a[2] = __Y;
753  __u.__a[3] = __Z;
754
755  return __u.__v;
756}
757/* Create the vector [S T U V Z Y X W].  */
758static __inline __m128i
759_mm_setr_epi16 (short __S, short __T, short __U, short __V,
760	        short __W, short __X, short __Y, short __Z)
761{
762  union {
763    short __a[8];
764    __m128i __v;
765  } __u;
766
767  __u.__a[0] = __S;
768  __u.__a[1] = __T;
769  __u.__a[2] = __U;
770  __u.__a[3] = __V;
771  __u.__a[4] = __W;
772  __u.__a[5] = __X;
773  __u.__a[6] = __Y;
774  __u.__a[7] = __Z;
775
776  return __u.__v;
777}
778
779/* Create the vector [S T U V Z Y X W].  */
780static __inline __m128i
781_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1,
782	       char __W1, char __X1, char __Y1, char __Z1,
783	       char __S, char __T, char __U, char __V,
784	       char __W, char __X, char __Y, char __Z)
785{
786  union {
787    char __a[16];
788    __m128i __v;
789  } __u;
790
791  __u.__a[0] = __S1;
792  __u.__a[1] = __T1;
793  __u.__a[2] = __U1;
794  __u.__a[3] = __V1;
795  __u.__a[4] = __W1;
796  __u.__a[5] = __X1;
797  __u.__a[6] = __Y1;
798  __u.__a[7] = __Z1;
799  __u.__a[8] = __S;
800  __u.__a[9] = __T;
801  __u.__a[10] = __U;
802  __u.__a[11] = __V;
803  __u.__a[12] = __W;
804  __u.__a[13] = __X;
805  __u.__a[14] = __Y;
806  __u.__a[15] = __Z;
807
808  return __u.__v;
809}
810
811static __inline __m128d
812_mm_cvtepi32_pd (__m128i __A)
813{
814  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
815}
816
817static __inline __m128
818_mm_cvtepi32_ps (__m128i __A)
819{
820  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
821}
822
823static __inline __m128i
824_mm_cvtpd_epi32 (__m128d __A)
825{
826  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
827}
828
829static __inline __m64
830_mm_cvtpd_pi32 (__m128d __A)
831{
832  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
833}
834
835static __inline __m128
836_mm_cvtpd_ps (__m128d __A)
837{
838  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
839}
840
841static __inline __m128i
842_mm_cvttpd_epi32 (__m128d __A)
843{
844  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
845}
846
847static __inline __m64
848_mm_cvttpd_pi32 (__m128d __A)
849{
850  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
851}
852
853static __inline __m128d
854_mm_cvtpi32_pd (__m64 __A)
855{
856  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
857}
858
859static __inline __m128i
860_mm_cvtps_epi32 (__m128 __A)
861{
862  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
863}
864
865static __inline __m128i
866_mm_cvttps_epi32 (__m128 __A)
867{
868  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
869}
870
871static __inline __m128d
872_mm_cvtps_pd (__m128 __A)
873{
874  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
875}
876
877static __inline int
878_mm_cvtsd_si32 (__m128d __A)
879{
880  return __builtin_ia32_cvtsd2si ((__v2df) __A);
881}
882
883#ifdef __x86_64__
884static __inline long long
885_mm_cvtsd_si64x (__m128d __A)
886{
887  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
888}
889#endif
890
891static __inline int
892_mm_cvttsd_si32 (__m128d __A)
893{
894  return __builtin_ia32_cvttsd2si ((__v2df) __A);
895}
896
897#ifdef __x86_64__
898static __inline long long
899_mm_cvttsd_si64x (__m128d __A)
900{
901  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
902}
903#endif
904
905static __inline __m128
906_mm_cvtsd_ss (__m128 __A, __m128d __B)
907{
908  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
909}
910
911static __inline __m128d
912_mm_cvtsi32_sd (__m128d __A, int __B)
913{
914  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
915}
916
917#ifdef __x86_64__
918static __inline __m128d
919_mm_cvtsi64x_sd (__m128d __A, long long __B)
920{
921  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
922}
923#endif
924
925static __inline __m128d
926_mm_cvtss_sd (__m128d __A, __m128 __B)
927{
928  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
929}
930
931#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
932
933static __inline __m128d
934_mm_unpackhi_pd (__m128d __A, __m128d __B)
935{
936  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
937}
938
939static __inline __m128d
940_mm_unpacklo_pd (__m128d __A, __m128d __B)
941{
942  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
943}
944
945static __inline __m128d
946_mm_loadh_pd (__m128d __A, double const *__B)
947{
948  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B);
949}
950
951static __inline void
952_mm_storeh_pd (double *__A, __m128d __B)
953{
954  __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B);
955}
956
957static __inline __m128d
958_mm_loadl_pd (__m128d __A, double const *__B)
959{
960  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B);
961}
962
963static __inline void
964_mm_storel_pd (double *__A, __m128d __B)
965{
966  __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B);
967}
968
969static __inline int
970_mm_movemask_pd (__m128d __A)
971{
972  return __builtin_ia32_movmskpd ((__v2df)__A);
973}
974
975static __inline __m128i
976_mm_packs_epi16 (__m128i __A, __m128i __B)
977{
978  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
979}
980
981static __inline __m128i
982_mm_packs_epi32 (__m128i __A, __m128i __B)
983{
984  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
985}
986
987static __inline __m128i
988_mm_packus_epi16 (__m128i __A, __m128i __B)
989{
990  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
991}
992
993static __inline __m128i
994_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
995{
996  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
997}
998
999static __inline __m128i
1000_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1001{
1002  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
1003}
1004
1005static __inline __m128i
1006_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1007{
1008  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
1009}
1010
1011static __inline __m128i
1012_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1013{
1014  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
1015}
1016
1017static __inline __m128i
1018_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1019{
1020  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
1021}
1022
1023static __inline __m128i
1024_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1025{
1026  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
1027}
1028
1029static __inline __m128i
1030_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1031{
1032  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
1033}
1034
1035static __inline __m128i
1036_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1037{
1038  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
1039}
1040
1041static __inline __m128i
1042_mm_add_epi8 (__m128i __A, __m128i __B)
1043{
1044  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
1045}
1046
1047static __inline __m128i
1048_mm_add_epi16 (__m128i __A, __m128i __B)
1049{
1050  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
1051}
1052
1053static __inline __m128i
1054_mm_add_epi32 (__m128i __A, __m128i __B)
1055{
1056  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
1057}
1058
1059static __inline __m128i
1060_mm_add_epi64 (__m128i __A, __m128i __B)
1061{
1062  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1063}
1064
1065static __inline __m128i
1066_mm_adds_epi8 (__m128i __A, __m128i __B)
1067{
1068  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1069}
1070
1071static __inline __m128i
1072_mm_adds_epi16 (__m128i __A, __m128i __B)
1073{
1074  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1075}
1076
1077static __inline __m128i
1078_mm_adds_epu8 (__m128i __A, __m128i __B)
1079{
1080  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1081}
1082
1083static __inline __m128i
1084_mm_adds_epu16 (__m128i __A, __m128i __B)
1085{
1086  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1087}
1088
1089static __inline __m128i
1090_mm_sub_epi8 (__m128i __A, __m128i __B)
1091{
1092  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1093}
1094
1095static __inline __m128i
1096_mm_sub_epi16 (__m128i __A, __m128i __B)
1097{
1098  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1099}
1100
1101static __inline __m128i
1102_mm_sub_epi32 (__m128i __A, __m128i __B)
1103{
1104  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1105}
1106
1107static __inline __m128i
1108_mm_sub_epi64 (__m128i __A, __m128i __B)
1109{
1110  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1111}
1112
1113static __inline __m128i
1114_mm_subs_epi8 (__m128i __A, __m128i __B)
1115{
1116  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1117}
1118
1119static __inline __m128i
1120_mm_subs_epi16 (__m128i __A, __m128i __B)
1121{
1122  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1123}
1124
1125static __inline __m128i
1126_mm_subs_epu8 (__m128i __A, __m128i __B)
1127{
1128  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1129}
1130
1131static __inline __m128i
1132_mm_subs_epu16 (__m128i __A, __m128i __B)
1133{
1134  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1135}
1136
1137static __inline __m128i
1138_mm_madd_epi16 (__m128i __A, __m128i __B)
1139{
1140  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1141}
1142
1143static __inline __m128i
1144_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1145{
1146  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1147}
1148
1149static __inline __m128i
1150_mm_mullo_epi16 (__m128i __A, __m128i __B)
1151{
1152  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1153}
1154
1155static __inline __m64
1156_mm_mul_su32 (__m64 __A, __m64 __B)
1157{
1158  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1159}
1160
1161static __inline __m128i
1162_mm_mul_epu32 (__m128i __A, __m128i __B)
1163{
1164  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1165}
1166
1167static __inline __m128i
1168_mm_sll_epi16 (__m128i __A, __m128i __B)
1169{
1170  return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B);
1171}
1172
1173static __inline __m128i
1174_mm_sll_epi32 (__m128i __A, __m128i __B)
1175{
1176  return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B);
1177}
1178
1179static __inline __m128i
1180_mm_sll_epi64 (__m128i __A, __m128i __B)
1181{
1182  return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B);
1183}
1184
1185static __inline __m128i
1186_mm_sra_epi16 (__m128i __A, __m128i __B)
1187{
1188  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B);
1189}
1190
1191static __inline __m128i
1192_mm_sra_epi32 (__m128i __A, __m128i __B)
1193{
1194  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B);
1195}
1196
1197static __inline __m128i
1198_mm_srl_epi16 (__m128i __A, __m128i __B)
1199{
1200  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B);
1201}
1202
1203static __inline __m128i
1204_mm_srl_epi32 (__m128i __A, __m128i __B)
1205{
1206  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B);
1207}
1208
1209static __inline __m128i
1210_mm_srl_epi64 (__m128i __A, __m128i __B)
1211{
1212  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1213}
1214
1215static __inline __m128i
1216_mm_slli_epi16 (__m128i __A, int __B)
1217{
1218  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1219}
1220
1221static __inline __m128i
1222_mm_slli_epi32 (__m128i __A, int __B)
1223{
1224  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1225}
1226
1227static __inline __m128i
1228_mm_slli_epi64 (__m128i __A, int __B)
1229{
1230  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1231}
1232
1233static __inline __m128i
1234_mm_srai_epi16 (__m128i __A, int __B)
1235{
1236  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1237}
1238
1239static __inline __m128i
1240_mm_srai_epi32 (__m128i __A, int __B)
1241{
1242  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1243}
1244
1245#if 0
1246static __m128i __attribute__((__always_inline__))
1247_mm_srli_si128 (__m128i __A, const int __B)
1248{
1249  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1250}
1251
1252static __m128i __attribute__((__always_inline__))
1253_mm_srli_si128 (__m128i __A, const int __B)
1254{
1255  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1256}
1257#endif
1258#define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1259#define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1260
1261static __inline __m128i
1262_mm_srli_epi16 (__m128i __A, int __B)
1263{
1264  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1265}
1266
1267static __inline __m128i
1268_mm_srli_epi32 (__m128i __A, int __B)
1269{
1270  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1271}
1272
1273static __inline __m128i
1274_mm_srli_epi64 (__m128i __A, int __B)
1275{
1276  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1277}
1278
1279static __inline __m128i
1280_mm_and_si128 (__m128i __A, __m128i __B)
1281{
1282  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1283}
1284
1285static __inline __m128i
1286_mm_andnot_si128 (__m128i __A, __m128i __B)
1287{
1288  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1289}
1290
1291static __inline __m128i
1292_mm_or_si128 (__m128i __A, __m128i __B)
1293{
1294  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1295}
1296
1297static __inline __m128i
1298_mm_xor_si128 (__m128i __A, __m128i __B)
1299{
1300  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1301}
1302
1303static __inline __m128i
1304_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1305{
1306  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1307}
1308
1309static __inline __m128i
1310_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1311{
1312  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1313}
1314
1315static __inline __m128i
1316_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1317{
1318  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1319}
1320
1321static __inline __m128i
1322_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1323{
1324  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1325}
1326
1327static __inline __m128i
1328_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1329{
1330  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1331}
1332
1333static __inline __m128i
1334_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1335{
1336  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1337}
1338
1339static __inline __m128i
1340_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1341{
1342  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1343}
1344
1345static __inline __m128i
1346_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1347{
1348  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1349}
1350
1351static __inline __m128i
1352_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1353{
1354  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1355}
1356
1357#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1358
1359#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1360
1361static __inline __m128i
1362_mm_max_epi16 (__m128i __A, __m128i __B)
1363{
1364  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1365}
1366
1367static __inline __m128i
1368_mm_max_epu8 (__m128i __A, __m128i __B)
1369{
1370  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1371}
1372
1373static __inline __m128i
1374_mm_min_epi16 (__m128i __A, __m128i __B)
1375{
1376  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1377}
1378
1379static __inline __m128i
1380_mm_min_epu8 (__m128i __A, __m128i __B)
1381{
1382  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1383}
1384
1385static __inline int
1386_mm_movemask_epi8 (__m128i __A)
1387{
1388  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1389}
1390
1391static __inline __m128i
1392_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1393{
1394  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1395}
1396
1397#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1398#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1399#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1400
1401static __inline void
1402_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1403{
1404  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1405}
1406
1407static __inline __m128i
1408_mm_avg_epu8 (__m128i __A, __m128i __B)
1409{
1410  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1411}
1412
1413static __inline __m128i
1414_mm_avg_epu16 (__m128i __A, __m128i __B)
1415{
1416  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1417}
1418
1419static __inline __m128i
1420_mm_sad_epu8 (__m128i __A, __m128i __B)
1421{
1422  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1423}
1424
1425static __inline void
1426_mm_stream_si32 (int *__A, int __B)
1427{
1428  __builtin_ia32_movnti (__A, __B);
1429}
1430
1431static __inline void
1432_mm_stream_si128 (__m128i *__A, __m128i __B)
1433{
1434  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1435}
1436
1437static __inline void
1438_mm_stream_pd (double *__A, __m128d __B)
1439{
1440  __builtin_ia32_movntpd (__A, (__v2df)__B);
1441}
1442
1443static __inline __m128i
1444_mm_movpi64_epi64 (__m64 __A)
1445{
1446  return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
1447}
1448
1449static __inline void
1450_mm_clflush (void const *__A)
1451{
1452  return __builtin_ia32_clflush (__A);
1453}
1454
1455static __inline void
1456_mm_lfence (void)
1457{
1458  __builtin_ia32_lfence ();
1459}
1460
1461static __inline void
1462_mm_mfence (void)
1463{
1464  __builtin_ia32_mfence ();
1465}
1466
1467static __inline __m128i
1468_mm_cvtsi32_si128 (int __A)
1469{
1470  return (__m128i) __builtin_ia32_loadd (&__A);
1471}
1472
1473#ifdef __x86_64__
1474static __inline __m128i
1475_mm_cvtsi64x_si128 (long long __A)
1476{
1477  return (__m128i) __builtin_ia32_movq2dq (__A);
1478}
1479#endif
1480
1481static __inline int
1482_mm_cvtsi128_si32 (__m128i __A)
1483{
1484  int __tmp;
1485  __builtin_ia32_stored (&__tmp, (__v4si)__A);
1486  return __tmp;
1487}
1488
1489#ifdef __x86_64__
1490static __inline long long
1491_mm_cvtsi128_si64x (__m128i __A)
1492{
1493  return __builtin_ia32_movdq2q ((__v2di)__A);
1494}
1495#endif
1496
1497#endif /* __SSE2__  */
1498
1499#endif /* _EMMINTRIN_H_INCLUDED */
1500