emmintrin.h revision 146895
1194955Strasz/* Copyright (C) 2003, 2004 Free Software Foundation, Inc.
2194955Strasz
3194955Strasz   This file is part of GCC.
4194955Strasz
5194955Strasz   GCC is free software; you can redistribute it and/or modify
6194955Strasz   it under the terms of the GNU General Public License as published by
7194955Strasz   the Free Software Foundation; either version 2, or (at your option)
8194955Strasz   any later version.
9194955Strasz
10194955Strasz   GCC is distributed in the hope that it will be useful,
11194955Strasz   but WITHOUT ANY WARRANTY; without even the implied warranty of
12194955Strasz   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13194955Strasz   GNU General Public License for more details.
14194955Strasz
15194955Strasz   You should have received a copy of the GNU General Public License
16194955Strasz   along with GCC; see the file COPYING.  If not, write to
17194955Strasz   the Free Software Foundation, 59 Temple Place - Suite 330,
18194955Strasz   Boston, MA 02111-1307, USA.  */
19194955Strasz
20194955Strasz/* As a special exception, if you include this header file into source
21194955Strasz   files compiled by GCC, this header file does not by itself cause
22194955Strasz   the resulting executable to be covered by the GNU General Public
23194955Strasz   License.  This exception does not however invalidate any other
24194955Strasz   reasons why the executable file might be covered by the GNU General
25194955Strasz   Public License.  */
26194955Strasz
27194955Strasz/* Implemented from the specification included in the Intel C++ Compiler
28194955Strasz   User Guide and Reference, version 8.0.  */
29194955Strasz
30194955Strasz#ifndef _EMMINTRIN_H_INCLUDED
31194955Strasz#define _EMMINTRIN_H_INCLUDED
32194955Strasz
33194955Strasz#ifdef __SSE2__
34194955Strasz#include <xmmintrin.h>
35194955Strasz
36194955Strasz/* SSE2 */
37194955Strasztypedef double __v2df __attribute__ ((mode (V2DF)));
38194955Strasztypedef int __v2di __attribute__ ((mode (V2DI)));
39194955Strasztypedef int __v4si __attribute__ ((mode (V4SI)));
40194955Strasztypedef int __v8hi __attribute__ ((mode (V8HI)));
41194955Strasztypedef int __v16qi __attribute__ ((mode (V16QI)));
42194955Strasz
43194955Strasz/* Create a selector for use with the SHUFPD instruction.  */
44194955Strasz#define _MM_SHUFFLE2(fp1,fp0) \
45194955Strasz (((fp1) << 1) | (fp0))
46194955Strasz
47194955Strasz#define __m128i __v2di
48194955Strasz#define __m128d __v2df
49194955Strasz
50194955Strasz/* Create a vector with element 0 as *P and the rest zero.  */
51290893Sngiestatic __inline __m128d
52194955Strasz_mm_load_sd (double const *__P)
53194955Strasz{
54194955Strasz  return (__m128d) __builtin_ia32_loadsd (__P);
55194955Strasz}
56194955Strasz
57194955Strasz/* Create a vector with all two elements equal to *P.  */
58194955Straszstatic __inline __m128d
59194955Strasz_mm_load1_pd (double const *__P)
60194955Strasz{
61194955Strasz  __v2df __tmp = __builtin_ia32_loadsd (__P);
62194955Strasz  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
63194955Strasz}
64194955Strasz
65194955Straszstatic __inline __m128d
66194955Strasz_mm_load_pd1 (double const *__P)
67194955Strasz{
68194955Strasz  return _mm_load1_pd (__P);
69194955Strasz}
70194955Strasz
71194955Strasz/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
72194955Straszstatic __inline __m128d
73194955Strasz_mm_load_pd (double const *__P)
74220465Strasz{
75220465Strasz  return (__m128d) __builtin_ia32_loadapd (__P);
76220465Strasz}
77220465Strasz
78194955Strasz/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
79194955Straszstatic __inline __m128d
80194955Strasz_mm_loadu_pd (double const *__P)
81194955Strasz{
82194955Strasz  return (__m128d) __builtin_ia32_loadupd (__P);
83194955Strasz}
84309485Sngie
85194955Strasz/* Load two DPFP values in reverse order.  The address must be aligned.  */
86194955Straszstatic __inline __m128d
87194955Strasz_mm_loadr_pd (double const *__P)
88194955Strasz{
89194955Strasz  __v2df __tmp = __builtin_ia32_loadapd (__P);
90194955Strasz  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
91194955Strasz}
92194955Strasz
93194955Strasz/* Create a vector with element 0 as F and the rest zero.  */
94194955Straszstatic __inline __m128d
95194955Strasz_mm_set_sd (double __F)
96194955Strasz{
97194955Strasz  return (__m128d) __builtin_ia32_loadsd (&__F);
98194955Strasz}
99194955Strasz
100194955Strasz/* Create a vector with all two elements equal to F.  */
101194955Straszstatic __inline __m128d
102194955Strasz_mm_set1_pd (double __F)
103194955Strasz{
104194955Strasz  __v2df __tmp = __builtin_ia32_loadsd (&__F);
105194955Strasz  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
106194955Strasz}
107194955Strasz
108194955Straszstatic __inline __m128d
109194955Strasz_mm_set_pd1 (double __F)
110194955Strasz{
111194955Strasz  return _mm_set1_pd (__F);
112194955Strasz}
113194955Strasz
114194955Strasz/* Create the vector [Z Y].  */
115194955Straszstatic __inline __m128d
116194955Strasz_mm_set_pd (double __Z, double __Y)
117194955Strasz{
118194955Strasz  return (__v2df) {__Y, __Z};
119194955Strasz}
120194955Strasz
121194955Strasz/* Create the vector [Y Z].  */
122194955Straszstatic __inline __m128d
123194955Strasz_mm_setr_pd (double __Z, double __Y)
124194955Strasz{
125220465Strasz  return _mm_set_pd (__Y, __Z);
126194955Strasz}
127194955Strasz
128194955Strasz/* Create a vector of zeros.  */
129194955Straszstatic __inline __m128d
130194955Strasz_mm_setzero_pd (void)
131194955Strasz{
132194955Strasz  return (__m128d) __builtin_ia32_setzeropd ();
133194955Strasz}
134194955Strasz
135194955Strasz/* Stores the lower DPFP value.  */
136194955Straszstatic __inline void
137194955Strasz_mm_store_sd (double *__P, __m128d __A)
138194955Strasz{
139194955Strasz  __builtin_ia32_storesd (__P, (__v2df)__A);
140194955Strasz}
141194955Strasz
142194955Strasz/* Store the lower DPFP value across two words.  */
143194955Straszstatic __inline void
144194955Strasz_mm_store1_pd (double *__P, __m128d __A)
145194955Strasz{
146194955Strasz  __v2df __va = (__v2df)__A;
147194955Strasz  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0));
148194955Strasz  __builtin_ia32_storeapd (__P, __tmp);
149194955Strasz}
150194955Strasz
151194955Straszstatic __inline void
152194955Strasz_mm_store_pd1 (double *__P, __m128d __A)
153194955Strasz{
154194955Strasz  _mm_store1_pd (__P, __A);
155194955Strasz}
156194955Strasz
157194955Strasz/* Store two DPFP values.  The address must be 16-byte aligned.  */
158194955Straszstatic __inline void
159194955Strasz_mm_store_pd (double *__P, __m128d __A)
160194955Strasz{
161194955Strasz  __builtin_ia32_storeapd (__P, (__v2df)__A);
162194955Strasz}
163194955Strasz
164194955Strasz/* Store two DPFP values.  The address need not be 16-byte aligned.  */
165194955Straszstatic __inline void
166194955Strasz_mm_storeu_pd (double *__P, __m128d __A)
167194955Strasz{
168194955Strasz  __builtin_ia32_storeupd (__P, (__v2df)__A);
169194955Strasz}
170194955Strasz
171194955Strasz/* Store two DPFP values in reverse order.  The address must be aligned.  */
172194955Straszstatic __inline void
173194955Strasz_mm_storer_pd (double *__P, __m128d __A)
174194955Strasz{
175194955Strasz  __v2df __va = (__v2df)__A;
176194955Strasz  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1));
177194955Strasz  __builtin_ia32_storeapd (__P, __tmp);
178194955Strasz}
179194955Strasz
180194955Strasz/* Sets the low DPFP value of A from the low value of B.  */
181194955Straszstatic __inline __m128d
182194955Strasz_mm_move_sd (__m128d __A, __m128d __B)
183194955Strasz{
184194955Strasz  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
185194955Strasz}
186194955Strasz
187194955Strasz
188194955Straszstatic __inline __m128d
189194955Strasz_mm_add_pd (__m128d __A, __m128d __B)
190194955Strasz{
191194955Strasz  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
192194955Strasz}
193194955Strasz
194194955Straszstatic __inline __m128d
195194955Strasz_mm_add_sd (__m128d __A, __m128d __B)
196194955Strasz{
197194955Strasz  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
198194955Strasz}
199194955Strasz
200194955Straszstatic __inline __m128d
201194955Strasz_mm_sub_pd (__m128d __A, __m128d __B)
202194955Strasz{
203194955Strasz  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
204194955Strasz}
205194955Strasz
206194955Straszstatic __inline __m128d
207194955Strasz_mm_sub_sd (__m128d __A, __m128d __B)
208194955Strasz{
209194955Strasz  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
210194955Strasz}
211194955Strasz
212194955Straszstatic __inline __m128d
213194955Strasz_mm_mul_pd (__m128d __A, __m128d __B)
214194955Strasz{
215194955Strasz  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
216194955Strasz}
217194955Strasz
218194955Straszstatic __inline __m128d
219194955Strasz_mm_mul_sd (__m128d __A, __m128d __B)
220194955Strasz{
221194955Strasz  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
222194955Strasz}
223194955Strasz
224194955Straszstatic __inline __m128d
225194955Strasz_mm_div_pd (__m128d __A, __m128d __B)
226194955Strasz{
227194955Strasz  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
228194955Strasz}
229194955Strasz
230194955Straszstatic __inline __m128d
231194955Strasz_mm_div_sd (__m128d __A, __m128d __B)
232194955Strasz{
233194955Strasz  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
234194955Strasz}
235194955Strasz
236194955Straszstatic __inline __m128d
237194955Strasz_mm_sqrt_pd (__m128d __A)
238194955Strasz{
239194955Strasz  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
240194955Strasz}
241194955Strasz
242194955Strasz/* Return pair {sqrt (A[0), B[1]}.  */
243194955Straszstatic __inline __m128d
244194955Strasz_mm_sqrt_sd (__m128d __A, __m128d __B)
245194955Strasz{
246194955Strasz  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
247194955Strasz  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
248194955Strasz}
249194955Strasz
250194955Straszstatic __inline __m128d
251194955Strasz_mm_min_pd (__m128d __A, __m128d __B)
252194955Strasz{
253194955Strasz  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
254194955Strasz}
255194955Strasz
256194955Straszstatic __inline __m128d
257194955Strasz_mm_min_sd (__m128d __A, __m128d __B)
258194955Strasz{
259194955Strasz  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
260194955Strasz}
261194955Strasz
262static __inline __m128d
263_mm_max_pd (__m128d __A, __m128d __B)
264{
265  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
266}
267
268static __inline __m128d
269_mm_max_sd (__m128d __A, __m128d __B)
270{
271  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
272}
273
274static __inline __m128d
275_mm_and_pd (__m128d __A, __m128d __B)
276{
277  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
278}
279
280static __inline __m128d
281_mm_andnot_pd (__m128d __A, __m128d __B)
282{
283  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
284}
285
286static __inline __m128d
287_mm_or_pd (__m128d __A, __m128d __B)
288{
289  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
290}
291
292static __inline __m128d
293_mm_xor_pd (__m128d __A, __m128d __B)
294{
295  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
296}
297
298static __inline __m128d
299_mm_cmpeq_pd (__m128d __A, __m128d __B)
300{
301  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
302}
303
304static __inline __m128d
305_mm_cmplt_pd (__m128d __A, __m128d __B)
306{
307  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
308}
309
310static __inline __m128d
311_mm_cmple_pd (__m128d __A, __m128d __B)
312{
313  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
314}
315
316static __inline __m128d
317_mm_cmpgt_pd (__m128d __A, __m128d __B)
318{
319  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
320}
321
322static __inline __m128d
323_mm_cmpge_pd (__m128d __A, __m128d __B)
324{
325  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
326}
327
328static __inline __m128d
329_mm_cmpneq_pd (__m128d __A, __m128d __B)
330{
331  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
332}
333
334static __inline __m128d
335_mm_cmpnlt_pd (__m128d __A, __m128d __B)
336{
337  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
338}
339
340static __inline __m128d
341_mm_cmpnle_pd (__m128d __A, __m128d __B)
342{
343  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
344}
345
346static __inline __m128d
347_mm_cmpngt_pd (__m128d __A, __m128d __B)
348{
349  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
350}
351
352static __inline __m128d
353_mm_cmpnge_pd (__m128d __A, __m128d __B)
354{
355  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
356}
357
358static __inline __m128d
359_mm_cmpord_pd (__m128d __A, __m128d __B)
360{
361  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
362}
363
364static __inline __m128d
365_mm_cmpunord_pd (__m128d __A, __m128d __B)
366{
367  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
368}
369
370static __inline __m128d
371_mm_cmpeq_sd (__m128d __A, __m128d __B)
372{
373  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
374}
375
376static __inline __m128d
377_mm_cmplt_sd (__m128d __A, __m128d __B)
378{
379  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
380}
381
382static __inline __m128d
383_mm_cmple_sd (__m128d __A, __m128d __B)
384{
385  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
386}
387
388static __inline __m128d
389_mm_cmpgt_sd (__m128d __A, __m128d __B)
390{
391  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
392					 (__v2df)
393					 __builtin_ia32_cmpltsd ((__v2df) __B,
394								 (__v2df)
395								 __A));
396}
397
398static __inline __m128d
399_mm_cmpge_sd (__m128d __A, __m128d __B)
400{
401  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
402					 (__v2df)
403					 __builtin_ia32_cmplesd ((__v2df) __B,
404								 (__v2df)
405								 __A));
406}
407
408static __inline __m128d
409_mm_cmpneq_sd (__m128d __A, __m128d __B)
410{
411  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
412}
413
414static __inline __m128d
415_mm_cmpnlt_sd (__m128d __A, __m128d __B)
416{
417  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
418}
419
420static __inline __m128d
421_mm_cmpnle_sd (__m128d __A, __m128d __B)
422{
423  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
424}
425
426static __inline __m128d
427_mm_cmpngt_sd (__m128d __A, __m128d __B)
428{
429  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
430					 (__v2df)
431					 __builtin_ia32_cmpnltsd ((__v2df) __B,
432								  (__v2df)
433								  __A));
434}
435
436static __inline __m128d
437_mm_cmpnge_sd (__m128d __A, __m128d __B)
438{
439  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
440					 (__v2df)
441					 __builtin_ia32_cmpnlesd ((__v2df) __B,
442								  (__v2df)
443								  __A));
444}
445
446static __inline __m128d
447_mm_cmpord_sd (__m128d __A, __m128d __B)
448{
449  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
450}
451
452static __inline __m128d
453_mm_cmpunord_sd (__m128d __A, __m128d __B)
454{
455  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
456}
457
458static __inline int
459_mm_comieq_sd (__m128d __A, __m128d __B)
460{
461  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
462}
463
464static __inline int
465_mm_comilt_sd (__m128d __A, __m128d __B)
466{
467  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
468}
469
470static __inline int
471_mm_comile_sd (__m128d __A, __m128d __B)
472{
473  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
474}
475
476static __inline int
477_mm_comigt_sd (__m128d __A, __m128d __B)
478{
479  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
480}
481
482static __inline int
483_mm_comige_sd (__m128d __A, __m128d __B)
484{
485  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
486}
487
488static __inline int
489_mm_comineq_sd (__m128d __A, __m128d __B)
490{
491  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
492}
493
494static __inline int
495_mm_ucomieq_sd (__m128d __A, __m128d __B)
496{
497  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
498}
499
500static __inline int
501_mm_ucomilt_sd (__m128d __A, __m128d __B)
502{
503  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
504}
505
506static __inline int
507_mm_ucomile_sd (__m128d __A, __m128d __B)
508{
509  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
510}
511
512static __inline int
513_mm_ucomigt_sd (__m128d __A, __m128d __B)
514{
515  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
516}
517
518static __inline int
519_mm_ucomige_sd (__m128d __A, __m128d __B)
520{
521  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
522}
523
524static __inline int
525_mm_ucomineq_sd (__m128d __A, __m128d __B)
526{
527  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
528}
529
530/* Create a vector with element 0 as *P and the rest zero.  */
531
532static __inline __m128i
533_mm_load_si128 (__m128i const *__P)
534{
535  return (__m128i) __builtin_ia32_loaddqa ((char const *)__P);
536}
537
538static __inline __m128i
539_mm_loadu_si128 (__m128i const *__P)
540{
541  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
542}
543
544static __inline __m128i
545_mm_loadl_epi64 (__m128i const *__P)
546{
547  return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P);
548}
549
550static __inline void
551_mm_store_si128 (__m128i *__P, __m128i __B)
552{
553  __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B);
554}
555
556static __inline void
557_mm_storeu_si128 (__m128i *__P, __m128i __B)
558{
559  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
560}
561
562static __inline void
563_mm_storel_epi64 (__m128i *__P, __m128i __B)
564{
565  *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B);
566}
567
568static __inline __m64
569_mm_movepi64_pi64 (__m128i __B)
570{
571  return (__m64) __builtin_ia32_movdq2q ((__v2di)__B);
572}
573
574static __inline __m128i
575_mm_move_epi64 (__m128i __A)
576{
577  return (__m128i) __builtin_ia32_movq ((__v2di)__A);
578}
579
580/* Create a vector of zeros.  */
581static __inline __m128i
582_mm_setzero_si128 (void)
583{
584  return (__m128i) __builtin_ia32_setzero128 ();
585}
586
587static __inline __m128i
588_mm_set_epi64 (__m64 __A,  __m64 __B)
589{
590  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
591  __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
592  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp);
593}
594
595/* Create the vector [Z Y X W].  */
596static __inline __m128i
597_mm_set_epi32 (int __Z, int __Y, int __X, int __W)
598{
599  union {
600    int __a[4];
601    __m128i __v;
602  } __u;
603
604  __u.__a[0] = __W;
605  __u.__a[1] = __X;
606  __u.__a[2] = __Y;
607  __u.__a[3] = __Z;
608
609  return __u.__v;
610}
611
612#ifdef __x86_64__
613/* Create the vector [Z Y].  */
614static __inline __m128i
615_mm_set_epi64x (long long __Z, long long __Y)
616{
617  union {
618    long __a[2];
619    __m128i __v;
620  } __u;
621
622  __u.__a[0] = __Y;
623  __u.__a[1] = __Z;
624
625  return __u.__v;
626}
627#endif
628
629/* Create the vector [S T U V Z Y X W].  */
630static __inline __m128i
631_mm_set_epi16 (short __Z, short __Y, short __X, short __W,
632	       short __V, short __U, short __T, short __S)
633{
634  union {
635    short __a[8];
636    __m128i __v;
637  } __u;
638
639  __u.__a[0] = __S;
640  __u.__a[1] = __T;
641  __u.__a[2] = __U;
642  __u.__a[3] = __V;
643  __u.__a[4] = __W;
644  __u.__a[5] = __X;
645  __u.__a[6] = __Y;
646  __u.__a[7] = __Z;
647
648  return __u.__v;
649}
650
651/* Create the vector [S T U V Z Y X W].  */
652static __inline __m128i
653_mm_set_epi8 (char __Z, char __Y, char __X, char __W,
654	      char __V, char __U, char __T, char __S,
655	      char __Z1, char __Y1, char __X1, char __W1,
656	      char __V1, char __U1, char __T1, char __S1)
657{
658  union {
659    char __a[16];
660    __m128i __v;
661  } __u;
662
663  __u.__a[0] = __S1;
664  __u.__a[1] = __T1;
665  __u.__a[2] = __U1;
666  __u.__a[3] = __V1;
667  __u.__a[4] = __W1;
668  __u.__a[5] = __X1;
669  __u.__a[6] = __Y1;
670  __u.__a[7] = __Z1;
671  __u.__a[8] = __S;
672  __u.__a[9] = __T;
673  __u.__a[10] = __U;
674  __u.__a[11] = __V;
675  __u.__a[12] = __W;
676  __u.__a[13] = __X;
677  __u.__a[14] = __Y;
678  __u.__a[15] = __Z;
679
680  return __u.__v;
681}
682
683static __inline __m128i
684_mm_set1_epi64 (__m64 __A)
685{
686  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
687  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp);
688}
689
690static __inline __m128i
691_mm_set1_epi32 (int __A)
692{
693  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A);
694  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
695}
696
697#ifdef __x86_64__
698static __inline __m128i
699_mm_set1_epi64x (long long __A)
700{
701  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
702  return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0));
703}
704#endif
705
706static __inline __m128i
707_mm_set1_epi16 (short __A)
708{
709  int __Acopy = (unsigned short)__A;
710  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
711  __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp);
712  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
713}
714
715static __inline __m128i
716_mm_set1_epi8 (char __A)
717{
718  int __Acopy = (unsigned char)__A;
719  __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
720  __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
721  __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
722  return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
723}
724
725static __inline __m128i
726_mm_setr_epi64 (__m64 __A,  __m64 __B)
727{
728  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
729  __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
730  return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2);
731}
732
733/* Create the vector [Z Y X W].  */
734static __inline __m128i
735_mm_setr_epi32 (int __W, int __X, int __Y, int __Z)
736{
737  union {
738    int __a[4];
739    __m128i __v;
740  } __u;
741
742  __u.__a[0] = __W;
743  __u.__a[1] = __X;
744  __u.__a[2] = __Y;
745  __u.__a[3] = __Z;
746
747  return __u.__v;
748}
749/* Create the vector [S T U V Z Y X W].  */
750static __inline __m128i
751_mm_setr_epi16 (short __S, short __T, short __U, short __V,
752	        short __W, short __X, short __Y, short __Z)
753{
754  union {
755    short __a[8];
756    __m128i __v;
757  } __u;
758
759  __u.__a[0] = __S;
760  __u.__a[1] = __T;
761  __u.__a[2] = __U;
762  __u.__a[3] = __V;
763  __u.__a[4] = __W;
764  __u.__a[5] = __X;
765  __u.__a[6] = __Y;
766  __u.__a[7] = __Z;
767
768  return __u.__v;
769}
770
771/* Create the vector [S T U V Z Y X W].  */
772static __inline __m128i
773_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1,
774	       char __W1, char __X1, char __Y1, char __Z1,
775	       char __S, char __T, char __U, char __V,
776	       char __W, char __X, char __Y, char __Z)
777{
778  union {
779    char __a[16];
780    __m128i __v;
781  } __u;
782
783  __u.__a[0] = __S1;
784  __u.__a[1] = __T1;
785  __u.__a[2] = __U1;
786  __u.__a[3] = __V1;
787  __u.__a[4] = __W1;
788  __u.__a[5] = __X1;
789  __u.__a[6] = __Y1;
790  __u.__a[7] = __Z1;
791  __u.__a[8] = __S;
792  __u.__a[9] = __T;
793  __u.__a[10] = __U;
794  __u.__a[11] = __V;
795  __u.__a[12] = __W;
796  __u.__a[13] = __X;
797  __u.__a[14] = __Y;
798  __u.__a[15] = __Z;
799
800  return __u.__v;
801}
802
803static __inline __m128d
804_mm_cvtepi32_pd (__m128i __A)
805{
806  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
807}
808
809static __inline __m128
810_mm_cvtepi32_ps (__m128i __A)
811{
812  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
813}
814
815static __inline __m128i
816_mm_cvtpd_epi32 (__m128d __A)
817{
818  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
819}
820
821static __inline __m64
822_mm_cvtpd_pi32 (__m128d __A)
823{
824  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
825}
826
827static __inline __m128
828_mm_cvtpd_ps (__m128d __A)
829{
830  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
831}
832
833static __inline __m128i
834_mm_cvttpd_epi32 (__m128d __A)
835{
836  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
837}
838
839static __inline __m64
840_mm_cvttpd_pi32 (__m128d __A)
841{
842  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
843}
844
845static __inline __m128d
846_mm_cvtpi32_pd (__m64 __A)
847{
848  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
849}
850
851static __inline __m128i
852_mm_cvtps_epi32 (__m128 __A)
853{
854  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
855}
856
857static __inline __m128i
858_mm_cvttps_epi32 (__m128 __A)
859{
860  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
861}
862
863static __inline __m128d
864_mm_cvtps_pd (__m128 __A)
865{
866  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
867}
868
869static __inline int
870_mm_cvtsd_si32 (__m128d __A)
871{
872  return __builtin_ia32_cvtsd2si ((__v2df) __A);
873}
874
875#ifdef __x86_64__
876static __inline long long
877_mm_cvtsd_si64x (__m128d __A)
878{
879  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
880}
881#endif
882
883static __inline int
884_mm_cvttsd_si32 (__m128d __A)
885{
886  return __builtin_ia32_cvttsd2si ((__v2df) __A);
887}
888
889#ifdef __x86_64__
890static __inline long long
891_mm_cvttsd_si64x (__m128d __A)
892{
893  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
894}
895#endif
896
897static __inline __m128
898_mm_cvtsd_ss (__m128 __A, __m128d __B)
899{
900  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
901}
902
903static __inline __m128d
904_mm_cvtsi32_sd (__m128d __A, int __B)
905{
906  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
907}
908
909#ifdef __x86_64__
910static __inline __m128d
911_mm_cvtsi64x_sd (__m128d __A, long long __B)
912{
913  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
914}
915#endif
916
917static __inline __m128d
918_mm_cvtss_sd (__m128d __A, __m128 __B)
919{
920  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
921}
922
923#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
924
925static __inline __m128d
926_mm_unpackhi_pd (__m128d __A, __m128d __B)
927{
928  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
929}
930
931static __inline __m128d
932_mm_unpacklo_pd (__m128d __A, __m128d __B)
933{
934  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
935}
936
937static __inline __m128d
938_mm_loadh_pd (__m128d __A, double const *__B)
939{
940  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B);
941}
942
943static __inline void
944_mm_storeh_pd (double *__A, __m128d __B)
945{
946  __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B);
947}
948
949static __inline __m128d
950_mm_loadl_pd (__m128d __A, double const *__B)
951{
952  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B);
953}
954
955static __inline void
956_mm_storel_pd (double *__A, __m128d __B)
957{
958  __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B);
959}
960
961static __inline int
962_mm_movemask_pd (__m128d __A)
963{
964  return __builtin_ia32_movmskpd ((__v2df)__A);
965}
966
967static __inline __m128i
968_mm_packs_epi16 (__m128i __A, __m128i __B)
969{
970  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
971}
972
973static __inline __m128i
974_mm_packs_epi32 (__m128i __A, __m128i __B)
975{
976  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
977}
978
979static __inline __m128i
980_mm_packus_epi16 (__m128i __A, __m128i __B)
981{
982  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
983}
984
985static __inline __m128i
986_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
987{
988  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
989}
990
991static __inline __m128i
992_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
993{
994  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
995}
996
997static __inline __m128i
998_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
999{
1000  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
1001}
1002
1003static __inline __m128i
1004_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1005{
1006  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
1007}
1008
1009static __inline __m128i
1010_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1011{
1012  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
1013}
1014
1015static __inline __m128i
1016_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1017{
1018  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
1019}
1020
1021static __inline __m128i
1022_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1023{
1024  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
1025}
1026
1027static __inline __m128i
1028_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1029{
1030  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
1031}
1032
1033static __inline __m128i
1034_mm_add_epi8 (__m128i __A, __m128i __B)
1035{
1036  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
1037}
1038
1039static __inline __m128i
1040_mm_add_epi16 (__m128i __A, __m128i __B)
1041{
1042  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
1043}
1044
1045static __inline __m128i
1046_mm_add_epi32 (__m128i __A, __m128i __B)
1047{
1048  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
1049}
1050
1051static __inline __m128i
1052_mm_add_epi64 (__m128i __A, __m128i __B)
1053{
1054  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1055}
1056
1057static __inline __m128i
1058_mm_adds_epi8 (__m128i __A, __m128i __B)
1059{
1060  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1061}
1062
1063static __inline __m128i
1064_mm_adds_epi16 (__m128i __A, __m128i __B)
1065{
1066  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1067}
1068
1069static __inline __m128i
1070_mm_adds_epu8 (__m128i __A, __m128i __B)
1071{
1072  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1073}
1074
1075static __inline __m128i
1076_mm_adds_epu16 (__m128i __A, __m128i __B)
1077{
1078  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1079}
1080
1081static __inline __m128i
1082_mm_sub_epi8 (__m128i __A, __m128i __B)
1083{
1084  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1085}
1086
1087static __inline __m128i
1088_mm_sub_epi16 (__m128i __A, __m128i __B)
1089{
1090  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1091}
1092
1093static __inline __m128i
1094_mm_sub_epi32 (__m128i __A, __m128i __B)
1095{
1096  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1097}
1098
1099static __inline __m128i
1100_mm_sub_epi64 (__m128i __A, __m128i __B)
1101{
1102  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1103}
1104
1105static __inline __m128i
1106_mm_subs_epi8 (__m128i __A, __m128i __B)
1107{
1108  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1109}
1110
1111static __inline __m128i
1112_mm_subs_epi16 (__m128i __A, __m128i __B)
1113{
1114  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1115}
1116
1117static __inline __m128i
1118_mm_subs_epu8 (__m128i __A, __m128i __B)
1119{
1120  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1121}
1122
1123static __inline __m128i
1124_mm_subs_epu16 (__m128i __A, __m128i __B)
1125{
1126  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1127}
1128
1129static __inline __m128i
1130_mm_madd_epi16 (__m128i __A, __m128i __B)
1131{
1132  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1133}
1134
1135static __inline __m128i
1136_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1137{
1138  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1139}
1140
1141static __inline __m128i
1142_mm_mullo_epi16 (__m128i __A, __m128i __B)
1143{
1144  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1145}
1146
1147static __inline __m64
1148_mm_mul_su32 (__m64 __A, __m64 __B)
1149{
1150  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1151}
1152
1153static __inline __m128i
1154_mm_mul_epu32 (__m128i __A, __m128i __B)
1155{
1156  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1157}
1158
1159static __inline __m128i
1160_mm_sll_epi16 (__m128i __A, __m128i __B)
1161{
1162  return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B);
1163}
1164
1165static __inline __m128i
1166_mm_sll_epi32 (__m128i __A, __m128i __B)
1167{
1168  return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B);
1169}
1170
1171static __inline __m128i
1172_mm_sll_epi64 (__m128i __A, __m128i __B)
1173{
1174  return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B);
1175}
1176
1177static __inline __m128i
1178_mm_sra_epi16 (__m128i __A, __m128i __B)
1179{
1180  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B);
1181}
1182
1183static __inline __m128i
1184_mm_sra_epi32 (__m128i __A, __m128i __B)
1185{
1186  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B);
1187}
1188
1189static __inline __m128i
1190_mm_srl_epi16 (__m128i __A, __m128i __B)
1191{
1192  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B);
1193}
1194
1195static __inline __m128i
1196_mm_srl_epi32 (__m128i __A, __m128i __B)
1197{
1198  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B);
1199}
1200
1201static __inline __m128i
1202_mm_srl_epi64 (__m128i __A, __m128i __B)
1203{
1204  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1205}
1206
1207static __inline __m128i
1208_mm_slli_epi16 (__m128i __A, int __B)
1209{
1210  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1211}
1212
1213static __inline __m128i
1214_mm_slli_epi32 (__m128i __A, int __B)
1215{
1216  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1217}
1218
1219static __inline __m128i
1220_mm_slli_epi64 (__m128i __A, int __B)
1221{
1222  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1223}
1224
1225static __inline __m128i
1226_mm_srai_epi16 (__m128i __A, int __B)
1227{
1228  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1229}
1230
1231static __inline __m128i
1232_mm_srai_epi32 (__m128i __A, int __B)
1233{
1234  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1235}
1236
1237#if 0
1238static __m128i __attribute__((__always_inline__))
1239_mm_srli_si128 (__m128i __A, const int __B)
1240{
1241  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1242}
1243
1244static __m128i __attribute__((__always_inline__))
1245_mm_srli_si128 (__m128i __A, const int __B)
1246{
1247  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1248}
1249#endif
1250#define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1251#define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1252
1253static __inline __m128i
1254_mm_srli_epi16 (__m128i __A, int __B)
1255{
1256  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1257}
1258
1259static __inline __m128i
1260_mm_srli_epi32 (__m128i __A, int __B)
1261{
1262  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1263}
1264
1265static __inline __m128i
1266_mm_srli_epi64 (__m128i __A, int __B)
1267{
1268  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1269}
1270
1271static __inline __m128i
1272_mm_and_si128 (__m128i __A, __m128i __B)
1273{
1274  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1275}
1276
1277static __inline __m128i
1278_mm_andnot_si128 (__m128i __A, __m128i __B)
1279{
1280  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1281}
1282
1283static __inline __m128i
1284_mm_or_si128 (__m128i __A, __m128i __B)
1285{
1286  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1287}
1288
1289static __inline __m128i
1290_mm_xor_si128 (__m128i __A, __m128i __B)
1291{
1292  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1293}
1294
1295static __inline __m128i
1296_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1297{
1298  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1299}
1300
1301static __inline __m128i
1302_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1303{
1304  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1305}
1306
1307static __inline __m128i
1308_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1309{
1310  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1311}
1312
1313static __inline __m128i
1314_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1315{
1316  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1317}
1318
1319static __inline __m128i
1320_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1321{
1322  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1323}
1324
1325static __inline __m128i
1326_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1327{
1328  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1329}
1330
1331static __inline __m128i
1332_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1333{
1334  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1335}
1336
1337static __inline __m128i
1338_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1339{
1340  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1341}
1342
1343static __inline __m128i
1344_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1345{
1346  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1347}
1348
1349#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1350
1351#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1352
1353static __inline __m128i
1354_mm_max_epi16 (__m128i __A, __m128i __B)
1355{
1356  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1357}
1358
1359static __inline __m128i
1360_mm_max_epu8 (__m128i __A, __m128i __B)
1361{
1362  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1363}
1364
1365static __inline __m128i
1366_mm_min_epi16 (__m128i __A, __m128i __B)
1367{
1368  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1369}
1370
1371static __inline __m128i
1372_mm_min_epu8 (__m128i __A, __m128i __B)
1373{
1374  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1375}
1376
1377static __inline int
1378_mm_movemask_epi8 (__m128i __A)
1379{
1380  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1381}
1382
1383static __inline __m128i
1384_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1385{
1386  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1387}
1388
1389#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1390#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1391#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1392
1393static __inline void
1394_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1395{
1396  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1397}
1398
1399static __inline __m128i
1400_mm_avg_epu8 (__m128i __A, __m128i __B)
1401{
1402  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1403}
1404
1405static __inline __m128i
1406_mm_avg_epu16 (__m128i __A, __m128i __B)
1407{
1408  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1409}
1410
1411static __inline __m128i
1412_mm_sad_epu8 (__m128i __A, __m128i __B)
1413{
1414  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1415}
1416
1417static __inline void
1418_mm_stream_si32 (int *__A, int __B)
1419{
1420  __builtin_ia32_movnti (__A, __B);
1421}
1422
1423static __inline void
1424_mm_stream_si128 (__m128i *__A, __m128i __B)
1425{
1426  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1427}
1428
1429static __inline void
1430_mm_stream_pd (double *__A, __m128d __B)
1431{
1432  __builtin_ia32_movntpd (__A, (__v2df)__B);
1433}
1434
1435static __inline __m128i
1436_mm_movpi64_epi64 (__m64 __A)
1437{
1438  return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
1439}
1440
1441static __inline void
1442_mm_clflush (void const *__A)
1443{
1444  return __builtin_ia32_clflush (__A);
1445}
1446
1447static __inline void
1448_mm_lfence (void)
1449{
1450  __builtin_ia32_lfence ();
1451}
1452
1453static __inline void
1454_mm_mfence (void)
1455{
1456  __builtin_ia32_mfence ();
1457}
1458
1459static __inline __m128i
1460_mm_cvtsi32_si128 (int __A)
1461{
1462  return (__m128i) __builtin_ia32_loadd (&__A);
1463}
1464
1465#ifdef __x86_64__
1466static __inline __m128i
1467_mm_cvtsi64x_si128 (long long __A)
1468{
1469  return (__m128i) __builtin_ia32_movq2dq (__A);
1470}
1471#endif
1472
1473static __inline int
1474_mm_cvtsi128_si32 (__m128i __A)
1475{
1476  int __tmp;
1477  __builtin_ia32_stored (&__tmp, (__v4si)__A);
1478  return __tmp;
1479}
1480
1481#ifdef __x86_64__
1482static __inline long long
1483_mm_cvtsi128_si64x (__m128i __A)
1484{
1485  return __builtin_ia32_movdq2q ((__v2di)__A);
1486}
1487#endif
1488
1489#endif /* __SSE2__  */
1490
1491#endif /* _EMMINTRIN_H_INCLUDED */
1492