Deleted Added
full compact
emmintrin.h (169690) emmintrin.h (251212)
1/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA. */
19
20/* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 9.0. */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
1/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA. */
19
20/* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 9.0. */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
33#ifdef __SSE2__
33#ifndef __SSE2__
34# error "SSE2 instruction set not enabled"
35#else
36
37/* We need definitions from the SSE header files*/
34#include <xmmintrin.h>
35
36/* SSE2 */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef int __v4si __attribute__ ((__vector_size__ (16)));
40typedef short __v8hi __attribute__ ((__vector_size__ (16)));
41typedef char __v16qi __attribute__ ((__vector_size__ (16)));
42
43/* The Intel API is flexible enough that we must allow aliasing with other
44 vector types, and their scalar components. */
45typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
46typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
47
48/* Create a selector for use with the SHUFPD instruction. */
49#define _MM_SHUFFLE2(fp1,fp0) \
50 (((fp1) << 1) | (fp0))
51
52/* Create a vector with element 0 as F and the rest zero. */
53static __inline __m128d __attribute__((__always_inline__))
54_mm_set_sd (double __F)
55{
56 return __extension__ (__m128d){ __F, 0 };
57}
58
59/* Create a vector with both elements equal to F. */
60static __inline __m128d __attribute__((__always_inline__))
61_mm_set1_pd (double __F)
62{
63 return __extension__ (__m128d){ __F, __F };
64}
65
66static __inline __m128d __attribute__((__always_inline__))
67_mm_set_pd1 (double __F)
68{
69 return _mm_set1_pd (__F);
70}
71
72/* Create a vector with the lower value X and upper value W. */
73static __inline __m128d __attribute__((__always_inline__))
74_mm_set_pd (double __W, double __X)
75{
76 return __extension__ (__m128d){ __X, __W };
77}
78
79/* Create a vector with the lower value W and upper value X. */
80static __inline __m128d __attribute__((__always_inline__))
81_mm_setr_pd (double __W, double __X)
82{
83 return __extension__ (__m128d){ __W, __X };
84}
85
86/* Create a vector of zeros. */
87static __inline __m128d __attribute__((__always_inline__))
88_mm_setzero_pd (void)
89{
90 return __extension__ (__m128d){ 0.0, 0.0 };
91}
92
93/* Sets the low DPFP value of A from the low value of B. */
94static __inline __m128d __attribute__((__always_inline__))
95_mm_move_sd (__m128d __A, __m128d __B)
96{
97 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
98}
99
100/* Load two DPFP values from P. The address must be 16-byte aligned. */
101static __inline __m128d __attribute__((__always_inline__))
102_mm_load_pd (double const *__P)
103{
104 return *(__m128d *)__P;
105}
106
107/* Load two DPFP values from P. The address need not be 16-byte aligned. */
108static __inline __m128d __attribute__((__always_inline__))
109_mm_loadu_pd (double const *__P)
110{
111 return __builtin_ia32_loadupd (__P);
112}
113
114/* Create a vector with all two elements equal to *P. */
115static __inline __m128d __attribute__((__always_inline__))
116_mm_load1_pd (double const *__P)
117{
118 return _mm_set1_pd (*__P);
119}
120
121/* Create a vector with element 0 as *P and the rest zero. */
122static __inline __m128d __attribute__((__always_inline__))
123_mm_load_sd (double const *__P)
124{
125 return _mm_set_sd (*__P);
126}
127
128static __inline __m128d __attribute__((__always_inline__))
129_mm_load_pd1 (double const *__P)
130{
131 return _mm_load1_pd (__P);
132}
133
134/* Load two DPFP values in reverse order. The address must be aligned. */
135static __inline __m128d __attribute__((__always_inline__))
136_mm_loadr_pd (double const *__P)
137{
138 __m128d __tmp = _mm_load_pd (__P);
139 return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
140}
141
142/* Store two DPFP values. The address must be 16-byte aligned. */
143static __inline void __attribute__((__always_inline__))
144_mm_store_pd (double *__P, __m128d __A)
145{
146 *(__m128d *)__P = __A;
147}
148
149/* Store two DPFP values. The address need not be 16-byte aligned. */
150static __inline void __attribute__((__always_inline__))
151_mm_storeu_pd (double *__P, __m128d __A)
152{
153 __builtin_ia32_storeupd (__P, __A);
154}
155
156/* Stores the lower DPFP value. */
157static __inline void __attribute__((__always_inline__))
158_mm_store_sd (double *__P, __m128d __A)
159{
160 *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
161}
162
163static __inline double __attribute__((__always_inline__))
164_mm_cvtsd_f64 (__m128d __A)
165{
166 return __builtin_ia32_vec_ext_v2df (__A, 0);
167}
168
169static __inline void __attribute__((__always_inline__))
170_mm_storel_pd (double *__P, __m128d __A)
171{
172 _mm_store_sd (__P, __A);
173}
174
175/* Stores the upper DPFP value. */
176static __inline void __attribute__((__always_inline__))
177_mm_storeh_pd (double *__P, __m128d __A)
178{
179 *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
180}
181
182/* Store the lower DPFP value across two words.
183 The address must be 16-byte aligned. */
184static __inline void __attribute__((__always_inline__))
185_mm_store1_pd (double *__P, __m128d __A)
186{
187 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
188}
189
190static __inline void __attribute__((__always_inline__))
191_mm_store_pd1 (double *__P, __m128d __A)
192{
193 _mm_store1_pd (__P, __A);
194}
195
196/* Store two DPFP values in reverse order. The address must be aligned. */
197static __inline void __attribute__((__always_inline__))
198_mm_storer_pd (double *__P, __m128d __A)
199{
200 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
201}
202
203static __inline int __attribute__((__always_inline__))
204_mm_cvtsi128_si32 (__m128i __A)
205{
206 return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
207}
208
209#ifdef __x86_64__
210/* Intel intrinsic. */
211static __inline long long __attribute__((__always_inline__))
212_mm_cvtsi128_si64 (__m128i __A)
213{
214 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
215}
216
217/* Microsoft intrinsic. */
218static __inline long long __attribute__((__always_inline__))
219_mm_cvtsi128_si64x (__m128i __A)
220{
221 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
222}
223#endif
224
225static __inline __m128d __attribute__((__always_inline__))
226_mm_add_pd (__m128d __A, __m128d __B)
227{
228 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
229}
230
231static __inline __m128d __attribute__((__always_inline__))
232_mm_add_sd (__m128d __A, __m128d __B)
233{
234 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
235}
236
237static __inline __m128d __attribute__((__always_inline__))
238_mm_sub_pd (__m128d __A, __m128d __B)
239{
240 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
241}
242
243static __inline __m128d __attribute__((__always_inline__))
244_mm_sub_sd (__m128d __A, __m128d __B)
245{
246 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
247}
248
249static __inline __m128d __attribute__((__always_inline__))
250_mm_mul_pd (__m128d __A, __m128d __B)
251{
252 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
253}
254
255static __inline __m128d __attribute__((__always_inline__))
256_mm_mul_sd (__m128d __A, __m128d __B)
257{
258 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
259}
260
261static __inline __m128d __attribute__((__always_inline__))
262_mm_div_pd (__m128d __A, __m128d __B)
263{
264 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
265}
266
267static __inline __m128d __attribute__((__always_inline__))
268_mm_div_sd (__m128d __A, __m128d __B)
269{
270 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
271}
272
273static __inline __m128d __attribute__((__always_inline__))
274_mm_sqrt_pd (__m128d __A)
275{
276 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
277}
278
279/* Return pair {sqrt (A[0), B[1]}. */
280static __inline __m128d __attribute__((__always_inline__))
281_mm_sqrt_sd (__m128d __A, __m128d __B)
282{
283 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
284 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
285}
286
287static __inline __m128d __attribute__((__always_inline__))
288_mm_min_pd (__m128d __A, __m128d __B)
289{
290 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
291}
292
293static __inline __m128d __attribute__((__always_inline__))
294_mm_min_sd (__m128d __A, __m128d __B)
295{
296 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
297}
298
299static __inline __m128d __attribute__((__always_inline__))
300_mm_max_pd (__m128d __A, __m128d __B)
301{
302 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
303}
304
305static __inline __m128d __attribute__((__always_inline__))
306_mm_max_sd (__m128d __A, __m128d __B)
307{
308 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
309}
310
311static __inline __m128d __attribute__((__always_inline__))
312_mm_and_pd (__m128d __A, __m128d __B)
313{
314 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
315}
316
317static __inline __m128d __attribute__((__always_inline__))
318_mm_andnot_pd (__m128d __A, __m128d __B)
319{
320 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
321}
322
323static __inline __m128d __attribute__((__always_inline__))
324_mm_or_pd (__m128d __A, __m128d __B)
325{
326 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
327}
328
329static __inline __m128d __attribute__((__always_inline__))
330_mm_xor_pd (__m128d __A, __m128d __B)
331{
332 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
333}
334
335static __inline __m128d __attribute__((__always_inline__))
336_mm_cmpeq_pd (__m128d __A, __m128d __B)
337{
338 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
339}
340
341static __inline __m128d __attribute__((__always_inline__))
342_mm_cmplt_pd (__m128d __A, __m128d __B)
343{
344 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
345}
346
347static __inline __m128d __attribute__((__always_inline__))
348_mm_cmple_pd (__m128d __A, __m128d __B)
349{
350 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
351}
352
353static __inline __m128d __attribute__((__always_inline__))
354_mm_cmpgt_pd (__m128d __A, __m128d __B)
355{
356 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
357}
358
359static __inline __m128d __attribute__((__always_inline__))
360_mm_cmpge_pd (__m128d __A, __m128d __B)
361{
362 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
363}
364
365static __inline __m128d __attribute__((__always_inline__))
366_mm_cmpneq_pd (__m128d __A, __m128d __B)
367{
368 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
369}
370
371static __inline __m128d __attribute__((__always_inline__))
372_mm_cmpnlt_pd (__m128d __A, __m128d __B)
373{
374 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
375}
376
377static __inline __m128d __attribute__((__always_inline__))
378_mm_cmpnle_pd (__m128d __A, __m128d __B)
379{
380 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
381}
382
383static __inline __m128d __attribute__((__always_inline__))
384_mm_cmpngt_pd (__m128d __A, __m128d __B)
385{
386 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
387}
388
389static __inline __m128d __attribute__((__always_inline__))
390_mm_cmpnge_pd (__m128d __A, __m128d __B)
391{
392 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
393}
394
395static __inline __m128d __attribute__((__always_inline__))
396_mm_cmpord_pd (__m128d __A, __m128d __B)
397{
398 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
399}
400
401static __inline __m128d __attribute__((__always_inline__))
402_mm_cmpunord_pd (__m128d __A, __m128d __B)
403{
404 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
405}
406
407static __inline __m128d __attribute__((__always_inline__))
408_mm_cmpeq_sd (__m128d __A, __m128d __B)
409{
410 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
411}
412
413static __inline __m128d __attribute__((__always_inline__))
414_mm_cmplt_sd (__m128d __A, __m128d __B)
415{
416 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
417}
418
419static __inline __m128d __attribute__((__always_inline__))
420_mm_cmple_sd (__m128d __A, __m128d __B)
421{
422 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
423}
424
425static __inline __m128d __attribute__((__always_inline__))
426_mm_cmpgt_sd (__m128d __A, __m128d __B)
427{
428 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
429 (__v2df)
430 __builtin_ia32_cmpltsd ((__v2df) __B,
431 (__v2df)
432 __A));
433}
434
435static __inline __m128d __attribute__((__always_inline__))
436_mm_cmpge_sd (__m128d __A, __m128d __B)
437{
438 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
439 (__v2df)
440 __builtin_ia32_cmplesd ((__v2df) __B,
441 (__v2df)
442 __A));
443}
444
445static __inline __m128d __attribute__((__always_inline__))
446_mm_cmpneq_sd (__m128d __A, __m128d __B)
447{
448 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
449}
450
451static __inline __m128d __attribute__((__always_inline__))
452_mm_cmpnlt_sd (__m128d __A, __m128d __B)
453{
454 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
455}
456
457static __inline __m128d __attribute__((__always_inline__))
458_mm_cmpnle_sd (__m128d __A, __m128d __B)
459{
460 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
461}
462
463static __inline __m128d __attribute__((__always_inline__))
464_mm_cmpngt_sd (__m128d __A, __m128d __B)
465{
466 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
467 (__v2df)
468 __builtin_ia32_cmpnltsd ((__v2df) __B,
469 (__v2df)
470 __A));
471}
472
473static __inline __m128d __attribute__((__always_inline__))
474_mm_cmpnge_sd (__m128d __A, __m128d __B)
475{
476 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
477 (__v2df)
478 __builtin_ia32_cmpnlesd ((__v2df) __B,
479 (__v2df)
480 __A));
481}
482
483static __inline __m128d __attribute__((__always_inline__))
484_mm_cmpord_sd (__m128d __A, __m128d __B)
485{
486 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
487}
488
489static __inline __m128d __attribute__((__always_inline__))
490_mm_cmpunord_sd (__m128d __A, __m128d __B)
491{
492 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
493}
494
495static __inline int __attribute__((__always_inline__))
496_mm_comieq_sd (__m128d __A, __m128d __B)
497{
498 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
499}
500
501static __inline int __attribute__((__always_inline__))
502_mm_comilt_sd (__m128d __A, __m128d __B)
503{
504 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
505}
506
507static __inline int __attribute__((__always_inline__))
508_mm_comile_sd (__m128d __A, __m128d __B)
509{
510 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
511}
512
513static __inline int __attribute__((__always_inline__))
514_mm_comigt_sd (__m128d __A, __m128d __B)
515{
516 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
517}
518
519static __inline int __attribute__((__always_inline__))
520_mm_comige_sd (__m128d __A, __m128d __B)
521{
522 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
523}
524
525static __inline int __attribute__((__always_inline__))
526_mm_comineq_sd (__m128d __A, __m128d __B)
527{
528 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
529}
530
531static __inline int __attribute__((__always_inline__))
532_mm_ucomieq_sd (__m128d __A, __m128d __B)
533{
534 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
535}
536
537static __inline int __attribute__((__always_inline__))
538_mm_ucomilt_sd (__m128d __A, __m128d __B)
539{
540 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
541}
542
543static __inline int __attribute__((__always_inline__))
544_mm_ucomile_sd (__m128d __A, __m128d __B)
545{
546 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
547}
548
549static __inline int __attribute__((__always_inline__))
550_mm_ucomigt_sd (__m128d __A, __m128d __B)
551{
552 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
553}
554
555static __inline int __attribute__((__always_inline__))
556_mm_ucomige_sd (__m128d __A, __m128d __B)
557{
558 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
559}
560
561static __inline int __attribute__((__always_inline__))
562_mm_ucomineq_sd (__m128d __A, __m128d __B)
563{
564 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
565}
566
567/* Create a vector of Qi, where i is the element number. */
568
569static __inline __m128i __attribute__((__always_inline__))
570_mm_set_epi64x (long long __q1, long long __q0)
571{
572 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
573}
574
575static __inline __m128i __attribute__((__always_inline__))
576_mm_set_epi64 (__m64 __q1, __m64 __q0)
577{
578 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
579}
580
581static __inline __m128i __attribute__((__always_inline__))
582_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
583{
584 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
585}
586
587static __inline __m128i __attribute__((__always_inline__))
588_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
589 short __q3, short __q2, short __q1, short __q0)
590{
591 return __extension__ (__m128i)(__v8hi){
592 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
593}
594
595static __inline __m128i __attribute__((__always_inline__))
596_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
597 char __q11, char __q10, char __q09, char __q08,
598 char __q07, char __q06, char __q05, char __q04,
599 char __q03, char __q02, char __q01, char __q00)
600{
601 return __extension__ (__m128i)(__v16qi){
602 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
603 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
604 };
605}
606
607/* Set all of the elements of the vector to A. */
608
609static __inline __m128i __attribute__((__always_inline__))
610_mm_set1_epi64x (long long __A)
611{
612 return _mm_set_epi64x (__A, __A);
613}
614
615static __inline __m128i __attribute__((__always_inline__))
616_mm_set1_epi64 (__m64 __A)
617{
618 return _mm_set_epi64 (__A, __A);
619}
620
621static __inline __m128i __attribute__((__always_inline__))
622_mm_set1_epi32 (int __A)
623{
624 return _mm_set_epi32 (__A, __A, __A, __A);
625}
626
627static __inline __m128i __attribute__((__always_inline__))
628_mm_set1_epi16 (short __A)
629{
630 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
631}
632
633static __inline __m128i __attribute__((__always_inline__))
634_mm_set1_epi8 (char __A)
635{
636 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
637 __A, __A, __A, __A, __A, __A, __A, __A);
638}
639
640/* Create a vector of Qi, where i is the element number.
641 The parameter order is reversed from the _mm_set_epi* functions. */
642
643static __inline __m128i __attribute__((__always_inline__))
644_mm_setr_epi64 (__m64 __q0, __m64 __q1)
645{
646 return _mm_set_epi64 (__q1, __q0);
647}
648
649static __inline __m128i __attribute__((__always_inline__))
650_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
651{
652 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
653}
654
655static __inline __m128i __attribute__((__always_inline__))
656_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
657 short __q4, short __q5, short __q6, short __q7)
658{
659 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
660}
661
662static __inline __m128i __attribute__((__always_inline__))
663_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
664 char __q04, char __q05, char __q06, char __q07,
665 char __q08, char __q09, char __q10, char __q11,
666 char __q12, char __q13, char __q14, char __q15)
667{
668 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
669 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
670}
671
672/* Create a vector with element 0 as *P and the rest zero. */
673
674static __inline __m128i __attribute__((__always_inline__))
675_mm_load_si128 (__m128i const *__P)
676{
677 return *__P;
678}
679
680static __inline __m128i __attribute__((__always_inline__))
681_mm_loadu_si128 (__m128i const *__P)
682{
683 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
684}
685
686static __inline __m128i __attribute__((__always_inline__))
687_mm_loadl_epi64 (__m128i const *__P)
688{
689 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
690}
691
692static __inline void __attribute__((__always_inline__))
693_mm_store_si128 (__m128i *__P, __m128i __B)
694{
695 *__P = __B;
696}
697
698static __inline void __attribute__((__always_inline__))
699_mm_storeu_si128 (__m128i *__P, __m128i __B)
700{
701 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
702}
703
704static __inline void __attribute__((__always_inline__))
705_mm_storel_epi64 (__m128i *__P, __m128i __B)
706{
707 *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
708}
709
710static __inline __m64 __attribute__((__always_inline__))
711_mm_movepi64_pi64 (__m128i __B)
712{
713 return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
714}
715
716static __inline __m128i __attribute__((__always_inline__))
717_mm_movpi64_epi64 (__m64 __A)
718{
719 return _mm_set_epi64 ((__m64)0LL, __A);
720}
721
722static __inline __m128i __attribute__((__always_inline__))
723_mm_move_epi64 (__m128i __A)
724{
725 return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
726}
727
728/* Create a vector of zeros. */
729static __inline __m128i __attribute__((__always_inline__))
730_mm_setzero_si128 (void)
731{
732 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
733}
734
735static __inline __m128d __attribute__((__always_inline__))
736_mm_cvtepi32_pd (__m128i __A)
737{
738 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
739}
740
741static __inline __m128 __attribute__((__always_inline__))
742_mm_cvtepi32_ps (__m128i __A)
743{
744 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
745}
746
747static __inline __m128i __attribute__((__always_inline__))
748_mm_cvtpd_epi32 (__m128d __A)
749{
750 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
751}
752
753static __inline __m64 __attribute__((__always_inline__))
754_mm_cvtpd_pi32 (__m128d __A)
755{
756 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
757}
758
759static __inline __m128 __attribute__((__always_inline__))
760_mm_cvtpd_ps (__m128d __A)
761{
762 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
763}
764
765static __inline __m128i __attribute__((__always_inline__))
766_mm_cvttpd_epi32 (__m128d __A)
767{
768 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
769}
770
771static __inline __m64 __attribute__((__always_inline__))
772_mm_cvttpd_pi32 (__m128d __A)
773{
774 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
775}
776
777static __inline __m128d __attribute__((__always_inline__))
778_mm_cvtpi32_pd (__m64 __A)
779{
780 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
781}
782
783static __inline __m128i __attribute__((__always_inline__))
784_mm_cvtps_epi32 (__m128 __A)
785{
786 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
787}
788
789static __inline __m128i __attribute__((__always_inline__))
790_mm_cvttps_epi32 (__m128 __A)
791{
792 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
793}
794
795static __inline __m128d __attribute__((__always_inline__))
796_mm_cvtps_pd (__m128 __A)
797{
798 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
799}
800
801static __inline int __attribute__((__always_inline__))
802_mm_cvtsd_si32 (__m128d __A)
803{
804 return __builtin_ia32_cvtsd2si ((__v2df) __A);
805}
806
807#ifdef __x86_64__
808/* Intel intrinsic. */
809static __inline long long __attribute__((__always_inline__))
810_mm_cvtsd_si64 (__m128d __A)
811{
812 return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
813}
814
815/* Microsoft intrinsic. */
816static __inline long long __attribute__((__always_inline__))
817_mm_cvtsd_si64x (__m128d __A)
818{
819 return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
820}
821#endif
822
823static __inline int __attribute__((__always_inline__))
824_mm_cvttsd_si32 (__m128d __A)
825{
826 return __builtin_ia32_cvttsd2si ((__v2df) __A);
827}
828
829#ifdef __x86_64__
830/* Intel intrinsic. */
831static __inline long long __attribute__((__always_inline__))
832_mm_cvttsd_si64 (__m128d __A)
833{
834 return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
835}
836
837/* Microsoft intrinsic. */
838static __inline long long __attribute__((__always_inline__))
839_mm_cvttsd_si64x (__m128d __A)
840{
841 return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
842}
843#endif
844
845static __inline __m128 __attribute__((__always_inline__))
846_mm_cvtsd_ss (__m128 __A, __m128d __B)
847{
848 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
849}
850
851static __inline __m128d __attribute__((__always_inline__))
852_mm_cvtsi32_sd (__m128d __A, int __B)
853{
854 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
855}
856
857#ifdef __x86_64__
858/* Intel intrinsic. */
859static __inline __m128d __attribute__((__always_inline__))
860_mm_cvtsi64_sd (__m128d __A, long long __B)
861{
862 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
863}
864
865/* Microsoft intrinsic. */
866static __inline __m128d __attribute__((__always_inline__))
867_mm_cvtsi64x_sd (__m128d __A, long long __B)
868{
869 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
870}
871#endif
872
873static __inline __m128d __attribute__((__always_inline__))
874_mm_cvtss_sd (__m128d __A, __m128 __B)
875{
876 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
877}
878
879#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
880
881static __inline __m128d __attribute__((__always_inline__))
882_mm_unpackhi_pd (__m128d __A, __m128d __B)
883{
884 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
885}
886
887static __inline __m128d __attribute__((__always_inline__))
888_mm_unpacklo_pd (__m128d __A, __m128d __B)
889{
890 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
891}
892
893static __inline __m128d __attribute__((__always_inline__))
894_mm_loadh_pd (__m128d __A, double const *__B)
895{
896 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
897}
898
899static __inline __m128d __attribute__((__always_inline__))
900_mm_loadl_pd (__m128d __A, double const *__B)
901{
902 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
903}
904
905static __inline int __attribute__((__always_inline__))
906_mm_movemask_pd (__m128d __A)
907{
908 return __builtin_ia32_movmskpd ((__v2df)__A);
909}
910
911static __inline __m128i __attribute__((__always_inline__))
912_mm_packs_epi16 (__m128i __A, __m128i __B)
913{
914 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
915}
916
917static __inline __m128i __attribute__((__always_inline__))
918_mm_packs_epi32 (__m128i __A, __m128i __B)
919{
920 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
921}
922
923static __inline __m128i __attribute__((__always_inline__))
924_mm_packus_epi16 (__m128i __A, __m128i __B)
925{
926 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
927}
928
929static __inline __m128i __attribute__((__always_inline__))
930_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
931{
932 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
933}
934
935static __inline __m128i __attribute__((__always_inline__))
936_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
937{
938 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
939}
940
941static __inline __m128i __attribute__((__always_inline__))
942_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
943{
944 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
945}
946
947static __inline __m128i __attribute__((__always_inline__))
948_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
949{
950 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
951}
952
953static __inline __m128i __attribute__((__always_inline__))
954_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
955{
956 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
957}
958
959static __inline __m128i __attribute__((__always_inline__))
960_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
961{
962 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
963}
964
965static __inline __m128i __attribute__((__always_inline__))
966_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
967{
968 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
969}
970
971static __inline __m128i __attribute__((__always_inline__))
972_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
973{
974 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
975}
976
977static __inline __m128i __attribute__((__always_inline__))
978_mm_add_epi8 (__m128i __A, __m128i __B)
979{
980 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
981}
982
983static __inline __m128i __attribute__((__always_inline__))
984_mm_add_epi16 (__m128i __A, __m128i __B)
985{
986 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
987}
988
989static __inline __m128i __attribute__((__always_inline__))
990_mm_add_epi32 (__m128i __A, __m128i __B)
991{
992 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
993}
994
995static __inline __m128i __attribute__((__always_inline__))
996_mm_add_epi64 (__m128i __A, __m128i __B)
997{
998 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
999}
1000
1001static __inline __m128i __attribute__((__always_inline__))
1002_mm_adds_epi8 (__m128i __A, __m128i __B)
1003{
1004 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1005}
1006
1007static __inline __m128i __attribute__((__always_inline__))
1008_mm_adds_epi16 (__m128i __A, __m128i __B)
1009{
1010 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1011}
1012
1013static __inline __m128i __attribute__((__always_inline__))
1014_mm_adds_epu8 (__m128i __A, __m128i __B)
1015{
1016 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1017}
1018
1019static __inline __m128i __attribute__((__always_inline__))
1020_mm_adds_epu16 (__m128i __A, __m128i __B)
1021{
1022 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1023}
1024
1025static __inline __m128i __attribute__((__always_inline__))
1026_mm_sub_epi8 (__m128i __A, __m128i __B)
1027{
1028 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1029}
1030
1031static __inline __m128i __attribute__((__always_inline__))
1032_mm_sub_epi16 (__m128i __A, __m128i __B)
1033{
1034 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1035}
1036
1037static __inline __m128i __attribute__((__always_inline__))
1038_mm_sub_epi32 (__m128i __A, __m128i __B)
1039{
1040 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1041}
1042
1043static __inline __m128i __attribute__((__always_inline__))
1044_mm_sub_epi64 (__m128i __A, __m128i __B)
1045{
1046 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1047}
1048
1049static __inline __m128i __attribute__((__always_inline__))
1050_mm_subs_epi8 (__m128i __A, __m128i __B)
1051{
1052 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1053}
1054
1055static __inline __m128i __attribute__((__always_inline__))
1056_mm_subs_epi16 (__m128i __A, __m128i __B)
1057{
1058 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1059}
1060
1061static __inline __m128i __attribute__((__always_inline__))
1062_mm_subs_epu8 (__m128i __A, __m128i __B)
1063{
1064 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1065}
1066
1067static __inline __m128i __attribute__((__always_inline__))
1068_mm_subs_epu16 (__m128i __A, __m128i __B)
1069{
1070 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1071}
1072
1073static __inline __m128i __attribute__((__always_inline__))
1074_mm_madd_epi16 (__m128i __A, __m128i __B)
1075{
1076 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1077}
1078
1079static __inline __m128i __attribute__((__always_inline__))
1080_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1081{
1082 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1083}
1084
1085static __inline __m128i __attribute__((__always_inline__))
1086_mm_mullo_epi16 (__m128i __A, __m128i __B)
1087{
1088 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1089}
1090
1091static __inline __m64 __attribute__((__always_inline__))
1092_mm_mul_su32 (__m64 __A, __m64 __B)
1093{
1094 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1095}
1096
1097static __inline __m128i __attribute__((__always_inline__))
1098_mm_mul_epu32 (__m128i __A, __m128i __B)
1099{
1100 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1101}
1102
1103#if 0
1104static __inline __m128i __attribute__((__always_inline__))
1105_mm_slli_epi16 (__m128i __A, int __B)
1106{
1107 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1108}
1109
1110static __inline __m128i __attribute__((__always_inline__))
1111_mm_slli_epi32 (__m128i __A, int __B)
1112{
1113 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1114}
1115
1116static __inline __m128i __attribute__((__always_inline__))
1117_mm_slli_epi64 (__m128i __A, int __B)
1118{
1119 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1120}
1121#else
1122#define _mm_slli_epi16(__A, __B) \
1123 ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B))
1124#define _mm_slli_epi32(__A, __B) \
1125 ((__m128i)__builtin_ia32_pslldi128 ((__v8hi)(__A), __B))
1126#define _mm_slli_epi64(__A, __B) \
1127 ((__m128i)__builtin_ia32_psllqi128 ((__v8hi)(__A), __B))
1128#endif
1129
1130#if 0
1131static __inline __m128i __attribute__((__always_inline__))
1132_mm_srai_epi16 (__m128i __A, int __B)
1133{
1134 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1135}
1136
1137static __inline __m128i __attribute__((__always_inline__))
1138_mm_srai_epi32 (__m128i __A, int __B)
1139{
1140 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1141}
1142#else
1143#define _mm_srai_epi16(__A, __B) \
1144 ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B))
1145#define _mm_srai_epi32(__A, __B) \
1146 ((__m128i)__builtin_ia32_psradi128 ((__v8hi)(__A), __B))
1147#endif
1148
1149#if 0
1150static __m128i __attribute__((__always_inline__))
1151_mm_srli_si128 (__m128i __A, int __B)
1152{
1153 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8));
1154}
1155
1156static __m128i __attribute__((__always_inline__))
1157_mm_srli_si128 (__m128i __A, int __B)
1158{
1159 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8));
1160}
1161#else
1162#define _mm_srli_si128(__A, __B) \
1163 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1164#define _mm_slli_si128(__A, __B) \
1165 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1166#endif
1167
1168#if 0
1169static __inline __m128i __attribute__((__always_inline__))
1170_mm_srli_epi16 (__m128i __A, int __B)
1171{
1172 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1173}
1174
1175static __inline __m128i __attribute__((__always_inline__))
1176_mm_srli_epi32 (__m128i __A, int __B)
1177{
1178 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1179}
1180
1181static __inline __m128i __attribute__((__always_inline__))
1182_mm_srli_epi64 (__m128i __A, int __B)
1183{
1184 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1185}
1186#else
1187#define _mm_srli_epi16(__A, __B) \
1188 ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B))
1189#define _mm_srli_epi32(__A, __B) \
1190 ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B))
1191#define _mm_srli_epi64(__A, __B) \
1192 ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B))
1193#endif
1194
1195static __inline __m128i __attribute__((__always_inline__))
1196_mm_sll_epi16 (__m128i __A, __m128i __B)
1197{
1198 return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1199}
1200
1201static __inline __m128i __attribute__((__always_inline__))
1202_mm_sll_epi32 (__m128i __A, __m128i __B)
1203{
1204 return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1205}
1206
1207static __inline __m128i __attribute__((__always_inline__))
1208_mm_sll_epi64 (__m128i __A, __m128i __B)
1209{
1210 return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1211}
1212
1213static __inline __m128i __attribute__((__always_inline__))
1214_mm_sra_epi16 (__m128i __A, __m128i __B)
1215{
1216 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1217}
1218
1219static __inline __m128i __attribute__((__always_inline__))
1220_mm_sra_epi32 (__m128i __A, __m128i __B)
1221{
1222 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1223}
1224
1225static __inline __m128i __attribute__((__always_inline__))
1226_mm_srl_epi16 (__m128i __A, __m128i __B)
1227{
1228 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1229}
1230
1231static __inline __m128i __attribute__((__always_inline__))
1232_mm_srl_epi32 (__m128i __A, __m128i __B)
1233{
1234 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1235}
1236
1237static __inline __m128i __attribute__((__always_inline__))
1238_mm_srl_epi64 (__m128i __A, __m128i __B)
1239{
1240 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1241}
1242
1243static __inline __m128i __attribute__((__always_inline__))
1244_mm_and_si128 (__m128i __A, __m128i __B)
1245{
1246 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1247}
1248
1249static __inline __m128i __attribute__((__always_inline__))
1250_mm_andnot_si128 (__m128i __A, __m128i __B)
1251{
1252 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1253}
1254
1255static __inline __m128i __attribute__((__always_inline__))
1256_mm_or_si128 (__m128i __A, __m128i __B)
1257{
1258 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1259}
1260
1261static __inline __m128i __attribute__((__always_inline__))
1262_mm_xor_si128 (__m128i __A, __m128i __B)
1263{
1264 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1265}
1266
1267static __inline __m128i __attribute__((__always_inline__))
1268_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1269{
1270 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1271}
1272
1273static __inline __m128i __attribute__((__always_inline__))
1274_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1275{
1276 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1277}
1278
1279static __inline __m128i __attribute__((__always_inline__))
1280_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1281{
1282 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1283}
1284
1285static __inline __m128i __attribute__((__always_inline__))
1286_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1287{
1288 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1289}
1290
1291static __inline __m128i __attribute__((__always_inline__))
1292_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1293{
1294 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1295}
1296
1297static __inline __m128i __attribute__((__always_inline__))
1298_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1299{
1300 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1301}
1302
1303static __inline __m128i __attribute__((__always_inline__))
1304_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1305{
1306 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1307}
1308
1309static __inline __m128i __attribute__((__always_inline__))
1310_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1311{
1312 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1313}
1314
1315static __inline __m128i __attribute__((__always_inline__))
1316_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1317{
1318 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1319}
1320
1321#if 0
1322static __inline int __attribute__((__always_inline__))
1323_mm_extract_epi16 (__m128i const __A, int const __N)
1324{
1325 return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1326}
1327
1328static __inline __m128i __attribute__((__always_inline__))
1329_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1330{
1331 return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1332}
1333#else
1334#define _mm_extract_epi16(A, N) \
1335 ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N)))
1336#define _mm_insert_epi16(A, D, N) \
1337 ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
1338#endif
1339
1340static __inline __m128i __attribute__((__always_inline__))
1341_mm_max_epi16 (__m128i __A, __m128i __B)
1342{
1343 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1344}
1345
1346static __inline __m128i __attribute__((__always_inline__))
1347_mm_max_epu8 (__m128i __A, __m128i __B)
1348{
1349 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1350}
1351
1352static __inline __m128i __attribute__((__always_inline__))
1353_mm_min_epi16 (__m128i __A, __m128i __B)
1354{
1355 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1356}
1357
1358static __inline __m128i __attribute__((__always_inline__))
1359_mm_min_epu8 (__m128i __A, __m128i __B)
1360{
1361 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1362}
1363
1364static __inline int __attribute__((__always_inline__))
1365_mm_movemask_epi8 (__m128i __A)
1366{
1367 return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1368}
1369
1370static __inline __m128i __attribute__((__always_inline__))
1371_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1372{
1373 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1374}
1375
1376#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1377#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1378#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1379
1380static __inline void __attribute__((__always_inline__))
1381_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1382{
1383 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1384}
1385
1386static __inline __m128i __attribute__((__always_inline__))
1387_mm_avg_epu8 (__m128i __A, __m128i __B)
1388{
1389 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1390}
1391
1392static __inline __m128i __attribute__((__always_inline__))
1393_mm_avg_epu16 (__m128i __A, __m128i __B)
1394{
1395 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1396}
1397
1398static __inline __m128i __attribute__((__always_inline__))
1399_mm_sad_epu8 (__m128i __A, __m128i __B)
1400{
1401 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1402}
1403
1404static __inline void __attribute__((__always_inline__))
1405_mm_stream_si32 (int *__A, int __B)
1406{
1407 __builtin_ia32_movnti (__A, __B);
1408}
1409
1410static __inline void __attribute__((__always_inline__))
1411_mm_stream_si128 (__m128i *__A, __m128i __B)
1412{
1413 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1414}
1415
1416static __inline void __attribute__((__always_inline__))
1417_mm_stream_pd (double *__A, __m128d __B)
1418{
1419 __builtin_ia32_movntpd (__A, (__v2df)__B);
1420}
1421
1422static __inline void __attribute__((__always_inline__))
1423_mm_clflush (void const *__A)
1424{
1425 __builtin_ia32_clflush (__A);
1426}
1427
1428static __inline void __attribute__((__always_inline__))
1429_mm_lfence (void)
1430{
1431 __builtin_ia32_lfence ();
1432}
1433
1434static __inline void __attribute__((__always_inline__))
1435_mm_mfence (void)
1436{
1437 __builtin_ia32_mfence ();
1438}
1439
1440static __inline __m128i __attribute__((__always_inline__))
1441_mm_cvtsi32_si128 (int __A)
1442{
1443 return _mm_set_epi32 (0, 0, 0, __A);
1444}
1445
1446#ifdef __x86_64__
1447/* Intel intrinsic. */
1448static __inline __m128i __attribute__((__always_inline__))
1449_mm_cvtsi64_si128 (long long __A)
1450{
1451 return _mm_set_epi64x (0, __A);
1452}
1453
1454/* Microsoft intrinsic. */
1455static __inline __m128i __attribute__((__always_inline__))
1456_mm_cvtsi64x_si128 (long long __A)
1457{
1458 return _mm_set_epi64x (0, __A);
1459}
1460#endif
1461
1462/* Casts between various SP, DP, INT vector types. Note that these do no
1463 conversion of values, they just change the type. */
1464static __inline __m128 __attribute__((__always_inline__))
1465_mm_castpd_ps(__m128d __A)
1466{
1467 return (__m128) __A;
1468}
1469
1470static __inline __m128i __attribute__((__always_inline__))
1471_mm_castpd_si128(__m128d __A)
1472{
1473 return (__m128i) __A;
1474}
1475
1476static __inline __m128d __attribute__((__always_inline__))
1477_mm_castps_pd(__m128 __A)
1478{
1479 return (__m128d) __A;
1480}
1481
1482static __inline __m128i __attribute__((__always_inline__))
1483_mm_castps_si128(__m128 __A)
1484{
1485 return (__m128i) __A;
1486}
1487
1488static __inline __m128 __attribute__((__always_inline__))
1489_mm_castsi128_ps(__m128i __A)
1490{
1491 return (__m128) __A;
1492}
1493
1494static __inline __m128d __attribute__((__always_inline__))
1495_mm_castsi128_pd(__m128i __A)
1496{
1497 return (__m128d) __A;
1498}
1499
1500#endif /* __SSE2__ */
1501
1502#endif /* _EMMINTRIN_H_INCLUDED */
38#include <xmmintrin.h>
39
40/* SSE2 */
41typedef double __v2df __attribute__ ((__vector_size__ (16)));
42typedef long long __v2di __attribute__ ((__vector_size__ (16)));
43typedef int __v4si __attribute__ ((__vector_size__ (16)));
44typedef short __v8hi __attribute__ ((__vector_size__ (16)));
45typedef char __v16qi __attribute__ ((__vector_size__ (16)));
46
47/* The Intel API is flexible enough that we must allow aliasing with other
48 vector types, and their scalar components. */
49typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
50typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
51
52/* Create a selector for use with the SHUFPD instruction. */
53#define _MM_SHUFFLE2(fp1,fp0) \
54 (((fp1) << 1) | (fp0))
55
56/* Create a vector with element 0 as F and the rest zero. */
57static __inline __m128d __attribute__((__always_inline__))
58_mm_set_sd (double __F)
59{
60 return __extension__ (__m128d){ __F, 0 };
61}
62
63/* Create a vector with both elements equal to F. */
64static __inline __m128d __attribute__((__always_inline__))
65_mm_set1_pd (double __F)
66{
67 return __extension__ (__m128d){ __F, __F };
68}
69
70static __inline __m128d __attribute__((__always_inline__))
71_mm_set_pd1 (double __F)
72{
73 return _mm_set1_pd (__F);
74}
75
76/* Create a vector with the lower value X and upper value W. */
77static __inline __m128d __attribute__((__always_inline__))
78_mm_set_pd (double __W, double __X)
79{
80 return __extension__ (__m128d){ __X, __W };
81}
82
83/* Create a vector with the lower value W and upper value X. */
84static __inline __m128d __attribute__((__always_inline__))
85_mm_setr_pd (double __W, double __X)
86{
87 return __extension__ (__m128d){ __W, __X };
88}
89
90/* Create a vector of zeros. */
91static __inline __m128d __attribute__((__always_inline__))
92_mm_setzero_pd (void)
93{
94 return __extension__ (__m128d){ 0.0, 0.0 };
95}
96
97/* Sets the low DPFP value of A from the low value of B. */
98static __inline __m128d __attribute__((__always_inline__))
99_mm_move_sd (__m128d __A, __m128d __B)
100{
101 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
102}
103
104/* Load two DPFP values from P. The address must be 16-byte aligned. */
105static __inline __m128d __attribute__((__always_inline__))
106_mm_load_pd (double const *__P)
107{
108 return *(__m128d *)__P;
109}
110
111/* Load two DPFP values from P. The address need not be 16-byte aligned. */
112static __inline __m128d __attribute__((__always_inline__))
113_mm_loadu_pd (double const *__P)
114{
115 return __builtin_ia32_loadupd (__P);
116}
117
118/* Create a vector with all two elements equal to *P. */
119static __inline __m128d __attribute__((__always_inline__))
120_mm_load1_pd (double const *__P)
121{
122 return _mm_set1_pd (*__P);
123}
124
125/* Create a vector with element 0 as *P and the rest zero. */
126static __inline __m128d __attribute__((__always_inline__))
127_mm_load_sd (double const *__P)
128{
129 return _mm_set_sd (*__P);
130}
131
132static __inline __m128d __attribute__((__always_inline__))
133_mm_load_pd1 (double const *__P)
134{
135 return _mm_load1_pd (__P);
136}
137
138/* Load two DPFP values in reverse order. The address must be aligned. */
139static __inline __m128d __attribute__((__always_inline__))
140_mm_loadr_pd (double const *__P)
141{
142 __m128d __tmp = _mm_load_pd (__P);
143 return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
144}
145
146/* Store two DPFP values. The address must be 16-byte aligned. */
147static __inline void __attribute__((__always_inline__))
148_mm_store_pd (double *__P, __m128d __A)
149{
150 *(__m128d *)__P = __A;
151}
152
153/* Store two DPFP values. The address need not be 16-byte aligned. */
154static __inline void __attribute__((__always_inline__))
155_mm_storeu_pd (double *__P, __m128d __A)
156{
157 __builtin_ia32_storeupd (__P, __A);
158}
159
160/* Stores the lower DPFP value. */
161static __inline void __attribute__((__always_inline__))
162_mm_store_sd (double *__P, __m128d __A)
163{
164 *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
165}
166
167static __inline double __attribute__((__always_inline__))
168_mm_cvtsd_f64 (__m128d __A)
169{
170 return __builtin_ia32_vec_ext_v2df (__A, 0);
171}
172
173static __inline void __attribute__((__always_inline__))
174_mm_storel_pd (double *__P, __m128d __A)
175{
176 _mm_store_sd (__P, __A);
177}
178
179/* Stores the upper DPFP value. */
180static __inline void __attribute__((__always_inline__))
181_mm_storeh_pd (double *__P, __m128d __A)
182{
183 *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
184}
185
186/* Store the lower DPFP value across two words.
187 The address must be 16-byte aligned. */
188static __inline void __attribute__((__always_inline__))
189_mm_store1_pd (double *__P, __m128d __A)
190{
191 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
192}
193
194static __inline void __attribute__((__always_inline__))
195_mm_store_pd1 (double *__P, __m128d __A)
196{
197 _mm_store1_pd (__P, __A);
198}
199
200/* Store two DPFP values in reverse order. The address must be aligned. */
201static __inline void __attribute__((__always_inline__))
202_mm_storer_pd (double *__P, __m128d __A)
203{
204 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
205}
206
207static __inline int __attribute__((__always_inline__))
208_mm_cvtsi128_si32 (__m128i __A)
209{
210 return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
211}
212
213#ifdef __x86_64__
214/* Intel intrinsic. */
215static __inline long long __attribute__((__always_inline__))
216_mm_cvtsi128_si64 (__m128i __A)
217{
218 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
219}
220
221/* Microsoft intrinsic. */
222static __inline long long __attribute__((__always_inline__))
223_mm_cvtsi128_si64x (__m128i __A)
224{
225 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
226}
227#endif
228
229static __inline __m128d __attribute__((__always_inline__))
230_mm_add_pd (__m128d __A, __m128d __B)
231{
232 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
233}
234
235static __inline __m128d __attribute__((__always_inline__))
236_mm_add_sd (__m128d __A, __m128d __B)
237{
238 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
239}
240
241static __inline __m128d __attribute__((__always_inline__))
242_mm_sub_pd (__m128d __A, __m128d __B)
243{
244 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
245}
246
247static __inline __m128d __attribute__((__always_inline__))
248_mm_sub_sd (__m128d __A, __m128d __B)
249{
250 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
251}
252
253static __inline __m128d __attribute__((__always_inline__))
254_mm_mul_pd (__m128d __A, __m128d __B)
255{
256 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
257}
258
259static __inline __m128d __attribute__((__always_inline__))
260_mm_mul_sd (__m128d __A, __m128d __B)
261{
262 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
263}
264
265static __inline __m128d __attribute__((__always_inline__))
266_mm_div_pd (__m128d __A, __m128d __B)
267{
268 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
269}
270
271static __inline __m128d __attribute__((__always_inline__))
272_mm_div_sd (__m128d __A, __m128d __B)
273{
274 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
275}
276
277static __inline __m128d __attribute__((__always_inline__))
278_mm_sqrt_pd (__m128d __A)
279{
280 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
281}
282
283/* Return pair {sqrt (A[0), B[1]}. */
284static __inline __m128d __attribute__((__always_inline__))
285_mm_sqrt_sd (__m128d __A, __m128d __B)
286{
287 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
288 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
289}
290
291static __inline __m128d __attribute__((__always_inline__))
292_mm_min_pd (__m128d __A, __m128d __B)
293{
294 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
295}
296
297static __inline __m128d __attribute__((__always_inline__))
298_mm_min_sd (__m128d __A, __m128d __B)
299{
300 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
301}
302
303static __inline __m128d __attribute__((__always_inline__))
304_mm_max_pd (__m128d __A, __m128d __B)
305{
306 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
307}
308
309static __inline __m128d __attribute__((__always_inline__))
310_mm_max_sd (__m128d __A, __m128d __B)
311{
312 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
313}
314
315static __inline __m128d __attribute__((__always_inline__))
316_mm_and_pd (__m128d __A, __m128d __B)
317{
318 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
319}
320
321static __inline __m128d __attribute__((__always_inline__))
322_mm_andnot_pd (__m128d __A, __m128d __B)
323{
324 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
325}
326
327static __inline __m128d __attribute__((__always_inline__))
328_mm_or_pd (__m128d __A, __m128d __B)
329{
330 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
331}
332
333static __inline __m128d __attribute__((__always_inline__))
334_mm_xor_pd (__m128d __A, __m128d __B)
335{
336 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
337}
338
339static __inline __m128d __attribute__((__always_inline__))
340_mm_cmpeq_pd (__m128d __A, __m128d __B)
341{
342 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
343}
344
345static __inline __m128d __attribute__((__always_inline__))
346_mm_cmplt_pd (__m128d __A, __m128d __B)
347{
348 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
349}
350
351static __inline __m128d __attribute__((__always_inline__))
352_mm_cmple_pd (__m128d __A, __m128d __B)
353{
354 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
355}
356
357static __inline __m128d __attribute__((__always_inline__))
358_mm_cmpgt_pd (__m128d __A, __m128d __B)
359{
360 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
361}
362
363static __inline __m128d __attribute__((__always_inline__))
364_mm_cmpge_pd (__m128d __A, __m128d __B)
365{
366 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
367}
368
369static __inline __m128d __attribute__((__always_inline__))
370_mm_cmpneq_pd (__m128d __A, __m128d __B)
371{
372 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
373}
374
375static __inline __m128d __attribute__((__always_inline__))
376_mm_cmpnlt_pd (__m128d __A, __m128d __B)
377{
378 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
379}
380
381static __inline __m128d __attribute__((__always_inline__))
382_mm_cmpnle_pd (__m128d __A, __m128d __B)
383{
384 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
385}
386
387static __inline __m128d __attribute__((__always_inline__))
388_mm_cmpngt_pd (__m128d __A, __m128d __B)
389{
390 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
391}
392
393static __inline __m128d __attribute__((__always_inline__))
394_mm_cmpnge_pd (__m128d __A, __m128d __B)
395{
396 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
397}
398
399static __inline __m128d __attribute__((__always_inline__))
400_mm_cmpord_pd (__m128d __A, __m128d __B)
401{
402 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
403}
404
405static __inline __m128d __attribute__((__always_inline__))
406_mm_cmpunord_pd (__m128d __A, __m128d __B)
407{
408 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
409}
410
411static __inline __m128d __attribute__((__always_inline__))
412_mm_cmpeq_sd (__m128d __A, __m128d __B)
413{
414 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
415}
416
417static __inline __m128d __attribute__((__always_inline__))
418_mm_cmplt_sd (__m128d __A, __m128d __B)
419{
420 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
421}
422
423static __inline __m128d __attribute__((__always_inline__))
424_mm_cmple_sd (__m128d __A, __m128d __B)
425{
426 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
427}
428
429static __inline __m128d __attribute__((__always_inline__))
430_mm_cmpgt_sd (__m128d __A, __m128d __B)
431{
432 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
433 (__v2df)
434 __builtin_ia32_cmpltsd ((__v2df) __B,
435 (__v2df)
436 __A));
437}
438
439static __inline __m128d __attribute__((__always_inline__))
440_mm_cmpge_sd (__m128d __A, __m128d __B)
441{
442 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
443 (__v2df)
444 __builtin_ia32_cmplesd ((__v2df) __B,
445 (__v2df)
446 __A));
447}
448
449static __inline __m128d __attribute__((__always_inline__))
450_mm_cmpneq_sd (__m128d __A, __m128d __B)
451{
452 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
453}
454
455static __inline __m128d __attribute__((__always_inline__))
456_mm_cmpnlt_sd (__m128d __A, __m128d __B)
457{
458 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
459}
460
461static __inline __m128d __attribute__((__always_inline__))
462_mm_cmpnle_sd (__m128d __A, __m128d __B)
463{
464 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
465}
466
467static __inline __m128d __attribute__((__always_inline__))
468_mm_cmpngt_sd (__m128d __A, __m128d __B)
469{
470 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
471 (__v2df)
472 __builtin_ia32_cmpnltsd ((__v2df) __B,
473 (__v2df)
474 __A));
475}
476
477static __inline __m128d __attribute__((__always_inline__))
478_mm_cmpnge_sd (__m128d __A, __m128d __B)
479{
480 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
481 (__v2df)
482 __builtin_ia32_cmpnlesd ((__v2df) __B,
483 (__v2df)
484 __A));
485}
486
487static __inline __m128d __attribute__((__always_inline__))
488_mm_cmpord_sd (__m128d __A, __m128d __B)
489{
490 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
491}
492
493static __inline __m128d __attribute__((__always_inline__))
494_mm_cmpunord_sd (__m128d __A, __m128d __B)
495{
496 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
497}
498
499static __inline int __attribute__((__always_inline__))
500_mm_comieq_sd (__m128d __A, __m128d __B)
501{
502 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
503}
504
505static __inline int __attribute__((__always_inline__))
506_mm_comilt_sd (__m128d __A, __m128d __B)
507{
508 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
509}
510
511static __inline int __attribute__((__always_inline__))
512_mm_comile_sd (__m128d __A, __m128d __B)
513{
514 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
515}
516
517static __inline int __attribute__((__always_inline__))
518_mm_comigt_sd (__m128d __A, __m128d __B)
519{
520 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
521}
522
523static __inline int __attribute__((__always_inline__))
524_mm_comige_sd (__m128d __A, __m128d __B)
525{
526 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
527}
528
529static __inline int __attribute__((__always_inline__))
530_mm_comineq_sd (__m128d __A, __m128d __B)
531{
532 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
533}
534
535static __inline int __attribute__((__always_inline__))
536_mm_ucomieq_sd (__m128d __A, __m128d __B)
537{
538 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
539}
540
541static __inline int __attribute__((__always_inline__))
542_mm_ucomilt_sd (__m128d __A, __m128d __B)
543{
544 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
545}
546
547static __inline int __attribute__((__always_inline__))
548_mm_ucomile_sd (__m128d __A, __m128d __B)
549{
550 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
551}
552
553static __inline int __attribute__((__always_inline__))
554_mm_ucomigt_sd (__m128d __A, __m128d __B)
555{
556 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
557}
558
559static __inline int __attribute__((__always_inline__))
560_mm_ucomige_sd (__m128d __A, __m128d __B)
561{
562 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
563}
564
565static __inline int __attribute__((__always_inline__))
566_mm_ucomineq_sd (__m128d __A, __m128d __B)
567{
568 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
569}
570
571/* Create a vector of Qi, where i is the element number. */
572
573static __inline __m128i __attribute__((__always_inline__))
574_mm_set_epi64x (long long __q1, long long __q0)
575{
576 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
577}
578
579static __inline __m128i __attribute__((__always_inline__))
580_mm_set_epi64 (__m64 __q1, __m64 __q0)
581{
582 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
583}
584
585static __inline __m128i __attribute__((__always_inline__))
586_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
587{
588 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
589}
590
591static __inline __m128i __attribute__((__always_inline__))
592_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
593 short __q3, short __q2, short __q1, short __q0)
594{
595 return __extension__ (__m128i)(__v8hi){
596 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
597}
598
599static __inline __m128i __attribute__((__always_inline__))
600_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
601 char __q11, char __q10, char __q09, char __q08,
602 char __q07, char __q06, char __q05, char __q04,
603 char __q03, char __q02, char __q01, char __q00)
604{
605 return __extension__ (__m128i)(__v16qi){
606 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
607 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
608 };
609}
610
611/* Set all of the elements of the vector to A. */
612
613static __inline __m128i __attribute__((__always_inline__))
614_mm_set1_epi64x (long long __A)
615{
616 return _mm_set_epi64x (__A, __A);
617}
618
619static __inline __m128i __attribute__((__always_inline__))
620_mm_set1_epi64 (__m64 __A)
621{
622 return _mm_set_epi64 (__A, __A);
623}
624
625static __inline __m128i __attribute__((__always_inline__))
626_mm_set1_epi32 (int __A)
627{
628 return _mm_set_epi32 (__A, __A, __A, __A);
629}
630
631static __inline __m128i __attribute__((__always_inline__))
632_mm_set1_epi16 (short __A)
633{
634 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
635}
636
637static __inline __m128i __attribute__((__always_inline__))
638_mm_set1_epi8 (char __A)
639{
640 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
641 __A, __A, __A, __A, __A, __A, __A, __A);
642}
643
644/* Create a vector of Qi, where i is the element number.
645 The parameter order is reversed from the _mm_set_epi* functions. */
646
647static __inline __m128i __attribute__((__always_inline__))
648_mm_setr_epi64 (__m64 __q0, __m64 __q1)
649{
650 return _mm_set_epi64 (__q1, __q0);
651}
652
653static __inline __m128i __attribute__((__always_inline__))
654_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
655{
656 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
657}
658
659static __inline __m128i __attribute__((__always_inline__))
660_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
661 short __q4, short __q5, short __q6, short __q7)
662{
663 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
664}
665
666static __inline __m128i __attribute__((__always_inline__))
667_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
668 char __q04, char __q05, char __q06, char __q07,
669 char __q08, char __q09, char __q10, char __q11,
670 char __q12, char __q13, char __q14, char __q15)
671{
672 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
673 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
674}
675
676/* Create a vector with element 0 as *P and the rest zero. */
677
678static __inline __m128i __attribute__((__always_inline__))
679_mm_load_si128 (__m128i const *__P)
680{
681 return *__P;
682}
683
684static __inline __m128i __attribute__((__always_inline__))
685_mm_loadu_si128 (__m128i const *__P)
686{
687 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
688}
689
690static __inline __m128i __attribute__((__always_inline__))
691_mm_loadl_epi64 (__m128i const *__P)
692{
693 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
694}
695
696static __inline void __attribute__((__always_inline__))
697_mm_store_si128 (__m128i *__P, __m128i __B)
698{
699 *__P = __B;
700}
701
702static __inline void __attribute__((__always_inline__))
703_mm_storeu_si128 (__m128i *__P, __m128i __B)
704{
705 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
706}
707
708static __inline void __attribute__((__always_inline__))
709_mm_storel_epi64 (__m128i *__P, __m128i __B)
710{
711 *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
712}
713
714static __inline __m64 __attribute__((__always_inline__))
715_mm_movepi64_pi64 (__m128i __B)
716{
717 return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
718}
719
720static __inline __m128i __attribute__((__always_inline__))
721_mm_movpi64_epi64 (__m64 __A)
722{
723 return _mm_set_epi64 ((__m64)0LL, __A);
724}
725
726static __inline __m128i __attribute__((__always_inline__))
727_mm_move_epi64 (__m128i __A)
728{
729 return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
730}
731
732/* Create a vector of zeros. */
733static __inline __m128i __attribute__((__always_inline__))
734_mm_setzero_si128 (void)
735{
736 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
737}
738
739static __inline __m128d __attribute__((__always_inline__))
740_mm_cvtepi32_pd (__m128i __A)
741{
742 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
743}
744
745static __inline __m128 __attribute__((__always_inline__))
746_mm_cvtepi32_ps (__m128i __A)
747{
748 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
749}
750
751static __inline __m128i __attribute__((__always_inline__))
752_mm_cvtpd_epi32 (__m128d __A)
753{
754 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
755}
756
757static __inline __m64 __attribute__((__always_inline__))
758_mm_cvtpd_pi32 (__m128d __A)
759{
760 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
761}
762
763static __inline __m128 __attribute__((__always_inline__))
764_mm_cvtpd_ps (__m128d __A)
765{
766 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
767}
768
769static __inline __m128i __attribute__((__always_inline__))
770_mm_cvttpd_epi32 (__m128d __A)
771{
772 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
773}
774
775static __inline __m64 __attribute__((__always_inline__))
776_mm_cvttpd_pi32 (__m128d __A)
777{
778 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
779}
780
781static __inline __m128d __attribute__((__always_inline__))
782_mm_cvtpi32_pd (__m64 __A)
783{
784 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
785}
786
787static __inline __m128i __attribute__((__always_inline__))
788_mm_cvtps_epi32 (__m128 __A)
789{
790 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
791}
792
793static __inline __m128i __attribute__((__always_inline__))
794_mm_cvttps_epi32 (__m128 __A)
795{
796 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
797}
798
799static __inline __m128d __attribute__((__always_inline__))
800_mm_cvtps_pd (__m128 __A)
801{
802 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
803}
804
805static __inline int __attribute__((__always_inline__))
806_mm_cvtsd_si32 (__m128d __A)
807{
808 return __builtin_ia32_cvtsd2si ((__v2df) __A);
809}
810
811#ifdef __x86_64__
812/* Intel intrinsic. */
813static __inline long long __attribute__((__always_inline__))
814_mm_cvtsd_si64 (__m128d __A)
815{
816 return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
817}
818
819/* Microsoft intrinsic. */
820static __inline long long __attribute__((__always_inline__))
821_mm_cvtsd_si64x (__m128d __A)
822{
823 return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
824}
825#endif
826
827static __inline int __attribute__((__always_inline__))
828_mm_cvttsd_si32 (__m128d __A)
829{
830 return __builtin_ia32_cvttsd2si ((__v2df) __A);
831}
832
833#ifdef __x86_64__
834/* Intel intrinsic. */
835static __inline long long __attribute__((__always_inline__))
836_mm_cvttsd_si64 (__m128d __A)
837{
838 return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
839}
840
841/* Microsoft intrinsic. */
842static __inline long long __attribute__((__always_inline__))
843_mm_cvttsd_si64x (__m128d __A)
844{
845 return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
846}
847#endif
848
849static __inline __m128 __attribute__((__always_inline__))
850_mm_cvtsd_ss (__m128 __A, __m128d __B)
851{
852 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
853}
854
855static __inline __m128d __attribute__((__always_inline__))
856_mm_cvtsi32_sd (__m128d __A, int __B)
857{
858 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
859}
860
861#ifdef __x86_64__
862/* Intel intrinsic. */
863static __inline __m128d __attribute__((__always_inline__))
864_mm_cvtsi64_sd (__m128d __A, long long __B)
865{
866 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
867}
868
869/* Microsoft intrinsic. */
870static __inline __m128d __attribute__((__always_inline__))
871_mm_cvtsi64x_sd (__m128d __A, long long __B)
872{
873 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
874}
875#endif
876
877static __inline __m128d __attribute__((__always_inline__))
878_mm_cvtss_sd (__m128d __A, __m128 __B)
879{
880 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
881}
882
883#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
884
885static __inline __m128d __attribute__((__always_inline__))
886_mm_unpackhi_pd (__m128d __A, __m128d __B)
887{
888 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
889}
890
891static __inline __m128d __attribute__((__always_inline__))
892_mm_unpacklo_pd (__m128d __A, __m128d __B)
893{
894 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
895}
896
897static __inline __m128d __attribute__((__always_inline__))
898_mm_loadh_pd (__m128d __A, double const *__B)
899{
900 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
901}
902
903static __inline __m128d __attribute__((__always_inline__))
904_mm_loadl_pd (__m128d __A, double const *__B)
905{
906 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
907}
908
909static __inline int __attribute__((__always_inline__))
910_mm_movemask_pd (__m128d __A)
911{
912 return __builtin_ia32_movmskpd ((__v2df)__A);
913}
914
915static __inline __m128i __attribute__((__always_inline__))
916_mm_packs_epi16 (__m128i __A, __m128i __B)
917{
918 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
919}
920
921static __inline __m128i __attribute__((__always_inline__))
922_mm_packs_epi32 (__m128i __A, __m128i __B)
923{
924 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
925}
926
927static __inline __m128i __attribute__((__always_inline__))
928_mm_packus_epi16 (__m128i __A, __m128i __B)
929{
930 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
931}
932
933static __inline __m128i __attribute__((__always_inline__))
934_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
935{
936 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
937}
938
939static __inline __m128i __attribute__((__always_inline__))
940_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
941{
942 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
943}
944
945static __inline __m128i __attribute__((__always_inline__))
946_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
947{
948 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
949}
950
951static __inline __m128i __attribute__((__always_inline__))
952_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
953{
954 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
955}
956
957static __inline __m128i __attribute__((__always_inline__))
958_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
959{
960 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
961}
962
963static __inline __m128i __attribute__((__always_inline__))
964_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
965{
966 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
967}
968
969static __inline __m128i __attribute__((__always_inline__))
970_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
971{
972 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
973}
974
975static __inline __m128i __attribute__((__always_inline__))
976_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
977{
978 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
979}
980
981static __inline __m128i __attribute__((__always_inline__))
982_mm_add_epi8 (__m128i __A, __m128i __B)
983{
984 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
985}
986
987static __inline __m128i __attribute__((__always_inline__))
988_mm_add_epi16 (__m128i __A, __m128i __B)
989{
990 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
991}
992
993static __inline __m128i __attribute__((__always_inline__))
994_mm_add_epi32 (__m128i __A, __m128i __B)
995{
996 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
997}
998
999static __inline __m128i __attribute__((__always_inline__))
1000_mm_add_epi64 (__m128i __A, __m128i __B)
1001{
1002 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1003}
1004
1005static __inline __m128i __attribute__((__always_inline__))
1006_mm_adds_epi8 (__m128i __A, __m128i __B)
1007{
1008 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1009}
1010
1011static __inline __m128i __attribute__((__always_inline__))
1012_mm_adds_epi16 (__m128i __A, __m128i __B)
1013{
1014 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1015}
1016
1017static __inline __m128i __attribute__((__always_inline__))
1018_mm_adds_epu8 (__m128i __A, __m128i __B)
1019{
1020 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1021}
1022
1023static __inline __m128i __attribute__((__always_inline__))
1024_mm_adds_epu16 (__m128i __A, __m128i __B)
1025{
1026 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1027}
1028
1029static __inline __m128i __attribute__((__always_inline__))
1030_mm_sub_epi8 (__m128i __A, __m128i __B)
1031{
1032 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1033}
1034
1035static __inline __m128i __attribute__((__always_inline__))
1036_mm_sub_epi16 (__m128i __A, __m128i __B)
1037{
1038 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1039}
1040
1041static __inline __m128i __attribute__((__always_inline__))
1042_mm_sub_epi32 (__m128i __A, __m128i __B)
1043{
1044 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1045}
1046
1047static __inline __m128i __attribute__((__always_inline__))
1048_mm_sub_epi64 (__m128i __A, __m128i __B)
1049{
1050 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1051}
1052
1053static __inline __m128i __attribute__((__always_inline__))
1054_mm_subs_epi8 (__m128i __A, __m128i __B)
1055{
1056 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1057}
1058
1059static __inline __m128i __attribute__((__always_inline__))
1060_mm_subs_epi16 (__m128i __A, __m128i __B)
1061{
1062 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1063}
1064
1065static __inline __m128i __attribute__((__always_inline__))
1066_mm_subs_epu8 (__m128i __A, __m128i __B)
1067{
1068 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1069}
1070
1071static __inline __m128i __attribute__((__always_inline__))
1072_mm_subs_epu16 (__m128i __A, __m128i __B)
1073{
1074 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1075}
1076
1077static __inline __m128i __attribute__((__always_inline__))
1078_mm_madd_epi16 (__m128i __A, __m128i __B)
1079{
1080 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1081}
1082
1083static __inline __m128i __attribute__((__always_inline__))
1084_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1085{
1086 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1087}
1088
1089static __inline __m128i __attribute__((__always_inline__))
1090_mm_mullo_epi16 (__m128i __A, __m128i __B)
1091{
1092 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1093}
1094
1095static __inline __m64 __attribute__((__always_inline__))
1096_mm_mul_su32 (__m64 __A, __m64 __B)
1097{
1098 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1099}
1100
1101static __inline __m128i __attribute__((__always_inline__))
1102_mm_mul_epu32 (__m128i __A, __m128i __B)
1103{
1104 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1105}
1106
1107#if 0
1108static __inline __m128i __attribute__((__always_inline__))
1109_mm_slli_epi16 (__m128i __A, int __B)
1110{
1111 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1112}
1113
1114static __inline __m128i __attribute__((__always_inline__))
1115_mm_slli_epi32 (__m128i __A, int __B)
1116{
1117 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1118}
1119
1120static __inline __m128i __attribute__((__always_inline__))
1121_mm_slli_epi64 (__m128i __A, int __B)
1122{
1123 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1124}
1125#else
1126#define _mm_slli_epi16(__A, __B) \
1127 ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B))
1128#define _mm_slli_epi32(__A, __B) \
1129 ((__m128i)__builtin_ia32_pslldi128 ((__v8hi)(__A), __B))
1130#define _mm_slli_epi64(__A, __B) \
1131 ((__m128i)__builtin_ia32_psllqi128 ((__v8hi)(__A), __B))
1132#endif
1133
1134#if 0
1135static __inline __m128i __attribute__((__always_inline__))
1136_mm_srai_epi16 (__m128i __A, int __B)
1137{
1138 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1139}
1140
1141static __inline __m128i __attribute__((__always_inline__))
1142_mm_srai_epi32 (__m128i __A, int __B)
1143{
1144 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1145}
1146#else
1147#define _mm_srai_epi16(__A, __B) \
1148 ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B))
1149#define _mm_srai_epi32(__A, __B) \
1150 ((__m128i)__builtin_ia32_psradi128 ((__v8hi)(__A), __B))
1151#endif
1152
1153#if 0
1154static __m128i __attribute__((__always_inline__))
1155_mm_srli_si128 (__m128i __A, int __B)
1156{
1157 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8));
1158}
1159
1160static __m128i __attribute__((__always_inline__))
1161_mm_srli_si128 (__m128i __A, int __B)
1162{
1163 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8));
1164}
1165#else
1166#define _mm_srli_si128(__A, __B) \
1167 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1168#define _mm_slli_si128(__A, __B) \
1169 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1170#endif
1171
1172#if 0
1173static __inline __m128i __attribute__((__always_inline__))
1174_mm_srli_epi16 (__m128i __A, int __B)
1175{
1176 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1177}
1178
1179static __inline __m128i __attribute__((__always_inline__))
1180_mm_srli_epi32 (__m128i __A, int __B)
1181{
1182 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1183}
1184
1185static __inline __m128i __attribute__((__always_inline__))
1186_mm_srli_epi64 (__m128i __A, int __B)
1187{
1188 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1189}
1190#else
1191#define _mm_srli_epi16(__A, __B) \
1192 ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B))
1193#define _mm_srli_epi32(__A, __B) \
1194 ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B))
1195#define _mm_srli_epi64(__A, __B) \
1196 ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B))
1197#endif
1198
1199static __inline __m128i __attribute__((__always_inline__))
1200_mm_sll_epi16 (__m128i __A, __m128i __B)
1201{
1202 return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1203}
1204
1205static __inline __m128i __attribute__((__always_inline__))
1206_mm_sll_epi32 (__m128i __A, __m128i __B)
1207{
1208 return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1209}
1210
1211static __inline __m128i __attribute__((__always_inline__))
1212_mm_sll_epi64 (__m128i __A, __m128i __B)
1213{
1214 return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1215}
1216
1217static __inline __m128i __attribute__((__always_inline__))
1218_mm_sra_epi16 (__m128i __A, __m128i __B)
1219{
1220 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1221}
1222
1223static __inline __m128i __attribute__((__always_inline__))
1224_mm_sra_epi32 (__m128i __A, __m128i __B)
1225{
1226 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1227}
1228
1229static __inline __m128i __attribute__((__always_inline__))
1230_mm_srl_epi16 (__m128i __A, __m128i __B)
1231{
1232 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1233}
1234
1235static __inline __m128i __attribute__((__always_inline__))
1236_mm_srl_epi32 (__m128i __A, __m128i __B)
1237{
1238 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1239}
1240
1241static __inline __m128i __attribute__((__always_inline__))
1242_mm_srl_epi64 (__m128i __A, __m128i __B)
1243{
1244 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1245}
1246
1247static __inline __m128i __attribute__((__always_inline__))
1248_mm_and_si128 (__m128i __A, __m128i __B)
1249{
1250 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1251}
1252
1253static __inline __m128i __attribute__((__always_inline__))
1254_mm_andnot_si128 (__m128i __A, __m128i __B)
1255{
1256 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1257}
1258
1259static __inline __m128i __attribute__((__always_inline__))
1260_mm_or_si128 (__m128i __A, __m128i __B)
1261{
1262 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1263}
1264
1265static __inline __m128i __attribute__((__always_inline__))
1266_mm_xor_si128 (__m128i __A, __m128i __B)
1267{
1268 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1269}
1270
1271static __inline __m128i __attribute__((__always_inline__))
1272_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1273{
1274 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1275}
1276
1277static __inline __m128i __attribute__((__always_inline__))
1278_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1279{
1280 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1281}
1282
1283static __inline __m128i __attribute__((__always_inline__))
1284_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1285{
1286 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1287}
1288
1289static __inline __m128i __attribute__((__always_inline__))
1290_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1291{
1292 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1293}
1294
1295static __inline __m128i __attribute__((__always_inline__))
1296_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1297{
1298 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1299}
1300
1301static __inline __m128i __attribute__((__always_inline__))
1302_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1303{
1304 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1305}
1306
1307static __inline __m128i __attribute__((__always_inline__))
1308_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1309{
1310 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1311}
1312
1313static __inline __m128i __attribute__((__always_inline__))
1314_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1315{
1316 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1317}
1318
1319static __inline __m128i __attribute__((__always_inline__))
1320_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1321{
1322 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1323}
1324
1325#if 0
1326static __inline int __attribute__((__always_inline__))
1327_mm_extract_epi16 (__m128i const __A, int const __N)
1328{
1329 return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1330}
1331
1332static __inline __m128i __attribute__((__always_inline__))
1333_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1334{
1335 return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1336}
1337#else
1338#define _mm_extract_epi16(A, N) \
1339 ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N)))
1340#define _mm_insert_epi16(A, D, N) \
1341 ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
1342#endif
1343
1344static __inline __m128i __attribute__((__always_inline__))
1345_mm_max_epi16 (__m128i __A, __m128i __B)
1346{
1347 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1348}
1349
1350static __inline __m128i __attribute__((__always_inline__))
1351_mm_max_epu8 (__m128i __A, __m128i __B)
1352{
1353 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1354}
1355
1356static __inline __m128i __attribute__((__always_inline__))
1357_mm_min_epi16 (__m128i __A, __m128i __B)
1358{
1359 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1360}
1361
1362static __inline __m128i __attribute__((__always_inline__))
1363_mm_min_epu8 (__m128i __A, __m128i __B)
1364{
1365 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1366}
1367
1368static __inline int __attribute__((__always_inline__))
1369_mm_movemask_epi8 (__m128i __A)
1370{
1371 return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1372}
1373
1374static __inline __m128i __attribute__((__always_inline__))
1375_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1376{
1377 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1378}
1379
1380#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1381#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1382#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1383
1384static __inline void __attribute__((__always_inline__))
1385_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1386{
1387 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1388}
1389
1390static __inline __m128i __attribute__((__always_inline__))
1391_mm_avg_epu8 (__m128i __A, __m128i __B)
1392{
1393 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1394}
1395
1396static __inline __m128i __attribute__((__always_inline__))
1397_mm_avg_epu16 (__m128i __A, __m128i __B)
1398{
1399 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1400}
1401
1402static __inline __m128i __attribute__((__always_inline__))
1403_mm_sad_epu8 (__m128i __A, __m128i __B)
1404{
1405 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1406}
1407
1408static __inline void __attribute__((__always_inline__))
1409_mm_stream_si32 (int *__A, int __B)
1410{
1411 __builtin_ia32_movnti (__A, __B);
1412}
1413
1414static __inline void __attribute__((__always_inline__))
1415_mm_stream_si128 (__m128i *__A, __m128i __B)
1416{
1417 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1418}
1419
1420static __inline void __attribute__((__always_inline__))
1421_mm_stream_pd (double *__A, __m128d __B)
1422{
1423 __builtin_ia32_movntpd (__A, (__v2df)__B);
1424}
1425
1426static __inline void __attribute__((__always_inline__))
1427_mm_clflush (void const *__A)
1428{
1429 __builtin_ia32_clflush (__A);
1430}
1431
1432static __inline void __attribute__((__always_inline__))
1433_mm_lfence (void)
1434{
1435 __builtin_ia32_lfence ();
1436}
1437
1438static __inline void __attribute__((__always_inline__))
1439_mm_mfence (void)
1440{
1441 __builtin_ia32_mfence ();
1442}
1443
1444static __inline __m128i __attribute__((__always_inline__))
1445_mm_cvtsi32_si128 (int __A)
1446{
1447 return _mm_set_epi32 (0, 0, 0, __A);
1448}
1449
1450#ifdef __x86_64__
1451/* Intel intrinsic. */
1452static __inline __m128i __attribute__((__always_inline__))
1453_mm_cvtsi64_si128 (long long __A)
1454{
1455 return _mm_set_epi64x (0, __A);
1456}
1457
1458/* Microsoft intrinsic. */
1459static __inline __m128i __attribute__((__always_inline__))
1460_mm_cvtsi64x_si128 (long long __A)
1461{
1462 return _mm_set_epi64x (0, __A);
1463}
1464#endif
1465
1466/* Casts between various SP, DP, INT vector types. Note that these do no
1467 conversion of values, they just change the type. */
1468static __inline __m128 __attribute__((__always_inline__))
1469_mm_castpd_ps(__m128d __A)
1470{
1471 return (__m128) __A;
1472}
1473
1474static __inline __m128i __attribute__((__always_inline__))
1475_mm_castpd_si128(__m128d __A)
1476{
1477 return (__m128i) __A;
1478}
1479
1480static __inline __m128d __attribute__((__always_inline__))
1481_mm_castps_pd(__m128 __A)
1482{
1483 return (__m128d) __A;
1484}
1485
1486static __inline __m128i __attribute__((__always_inline__))
1487_mm_castps_si128(__m128 __A)
1488{
1489 return (__m128i) __A;
1490}
1491
1492static __inline __m128 __attribute__((__always_inline__))
1493_mm_castsi128_ps(__m128i __A)
1494{
1495 return (__m128) __A;
1496}
1497
1498static __inline __m128d __attribute__((__always_inline__))
1499_mm_castsi128_pd(__m128i __A)
1500{
1501 return (__m128d) __A;
1502}
1503
1504#endif /* __SSE2__ */
1505
1506#endif /* _EMMINTRIN_H_INCLUDED */