xmmintrin.h revision 202379
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef float __v4sf __attribute__((__vector_size__(16)));
34typedef float __m128 __attribute__((__vector_size__(16)));
35
36#include <mm_malloc.h>
37
38static inline __m128 __attribute__((__always_inline__, __nodebug__))
39_mm_add_ss(__m128 a, __m128 b)
40{
41  a[0] += b[0];
42  return a;
43}
44
45static inline __m128 __attribute__((__always_inline__, __nodebug__))
46_mm_add_ps(__m128 a, __m128 b)
47{
48  return a + b;
49}
50
51static inline __m128 __attribute__((__always_inline__, __nodebug__))
52_mm_sub_ss(__m128 a, __m128 b)
53{
54  a[0] -= b[0];
55  return a;
56}
57
58static inline __m128 __attribute__((__always_inline__, __nodebug__))
59_mm_sub_ps(__m128 a, __m128 b)
60{
61  return a - b;
62}
63
64static inline __m128 __attribute__((__always_inline__, __nodebug__))
65_mm_mul_ss(__m128 a, __m128 b)
66{
67  a[0] *= b[0];
68  return a;
69}
70
71static inline __m128 __attribute__((__always_inline__, __nodebug__))
72_mm_mul_ps(__m128 a, __m128 b)
73{
74  return a * b;
75}
76
77static inline __m128 __attribute__((__always_inline__, __nodebug__))
78_mm_div_ss(__m128 a, __m128 b)
79{
80  a[0] /= b[0];
81  return a;
82}
83
84static inline __m128 __attribute__((__always_inline__, __nodebug__))
85_mm_div_ps(__m128 a, __m128 b)
86{
87  return a / b;
88}
89
90static inline __m128 __attribute__((__always_inline__, __nodebug__))
91_mm_sqrt_ss(__m128 a)
92{
93  return __builtin_ia32_sqrtss(a);
94}
95
96static inline __m128 __attribute__((__always_inline__, __nodebug__))
97_mm_sqrt_ps(__m128 a)
98{
99  return __builtin_ia32_sqrtps(a);
100}
101
102static inline __m128 __attribute__((__always_inline__, __nodebug__))
103_mm_rcp_ss(__m128 a)
104{
105  return __builtin_ia32_rcpss(a);
106}
107
108static inline __m128 __attribute__((__always_inline__, __nodebug__))
109_mm_rcp_ps(__m128 a)
110{
111  return __builtin_ia32_rcpps(a);
112}
113
114static inline __m128 __attribute__((__always_inline__, __nodebug__))
115_mm_rsqrt_ss(__m128 a)
116{
117  return __builtin_ia32_rsqrtss(a);
118}
119
120static inline __m128 __attribute__((__always_inline__, __nodebug__))
121_mm_rsqrt_ps(__m128 a)
122{
123  return __builtin_ia32_rsqrtps(a);
124}
125
126static inline __m128 __attribute__((__always_inline__, __nodebug__))
127_mm_min_ss(__m128 a, __m128 b)
128{
129  return __builtin_ia32_minss(a, b);
130}
131
132static inline __m128 __attribute__((__always_inline__, __nodebug__))
133_mm_min_ps(__m128 a, __m128 b)
134{
135  return __builtin_ia32_minps(a, b);
136}
137
138static inline __m128 __attribute__((__always_inline__, __nodebug__))
139_mm_max_ss(__m128 a, __m128 b)
140{
141  return __builtin_ia32_maxss(a, b);
142}
143
144static inline __m128 __attribute__((__always_inline__, __nodebug__))
145_mm_max_ps(__m128 a, __m128 b)
146{
147  return __builtin_ia32_maxps(a, b);
148}
149
150static inline __m128 __attribute__((__always_inline__, __nodebug__))
151_mm_and_ps(__m128 a, __m128 b)
152{
153  typedef int __v4si __attribute__((__vector_size__(16)));
154  return (__m128)((__v4si)a & (__v4si)b);
155}
156
157static inline __m128 __attribute__((__always_inline__, __nodebug__))
158_mm_andnot_ps(__m128 a, __m128 b)
159{
160  typedef int __v4si __attribute__((__vector_size__(16)));
161  return (__m128)(~(__v4si)a & (__v4si)b);
162}
163
164static inline __m128 __attribute__((__always_inline__, __nodebug__))
165_mm_or_ps(__m128 a, __m128 b)
166{
167  typedef int __v4si __attribute__((__vector_size__(16)));
168  return (__m128)((__v4si)a | (__v4si)b);
169}
170
171static inline __m128 __attribute__((__always_inline__, __nodebug__))
172_mm_xor_ps(__m128 a, __m128 b)
173{
174  typedef int __v4si __attribute__((__vector_size__(16)));
175  return (__m128)((__v4si)a ^ (__v4si)b);
176}
177
178static inline __m128 __attribute__((__always_inline__, __nodebug__))
179_mm_cmpeq_ss(__m128 a, __m128 b)
180{
181  return (__m128)__builtin_ia32_cmpss(a, b, 0);
182}
183
184static inline __m128 __attribute__((__always_inline__, __nodebug__))
185_mm_cmpeq_ps(__m128 a, __m128 b)
186{
187  return (__m128)__builtin_ia32_cmpps(a, b, 0);
188}
189
190static inline __m128 __attribute__((__always_inline__, __nodebug__))
191_mm_cmplt_ss(__m128 a, __m128 b)
192{
193  return (__m128)__builtin_ia32_cmpss(a, b, 1);
194}
195
196static inline __m128 __attribute__((__always_inline__, __nodebug__))
197_mm_cmplt_ps(__m128 a, __m128 b)
198{
199  return (__m128)__builtin_ia32_cmpps(a, b, 1);
200}
201
202static inline __m128 __attribute__((__always_inline__, __nodebug__))
203_mm_cmple_ss(__m128 a, __m128 b)
204{
205  return (__m128)__builtin_ia32_cmpss(a, b, 2);
206}
207
208static inline __m128 __attribute__((__always_inline__, __nodebug__))
209_mm_cmple_ps(__m128 a, __m128 b)
210{
211  return (__m128)__builtin_ia32_cmpps(a, b, 2);
212}
213
214static inline __m128 __attribute__((__always_inline__, __nodebug__))
215_mm_cmpgt_ss(__m128 a, __m128 b)
216{
217  return (__m128)__builtin_ia32_cmpss(b, a, 1);
218}
219
220static inline __m128 __attribute__((__always_inline__, __nodebug__))
221_mm_cmpgt_ps(__m128 a, __m128 b)
222{
223  return (__m128)__builtin_ia32_cmpps(b, a, 1);
224}
225
226static inline __m128 __attribute__((__always_inline__, __nodebug__))
227_mm_cmpge_ss(__m128 a, __m128 b)
228{
229  return (__m128)__builtin_ia32_cmpss(b, a, 2);
230}
231
232static inline __m128 __attribute__((__always_inline__, __nodebug__))
233_mm_cmpge_ps(__m128 a, __m128 b)
234{
235  return (__m128)__builtin_ia32_cmpps(b, a, 2);
236}
237
238static inline __m128 __attribute__((__always_inline__, __nodebug__))
239_mm_cmpneq_ss(__m128 a, __m128 b)
240{
241  return (__m128)__builtin_ia32_cmpss(a, b, 4);
242}
243
244static inline __m128 __attribute__((__always_inline__, __nodebug__))
245_mm_cmpneq_ps(__m128 a, __m128 b)
246{
247  return (__m128)__builtin_ia32_cmpps(a, b, 4);
248}
249
250static inline __m128 __attribute__((__always_inline__, __nodebug__))
251_mm_cmpnlt_ss(__m128 a, __m128 b)
252{
253  return (__m128)__builtin_ia32_cmpss(a, b, 5);
254}
255
256static inline __m128 __attribute__((__always_inline__, __nodebug__))
257_mm_cmpnlt_ps(__m128 a, __m128 b)
258{
259  return (__m128)__builtin_ia32_cmpps(a, b, 5);
260}
261
262static inline __m128 __attribute__((__always_inline__, __nodebug__))
263_mm_cmpnle_ss(__m128 a, __m128 b)
264{
265  return (__m128)__builtin_ia32_cmpss(a, b, 6);
266}
267
268static inline __m128 __attribute__((__always_inline__, __nodebug__))
269_mm_cmpnle_ps(__m128 a, __m128 b)
270{
271  return (__m128)__builtin_ia32_cmpps(a, b, 6);
272}
273
274static inline __m128 __attribute__((__always_inline__, __nodebug__))
275_mm_cmpngt_ss(__m128 a, __m128 b)
276{
277  return (__m128)__builtin_ia32_cmpss(b, a, 5);
278}
279
280static inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_cmpngt_ps(__m128 a, __m128 b)
282{
283  return (__m128)__builtin_ia32_cmpps(b, a, 5);
284}
285
286static inline __m128 __attribute__((__always_inline__, __nodebug__))
287_mm_cmpnge_ss(__m128 a, __m128 b)
288{
289  return (__m128)__builtin_ia32_cmpss(b, a, 6);
290}
291
292static inline __m128 __attribute__((__always_inline__, __nodebug__))
293_mm_cmpnge_ps(__m128 a, __m128 b)
294{
295  return (__m128)__builtin_ia32_cmpps(b, a, 6);
296}
297
298static inline __m128 __attribute__((__always_inline__, __nodebug__))
299_mm_cmpord_ss(__m128 a, __m128 b)
300{
301  return (__m128)__builtin_ia32_cmpss(a, b, 7);
302}
303
304static inline __m128 __attribute__((__always_inline__, __nodebug__))
305_mm_cmpord_ps(__m128 a, __m128 b)
306{
307  return (__m128)__builtin_ia32_cmpps(a, b, 7);
308}
309
310static inline __m128 __attribute__((__always_inline__, __nodebug__))
311_mm_cmpunord_ss(__m128 a, __m128 b)
312{
313  return (__m128)__builtin_ia32_cmpss(a, b, 3);
314}
315
316static inline __m128 __attribute__((__always_inline__, __nodebug__))
317_mm_cmpunord_ps(__m128 a, __m128 b)
318{
319  return (__m128)__builtin_ia32_cmpps(a, b, 3);
320}
321
322static inline int __attribute__((__always_inline__, __nodebug__))
323_mm_comieq_ss(__m128 a, __m128 b)
324{
325  return __builtin_ia32_comieq(a, b);
326}
327
328static inline int __attribute__((__always_inline__, __nodebug__))
329_mm_comilt_ss(__m128 a, __m128 b)
330{
331  return __builtin_ia32_comilt(a, b);
332}
333
334static inline int __attribute__((__always_inline__, __nodebug__))
335_mm_comile_ss(__m128 a, __m128 b)
336{
337  return __builtin_ia32_comile(a, b);
338}
339
340static inline int __attribute__((__always_inline__, __nodebug__))
341_mm_comigt_ss(__m128 a, __m128 b)
342{
343  return __builtin_ia32_comigt(a, b);
344}
345
346static inline int __attribute__((__always_inline__, __nodebug__))
347_mm_comige_ss(__m128 a, __m128 b)
348{
349  return __builtin_ia32_comige(a, b);
350}
351
352static inline int __attribute__((__always_inline__, __nodebug__))
353_mm_comineq_ss(__m128 a, __m128 b)
354{
355  return __builtin_ia32_comineq(a, b);
356}
357
358static inline int __attribute__((__always_inline__, __nodebug__))
359_mm_ucomieq_ss(__m128 a, __m128 b)
360{
361  return __builtin_ia32_ucomieq(a, b);
362}
363
364static inline int __attribute__((__always_inline__, __nodebug__))
365_mm_ucomilt_ss(__m128 a, __m128 b)
366{
367  return __builtin_ia32_ucomilt(a, b);
368}
369
370static inline int __attribute__((__always_inline__, __nodebug__))
371_mm_ucomile_ss(__m128 a, __m128 b)
372{
373  return __builtin_ia32_ucomile(a, b);
374}
375
376static inline int __attribute__((__always_inline__, __nodebug__))
377_mm_ucomigt_ss(__m128 a, __m128 b)
378{
379  return __builtin_ia32_ucomigt(a, b);
380}
381
382static inline int __attribute__((__always_inline__, __nodebug__))
383_mm_ucomige_ss(__m128 a, __m128 b)
384{
385  return __builtin_ia32_ucomige(a, b);
386}
387
388static inline int __attribute__((__always_inline__, __nodebug__))
389_mm_ucomineq_ss(__m128 a, __m128 b)
390{
391  return __builtin_ia32_ucomineq(a, b);
392}
393
394static inline int __attribute__((__always_inline__, __nodebug__))
395_mm_cvtss_si32(__m128 a)
396{
397  return __builtin_ia32_cvtss2si(a);
398}
399
400#ifdef __x86_64__
401
402static inline long long __attribute__((__always_inline__, __nodebug__))
403_mm_cvtss_si64(__m128 a)
404{
405  return __builtin_ia32_cvtss2si64(a);
406}
407
408#endif
409
410static inline __m64 __attribute__((__always_inline__, __nodebug__))
411_mm_cvtps_pi32(__m128 a)
412{
413  return (__m64)__builtin_ia32_cvtps2pi(a);
414}
415
416static inline int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttss_si32(__m128 a)
418{
419  return a[0];
420}
421
422static inline long long __attribute__((__always_inline__, __nodebug__))
423_mm_cvttss_si64(__m128 a)
424{
425  return a[0];
426}
427
428static inline __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttps_pi32(__m128 a)
430{
431  return (__m64)__builtin_ia32_cvttps2pi(a);
432}
433
434static inline __m128 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtsi32_ss(__m128 a, int b)
436{
437  a[0] = b;
438  return a;
439}
440
441#ifdef __x86_64__
442
443static inline __m128 __attribute__((__always_inline__, __nodebug__))
444_mm_cvtsi64_ss(__m128 a, long long b)
445{
446  a[0] = b;
447  return a;
448}
449
450#endif
451
452static inline __m128 __attribute__((__always_inline__, __nodebug__))
453_mm_cvtpi32_ps(__m128 a, __m64 b)
454{
455  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
456}
457
458static inline float __attribute__((__always_inline__, __nodebug__))
459_mm_cvtss_f32(__m128 a)
460{
461  return a[0];
462}
463
464static inline __m128 __attribute__((__always_inline__, __nodebug__))
465_mm_loadh_pi(__m128 a, __m64 const *p)
466{
467  __m128 b;
468  b[0] = *(float*)p;
469  b[1] = *((float*)p+1);
470  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
471}
472
473static inline __m128 __attribute__((__always_inline__, __nodebug__))
474_mm_loadl_pi(__m128 a, __m64 const *p)
475{
476  __m128 b;
477  b[0] = *(float*)p;
478  b[1] = *((float*)p+1);
479  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
480}
481
482static inline __m128 __attribute__((__always_inline__, __nodebug__))
483_mm_load_ss(float *p)
484{
485  return (__m128){ *p, 0, 0, 0 };
486}
487
488static inline __m128 __attribute__((__always_inline__, __nodebug__))
489_mm_load1_ps(float *p)
490{
491  return (__m128){ *p, *p, *p, *p };
492}
493
494#define        _mm_load_ps1(p) _mm_load1_ps(p)
495
496static inline __m128 __attribute__((__always_inline__, __nodebug__))
497_mm_load_ps(float *p)
498{
499  return *(__m128*)p;
500}
501
502static inline __m128 __attribute__((__always_inline__, __nodebug__))
503_mm_loadu_ps(float *p)
504{
505  return __builtin_ia32_loadups(p);
506}
507
508static inline __m128 __attribute__((__always_inline__, __nodebug__))
509_mm_loadr_ps(float *p)
510{
511  __m128 a = _mm_load_ps(p);
512  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
513}
514
515static inline __m128 __attribute__((__always_inline__, __nodebug__))
516_mm_set_ss(float w)
517{
518  return (__m128){ w, 0, 0, 0 };
519}
520
521static inline __m128 __attribute__((__always_inline__, __nodebug__))
522_mm_set1_ps(float w)
523{
524  return (__m128){ w, w, w, w };
525}
526
527// Microsoft specific.
528static inline __m128 __attribute__((__always_inline__, __nodebug__))
529_mm_set_ps1(float w)
530{
531    return _mm_set1_ps(w);
532}
533
534static inline __m128 __attribute__((__always_inline__, __nodebug__))
535_mm_set_ps(float z, float y, float x, float w)
536{
537  return (__m128){ w, x, y, z };
538}
539
540static inline __m128 __attribute__((__always_inline__, __nodebug__))
541_mm_setr_ps(float z, float y, float x, float w)
542{
543  return (__m128){ z, y, x, w };
544}
545
546static inline __m128 __attribute__((__always_inline__))
547_mm_setzero_ps(void)
548{
549  return (__m128){ 0, 0, 0, 0 };
550}
551
552static inline void __attribute__((__always_inline__))
553_mm_storeh_pi(__m64 *p, __m128 a)
554{
555  __builtin_ia32_storehps((__v2si *)p, a);
556}
557
558static inline void __attribute__((__always_inline__))
559_mm_storel_pi(__m64 *p, __m128 a)
560{
561  __builtin_ia32_storelps((__v2si *)p, a);
562}
563
564static inline void __attribute__((__always_inline__))
565_mm_store_ss(float *p, __m128 a)
566{
567  *p = a[0];
568}
569
570static inline void __attribute__((__always_inline__, __nodebug__))
571_mm_storeu_ps(float *p, __m128 a)
572{
573  __builtin_ia32_storeups(p, a);
574}
575
576static inline void __attribute__((__always_inline__, __nodebug__))
577_mm_store1_ps(float *p, __m128 a)
578{
579  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
580  _mm_storeu_ps(p, a);
581}
582
583static inline void __attribute__((__always_inline__, __nodebug__))
584_mm_store_ps(float *p, __m128 a)
585{
586  *(__m128 *)p = a;
587}
588
589static inline void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_ps(float *p, __m128 a)
591{
592  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
593  _mm_store_ps(p, a);
594}
595
596#define _MM_HINT_T0 1
597#define _MM_HINT_T1 2
598#define _MM_HINT_T2 3
599#define _MM_HINT_NTA 0
600
601/* FIXME: We have to #define this because "sel" must be a constant integer, and
602   Sema doesn't do any form of constant propagation yet. */
603
604#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
605
606static inline void __attribute__((__always_inline__, __nodebug__))
607_mm_stream_pi(__m64 *p, __m64 a)
608{
609  __builtin_ia32_movntq(p, a);
610}
611
612static inline void __attribute__((__always_inline__, __nodebug__))
613_mm_stream_ps(float *p, __m128 a)
614{
615  __builtin_ia32_movntps(p, a);
616}
617
618static inline void __attribute__((__always_inline__, __nodebug__))
619_mm_sfence(void)
620{
621  __builtin_ia32_sfence();
622}
623
624static inline int __attribute__((__always_inline__, __nodebug__))
625_mm_extract_pi16(__m64 a, int n)
626{
627  __v4hi b = (__v4hi)a;
628  return (unsigned short)b[n & 3];
629}
630
631static inline __m64 __attribute__((__always_inline__, __nodebug__))
632_mm_insert_pi16(__m64 a, int d, int n)
633{
634   __v4hi b = (__v4hi)a;
635   b[n & 3] = d;
636   return (__m64)b;
637}
638
639static inline __m64 __attribute__((__always_inline__, __nodebug__))
640_mm_max_pi16(__m64 a, __m64 b)
641{
642  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
643}
644
645static inline __m64 __attribute__((__always_inline__, __nodebug__))
646_mm_max_pu8(__m64 a, __m64 b)
647{
648  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
649}
650
651static inline __m64 __attribute__((__always_inline__, __nodebug__))
652_mm_min_pi16(__m64 a, __m64 b)
653{
654  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
655}
656
657static inline __m64 __attribute__((__always_inline__, __nodebug__))
658_mm_min_pu8(__m64 a, __m64 b)
659{
660  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
661}
662
663static inline int __attribute__((__always_inline__, __nodebug__))
664_mm_movemask_pi8(__m64 a)
665{
666  return __builtin_ia32_pmovmskb((__v8qi)a);
667}
668
669static inline __m64 __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_pu16(__m64 a, __m64 b)
671{
672  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
673}
674
675#define _mm_shuffle_pi16(a, n) \
676  ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \
677                                  (n) & 0x3, ((n) & 0xc) >> 2, \
678                                  ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
679
680static inline void __attribute__((__always_inline__, __nodebug__))
681_mm_maskmove_si64(__m64 d, __m64 n, char *p)
682{
683  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
684}
685
686static inline __m64 __attribute__((__always_inline__, __nodebug__))
687_mm_avg_pu8(__m64 a, __m64 b)
688{
689  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
690}
691
692static inline __m64 __attribute__((__always_inline__, __nodebug__))
693_mm_avg_pu16(__m64 a, __m64 b)
694{
695  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
696}
697
698static inline __m64 __attribute__((__always_inline__, __nodebug__))
699_mm_sad_pu8(__m64 a, __m64 b)
700{
701  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
702}
703
704static inline unsigned int __attribute__((__always_inline__, __nodebug__))
705_mm_getcsr(void)
706{
707  return __builtin_ia32_stmxcsr();
708}
709
710static inline void __attribute__((__always_inline__, __nodebug__))
711_mm_setcsr(unsigned int i)
712{
713  __builtin_ia32_ldmxcsr(i);
714}
715
716#define _mm_shuffle_ps(a, b, mask) \
717        (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \
718                                 (((mask) & 0x30) >> 4) + 4, \
719                                 (((mask) & 0xc0) >> 6) + 4))
720
721static inline __m128 __attribute__((__always_inline__, __nodebug__))
722_mm_unpackhi_ps(__m128 a, __m128 b)
723{
724  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
725}
726
727static inline __m128 __attribute__((__always_inline__, __nodebug__))
728_mm_unpacklo_ps(__m128 a, __m128 b)
729{
730  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
731}
732
733static inline __m128 __attribute__((__always_inline__, __nodebug__))
734_mm_move_ss(__m128 a, __m128 b)
735{
736  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
737}
738
739static inline __m128 __attribute__((__always_inline__, __nodebug__))
740_mm_movehl_ps(__m128 a, __m128 b)
741{
742  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
743}
744
745static inline __m128 __attribute__((__always_inline__, __nodebug__))
746_mm_movelh_ps(__m128 a, __m128 b)
747{
748  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
749}
750
751static inline __m128 __attribute__((__always_inline__, __nodebug__))
752_mm_cvtpi16_ps(__m64 a)
753{
754  __m64 b, c;
755  __m128 r;
756
757  b = _mm_setzero_si64();
758  b = _mm_cmpgt_pi16(b, a);
759  c = _mm_unpackhi_pi16(a, b);
760  r = _mm_setzero_ps();
761  r = _mm_cvtpi32_ps(r, c);
762  r = _mm_movelh_ps(r, r);
763  c = _mm_unpacklo_pi16(a, b);
764  r = _mm_cvtpi32_ps(r, c);
765
766  return r;
767}
768
769static inline __m128 __attribute__((__always_inline__, __nodebug__))
770_mm_cvtpu16_ps(__m64 a)
771{
772  __m64 b, c;
773  __m128 r;
774
775  b = _mm_setzero_si64();
776  c = _mm_unpackhi_pi16(a, b);
777  r = _mm_setzero_ps();
778  r = _mm_cvtpi32_ps(r, c);
779  r = _mm_movelh_ps(r, r);
780  c = _mm_unpacklo_pi16(a, b);
781  r = _mm_cvtpi32_ps(r, c);
782
783  return r;
784}
785
786static inline __m128 __attribute__((__always_inline__, __nodebug__))
787_mm_cvtpi8_ps(__m64 a)
788{
789  __m64 b;
790
791  b = _mm_setzero_si64();
792  b = _mm_cmpgt_pi8(b, a);
793  b = _mm_unpacklo_pi8(a, b);
794
795  return _mm_cvtpi16_ps(b);
796}
797
798static inline __m128 __attribute__((__always_inline__, __nodebug__))
799_mm_cvtpu8_ps(__m64 a)
800{
801  __m64 b;
802
803  b = _mm_setzero_si64();
804  b = _mm_unpacklo_pi8(a, b);
805
806  return _mm_cvtpi16_ps(b);
807}
808
809static inline __m128 __attribute__((__always_inline__, __nodebug__))
810_mm_cvtpi32x2_ps(__m64 a, __m64 b)
811{
812  __m128 c;
813
814  c = _mm_setzero_ps();
815  c = _mm_cvtpi32_ps(c, b);
816  c = _mm_movelh_ps(c, c);
817
818  return _mm_cvtpi32_ps(c, a);
819}
820
821static inline __m64 __attribute__((__always_inline__, __nodebug__))
822_mm_cvtps_pi16(__m128 a)
823{
824  __m64 b, c;
825
826  b = _mm_cvtps_pi32(a);
827  a = _mm_movehl_ps(a, a);
828  c = _mm_cvtps_pi32(a);
829
830  return _mm_packs_pi16(b, c);
831}
832
833static inline __m64 __attribute__((__always_inline__, __nodebug__))
834_mm_cvtps_pi8(__m128 a)
835{
836  __m64 b, c;
837
838  b = _mm_cvtps_pi16(a);
839  c = _mm_setzero_si64();
840
841  return _mm_packs_pi16(b, c);
842}
843
844static inline int __attribute__((__always_inline__, __nodebug__))
845_mm_movemask_ps(__m128 a)
846{
847  return __builtin_ia32_movmskps(a);
848}
849
850#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
851
852#define _MM_EXCEPT_INVALID    (0x0001)
853#define _MM_EXCEPT_DENORM     (0x0002)
854#define _MM_EXCEPT_DIV_ZERO   (0x0004)
855#define _MM_EXCEPT_OVERFLOW   (0x0008)
856#define _MM_EXCEPT_UNDERFLOW  (0x0010)
857#define _MM_EXCEPT_INEXACT    (0x0020)
858#define _MM_EXCEPT_MASK       (0x003f)
859
860#define _MM_MASK_INVALID      (0x0080)
861#define _MM_MASK_DENORM       (0x0100)
862#define _MM_MASK_DIV_ZERO     (0x0200)
863#define _MM_MASK_OVERFLOW     (0x0400)
864#define _MM_MASK_UNDERFLOW    (0x0800)
865#define _MM_MASK_INEXACT      (0x1000)
866#define _MM_MASK_MASK         (0x1f80)
867
868#define _MM_ROUND_NEAREST     (0x0000)
869#define _MM_ROUND_DOWN        (0x2000)
870#define _MM_ROUND_UP          (0x4000)
871#define _MM_ROUND_TOWARD_ZERO (0x6000)
872#define _MM_ROUND_MASK        (0x6000)
873
874#define _MM_FLUSH_ZERO_MASK   (0x8000)
875#define _MM_FLUSH_ZERO_ON     (0x8000)
876#define _MM_FLUSH_ZERO_OFF    (0x8000)
877
878#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
879#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
880#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
881#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
882
883#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
884#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
885#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
886#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
887
888#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
889do { \
890  __m128 tmp3, tmp2, tmp1, tmp0; \
891  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
892  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
893  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
894  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
895  (row0) = _mm_movelh_ps(tmp0, tmp2); \
896  (row1) = _mm_movehl_ps(tmp2, tmp0); \
897  (row2) = _mm_movelh_ps(tmp1, tmp3); \
898  (row3) = _mm_movelh_ps(tmp3, tmp1); \
899} while (0)
900
901/* Ugly hack for backwards-compatibility (compatible with gcc) */
902#ifdef __SSE2__
903#include <emmintrin.h>
904#endif
905
906#endif /* __SSE__ */
907
908#endif /* __XMMINTRIN_H */
909