xmmintrin.h revision 193576
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef float __v4sf __attribute__((__vector_size__(16)));
34typedef float __m128 __attribute__((__vector_size__(16)));
35
36#include <mm_malloc.h>
37
38static inline __m128 __attribute__((__always_inline__, __nodebug__))
39_mm_add_ss(__m128 a, __m128 b)
40{
41  a[0] += b[0];
42  return a;
43}
44
45static inline __m128 __attribute__((__always_inline__, __nodebug__))
46_mm_add_ps(__m128 a, __m128 b)
47{
48  return a + b;
49}
50
51static inline __m128 __attribute__((__always_inline__, __nodebug__))
52_mm_sub_ss(__m128 a, __m128 b)
53{
54  a[0] -= b[0];
55  return a;
56}
57
58static inline __m128 __attribute__((__always_inline__, __nodebug__))
59_mm_sub_ps(__m128 a, __m128 b)
60{
61  return a - b;
62}
63
64static inline __m128 __attribute__((__always_inline__, __nodebug__))
65_mm_mul_ss(__m128 a, __m128 b)
66{
67  a[0] *= b[0];
68  return a;
69}
70
71static inline __m128 __attribute__((__always_inline__, __nodebug__))
72_mm_mul_ps(__m128 a, __m128 b)
73{
74  return a * b;
75}
76
77static inline __m128 __attribute__((__always_inline__, __nodebug__))
78_mm_div_ss(__m128 a, __m128 b)
79{
80  a[0] /= b[0];
81  return a;
82}
83
84static inline __m128 __attribute__((__always_inline__, __nodebug__))
85_mm_div_ps(__m128 a, __m128 b)
86{
87  return a / b;
88}
89
90static inline __m128 __attribute__((__always_inline__, __nodebug__))
91_mm_sqrt_ss(__m128 a)
92{
93  return __builtin_ia32_sqrtss(a);
94}
95
96static inline __m128 __attribute__((__always_inline__, __nodebug__))
97_mm_sqrt_ps(__m128 a)
98{
99  return __builtin_ia32_sqrtps(a);
100}
101
102static inline __m128 __attribute__((__always_inline__, __nodebug__))
103_mm_rcp_ss(__m128 a)
104{
105  return __builtin_ia32_rcpss(a);
106}
107
108static inline __m128 __attribute__((__always_inline__, __nodebug__))
109_mm_rcp_ps(__m128 a)
110{
111  return __builtin_ia32_rcpps(a);
112}
113
114static inline __m128 __attribute__((__always_inline__, __nodebug__))
115_mm_rsqrt_ss(__m128 a)
116{
117  return __builtin_ia32_rsqrtss(a);
118}
119
120static inline __m128 __attribute__((__always_inline__, __nodebug__))
121_mm_rsqrt_ps(__m128 a)
122{
123  return __builtin_ia32_rsqrtps(a);
124}
125
126static inline __m128 __attribute__((__always_inline__, __nodebug__))
127_mm_min_ss(__m128 a, __m128 b)
128{
129  return __builtin_ia32_minss(a, b);
130}
131
132static inline __m128 __attribute__((__always_inline__, __nodebug__))
133_mm_min_ps(__m128 a, __m128 b)
134{
135  return __builtin_ia32_minps(a, b);
136}
137
138static inline __m128 __attribute__((__always_inline__, __nodebug__))
139_mm_max_ss(__m128 a, __m128 b)
140{
141  return __builtin_ia32_maxss(a, b);
142}
143
144static inline __m128 __attribute__((__always_inline__, __nodebug__))
145_mm_max_ps(__m128 a, __m128 b)
146{
147  return __builtin_ia32_maxps(a, b);
148}
149
150static inline __m128 __attribute__((__always_inline__, __nodebug__))
151_mm_and_ps(__m128 a, __m128 b)
152{
153  typedef int __v4si __attribute__((__vector_size__(16)));
154  return (__m128)((__v4si)a & (__v4si)b);
155}
156
157static inline __m128 __attribute__((__always_inline__, __nodebug__))
158_mm_andnot_ps(__m128 a, __m128 b)
159{
160  typedef int __v4si __attribute__((__vector_size__(16)));
161  return (__m128)(~(__v4si)a & (__v4si)b);
162}
163
164static inline __m128 __attribute__((__always_inline__, __nodebug__))
165_mm_or_ps(__m128 a, __m128 b)
166{
167  typedef int __v4si __attribute__((__vector_size__(16)));
168  return (__m128)((__v4si)a | (__v4si)b);
169}
170
171static inline __m128 __attribute__((__always_inline__, __nodebug__))
172_mm_xor_ps(__m128 a, __m128 b)
173{
174  typedef int __v4si __attribute__((__vector_size__(16)));
175  return (__m128)((__v4si)a ^ ~(__v4si)b);
176}
177
178static inline __m128 __attribute__((__always_inline__, __nodebug__))
179_mm_cmpeq_ss(__m128 a, __m128 b)
180{
181  return (__m128)__builtin_ia32_cmpss(a, b, 0);
182}
183
184static inline __m128 __attribute__((__always_inline__, __nodebug__))
185_mm_cmpeq_ps(__m128 a, __m128 b)
186{
187  return (__m128)__builtin_ia32_cmpps(a, b, 0);
188}
189
190static inline __m128 __attribute__((__always_inline__, __nodebug__))
191_mm_cmplt_ss(__m128 a, __m128 b)
192{
193  return (__m128)__builtin_ia32_cmpss(a, b, 1);
194}
195
196static inline __m128 __attribute__((__always_inline__, __nodebug__))
197_mm_cmplt_ps(__m128 a, __m128 b)
198{
199  return (__m128)__builtin_ia32_cmpps(a, b, 1);
200}
201
202static inline __m128 __attribute__((__always_inline__, __nodebug__))
203_mm_cmple_ss(__m128 a, __m128 b)
204{
205  return (__m128)__builtin_ia32_cmpss(a, b, 2);
206}
207
208static inline __m128 __attribute__((__always_inline__, __nodebug__))
209_mm_cmple_ps(__m128 a, __m128 b)
210{
211  return (__m128)__builtin_ia32_cmpps(a, b, 2);
212}
213
214static inline __m128 __attribute__((__always_inline__, __nodebug__))
215_mm_cmpgt_ss(__m128 a, __m128 b)
216{
217  return (__m128)__builtin_ia32_cmpss(b, a, 1);
218}
219
220static inline __m128 __attribute__((__always_inline__, __nodebug__))
221_mm_cmpgt_ps(__m128 a, __m128 b)
222{
223  return (__m128)__builtin_ia32_cmpps(b, a, 1);
224}
225
226static inline __m128 __attribute__((__always_inline__, __nodebug__))
227_mm_cmpge_ss(__m128 a, __m128 b)
228{
229  return (__m128)__builtin_ia32_cmpss(b, a, 2);
230}
231
232static inline __m128 __attribute__((__always_inline__, __nodebug__))
233_mm_cmpge_ps(__m128 a, __m128 b)
234{
235  return (__m128)__builtin_ia32_cmpps(b, a, 2);
236}
237
238static inline __m128 __attribute__((__always_inline__, __nodebug__))
239_mm_cmpneq_ss(__m128 a, __m128 b)
240{
241  return (__m128)__builtin_ia32_cmpss(a, b, 4);
242}
243
244static inline __m128 __attribute__((__always_inline__, __nodebug__))
245_mm_cmpneq_ps(__m128 a, __m128 b)
246{
247  return (__m128)__builtin_ia32_cmpps(a, b, 4);
248}
249
250static inline __m128 __attribute__((__always_inline__, __nodebug__))
251_mm_cmpnlt_ss(__m128 a, __m128 b)
252{
253  return (__m128)__builtin_ia32_cmpss(a, b, 5);
254}
255
256static inline __m128 __attribute__((__always_inline__, __nodebug__))
257_mm_cmpnlt_ps(__m128 a, __m128 b)
258{
259  return (__m128)__builtin_ia32_cmpps(a, b, 5);
260}
261
262static inline __m128 __attribute__((__always_inline__, __nodebug__))
263_mm_cmpnle_ss(__m128 a, __m128 b)
264{
265  return (__m128)__builtin_ia32_cmpss(a, b, 6);
266}
267
268static inline __m128 __attribute__((__always_inline__, __nodebug__))
269_mm_cmpnle_ps(__m128 a, __m128 b)
270{
271  return (__m128)__builtin_ia32_cmpps(a, b, 6);
272}
273
274static inline __m128 __attribute__((__always_inline__, __nodebug__))
275_mm_cmpngt_ss(__m128 a, __m128 b)
276{
277  return (__m128)__builtin_ia32_cmpss(b, a, 5);
278}
279
280static inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_cmpngt_ps(__m128 a, __m128 b)
282{
283  return (__m128)__builtin_ia32_cmpps(b, a, 5);
284}
285
286static inline __m128 __attribute__((__always_inline__, __nodebug__))
287_mm_cmpnge_ss(__m128 a, __m128 b)
288{
289  return (__m128)__builtin_ia32_cmpss(b, a, 6);
290}
291
292static inline __m128 __attribute__((__always_inline__, __nodebug__))
293_mm_cmpnge_ps(__m128 a, __m128 b)
294{
295  return (__m128)__builtin_ia32_cmpps(b, a, 6);
296}
297
298static inline __m128 __attribute__((__always_inline__, __nodebug__))
299_mm_cmpord_ss(__m128 a, __m128 b)
300{
301  return (__m128)__builtin_ia32_cmpss(a, b, 7);
302}
303
304static inline __m128 __attribute__((__always_inline__, __nodebug__))
305_mm_cmpord_ps(__m128 a, __m128 b)
306{
307  return (__m128)__builtin_ia32_cmpps(a, b, 7);
308}
309
310static inline __m128 __attribute__((__always_inline__, __nodebug__))
311_mm_cmpunord_ss(__m128 a, __m128 b)
312{
313  return (__m128)__builtin_ia32_cmpss(a, b, 3);
314}
315
316static inline __m128 __attribute__((__always_inline__, __nodebug__))
317_mm_cmpunord_ps(__m128 a, __m128 b)
318{
319  return (__m128)__builtin_ia32_cmpps(a, b, 3);
320}
321
322static inline int __attribute__((__always_inline__, __nodebug__))
323_mm_comieq_ss(__m128 a, __m128 b)
324{
325  return __builtin_ia32_comieq(a, b);
326}
327
328static inline int __attribute__((__always_inline__, __nodebug__))
329_mm_comilt_ss(__m128 a, __m128 b)
330{
331  return __builtin_ia32_comilt(a, b);
332}
333
334static inline int __attribute__((__always_inline__, __nodebug__))
335_mm_comile_ss(__m128 a, __m128 b)
336{
337  return __builtin_ia32_comile(a, b);
338}
339
340static inline int __attribute__((__always_inline__, __nodebug__))
341_mm_comigt_ss(__m128 a, __m128 b)
342{
343  return __builtin_ia32_comigt(a, b);
344}
345
346static inline int __attribute__((__always_inline__, __nodebug__))
347_mm_comige_ss(__m128 a, __m128 b)
348{
349  return __builtin_ia32_comige(a, b);
350}
351
352static inline int __attribute__((__always_inline__, __nodebug__))
353_mm_comineq_ss(__m128 a, __m128 b)
354{
355  return __builtin_ia32_comineq(a, b);
356}
357
358static inline int __attribute__((__always_inline__, __nodebug__))
359_mm_ucomieq_ss(__m128 a, __m128 b)
360{
361  return __builtin_ia32_ucomieq(a, b);
362}
363
364static inline int __attribute__((__always_inline__, __nodebug__))
365_mm_ucomilt_ss(__m128 a, __m128 b)
366{
367  return __builtin_ia32_ucomilt(a, b);
368}
369
370static inline int __attribute__((__always_inline__, __nodebug__))
371_mm_ucomile_ss(__m128 a, __m128 b)
372{
373  return __builtin_ia32_ucomile(a, b);
374}
375
376static inline int __attribute__((__always_inline__, __nodebug__))
377_mm_ucomigt_ss(__m128 a, __m128 b)
378{
379  return __builtin_ia32_ucomigt(a, b);
380}
381
382static inline int __attribute__((__always_inline__, __nodebug__))
383_mm_ucomige_ss(__m128 a, __m128 b)
384{
385  return __builtin_ia32_ucomige(a, b);
386}
387
388static inline int __attribute__((__always_inline__, __nodebug__))
389_mm_ucomineq_ss(__m128 a, __m128 b)
390{
391  return __builtin_ia32_ucomineq(a, b);
392}
393
394static inline int __attribute__((__always_inline__, __nodebug__))
395_mm_cvtss_si32(__m128 a)
396{
397  return __builtin_ia32_cvtss2si(a);
398}
399
400#ifdef __x86_64__
401
402static inline long long __attribute__((__always_inline__, __nodebug__))
403_mm_cvtss_si64(__m128 a)
404{
405  return __builtin_ia32_cvtss2si64(a);
406}
407
408#endif
409
410static inline __m64 __attribute__((__always_inline__, __nodebug__))
411_mm_cvtps_pi32(__m128 a)
412{
413  return (__m64)__builtin_ia32_cvtps2pi(a);
414}
415
416static inline int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttss_si32(__m128 a)
418{
419  return a[0];
420}
421
422static inline long long __attribute__((__always_inline__, __nodebug__))
423_mm_cvttss_si64(__m128 a)
424{
425  return a[0];
426}
427
428static inline __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttps_pi32(__m128 a)
430{
431  return (__m64)__builtin_ia32_cvttps2pi(a);
432}
433
434static inline __m128 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtsi32_ss(__m128 a, int b)
436{
437  a[0] = b;
438  return a;
439}
440
441#ifdef __x86_64__
442
443static inline __m128 __attribute__((__always_inline__, __nodebug__))
444_mm_cvtsi64_ss(__m128 a, long long b)
445{
446  a[0] = b;
447  return a;
448}
449
450#endif
451
452static inline __m128 __attribute__((__always_inline__, __nodebug__))
453_mm_cvtpi32_ps(__m128 a, __m64 b)
454{
455  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
456}
457
458static inline float __attribute__((__always_inline__, __nodebug__))
459_mm_cvtss_f32(__m128 a)
460{
461  return a[0];
462}
463
464static inline __m128 __attribute__((__always_inline__, __nodebug__))
465_mm_loadh_pi(__m128 a, __m64 const *p)
466{
467  return __builtin_ia32_loadhps(a, (__v2si *)p);
468}
469
470static inline __m128 __attribute__((__always_inline__, __nodebug__))
471_mm_loadl_pi(__m128 a, __m64 const *p)
472{
473#if 0
474  // FIXME: This should work, but gives really crappy code at the moment
475  __m128 b;
476  b[0] = *(float*)p;
477  b[1] = *((float*)p+1);
478  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
479#endif
480  return __builtin_ia32_loadlps(a, (__v2si *)p);
481}
482
483static inline __m128 __attribute__((__always_inline__, __nodebug__))
484_mm_load_ss(float *p)
485{
486  return (__m128){ *p, 0, 0, 0 };
487}
488
489static inline __m128 __attribute__((__always_inline__, __nodebug__))
490_mm_load1_ps(float *p)
491{
492  return (__m128){ *p, *p, *p, *p };
493}
494
495#define        _mm_load_ps1(p) _mm_load1_ps(p)
496
497static inline __m128 __attribute__((__always_inline__, __nodebug__))
498_mm_load_ps(float *p)
499{
500  return *(__m128*)p;
501}
502
503static inline __m128 __attribute__((__always_inline__, __nodebug__))
504_mm_loadu_ps(float *p)
505{
506  return __builtin_ia32_loadups(p);
507}
508
509static inline __m128 __attribute__((__always_inline__, __nodebug__))
510_mm_loadr_ps(float *p)
511{
512  __m128 a = _mm_load_ps(p);
513  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
514}
515
516static inline __m128 __attribute__((__always_inline__, __nodebug__))
517_mm_set_ss(float w)
518{
519  return (__m128){ w, 0, 0, 0 };
520}
521
522static inline __m128 __attribute__((__always_inline__, __nodebug__))
523_mm_set1_ps(float w)
524{
525  return (__m128){ w, w, w, w };
526}
527
528// Microsoft specific.
529static inline __m128 __attribute__((__always_inline__, __nodebug__))
530_mm_set_ps1(float w)
531{
532    return _mm_set1_ps(w);
533}
534
535static inline __m128 __attribute__((__always_inline__, __nodebug__))
536_mm_set_ps(float z, float y, float x, float w)
537{
538  return (__m128){ w, x, y, z };
539}
540
541static inline __m128 __attribute__((__always_inline__, __nodebug__))
542_mm_setr_ps(float z, float y, float x, float w)
543{
544  return (__m128){ z, y, x, w };
545}
546
547static inline __m128 __attribute__((__always_inline__))
548_mm_setzero_ps(void)
549{
550  return (__m128){ 0, 0, 0, 0 };
551}
552
553static inline void __attribute__((__always_inline__))
554_mm_storeh_pi(__m64 *p, __m128 a)
555{
556  __builtin_ia32_storehps((__v2si *)p, a);
557}
558
559static inline void __attribute__((__always_inline__))
560_mm_storel_pi(__m64 *p, __m128 a)
561{
562  __builtin_ia32_storelps((__v2si *)p, a);
563}
564
565static inline void __attribute__((__always_inline__))
566_mm_store_ss(float *p, __m128 a)
567{
568  *p = a[0];
569}
570
571static inline void __attribute__((__always_inline__, __nodebug__))
572_mm_storeu_ps(float *p, __m128 a)
573{
574  __builtin_ia32_storeups(p, a);
575}
576
577static inline void __attribute__((__always_inline__, __nodebug__))
578_mm_store1_ps(float *p, __m128 a)
579{
580  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
581  _mm_storeu_ps(p, a);
582}
583
584static inline void __attribute__((__always_inline__, __nodebug__))
585_mm_store_ps(float *p, __m128 a)
586{
587  *(__m128 *)p = a;
588}
589
590static inline void __attribute__((__always_inline__, __nodebug__))
591_mm_storer_ps(float *p, __m128 a)
592{
593  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
594  _mm_store_ps(p, a);
595}
596
597#define _MM_HINT_T0 1
598#define _MM_HINT_T1 2
599#define _MM_HINT_T2 3
600#define _MM_HINT_NTA 0
601
602/* FIXME: We have to #define this because "sel" must be a constant integer, and
603   Sema doesn't do any form of constant propagation yet. */
604
605#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
606
607static inline void __attribute__((__always_inline__, __nodebug__))
608_mm_stream_pi(__m64 *p, __m64 a)
609{
610  __builtin_ia32_movntq(p, a);
611}
612
613static inline void __attribute__((__always_inline__, __nodebug__))
614_mm_stream_ps(float *p, __m128 a)
615{
616  __builtin_ia32_movntps(p, a);
617}
618
619static inline void __attribute__((__always_inline__, __nodebug__))
620_mm_sfence(void)
621{
622  __builtin_ia32_sfence();
623}
624
625static inline int __attribute__((__always_inline__, __nodebug__))
626_mm_extract_pi16(__m64 a, int n)
627{
628  __v4hi b = (__v4hi)a;
629  return (unsigned short)b[n & 3];
630}
631
632static inline __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_insert_pi16(__m64 a, int d, int n)
634{
635   __v4hi b = (__v4hi)a;
636   b[n & 3] = d;
637   return (__m64)b;
638}
639
640static inline __m64 __attribute__((__always_inline__, __nodebug__))
641_mm_max_pi16(__m64 a, __m64 b)
642{
643  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
644}
645
646static inline __m64 __attribute__((__always_inline__, __nodebug__))
647_mm_max_pu8(__m64 a, __m64 b)
648{
649  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
650}
651
652static inline __m64 __attribute__((__always_inline__, __nodebug__))
653_mm_min_pi16(__m64 a, __m64 b)
654{
655  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
656}
657
658static inline __m64 __attribute__((__always_inline__, __nodebug__))
659_mm_min_pu8(__m64 a, __m64 b)
660{
661  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
662}
663
664static inline int __attribute__((__always_inline__, __nodebug__))
665_mm_movemask_pi8(__m64 a)
666{
667  return __builtin_ia32_pmovmskb((__v8qi)a);
668}
669
670static inline __m64 __attribute__((__always_inline__, __nodebug__))
671_mm_mulhi_pu16(__m64 a, __m64 b)
672{
673  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
674}
675
676#define _mm_shuffle_pi16(a, n) \
677  ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \
678                                  (n) & 0x3, ((n) & 0xc) >> 2, \
679                                  ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
680
681static inline void __attribute__((__always_inline__, __nodebug__))
682_mm_maskmove_si64(__m64 d, __m64 n, char *p)
683{
684  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
685}
686
687static inline __m64 __attribute__((__always_inline__, __nodebug__))
688_mm_avg_pu8(__m64 a, __m64 b)
689{
690  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
691}
692
693static inline __m64 __attribute__((__always_inline__, __nodebug__))
694_mm_avg_pu16(__m64 a, __m64 b)
695{
696  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
697}
698
699static inline __m64 __attribute__((__always_inline__, __nodebug__))
700_mm_sad_pu8(__m64 a, __m64 b)
701{
702  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
703}
704
705static inline unsigned int __attribute__((__always_inline__, __nodebug__))
706_mm_getcsr(void)
707{
708  return __builtin_ia32_stmxcsr();
709}
710
711static inline void __attribute__((__always_inline__, __nodebug__))
712_mm_setcsr(unsigned int i)
713{
714  __builtin_ia32_ldmxcsr(i);
715}
716
717#define _mm_shuffle_ps(a, b, mask) \
718        (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \
719                                 (((mask) & 0x30) >> 4) + 4, \
720                                 (((mask) & 0xc0) >> 6) + 4))
721
722static inline __m128 __attribute__((__always_inline__, __nodebug__))
723_mm_unpackhi_ps(__m128 a, __m128 b)
724{
725  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
726}
727
728static inline __m128 __attribute__((__always_inline__, __nodebug__))
729_mm_unpacklo_ps(__m128 a, __m128 b)
730{
731  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
732}
733
734static inline __m128 __attribute__((__always_inline__, __nodebug__))
735_mm_move_ss(__m128 a, __m128 b)
736{
737  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
738}
739
740static inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_movehl_ps(__m128 a, __m128 b)
742{
743  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
744}
745
746static inline __m128 __attribute__((__always_inline__, __nodebug__))
747_mm_movelh_ps(__m128 a, __m128 b)
748{
749  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
750}
751
752static inline __m128 __attribute__((__always_inline__, __nodebug__))
753_mm_cvtpi16_ps(__m64 a)
754{
755  __m64 b, c;
756  __m128 r;
757
758  b = _mm_setzero_si64();
759  b = _mm_cmpgt_pi16(b, a);
760  c = _mm_unpackhi_pi16(a, b);
761  r = _mm_setzero_ps();
762  r = _mm_cvtpi32_ps(r, c);
763  r = _mm_movelh_ps(r, r);
764  c = _mm_unpacklo_pi16(a, b);
765  r = _mm_cvtpi32_ps(r, c);
766
767  return r;
768}
769
770static inline __m128 __attribute__((__always_inline__, __nodebug__))
771_mm_cvtpu16_ps(__m64 a)
772{
773  __m64 b, c;
774  __m128 r;
775
776  b = _mm_setzero_si64();
777  c = _mm_unpackhi_pi16(a, b);
778  r = _mm_setzero_ps();
779  r = _mm_cvtpi32_ps(r, c);
780  r = _mm_movelh_ps(r, r);
781  c = _mm_unpacklo_pi16(a, b);
782  r = _mm_cvtpi32_ps(r, c);
783
784  return r;
785}
786
787static inline __m128 __attribute__((__always_inline__, __nodebug__))
788_mm_cvtpi8_ps(__m64 a)
789{
790  __m64 b;
791
792  b = _mm_setzero_si64();
793  b = _mm_cmpgt_pi8(b, a);
794  b = _mm_unpacklo_pi8(a, b);
795
796  return _mm_cvtpi16_ps(b);
797}
798
799static inline __m128 __attribute__((__always_inline__, __nodebug__))
800_mm_cvtpu8_ps(__m64 a)
801{
802  __m64 b;
803
804  b = _mm_setzero_si64();
805  b = _mm_unpacklo_pi8(a, b);
806
807  return _mm_cvtpi16_ps(b);
808}
809
810static inline __m128 __attribute__((__always_inline__, __nodebug__))
811_mm_cvtpi32x2_ps(__m64 a, __m64 b)
812{
813  __m128 c;
814
815  c = _mm_setzero_ps();
816  c = _mm_cvtpi32_ps(c, b);
817  c = _mm_movelh_ps(c, c);
818
819  return _mm_cvtpi32_ps(c, a);
820}
821
822static inline __m64 __attribute__((__always_inline__, __nodebug__))
823_mm_cvtps_pi16(__m128 a)
824{
825  __m64 b, c;
826
827  b = _mm_cvtps_pi32(a);
828  a = _mm_movehl_ps(a, a);
829  c = _mm_cvtps_pi32(a);
830
831  return _mm_packs_pi16(b, c);
832}
833
834static inline __m64 __attribute__((__always_inline__, __nodebug__))
835_mm_cvtps_pi8(__m128 a)
836{
837  __m64 b, c;
838
839  b = _mm_cvtps_pi16(a);
840  c = _mm_setzero_si64();
841
842  return _mm_packs_pi16(b, c);
843}
844
845static inline int __attribute__((__always_inline__, __nodebug__))
846_mm_movemask_ps(__m128 a)
847{
848  return __builtin_ia32_movmskps(a);
849}
850
851#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
852
853#define _MM_EXCEPT_INVALID    (0x0001)
854#define _MM_EXCEPT_DENORM     (0x0002)
855#define _MM_EXCEPT_DIV_ZERO   (0x0004)
856#define _MM_EXCEPT_OVERFLOW   (0x0008)
857#define _MM_EXCEPT_UNDERFLOW  (0x0010)
858#define _MM_EXCEPT_INEXACT    (0x0020)
859#define _MM_EXCEPT_MASK       (0x003f)
860
861#define _MM_MASK_INVALID      (0x0080)
862#define _MM_MASK_DENORM       (0x0100)
863#define _MM_MASK_DIV_ZERO     (0x0200)
864#define _MM_MASK_OVERFLOW     (0x0400)
865#define _MM_MASK_UNDERFLOW    (0x0800)
866#define _MM_MASK_INEXACT      (0x1000)
867#define _MM_MASK_MASK         (0x1f80)
868
869#define _MM_ROUND_NEAREST     (0x0000)
870#define _MM_ROUND_DOWN        (0x2000)
871#define _MM_ROUND_UP          (0x4000)
872#define _MM_ROUND_TOWARD_ZERO (0x6000)
873#define _MM_ROUND_MASK        (0x6000)
874
875#define _MM_FLUSH_ZERO_MASK   (0x8000)
876#define _MM_FLUSH_ZERO_ON     (0x8000)
877#define _MM_FLUSH_ZERO_OFF    (0x8000)
878
879#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
880#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
881#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
882#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
883
884#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
885#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
886#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
887#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
888
889#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
890do { \
891  __m128 tmp3, tmp2, tmp1, tmp0; \
892  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
893  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
894  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
895  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
896  (row0) = _mm_movelh_ps(tmp0, tmp2); \
897  (row1) = _mm_movehl_ps(tmp2, tmp0); \
898  (row2) = _mm_movelh_ps(tmp1, tmp3); \
899  (row3) = _mm_movelh_ps(tmp3, tmp1); \
900} while (0)
901
902#include <emmintrin.h>
903
904#endif /* __SSE__ */
905
906#endif /* __XMMINTRIN_H */
907