xmmintrin.h revision 204643
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef float __v4sf __attribute__((__vector_size__(16)));
34typedef float __m128 __attribute__((__vector_size__(16)));
35
36#include <mm_malloc.h>
37
38static inline __m128 __attribute__((__always_inline__, __nodebug__))
39_mm_add_ss(__m128 a, __m128 b)
40{
41  a[0] += b[0];
42  return a;
43}
44
45static inline __m128 __attribute__((__always_inline__, __nodebug__))
46_mm_add_ps(__m128 a, __m128 b)
47{
48  return a + b;
49}
50
51static inline __m128 __attribute__((__always_inline__, __nodebug__))
52_mm_sub_ss(__m128 a, __m128 b)
53{
54  a[0] -= b[0];
55  return a;
56}
57
58static inline __m128 __attribute__((__always_inline__, __nodebug__))
59_mm_sub_ps(__m128 a, __m128 b)
60{
61  return a - b;
62}
63
64static inline __m128 __attribute__((__always_inline__, __nodebug__))
65_mm_mul_ss(__m128 a, __m128 b)
66{
67  a[0] *= b[0];
68  return a;
69}
70
71static inline __m128 __attribute__((__always_inline__, __nodebug__))
72_mm_mul_ps(__m128 a, __m128 b)
73{
74  return a * b;
75}
76
77static inline __m128 __attribute__((__always_inline__, __nodebug__))
78_mm_div_ss(__m128 a, __m128 b)
79{
80  a[0] /= b[0];
81  return a;
82}
83
84static inline __m128 __attribute__((__always_inline__, __nodebug__))
85_mm_div_ps(__m128 a, __m128 b)
86{
87  return a / b;
88}
89
90static inline __m128 __attribute__((__always_inline__, __nodebug__))
91_mm_sqrt_ss(__m128 a)
92{
93  return __builtin_ia32_sqrtss(a);
94}
95
96static inline __m128 __attribute__((__always_inline__, __nodebug__))
97_mm_sqrt_ps(__m128 a)
98{
99  return __builtin_ia32_sqrtps(a);
100}
101
102static inline __m128 __attribute__((__always_inline__, __nodebug__))
103_mm_rcp_ss(__m128 a)
104{
105  return __builtin_ia32_rcpss(a);
106}
107
108static inline __m128 __attribute__((__always_inline__, __nodebug__))
109_mm_rcp_ps(__m128 a)
110{
111  return __builtin_ia32_rcpps(a);
112}
113
114static inline __m128 __attribute__((__always_inline__, __nodebug__))
115_mm_rsqrt_ss(__m128 a)
116{
117  return __builtin_ia32_rsqrtss(a);
118}
119
120static inline __m128 __attribute__((__always_inline__, __nodebug__))
121_mm_rsqrt_ps(__m128 a)
122{
123  return __builtin_ia32_rsqrtps(a);
124}
125
126static inline __m128 __attribute__((__always_inline__, __nodebug__))
127_mm_min_ss(__m128 a, __m128 b)
128{
129  return __builtin_ia32_minss(a, b);
130}
131
132static inline __m128 __attribute__((__always_inline__, __nodebug__))
133_mm_min_ps(__m128 a, __m128 b)
134{
135  return __builtin_ia32_minps(a, b);
136}
137
138static inline __m128 __attribute__((__always_inline__, __nodebug__))
139_mm_max_ss(__m128 a, __m128 b)
140{
141  return __builtin_ia32_maxss(a, b);
142}
143
144static inline __m128 __attribute__((__always_inline__, __nodebug__))
145_mm_max_ps(__m128 a, __m128 b)
146{
147  return __builtin_ia32_maxps(a, b);
148}
149
150static inline __m128 __attribute__((__always_inline__, __nodebug__))
151_mm_and_ps(__m128 a, __m128 b)
152{
153  typedef int __v4si __attribute__((__vector_size__(16)));
154  return (__m128)((__v4si)a & (__v4si)b);
155}
156
157static inline __m128 __attribute__((__always_inline__, __nodebug__))
158_mm_andnot_ps(__m128 a, __m128 b)
159{
160  typedef int __v4si __attribute__((__vector_size__(16)));
161  return (__m128)(~(__v4si)a & (__v4si)b);
162}
163
164static inline __m128 __attribute__((__always_inline__, __nodebug__))
165_mm_or_ps(__m128 a, __m128 b)
166{
167  typedef int __v4si __attribute__((__vector_size__(16)));
168  return (__m128)((__v4si)a | (__v4si)b);
169}
170
171static inline __m128 __attribute__((__always_inline__, __nodebug__))
172_mm_xor_ps(__m128 a, __m128 b)
173{
174  typedef int __v4si __attribute__((__vector_size__(16)));
175  return (__m128)((__v4si)a ^ (__v4si)b);
176}
177
178static inline __m128 __attribute__((__always_inline__, __nodebug__))
179_mm_cmpeq_ss(__m128 a, __m128 b)
180{
181  return (__m128)__builtin_ia32_cmpss(a, b, 0);
182}
183
184static inline __m128 __attribute__((__always_inline__, __nodebug__))
185_mm_cmpeq_ps(__m128 a, __m128 b)
186{
187  return (__m128)__builtin_ia32_cmpps(a, b, 0);
188}
189
190static inline __m128 __attribute__((__always_inline__, __nodebug__))
191_mm_cmplt_ss(__m128 a, __m128 b)
192{
193  return (__m128)__builtin_ia32_cmpss(a, b, 1);
194}
195
196static inline __m128 __attribute__((__always_inline__, __nodebug__))
197_mm_cmplt_ps(__m128 a, __m128 b)
198{
199  return (__m128)__builtin_ia32_cmpps(a, b, 1);
200}
201
202static inline __m128 __attribute__((__always_inline__, __nodebug__))
203_mm_cmple_ss(__m128 a, __m128 b)
204{
205  return (__m128)__builtin_ia32_cmpss(a, b, 2);
206}
207
208static inline __m128 __attribute__((__always_inline__, __nodebug__))
209_mm_cmple_ps(__m128 a, __m128 b)
210{
211  return (__m128)__builtin_ia32_cmpps(a, b, 2);
212}
213
214static inline __m128 __attribute__((__always_inline__, __nodebug__))
215_mm_cmpgt_ss(__m128 a, __m128 b)
216{
217  return (__m128)__builtin_ia32_cmpss(b, a, 1);
218}
219
220static inline __m128 __attribute__((__always_inline__, __nodebug__))
221_mm_cmpgt_ps(__m128 a, __m128 b)
222{
223  return (__m128)__builtin_ia32_cmpps(b, a, 1);
224}
225
226static inline __m128 __attribute__((__always_inline__, __nodebug__))
227_mm_cmpge_ss(__m128 a, __m128 b)
228{
229  return (__m128)__builtin_ia32_cmpss(b, a, 2);
230}
231
232static inline __m128 __attribute__((__always_inline__, __nodebug__))
233_mm_cmpge_ps(__m128 a, __m128 b)
234{
235  return (__m128)__builtin_ia32_cmpps(b, a, 2);
236}
237
238static inline __m128 __attribute__((__always_inline__, __nodebug__))
239_mm_cmpneq_ss(__m128 a, __m128 b)
240{
241  return (__m128)__builtin_ia32_cmpss(a, b, 4);
242}
243
244static inline __m128 __attribute__((__always_inline__, __nodebug__))
245_mm_cmpneq_ps(__m128 a, __m128 b)
246{
247  return (__m128)__builtin_ia32_cmpps(a, b, 4);
248}
249
250static inline __m128 __attribute__((__always_inline__, __nodebug__))
251_mm_cmpnlt_ss(__m128 a, __m128 b)
252{
253  return (__m128)__builtin_ia32_cmpss(a, b, 5);
254}
255
256static inline __m128 __attribute__((__always_inline__, __nodebug__))
257_mm_cmpnlt_ps(__m128 a, __m128 b)
258{
259  return (__m128)__builtin_ia32_cmpps(a, b, 5);
260}
261
262static inline __m128 __attribute__((__always_inline__, __nodebug__))
263_mm_cmpnle_ss(__m128 a, __m128 b)
264{
265  return (__m128)__builtin_ia32_cmpss(a, b, 6);
266}
267
268static inline __m128 __attribute__((__always_inline__, __nodebug__))
269_mm_cmpnle_ps(__m128 a, __m128 b)
270{
271  return (__m128)__builtin_ia32_cmpps(a, b, 6);
272}
273
274static inline __m128 __attribute__((__always_inline__, __nodebug__))
275_mm_cmpngt_ss(__m128 a, __m128 b)
276{
277  return (__m128)__builtin_ia32_cmpss(b, a, 5);
278}
279
280static inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_cmpngt_ps(__m128 a, __m128 b)
282{
283  return (__m128)__builtin_ia32_cmpps(b, a, 5);
284}
285
286static inline __m128 __attribute__((__always_inline__, __nodebug__))
287_mm_cmpnge_ss(__m128 a, __m128 b)
288{
289  return (__m128)__builtin_ia32_cmpss(b, a, 6);
290}
291
292static inline __m128 __attribute__((__always_inline__, __nodebug__))
293_mm_cmpnge_ps(__m128 a, __m128 b)
294{
295  return (__m128)__builtin_ia32_cmpps(b, a, 6);
296}
297
298static inline __m128 __attribute__((__always_inline__, __nodebug__))
299_mm_cmpord_ss(__m128 a, __m128 b)
300{
301  return (__m128)__builtin_ia32_cmpss(a, b, 7);
302}
303
304static inline __m128 __attribute__((__always_inline__, __nodebug__))
305_mm_cmpord_ps(__m128 a, __m128 b)
306{
307  return (__m128)__builtin_ia32_cmpps(a, b, 7);
308}
309
310static inline __m128 __attribute__((__always_inline__, __nodebug__))
311_mm_cmpunord_ss(__m128 a, __m128 b)
312{
313  return (__m128)__builtin_ia32_cmpss(a, b, 3);
314}
315
316static inline __m128 __attribute__((__always_inline__, __nodebug__))
317_mm_cmpunord_ps(__m128 a, __m128 b)
318{
319  return (__m128)__builtin_ia32_cmpps(a, b, 3);
320}
321
322static inline int __attribute__((__always_inline__, __nodebug__))
323_mm_comieq_ss(__m128 a, __m128 b)
324{
325  return __builtin_ia32_comieq(a, b);
326}
327
328static inline int __attribute__((__always_inline__, __nodebug__))
329_mm_comilt_ss(__m128 a, __m128 b)
330{
331  return __builtin_ia32_comilt(a, b);
332}
333
334static inline int __attribute__((__always_inline__, __nodebug__))
335_mm_comile_ss(__m128 a, __m128 b)
336{
337  return __builtin_ia32_comile(a, b);
338}
339
340static inline int __attribute__((__always_inline__, __nodebug__))
341_mm_comigt_ss(__m128 a, __m128 b)
342{
343  return __builtin_ia32_comigt(a, b);
344}
345
346static inline int __attribute__((__always_inline__, __nodebug__))
347_mm_comige_ss(__m128 a, __m128 b)
348{
349  return __builtin_ia32_comige(a, b);
350}
351
352static inline int __attribute__((__always_inline__, __nodebug__))
353_mm_comineq_ss(__m128 a, __m128 b)
354{
355  return __builtin_ia32_comineq(a, b);
356}
357
358static inline int __attribute__((__always_inline__, __nodebug__))
359_mm_ucomieq_ss(__m128 a, __m128 b)
360{
361  return __builtin_ia32_ucomieq(a, b);
362}
363
364static inline int __attribute__((__always_inline__, __nodebug__))
365_mm_ucomilt_ss(__m128 a, __m128 b)
366{
367  return __builtin_ia32_ucomilt(a, b);
368}
369
370static inline int __attribute__((__always_inline__, __nodebug__))
371_mm_ucomile_ss(__m128 a, __m128 b)
372{
373  return __builtin_ia32_ucomile(a, b);
374}
375
376static inline int __attribute__((__always_inline__, __nodebug__))
377_mm_ucomigt_ss(__m128 a, __m128 b)
378{
379  return __builtin_ia32_ucomigt(a, b);
380}
381
382static inline int __attribute__((__always_inline__, __nodebug__))
383_mm_ucomige_ss(__m128 a, __m128 b)
384{
385  return __builtin_ia32_ucomige(a, b);
386}
387
388static inline int __attribute__((__always_inline__, __nodebug__))
389_mm_ucomineq_ss(__m128 a, __m128 b)
390{
391  return __builtin_ia32_ucomineq(a, b);
392}
393
394static inline int __attribute__((__always_inline__, __nodebug__))
395_mm_cvtss_si32(__m128 a)
396{
397  return __builtin_ia32_cvtss2si(a);
398}
399
400static inline int __attribute__((__always_inline__, __nodebug__))
401_mm_cvt_ss2si(__m128 a)
402{
403  return _mm_cvtss_si32(a);
404}
405
406#ifdef __x86_64__
407
408static inline long long __attribute__((__always_inline__, __nodebug__))
409_mm_cvtss_si64(__m128 a)
410{
411  return __builtin_ia32_cvtss2si64(a);
412}
413
414#endif
415
416static inline __m64 __attribute__((__always_inline__, __nodebug__))
417_mm_cvtps_pi32(__m128 a)
418{
419  return (__m64)__builtin_ia32_cvtps2pi(a);
420}
421
422static inline int __attribute__((__always_inline__, __nodebug__))
423_mm_cvttss_si32(__m128 a)
424{
425  return a[0];
426}
427
428static inline int __attribute__((__always_inline__, __nodebug__))
429_mm_cvtt_ss2si(__m128 a)
430{
431  return _mm_cvttss_si32(a);
432}
433
434static inline long long __attribute__((__always_inline__, __nodebug__))
435_mm_cvttss_si64(__m128 a)
436{
437  return a[0];
438}
439
440static inline __m64 __attribute__((__always_inline__, __nodebug__))
441_mm_cvttps_pi32(__m128 a)
442{
443  return (__m64)__builtin_ia32_cvttps2pi(a);
444}
445
446static inline __m128 __attribute__((__always_inline__, __nodebug__))
447_mm_cvtsi32_ss(__m128 a, int b)
448{
449  a[0] = b;
450  return a;
451}
452
453#ifdef __x86_64__
454
455static inline __m128 __attribute__((__always_inline__, __nodebug__))
456_mm_cvtsi64_ss(__m128 a, long long b)
457{
458  a[0] = b;
459  return a;
460}
461
462#endif
463
464static inline __m128 __attribute__((__always_inline__, __nodebug__))
465_mm_cvtpi32_ps(__m128 a, __m64 b)
466{
467  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
468}
469
470static inline float __attribute__((__always_inline__, __nodebug__))
471_mm_cvtss_f32(__m128 a)
472{
473  return a[0];
474}
475
476static inline __m128 __attribute__((__always_inline__, __nodebug__))
477_mm_loadh_pi(__m128 a, const __m64 *p)
478{
479  __m128 b;
480  b[0] = *(float*)p;
481  b[1] = *((float*)p+1);
482  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
483}
484
485static inline __m128 __attribute__((__always_inline__, __nodebug__))
486_mm_loadl_pi(__m128 a, const __m64 *p)
487{
488  __m128 b;
489  b[0] = *(float*)p;
490  b[1] = *((float*)p+1);
491  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
492}
493
494static inline __m128 __attribute__((__always_inline__, __nodebug__))
495_mm_load_ss(const float *p)
496{
497  return (__m128){ *p, 0, 0, 0 };
498}
499
500static inline __m128 __attribute__((__always_inline__, __nodebug__))
501_mm_load1_ps(const float *p)
502{
503  return (__m128){ *p, *p, *p, *p };
504}
505
506#define        _mm_load_ps1(p) _mm_load1_ps(p)
507
508static inline __m128 __attribute__((__always_inline__, __nodebug__))
509_mm_load_ps(const float *p)
510{
511  return *(__m128*)p;
512}
513
514static inline __m128 __attribute__((__always_inline__, __nodebug__))
515_mm_loadu_ps(const float *p)
516{
517  return __builtin_ia32_loadups(p);
518}
519
520static inline __m128 __attribute__((__always_inline__, __nodebug__))
521_mm_loadr_ps(const float *p)
522{
523  __m128 a = _mm_load_ps(p);
524  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
525}
526
527static inline __m128 __attribute__((__always_inline__, __nodebug__))
528_mm_set_ss(float w)
529{
530  return (__m128){ w, 0, 0, 0 };
531}
532
533static inline __m128 __attribute__((__always_inline__, __nodebug__))
534_mm_set1_ps(float w)
535{
536  return (__m128){ w, w, w, w };
537}
538
539// Microsoft specific.
540static inline __m128 __attribute__((__always_inline__, __nodebug__))
541_mm_set_ps1(float w)
542{
543    return _mm_set1_ps(w);
544}
545
546static inline __m128 __attribute__((__always_inline__, __nodebug__))
547_mm_set_ps(float z, float y, float x, float w)
548{
549  return (__m128){ w, x, y, z };
550}
551
552static inline __m128 __attribute__((__always_inline__, __nodebug__))
553_mm_setr_ps(float z, float y, float x, float w)
554{
555  return (__m128){ z, y, x, w };
556}
557
558static inline __m128 __attribute__((__always_inline__))
559_mm_setzero_ps(void)
560{
561  return (__m128){ 0, 0, 0, 0 };
562}
563
564static inline void __attribute__((__always_inline__))
565_mm_storeh_pi(__m64 *p, __m128 a)
566{
567  __builtin_ia32_storehps((__v2si *)p, a);
568}
569
570static inline void __attribute__((__always_inline__))
571_mm_storel_pi(__m64 *p, __m128 a)
572{
573  __builtin_ia32_storelps((__v2si *)p, a);
574}
575
576static inline void __attribute__((__always_inline__))
577_mm_store_ss(float *p, __m128 a)
578{
579  *p = a[0];
580}
581
582static inline void __attribute__((__always_inline__, __nodebug__))
583_mm_storeu_ps(float *p, __m128 a)
584{
585  __builtin_ia32_storeups(p, a);
586}
587
588static inline void __attribute__((__always_inline__, __nodebug__))
589_mm_store1_ps(float *p, __m128 a)
590{
591  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
592  _mm_storeu_ps(p, a);
593}
594
595static inline void __attribute__((__always_inline__, __nodebug__))
596_mm_store_ps(float *p, __m128 a)
597{
598  *(__m128 *)p = a;
599}
600
601static inline void __attribute__((__always_inline__, __nodebug__))
602_mm_storer_ps(float *p, __m128 a)
603{
604  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
605  _mm_store_ps(p, a);
606}
607
608#define _MM_HINT_T0 1
609#define _MM_HINT_T1 2
610#define _MM_HINT_T2 3
611#define _MM_HINT_NTA 0
612
613/* FIXME: We have to #define this because "sel" must be a constant integer, and
614   Sema doesn't do any form of constant propagation yet. */
615
616#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
617
618static inline void __attribute__((__always_inline__, __nodebug__))
619_mm_stream_pi(__m64 *p, __m64 a)
620{
621  __builtin_ia32_movntq(p, a);
622}
623
624static inline void __attribute__((__always_inline__, __nodebug__))
625_mm_stream_ps(float *p, __m128 a)
626{
627  __builtin_ia32_movntps(p, a);
628}
629
630static inline void __attribute__((__always_inline__, __nodebug__))
631_mm_sfence(void)
632{
633  __builtin_ia32_sfence();
634}
635
636static inline int __attribute__((__always_inline__, __nodebug__))
637_mm_extract_pi16(__m64 a, int n)
638{
639  __v4hi b = (__v4hi)a;
640  return (unsigned short)b[n & 3];
641}
642
643static inline __m64 __attribute__((__always_inline__, __nodebug__))
644_mm_insert_pi16(__m64 a, int d, int n)
645{
646   __v4hi b = (__v4hi)a;
647   b[n & 3] = d;
648   return (__m64)b;
649}
650
651static inline __m64 __attribute__((__always_inline__, __nodebug__))
652_mm_max_pi16(__m64 a, __m64 b)
653{
654  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
655}
656
657static inline __m64 __attribute__((__always_inline__, __nodebug__))
658_mm_max_pu8(__m64 a, __m64 b)
659{
660  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
661}
662
663static inline __m64 __attribute__((__always_inline__, __nodebug__))
664_mm_min_pi16(__m64 a, __m64 b)
665{
666  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
667}
668
669static inline __m64 __attribute__((__always_inline__, __nodebug__))
670_mm_min_pu8(__m64 a, __m64 b)
671{
672  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
673}
674
675static inline int __attribute__((__always_inline__, __nodebug__))
676_mm_movemask_pi8(__m64 a)
677{
678  return __builtin_ia32_pmovmskb((__v8qi)a);
679}
680
681static inline __m64 __attribute__((__always_inline__, __nodebug__))
682_mm_mulhi_pu16(__m64 a, __m64 b)
683{
684  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
685}
686
687#define _mm_shuffle_pi16(a, n) \
688  ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \
689                                  (n) & 0x3, ((n) & 0xc) >> 2, \
690                                  ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
691
692static inline void __attribute__((__always_inline__, __nodebug__))
693_mm_maskmove_si64(__m64 d, __m64 n, char *p)
694{
695  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
696}
697
698static inline __m64 __attribute__((__always_inline__, __nodebug__))
699_mm_avg_pu8(__m64 a, __m64 b)
700{
701  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
702}
703
704static inline __m64 __attribute__((__always_inline__, __nodebug__))
705_mm_avg_pu16(__m64 a, __m64 b)
706{
707  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
708}
709
710static inline __m64 __attribute__((__always_inline__, __nodebug__))
711_mm_sad_pu8(__m64 a, __m64 b)
712{
713  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
714}
715
716static inline unsigned int __attribute__((__always_inline__, __nodebug__))
717_mm_getcsr(void)
718{
719  return __builtin_ia32_stmxcsr();
720}
721
722static inline void __attribute__((__always_inline__, __nodebug__))
723_mm_setcsr(unsigned int i)
724{
725  __builtin_ia32_ldmxcsr(i);
726}
727
728#define _mm_shuffle_ps(a, b, mask) \
729        (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \
730                                 (((mask) & 0x30) >> 4) + 4, \
731                                 (((mask) & 0xc0) >> 6) + 4))
732
733static inline __m128 __attribute__((__always_inline__, __nodebug__))
734_mm_unpackhi_ps(__m128 a, __m128 b)
735{
736  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
737}
738
739static inline __m128 __attribute__((__always_inline__, __nodebug__))
740_mm_unpacklo_ps(__m128 a, __m128 b)
741{
742  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
743}
744
745static inline __m128 __attribute__((__always_inline__, __nodebug__))
746_mm_move_ss(__m128 a, __m128 b)
747{
748  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
749}
750
751static inline __m128 __attribute__((__always_inline__, __nodebug__))
752_mm_movehl_ps(__m128 a, __m128 b)
753{
754  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
755}
756
757static inline __m128 __attribute__((__always_inline__, __nodebug__))
758_mm_movelh_ps(__m128 a, __m128 b)
759{
760  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
761}
762
763static inline __m128 __attribute__((__always_inline__, __nodebug__))
764_mm_cvtpi16_ps(__m64 a)
765{
766  __m64 b, c;
767  __m128 r;
768
769  b = _mm_setzero_si64();
770  b = _mm_cmpgt_pi16(b, a);
771  c = _mm_unpackhi_pi16(a, b);
772  r = _mm_setzero_ps();
773  r = _mm_cvtpi32_ps(r, c);
774  r = _mm_movelh_ps(r, r);
775  c = _mm_unpacklo_pi16(a, b);
776  r = _mm_cvtpi32_ps(r, c);
777
778  return r;
779}
780
781static inline __m128 __attribute__((__always_inline__, __nodebug__))
782_mm_cvtpu16_ps(__m64 a)
783{
784  __m64 b, c;
785  __m128 r;
786
787  b = _mm_setzero_si64();
788  c = _mm_unpackhi_pi16(a, b);
789  r = _mm_setzero_ps();
790  r = _mm_cvtpi32_ps(r, c);
791  r = _mm_movelh_ps(r, r);
792  c = _mm_unpacklo_pi16(a, b);
793  r = _mm_cvtpi32_ps(r, c);
794
795  return r;
796}
797
798static inline __m128 __attribute__((__always_inline__, __nodebug__))
799_mm_cvtpi8_ps(__m64 a)
800{
801  __m64 b;
802
803  b = _mm_setzero_si64();
804  b = _mm_cmpgt_pi8(b, a);
805  b = _mm_unpacklo_pi8(a, b);
806
807  return _mm_cvtpi16_ps(b);
808}
809
810static inline __m128 __attribute__((__always_inline__, __nodebug__))
811_mm_cvtpu8_ps(__m64 a)
812{
813  __m64 b;
814
815  b = _mm_setzero_si64();
816  b = _mm_unpacklo_pi8(a, b);
817
818  return _mm_cvtpi16_ps(b);
819}
820
821static inline __m128 __attribute__((__always_inline__, __nodebug__))
822_mm_cvtpi32x2_ps(__m64 a, __m64 b)
823{
824  __m128 c;
825
826  c = _mm_setzero_ps();
827  c = _mm_cvtpi32_ps(c, b);
828  c = _mm_movelh_ps(c, c);
829
830  return _mm_cvtpi32_ps(c, a);
831}
832
833static inline __m64 __attribute__((__always_inline__, __nodebug__))
834_mm_cvtps_pi16(__m128 a)
835{
836  __m64 b, c;
837
838  b = _mm_cvtps_pi32(a);
839  a = _mm_movehl_ps(a, a);
840  c = _mm_cvtps_pi32(a);
841
842  return _mm_packs_pi16(b, c);
843}
844
845static inline __m64 __attribute__((__always_inline__, __nodebug__))
846_mm_cvtps_pi8(__m128 a)
847{
848  __m64 b, c;
849
850  b = _mm_cvtps_pi16(a);
851  c = _mm_setzero_si64();
852
853  return _mm_packs_pi16(b, c);
854}
855
856static inline int __attribute__((__always_inline__, __nodebug__))
857_mm_movemask_ps(__m128 a)
858{
859  return __builtin_ia32_movmskps(a);
860}
861
862#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
863
864#define _MM_EXCEPT_INVALID    (0x0001)
865#define _MM_EXCEPT_DENORM     (0x0002)
866#define _MM_EXCEPT_DIV_ZERO   (0x0004)
867#define _MM_EXCEPT_OVERFLOW   (0x0008)
868#define _MM_EXCEPT_UNDERFLOW  (0x0010)
869#define _MM_EXCEPT_INEXACT    (0x0020)
870#define _MM_EXCEPT_MASK       (0x003f)
871
872#define _MM_MASK_INVALID      (0x0080)
873#define _MM_MASK_DENORM       (0x0100)
874#define _MM_MASK_DIV_ZERO     (0x0200)
875#define _MM_MASK_OVERFLOW     (0x0400)
876#define _MM_MASK_UNDERFLOW    (0x0800)
877#define _MM_MASK_INEXACT      (0x1000)
878#define _MM_MASK_MASK         (0x1f80)
879
880#define _MM_ROUND_NEAREST     (0x0000)
881#define _MM_ROUND_DOWN        (0x2000)
882#define _MM_ROUND_UP          (0x4000)
883#define _MM_ROUND_TOWARD_ZERO (0x6000)
884#define _MM_ROUND_MASK        (0x6000)
885
886#define _MM_FLUSH_ZERO_MASK   (0x8000)
887#define _MM_FLUSH_ZERO_ON     (0x8000)
888#define _MM_FLUSH_ZERO_OFF    (0x8000)
889
890#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
891#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
892#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
893#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
894
895#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
896#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
897#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
898#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
899
900#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
901do { \
902  __m128 tmp3, tmp2, tmp1, tmp0; \
903  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
904  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
905  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
906  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
907  (row0) = _mm_movelh_ps(tmp0, tmp2); \
908  (row1) = _mm_movehl_ps(tmp2, tmp0); \
909  (row2) = _mm_movelh_ps(tmp1, tmp3); \
910  (row3) = _mm_movehl_ps(tmp3, tmp1); \
911} while (0)
912
913/* Ugly hack for backwards-compatibility (compatible with gcc) */
914#ifdef __SSE2__
915#include <emmintrin.h>
916#endif
917
918#endif /* __SSE__ */
919
920#endif /* __XMMINTRIN_H */
921