xmmintrin.h revision 205408
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23193326Sed
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE__
28193326Sed#error "SSE instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <mmintrin.h>
32193326Sed
33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
35193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
36193326Sed
37193326Sed#include <mm_malloc.h>
38193326Sed
39193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
40193326Sed_mm_add_ss(__m128 a, __m128 b)
41193326Sed{
42193576Sed  a[0] += b[0];
43193576Sed  return a;
44193326Sed}
45193326Sed
46193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
47193326Sed_mm_add_ps(__m128 a, __m128 b)
48193326Sed{
49193326Sed  return a + b;
50193326Sed}
51193326Sed
52193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
53193326Sed_mm_sub_ss(__m128 a, __m128 b)
54193326Sed{
55193576Sed  a[0] -= b[0];
56193576Sed  return a;
57193326Sed}
58193326Sed
59193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
60193326Sed_mm_sub_ps(__m128 a, __m128 b)
61193326Sed{
62193326Sed  return a - b;
63193326Sed}
64193326Sed
65193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
66193326Sed_mm_mul_ss(__m128 a, __m128 b)
67193326Sed{
68193576Sed  a[0] *= b[0];
69193576Sed  return a;
70193326Sed}
71193326Sed
72193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
73193326Sed_mm_mul_ps(__m128 a, __m128 b)
74193326Sed{
75193326Sed  return a * b;
76193326Sed}
77193326Sed
78193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
79193326Sed_mm_div_ss(__m128 a, __m128 b)
80193326Sed{
81193576Sed  a[0] /= b[0];
82193576Sed  return a;
83193326Sed}
84193326Sed
85193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
86193326Sed_mm_div_ps(__m128 a, __m128 b)
87193326Sed{
88193326Sed  return a / b;
89193326Sed}
90193326Sed
91193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
92193326Sed_mm_sqrt_ss(__m128 a)
93193326Sed{
94193326Sed  return __builtin_ia32_sqrtss(a);
95193326Sed}
96193326Sed
97193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
98193326Sed_mm_sqrt_ps(__m128 a)
99193326Sed{
100193326Sed  return __builtin_ia32_sqrtps(a);
101193326Sed}
102193326Sed
103193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
104193326Sed_mm_rcp_ss(__m128 a)
105193326Sed{
106193326Sed  return __builtin_ia32_rcpss(a);
107193326Sed}
108193326Sed
109193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
110193326Sed_mm_rcp_ps(__m128 a)
111193326Sed{
112193326Sed  return __builtin_ia32_rcpps(a);
113193326Sed}
114193326Sed
115193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
116193326Sed_mm_rsqrt_ss(__m128 a)
117193326Sed{
118193326Sed  return __builtin_ia32_rsqrtss(a);
119193326Sed}
120193326Sed
121193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
122193326Sed_mm_rsqrt_ps(__m128 a)
123193326Sed{
124193326Sed  return __builtin_ia32_rsqrtps(a);
125193326Sed}
126193326Sed
127193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
128193326Sed_mm_min_ss(__m128 a, __m128 b)
129193326Sed{
130193326Sed  return __builtin_ia32_minss(a, b);
131193326Sed}
132193326Sed
133193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
134193326Sed_mm_min_ps(__m128 a, __m128 b)
135193326Sed{
136193326Sed  return __builtin_ia32_minps(a, b);
137193326Sed}
138193326Sed
139193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
140193326Sed_mm_max_ss(__m128 a, __m128 b)
141193326Sed{
142193326Sed  return __builtin_ia32_maxss(a, b);
143193326Sed}
144193326Sed
145193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
146193326Sed_mm_max_ps(__m128 a, __m128 b)
147193326Sed{
148193326Sed  return __builtin_ia32_maxps(a, b);
149193326Sed}
150193326Sed
151193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
152193326Sed_mm_and_ps(__m128 a, __m128 b)
153193326Sed{
154193576Sed  return (__m128)((__v4si)a & (__v4si)b);
155193326Sed}
156193326Sed
157193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
158193326Sed_mm_andnot_ps(__m128 a, __m128 b)
159193326Sed{
160193576Sed  return (__m128)(~(__v4si)a & (__v4si)b);
161193326Sed}
162193326Sed
163193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
164193326Sed_mm_or_ps(__m128 a, __m128 b)
165193326Sed{
166193576Sed  return (__m128)((__v4si)a | (__v4si)b);
167193326Sed}
168193326Sed
169193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
170193326Sed_mm_xor_ps(__m128 a, __m128 b)
171193326Sed{
172202379Srdivacky  return (__m128)((__v4si)a ^ (__v4si)b);
173193326Sed}
174193326Sed
175193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
176193326Sed_mm_cmpeq_ss(__m128 a, __m128 b)
177193326Sed{
178193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 0);
179193326Sed}
180193326Sed
181193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
182193326Sed_mm_cmpeq_ps(__m128 a, __m128 b)
183193326Sed{
184193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 0);
185193326Sed}
186193326Sed
187193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
188193326Sed_mm_cmplt_ss(__m128 a, __m128 b)
189193326Sed{
190193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 1);
191193326Sed}
192193326Sed
193193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
194193326Sed_mm_cmplt_ps(__m128 a, __m128 b)
195193326Sed{
196193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 1);
197193326Sed}
198193326Sed
199193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
200193326Sed_mm_cmple_ss(__m128 a, __m128 b)
201193326Sed{
202193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 2);
203193326Sed}
204193326Sed
205193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
206193326Sed_mm_cmple_ps(__m128 a, __m128 b)
207193326Sed{
208193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 2);
209193326Sed}
210193326Sed
211193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
212193326Sed_mm_cmpgt_ss(__m128 a, __m128 b)
213193326Sed{
214193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 1);
215193326Sed}
216193326Sed
217193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
218193326Sed_mm_cmpgt_ps(__m128 a, __m128 b)
219193326Sed{
220193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 1);
221193326Sed}
222193326Sed
223193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
224193326Sed_mm_cmpge_ss(__m128 a, __m128 b)
225193326Sed{
226193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 2);
227193326Sed}
228193326Sed
229193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
230193326Sed_mm_cmpge_ps(__m128 a, __m128 b)
231193326Sed{
232193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 2);
233193326Sed}
234193326Sed
235193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
236193326Sed_mm_cmpneq_ss(__m128 a, __m128 b)
237193326Sed{
238193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 4);
239193326Sed}
240193326Sed
241193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
242193326Sed_mm_cmpneq_ps(__m128 a, __m128 b)
243193326Sed{
244193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 4);
245193326Sed}
246193326Sed
247193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
248193326Sed_mm_cmpnlt_ss(__m128 a, __m128 b)
249193326Sed{
250193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 5);
251193326Sed}
252193326Sed
253193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
254193326Sed_mm_cmpnlt_ps(__m128 a, __m128 b)
255193326Sed{
256193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 5);
257193326Sed}
258193326Sed
259193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
260193326Sed_mm_cmpnle_ss(__m128 a, __m128 b)
261193326Sed{
262193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 6);
263193326Sed}
264193326Sed
265193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
266193326Sed_mm_cmpnle_ps(__m128 a, __m128 b)
267193326Sed{
268193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 6);
269193326Sed}
270193326Sed
271193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
272193326Sed_mm_cmpngt_ss(__m128 a, __m128 b)
273193326Sed{
274193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 5);
275193326Sed}
276193326Sed
277193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
278193326Sed_mm_cmpngt_ps(__m128 a, __m128 b)
279193326Sed{
280193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 5);
281193326Sed}
282193326Sed
283193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
284193326Sed_mm_cmpnge_ss(__m128 a, __m128 b)
285193326Sed{
286193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 6);
287193326Sed}
288193326Sed
289193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
290193326Sed_mm_cmpnge_ps(__m128 a, __m128 b)
291193326Sed{
292193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 6);
293193326Sed}
294193326Sed
295193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
296193326Sed_mm_cmpord_ss(__m128 a, __m128 b)
297193326Sed{
298193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 7);
299193326Sed}
300193326Sed
301193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
302193326Sed_mm_cmpord_ps(__m128 a, __m128 b)
303193326Sed{
304193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 7);
305193326Sed}
306193326Sed
307193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
308193326Sed_mm_cmpunord_ss(__m128 a, __m128 b)
309193326Sed{
310193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 3);
311193326Sed}
312193326Sed
313193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
314193326Sed_mm_cmpunord_ps(__m128 a, __m128 b)
315193326Sed{
316193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 3);
317193326Sed}
318193326Sed
319193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
320193326Sed_mm_comieq_ss(__m128 a, __m128 b)
321193326Sed{
322193326Sed  return __builtin_ia32_comieq(a, b);
323193326Sed}
324193326Sed
325193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
326193326Sed_mm_comilt_ss(__m128 a, __m128 b)
327193326Sed{
328193326Sed  return __builtin_ia32_comilt(a, b);
329193326Sed}
330193326Sed
331193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
332193326Sed_mm_comile_ss(__m128 a, __m128 b)
333193326Sed{
334193326Sed  return __builtin_ia32_comile(a, b);
335193326Sed}
336193326Sed
337193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
338193326Sed_mm_comigt_ss(__m128 a, __m128 b)
339193326Sed{
340193326Sed  return __builtin_ia32_comigt(a, b);
341193326Sed}
342193326Sed
343193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
344193326Sed_mm_comige_ss(__m128 a, __m128 b)
345193326Sed{
346193326Sed  return __builtin_ia32_comige(a, b);
347193326Sed}
348193326Sed
349193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
350193326Sed_mm_comineq_ss(__m128 a, __m128 b)
351193326Sed{
352193326Sed  return __builtin_ia32_comineq(a, b);
353193326Sed}
354193326Sed
355193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
356193326Sed_mm_ucomieq_ss(__m128 a, __m128 b)
357193326Sed{
358193326Sed  return __builtin_ia32_ucomieq(a, b);
359193326Sed}
360193326Sed
361193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
362193326Sed_mm_ucomilt_ss(__m128 a, __m128 b)
363193326Sed{
364193326Sed  return __builtin_ia32_ucomilt(a, b);
365193326Sed}
366193326Sed
367193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
368193326Sed_mm_ucomile_ss(__m128 a, __m128 b)
369193326Sed{
370193326Sed  return __builtin_ia32_ucomile(a, b);
371193326Sed}
372193326Sed
373193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
374193326Sed_mm_ucomigt_ss(__m128 a, __m128 b)
375193326Sed{
376193326Sed  return __builtin_ia32_ucomigt(a, b);
377193326Sed}
378193326Sed
379193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
380193326Sed_mm_ucomige_ss(__m128 a, __m128 b)
381193326Sed{
382193326Sed  return __builtin_ia32_ucomige(a, b);
383193326Sed}
384193326Sed
385193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
386193326Sed_mm_ucomineq_ss(__m128 a, __m128 b)
387193326Sed{
388193326Sed  return __builtin_ia32_ucomineq(a, b);
389193326Sed}
390193326Sed
391193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
392193326Sed_mm_cvtss_si32(__m128 a)
393193326Sed{
394193326Sed  return __builtin_ia32_cvtss2si(a);
395193326Sed}
396193326Sed
397204643Srdivackystatic inline int __attribute__((__always_inline__, __nodebug__))
398204643Srdivacky_mm_cvt_ss2si(__m128 a)
399204643Srdivacky{
400204643Srdivacky  return _mm_cvtss_si32(a);
401204643Srdivacky}
402204643Srdivacky
403193576Sed#ifdef __x86_64__
404193576Sed
405193326Sedstatic inline long long __attribute__((__always_inline__, __nodebug__))
406193326Sed_mm_cvtss_si64(__m128 a)
407193326Sed{
408193326Sed  return __builtin_ia32_cvtss2si64(a);
409193326Sed}
410193326Sed
411193576Sed#endif
412193576Sed
413193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
414193326Sed_mm_cvtps_pi32(__m128 a)
415193326Sed{
416193326Sed  return (__m64)__builtin_ia32_cvtps2pi(a);
417193326Sed}
418193326Sed
419193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
420193326Sed_mm_cvttss_si32(__m128 a)
421193326Sed{
422193576Sed  return a[0];
423193326Sed}
424193326Sed
425204643Srdivackystatic inline int __attribute__((__always_inline__, __nodebug__))
426204643Srdivacky_mm_cvtt_ss2si(__m128 a)
427204643Srdivacky{
428204643Srdivacky  return _mm_cvttss_si32(a);
429204643Srdivacky}
430204643Srdivacky
431193326Sedstatic inline long long __attribute__((__always_inline__, __nodebug__))
432193326Sed_mm_cvttss_si64(__m128 a)
433193326Sed{
434193576Sed  return a[0];
435193326Sed}
436193326Sed
437193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
438193326Sed_mm_cvttps_pi32(__m128 a)
439193326Sed{
440193326Sed  return (__m64)__builtin_ia32_cvttps2pi(a);
441193326Sed}
442193326Sed
443193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
444193326Sed_mm_cvtsi32_ss(__m128 a, int b)
445193326Sed{
446193576Sed  a[0] = b;
447193576Sed  return a;
448193326Sed}
449193326Sed
450193326Sed#ifdef __x86_64__
451193326Sed
452193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
453193326Sed_mm_cvtsi64_ss(__m128 a, long long b)
454193326Sed{
455193576Sed  a[0] = b;
456193576Sed  return a;
457193326Sed}
458193326Sed
459193326Sed#endif
460193326Sed
461193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
462193326Sed_mm_cvtpi32_ps(__m128 a, __m64 b)
463193326Sed{
464193326Sed  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
465193326Sed}
466193326Sed
467193326Sedstatic inline float __attribute__((__always_inline__, __nodebug__))
468193326Sed_mm_cvtss_f32(__m128 a)
469193326Sed{
470193326Sed  return a[0];
471193326Sed}
472193326Sed
473193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
474203955Srdivacky_mm_loadh_pi(__m128 a, const __m64 *p)
475193326Sed{
476193631Sed  __m128 b;
477193631Sed  b[0] = *(float*)p;
478193631Sed  b[1] = *((float*)p+1);
479193631Sed  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
480193326Sed}
481193326Sed
482193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
483203955Srdivacky_mm_loadl_pi(__m128 a, const __m64 *p)
484193326Sed{
485193576Sed  __m128 b;
486193576Sed  b[0] = *(float*)p;
487193576Sed  b[1] = *((float*)p+1);
488193631Sed  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
489193326Sed}
490193326Sed
491193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
492203955Srdivacky_mm_load_ss(const float *p)
493193326Sed{
494193326Sed  return (__m128){ *p, 0, 0, 0 };
495193326Sed}
496193326Sed
497193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
498203955Srdivacky_mm_load1_ps(const float *p)
499193326Sed{
500193326Sed  return (__m128){ *p, *p, *p, *p };
501193326Sed}
502193326Sed
503193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
504193326Sed
505193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
506203955Srdivacky_mm_load_ps(const float *p)
507193326Sed{
508193326Sed  return *(__m128*)p;
509193326Sed}
510193326Sed
511193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
512203955Srdivacky_mm_loadu_ps(const float *p)
513193326Sed{
514193326Sed  return __builtin_ia32_loadups(p);
515193326Sed}
516193326Sed
517193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
518203955Srdivacky_mm_loadr_ps(const float *p)
519193326Sed{
520193326Sed  __m128 a = _mm_load_ps(p);
521193326Sed  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
522193326Sed}
523193326Sed
524193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
525193326Sed_mm_set_ss(float w)
526193326Sed{
527193326Sed  return (__m128){ w, 0, 0, 0 };
528193326Sed}
529193326Sed
530193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
531193326Sed_mm_set1_ps(float w)
532193326Sed{
533193326Sed  return (__m128){ w, w, w, w };
534193326Sed}
535193326Sed
536193326Sed// Microsoft specific.
537193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
538193326Sed_mm_set_ps1(float w)
539193326Sed{
540193326Sed    return _mm_set1_ps(w);
541193326Sed}
542193326Sed
543193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
544193326Sed_mm_set_ps(float z, float y, float x, float w)
545193326Sed{
546193326Sed  return (__m128){ w, x, y, z };
547193326Sed}
548193326Sed
549193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
550193326Sed_mm_setr_ps(float z, float y, float x, float w)
551193326Sed{
552193326Sed  return (__m128){ z, y, x, w };
553193326Sed}
554193326Sed
555193326Sedstatic inline __m128 __attribute__((__always_inline__))
556193326Sed_mm_setzero_ps(void)
557193326Sed{
558193326Sed  return (__m128){ 0, 0, 0, 0 };
559193326Sed}
560193326Sed
561193326Sedstatic inline void __attribute__((__always_inline__))
562193326Sed_mm_storeh_pi(__m64 *p, __m128 a)
563193326Sed{
564193326Sed  __builtin_ia32_storehps((__v2si *)p, a);
565193326Sed}
566193326Sed
567193326Sedstatic inline void __attribute__((__always_inline__))
568193326Sed_mm_storel_pi(__m64 *p, __m128 a)
569193326Sed{
570193326Sed  __builtin_ia32_storelps((__v2si *)p, a);
571193326Sed}
572193326Sed
573193326Sedstatic inline void __attribute__((__always_inline__))
574193326Sed_mm_store_ss(float *p, __m128 a)
575193326Sed{
576193326Sed  *p = a[0];
577193326Sed}
578193326Sed
579193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
580193326Sed_mm_storeu_ps(float *p, __m128 a)
581193326Sed{
582193326Sed  __builtin_ia32_storeups(p, a);
583193326Sed}
584193326Sed
585193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
586193326Sed_mm_store1_ps(float *p, __m128 a)
587193326Sed{
588193326Sed  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
589193326Sed  _mm_storeu_ps(p, a);
590193326Sed}
591193326Sed
592193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
593193326Sed_mm_store_ps(float *p, __m128 a)
594193326Sed{
595193326Sed  *(__m128 *)p = a;
596193326Sed}
597193326Sed
598193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
599193326Sed_mm_storer_ps(float *p, __m128 a)
600193326Sed{
601193326Sed  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
602193326Sed  _mm_store_ps(p, a);
603193326Sed}
604193326Sed
605193326Sed#define _MM_HINT_T0 1
606193326Sed#define _MM_HINT_T1 2
607193326Sed#define _MM_HINT_T2 3
608193326Sed#define _MM_HINT_NTA 0
609193326Sed
610193326Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
611193326Sed   Sema doesn't do any form of constant propagation yet. */
612193326Sed
613193326Sed#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
614193326Sed
615193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
616193326Sed_mm_stream_pi(__m64 *p, __m64 a)
617193326Sed{
618193326Sed  __builtin_ia32_movntq(p, a);
619193326Sed}
620193326Sed
621193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
622193326Sed_mm_stream_ps(float *p, __m128 a)
623193326Sed{
624193326Sed  __builtin_ia32_movntps(p, a);
625193326Sed}
626193326Sed
627193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
628193326Sed_mm_sfence(void)
629193326Sed{
630193326Sed  __builtin_ia32_sfence();
631193326Sed}
632193326Sed
633193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
634193326Sed_mm_extract_pi16(__m64 a, int n)
635193326Sed{
636193326Sed  __v4hi b = (__v4hi)a;
637193576Sed  return (unsigned short)b[n & 3];
638193326Sed}
639193326Sed
640193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
641193326Sed_mm_insert_pi16(__m64 a, int d, int n)
642193326Sed{
643193576Sed   __v4hi b = (__v4hi)a;
644193576Sed   b[n & 3] = d;
645193576Sed   return (__m64)b;
646193326Sed}
647193326Sed
648193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
649193326Sed_mm_max_pi16(__m64 a, __m64 b)
650193326Sed{
651193326Sed  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
652193326Sed}
653193326Sed
654193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
655193326Sed_mm_max_pu8(__m64 a, __m64 b)
656193326Sed{
657193326Sed  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
658193326Sed}
659193326Sed
660193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
661193326Sed_mm_min_pi16(__m64 a, __m64 b)
662193326Sed{
663193326Sed  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
664193326Sed}
665193326Sed
666193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
667193326Sed_mm_min_pu8(__m64 a, __m64 b)
668193326Sed{
669193326Sed  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
670193326Sed}
671193326Sed
672193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
673193326Sed_mm_movemask_pi8(__m64 a)
674193326Sed{
675193326Sed  return __builtin_ia32_pmovmskb((__v8qi)a);
676193326Sed}
677193326Sed
678193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
679193326Sed_mm_mulhi_pu16(__m64 a, __m64 b)
680193326Sed{
681193326Sed  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
682193326Sed}
683193326Sed
684193576Sed#define _mm_shuffle_pi16(a, n) \
685193576Sed  ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \
686193576Sed                                  (n) & 0x3, ((n) & 0xc) >> 2, \
687193576Sed                                  ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
688193326Sed
689193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
690193326Sed_mm_maskmove_si64(__m64 d, __m64 n, char *p)
691193326Sed{
692193326Sed  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
693193326Sed}
694193326Sed
695193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
696193326Sed_mm_avg_pu8(__m64 a, __m64 b)
697193326Sed{
698193326Sed  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
699193326Sed}
700193326Sed
701193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
702193326Sed_mm_avg_pu16(__m64 a, __m64 b)
703193326Sed{
704193326Sed  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
705193326Sed}
706193326Sed
707193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
708193326Sed_mm_sad_pu8(__m64 a, __m64 b)
709193326Sed{
710193326Sed  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
711193326Sed}
712193326Sed
713193326Sedstatic inline unsigned int __attribute__((__always_inline__, __nodebug__))
714193326Sed_mm_getcsr(void)
715193326Sed{
716193326Sed  return __builtin_ia32_stmxcsr();
717193326Sed}
718193326Sed
719193326Sedstatic inline void __attribute__((__always_inline__, __nodebug__))
720193326Sed_mm_setcsr(unsigned int i)
721193326Sed{
722193326Sed  __builtin_ia32_ldmxcsr(i);
723193326Sed}
724193326Sed
725193576Sed#define _mm_shuffle_ps(a, b, mask) \
726193576Sed        (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \
727193576Sed                                 (((mask) & 0x30) >> 4) + 4, \
728193576Sed                                 (((mask) & 0xc0) >> 6) + 4))
729193326Sed
730193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
731193326Sed_mm_unpackhi_ps(__m128 a, __m128 b)
732193326Sed{
733193326Sed  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
734193326Sed}
735193326Sed
736193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
737193326Sed_mm_unpacklo_ps(__m128 a, __m128 b)
738193326Sed{
739193326Sed  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
740193326Sed}
741193326Sed
742193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
743193326Sed_mm_move_ss(__m128 a, __m128 b)
744193326Sed{
745193326Sed  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
746193326Sed}
747193326Sed
748193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
749193326Sed_mm_movehl_ps(__m128 a, __m128 b)
750193326Sed{
751193326Sed  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
752193326Sed}
753193326Sed
754193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
755193326Sed_mm_movelh_ps(__m128 a, __m128 b)
756193326Sed{
757193326Sed  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
758193326Sed}
759193326Sed
760193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
761193326Sed_mm_cvtpi16_ps(__m64 a)
762193326Sed{
763193326Sed  __m64 b, c;
764193326Sed  __m128 r;
765193326Sed
766193326Sed  b = _mm_setzero_si64();
767193326Sed  b = _mm_cmpgt_pi16(b, a);
768193326Sed  c = _mm_unpackhi_pi16(a, b);
769193326Sed  r = _mm_setzero_ps();
770193326Sed  r = _mm_cvtpi32_ps(r, c);
771193326Sed  r = _mm_movelh_ps(r, r);
772193326Sed  c = _mm_unpacklo_pi16(a, b);
773193326Sed  r = _mm_cvtpi32_ps(r, c);
774193326Sed
775193326Sed  return r;
776193326Sed}
777193326Sed
778193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
779193326Sed_mm_cvtpu16_ps(__m64 a)
780193326Sed{
781193326Sed  __m64 b, c;
782193326Sed  __m128 r;
783193326Sed
784193326Sed  b = _mm_setzero_si64();
785193326Sed  c = _mm_unpackhi_pi16(a, b);
786193326Sed  r = _mm_setzero_ps();
787193326Sed  r = _mm_cvtpi32_ps(r, c);
788193326Sed  r = _mm_movelh_ps(r, r);
789193326Sed  c = _mm_unpacklo_pi16(a, b);
790193326Sed  r = _mm_cvtpi32_ps(r, c);
791193326Sed
792193326Sed  return r;
793193326Sed}
794193326Sed
795193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
796193326Sed_mm_cvtpi8_ps(__m64 a)
797193326Sed{
798193326Sed  __m64 b;
799193326Sed
800193326Sed  b = _mm_setzero_si64();
801193326Sed  b = _mm_cmpgt_pi8(b, a);
802193326Sed  b = _mm_unpacklo_pi8(a, b);
803193326Sed
804193326Sed  return _mm_cvtpi16_ps(b);
805193326Sed}
806193326Sed
807193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
808193326Sed_mm_cvtpu8_ps(__m64 a)
809193326Sed{
810193326Sed  __m64 b;
811193326Sed
812193326Sed  b = _mm_setzero_si64();
813193326Sed  b = _mm_unpacklo_pi8(a, b);
814193326Sed
815193326Sed  return _mm_cvtpi16_ps(b);
816193326Sed}
817193326Sed
818193326Sedstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
819193326Sed_mm_cvtpi32x2_ps(__m64 a, __m64 b)
820193326Sed{
821193326Sed  __m128 c;
822193326Sed
823193326Sed  c = _mm_setzero_ps();
824193326Sed  c = _mm_cvtpi32_ps(c, b);
825193326Sed  c = _mm_movelh_ps(c, c);
826193326Sed
827193326Sed  return _mm_cvtpi32_ps(c, a);
828193326Sed}
829193326Sed
830193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
831193326Sed_mm_cvtps_pi16(__m128 a)
832193326Sed{
833193326Sed  __m64 b, c;
834193326Sed
835193326Sed  b = _mm_cvtps_pi32(a);
836193326Sed  a = _mm_movehl_ps(a, a);
837193326Sed  c = _mm_cvtps_pi32(a);
838193326Sed
839193326Sed  return _mm_packs_pi16(b, c);
840193326Sed}
841193326Sed
842193326Sedstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
843193326Sed_mm_cvtps_pi8(__m128 a)
844193326Sed{
845193326Sed  __m64 b, c;
846193326Sed
847193326Sed  b = _mm_cvtps_pi16(a);
848193326Sed  c = _mm_setzero_si64();
849193326Sed
850193326Sed  return _mm_packs_pi16(b, c);
851193326Sed}
852193326Sed
853193326Sedstatic inline int __attribute__((__always_inline__, __nodebug__))
854193326Sed_mm_movemask_ps(__m128 a)
855193326Sed{
856193326Sed  return __builtin_ia32_movmskps(a);
857193326Sed}
858193326Sed
859193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
860193326Sed
861193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
862193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
863193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
864193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
865193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
866193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
867193326Sed#define _MM_EXCEPT_MASK       (0x003f)
868193326Sed
869193326Sed#define _MM_MASK_INVALID      (0x0080)
870193326Sed#define _MM_MASK_DENORM       (0x0100)
871193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
872193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
873193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
874193326Sed#define _MM_MASK_INEXACT      (0x1000)
875193326Sed#define _MM_MASK_MASK         (0x1f80)
876193326Sed
877193326Sed#define _MM_ROUND_NEAREST     (0x0000)
878193326Sed#define _MM_ROUND_DOWN        (0x2000)
879193326Sed#define _MM_ROUND_UP          (0x4000)
880193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
881193326Sed#define _MM_ROUND_MASK        (0x6000)
882193326Sed
883193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
884193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
885193326Sed#define _MM_FLUSH_ZERO_OFF    (0x8000)
886193326Sed
887193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
888193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
889193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
890193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
891193326Sed
892193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
893193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
894193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
895193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
896193326Sed
897193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
898193326Seddo { \
899193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
900193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
901193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
902193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
903193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
904193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
905193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
906193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
907203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
908193326Sed} while (0)
909193326Sed
910194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
911194179Sed#ifdef __SSE2__
912193326Sed#include <emmintrin.h>
913194179Sed#endif
914193326Sed
915193326Sed#endif /* __SSE__ */
916193326Sed
917193326Sed#endif /* __XMMINTRIN_H */
918