xmmintrin.h revision 218893
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23193326Sed
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE__
28193326Sed#error "SSE instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <mmintrin.h>
32193326Sed
33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
35193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
36193326Sed
37218893Sdim// This header should only be included in a hosted environment as it depends on
38218893Sdim// a standard library to provide allocation routines.
39218893Sdim#if __STDC_HOSTED__
40193326Sed#include <mm_malloc.h>
41218893Sdim#endif
42193326Sed
43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44193326Sed_mm_add_ss(__m128 a, __m128 b)
45193326Sed{
46193576Sed  a[0] += b[0];
47193576Sed  return a;
48193326Sed}
49193326Sed
50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51193326Sed_mm_add_ps(__m128 a, __m128 b)
52193326Sed{
53193326Sed  return a + b;
54193326Sed}
55193326Sed
56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57193326Sed_mm_sub_ss(__m128 a, __m128 b)
58193326Sed{
59193576Sed  a[0] -= b[0];
60193576Sed  return a;
61193326Sed}
62193326Sed
63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64193326Sed_mm_sub_ps(__m128 a, __m128 b)
65193326Sed{
66193326Sed  return a - b;
67193326Sed}
68193326Sed
69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70193326Sed_mm_mul_ss(__m128 a, __m128 b)
71193326Sed{
72193576Sed  a[0] *= b[0];
73193576Sed  return a;
74193326Sed}
75193326Sed
76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77193326Sed_mm_mul_ps(__m128 a, __m128 b)
78193326Sed{
79193326Sed  return a * b;
80193326Sed}
81193326Sed
82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83193326Sed_mm_div_ss(__m128 a, __m128 b)
84193326Sed{
85193576Sed  a[0] /= b[0];
86193576Sed  return a;
87193326Sed}
88193326Sed
89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90193326Sed_mm_div_ps(__m128 a, __m128 b)
91193326Sed{
92193326Sed  return a / b;
93193326Sed}
94193326Sed
95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96193326Sed_mm_sqrt_ss(__m128 a)
97193326Sed{
98193326Sed  return __builtin_ia32_sqrtss(a);
99193326Sed}
100193326Sed
101206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
102193326Sed_mm_sqrt_ps(__m128 a)
103193326Sed{
104193326Sed  return __builtin_ia32_sqrtps(a);
105193326Sed}
106193326Sed
107206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
108193326Sed_mm_rcp_ss(__m128 a)
109193326Sed{
110193326Sed  return __builtin_ia32_rcpss(a);
111193326Sed}
112193326Sed
113206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
114193326Sed_mm_rcp_ps(__m128 a)
115193326Sed{
116193326Sed  return __builtin_ia32_rcpps(a);
117193326Sed}
118193326Sed
119206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
120193326Sed_mm_rsqrt_ss(__m128 a)
121193326Sed{
122193326Sed  return __builtin_ia32_rsqrtss(a);
123193326Sed}
124193326Sed
125206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
126193326Sed_mm_rsqrt_ps(__m128 a)
127193326Sed{
128193326Sed  return __builtin_ia32_rsqrtps(a);
129193326Sed}
130193326Sed
131206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
132193326Sed_mm_min_ss(__m128 a, __m128 b)
133193326Sed{
134193326Sed  return __builtin_ia32_minss(a, b);
135193326Sed}
136193326Sed
137206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
138193326Sed_mm_min_ps(__m128 a, __m128 b)
139193326Sed{
140193326Sed  return __builtin_ia32_minps(a, b);
141193326Sed}
142193326Sed
143206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
144193326Sed_mm_max_ss(__m128 a, __m128 b)
145193326Sed{
146193326Sed  return __builtin_ia32_maxss(a, b);
147193326Sed}
148193326Sed
149206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
150193326Sed_mm_max_ps(__m128 a, __m128 b)
151193326Sed{
152193326Sed  return __builtin_ia32_maxps(a, b);
153193326Sed}
154193326Sed
155206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
156193326Sed_mm_and_ps(__m128 a, __m128 b)
157193326Sed{
158193576Sed  return (__m128)((__v4si)a & (__v4si)b);
159193326Sed}
160193326Sed
161206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
162193326Sed_mm_andnot_ps(__m128 a, __m128 b)
163193326Sed{
164193576Sed  return (__m128)(~(__v4si)a & (__v4si)b);
165193326Sed}
166193326Sed
167206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
168193326Sed_mm_or_ps(__m128 a, __m128 b)
169193326Sed{
170193576Sed  return (__m128)((__v4si)a | (__v4si)b);
171193326Sed}
172193326Sed
173206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
174193326Sed_mm_xor_ps(__m128 a, __m128 b)
175193326Sed{
176202379Srdivacky  return (__m128)((__v4si)a ^ (__v4si)b);
177193326Sed}
178193326Sed
179206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
180193326Sed_mm_cmpeq_ss(__m128 a, __m128 b)
181193326Sed{
182193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 0);
183193326Sed}
184193326Sed
185206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
186193326Sed_mm_cmpeq_ps(__m128 a, __m128 b)
187193326Sed{
188193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 0);
189193326Sed}
190193326Sed
191206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
192193326Sed_mm_cmplt_ss(__m128 a, __m128 b)
193193326Sed{
194193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 1);
195193326Sed}
196193326Sed
197206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
198193326Sed_mm_cmplt_ps(__m128 a, __m128 b)
199193326Sed{
200193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 1);
201193326Sed}
202193326Sed
203206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
204193326Sed_mm_cmple_ss(__m128 a, __m128 b)
205193326Sed{
206193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 2);
207193326Sed}
208193326Sed
209206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
210193326Sed_mm_cmple_ps(__m128 a, __m128 b)
211193326Sed{
212193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 2);
213193326Sed}
214193326Sed
215206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
216193326Sed_mm_cmpgt_ss(__m128 a, __m128 b)
217193326Sed{
218193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 1);
219193326Sed}
220193326Sed
221206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
222193326Sed_mm_cmpgt_ps(__m128 a, __m128 b)
223193326Sed{
224193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 1);
225193326Sed}
226193326Sed
227206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
228193326Sed_mm_cmpge_ss(__m128 a, __m128 b)
229193326Sed{
230193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 2);
231193326Sed}
232193326Sed
233206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
234193326Sed_mm_cmpge_ps(__m128 a, __m128 b)
235193326Sed{
236193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 2);
237193326Sed}
238193326Sed
239206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
240193326Sed_mm_cmpneq_ss(__m128 a, __m128 b)
241193326Sed{
242193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 4);
243193326Sed}
244193326Sed
245206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
246193326Sed_mm_cmpneq_ps(__m128 a, __m128 b)
247193326Sed{
248193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 4);
249193326Sed}
250193326Sed
251206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
252193326Sed_mm_cmpnlt_ss(__m128 a, __m128 b)
253193326Sed{
254193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 5);
255193326Sed}
256193326Sed
257206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
258193326Sed_mm_cmpnlt_ps(__m128 a, __m128 b)
259193326Sed{
260193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 5);
261193326Sed}
262193326Sed
263206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
264193326Sed_mm_cmpnle_ss(__m128 a, __m128 b)
265193326Sed{
266193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 6);
267193326Sed}
268193326Sed
269206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
270193326Sed_mm_cmpnle_ps(__m128 a, __m128 b)
271193326Sed{
272193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 6);
273193326Sed}
274193326Sed
275206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
276193326Sed_mm_cmpngt_ss(__m128 a, __m128 b)
277193326Sed{
278193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 5);
279193326Sed}
280193326Sed
281206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
282193326Sed_mm_cmpngt_ps(__m128 a, __m128 b)
283193326Sed{
284193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 5);
285193326Sed}
286193326Sed
287206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
288193326Sed_mm_cmpnge_ss(__m128 a, __m128 b)
289193326Sed{
290193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 6);
291193326Sed}
292193326Sed
293206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
294193326Sed_mm_cmpnge_ps(__m128 a, __m128 b)
295193326Sed{
296193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 6);
297193326Sed}
298193326Sed
299206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
300193326Sed_mm_cmpord_ss(__m128 a, __m128 b)
301193326Sed{
302193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 7);
303193326Sed}
304193326Sed
305206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
306193326Sed_mm_cmpord_ps(__m128 a, __m128 b)
307193326Sed{
308193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 7);
309193326Sed}
310193326Sed
311206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
312193326Sed_mm_cmpunord_ss(__m128 a, __m128 b)
313193326Sed{
314193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 3);
315193326Sed}
316193326Sed
317206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
318193326Sed_mm_cmpunord_ps(__m128 a, __m128 b)
319193326Sed{
320193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 3);
321193326Sed}
322193326Sed
323206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
324193326Sed_mm_comieq_ss(__m128 a, __m128 b)
325193326Sed{
326193326Sed  return __builtin_ia32_comieq(a, b);
327193326Sed}
328193326Sed
329206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
330193326Sed_mm_comilt_ss(__m128 a, __m128 b)
331193326Sed{
332193326Sed  return __builtin_ia32_comilt(a, b);
333193326Sed}
334193326Sed
335206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
336193326Sed_mm_comile_ss(__m128 a, __m128 b)
337193326Sed{
338193326Sed  return __builtin_ia32_comile(a, b);
339193326Sed}
340193326Sed
341206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
342193326Sed_mm_comigt_ss(__m128 a, __m128 b)
343193326Sed{
344193326Sed  return __builtin_ia32_comigt(a, b);
345193326Sed}
346193326Sed
347206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
348193326Sed_mm_comige_ss(__m128 a, __m128 b)
349193326Sed{
350193326Sed  return __builtin_ia32_comige(a, b);
351193326Sed}
352193326Sed
353206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
354193326Sed_mm_comineq_ss(__m128 a, __m128 b)
355193326Sed{
356193326Sed  return __builtin_ia32_comineq(a, b);
357193326Sed}
358193326Sed
359206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
360193326Sed_mm_ucomieq_ss(__m128 a, __m128 b)
361193326Sed{
362193326Sed  return __builtin_ia32_ucomieq(a, b);
363193326Sed}
364193326Sed
365206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
366193326Sed_mm_ucomilt_ss(__m128 a, __m128 b)
367193326Sed{
368193326Sed  return __builtin_ia32_ucomilt(a, b);
369193326Sed}
370193326Sed
371206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
372193326Sed_mm_ucomile_ss(__m128 a, __m128 b)
373193326Sed{
374193326Sed  return __builtin_ia32_ucomile(a, b);
375193326Sed}
376193326Sed
377206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
378193326Sed_mm_ucomigt_ss(__m128 a, __m128 b)
379193326Sed{
380193326Sed  return __builtin_ia32_ucomigt(a, b);
381193326Sed}
382193326Sed
383206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
384193326Sed_mm_ucomige_ss(__m128 a, __m128 b)
385193326Sed{
386193326Sed  return __builtin_ia32_ucomige(a, b);
387193326Sed}
388193326Sed
389206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
390193326Sed_mm_ucomineq_ss(__m128 a, __m128 b)
391193326Sed{
392193326Sed  return __builtin_ia32_ucomineq(a, b);
393193326Sed}
394193326Sed
395206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
396193326Sed_mm_cvtss_si32(__m128 a)
397193326Sed{
398193326Sed  return __builtin_ia32_cvtss2si(a);
399193326Sed}
400193326Sed
401206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
402204643Srdivacky_mm_cvt_ss2si(__m128 a)
403204643Srdivacky{
404204643Srdivacky  return _mm_cvtss_si32(a);
405204643Srdivacky}
406204643Srdivacky
407193576Sed#ifdef __x86_64__
408193576Sed
409206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
410193326Sed_mm_cvtss_si64(__m128 a)
411193326Sed{
412193326Sed  return __builtin_ia32_cvtss2si64(a);
413193326Sed}
414193326Sed
415193576Sed#endif
416193576Sed
417206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
418193326Sed_mm_cvtps_pi32(__m128 a)
419193326Sed{
420193326Sed  return (__m64)__builtin_ia32_cvtps2pi(a);
421193326Sed}
422193326Sed
423212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
424212904Sdim_mm_cvt_ps2pi(__m128 a)
425212904Sdim{
426212904Sdim  return _mm_cvtps_pi32(a);
427212904Sdim}
428212904Sdim
429206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
430193326Sed_mm_cvttss_si32(__m128 a)
431193326Sed{
432193576Sed  return a[0];
433193326Sed}
434193326Sed
435206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
436204643Srdivacky_mm_cvtt_ss2si(__m128 a)
437204643Srdivacky{
438204643Srdivacky  return _mm_cvttss_si32(a);
439204643Srdivacky}
440204643Srdivacky
441206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
442193326Sed_mm_cvttss_si64(__m128 a)
443193326Sed{
444193576Sed  return a[0];
445193326Sed}
446193326Sed
447206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
448193326Sed_mm_cvttps_pi32(__m128 a)
449193326Sed{
450193326Sed  return (__m64)__builtin_ia32_cvttps2pi(a);
451193326Sed}
452193326Sed
453212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
454212904Sdim_mm_cvtt_ps2pi(__m128 a)
455212904Sdim{
456212904Sdim  return _mm_cvttps_pi32(a);
457212904Sdim}
458212904Sdim
459206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
460193326Sed_mm_cvtsi32_ss(__m128 a, int b)
461193326Sed{
462193576Sed  a[0] = b;
463193576Sed  return a;
464193326Sed}
465193326Sed
466212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
467212904Sdim_mm_cvt_si2ss(__m128 a, int b)
468212904Sdim{
469212904Sdim  return _mm_cvtsi32_ss(a, b);
470212904Sdim}
471212904Sdim
472193326Sed#ifdef __x86_64__
473193326Sed
474206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475193326Sed_mm_cvtsi64_ss(__m128 a, long long b)
476193326Sed{
477193576Sed  a[0] = b;
478193576Sed  return a;
479193326Sed}
480193326Sed
481193326Sed#endif
482193326Sed
483206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
484193326Sed_mm_cvtpi32_ps(__m128 a, __m64 b)
485193326Sed{
486193326Sed  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
487193326Sed}
488193326Sed
489212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
490212904Sdim_mm_cvt_pi2ps(__m128 a, __m64 b)
491212904Sdim{
492212904Sdim  return _mm_cvtpi32_ps(a, b);
493212904Sdim}
494212904Sdim
495206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__))
496193326Sed_mm_cvtss_f32(__m128 a)
497193326Sed{
498193326Sed  return a[0];
499193326Sed}
500193326Sed
501206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
502203955Srdivacky_mm_loadh_pi(__m128 a, const __m64 *p)
503193326Sed{
504193631Sed  __m128 b;
505193631Sed  b[0] = *(float*)p;
506193631Sed  b[1] = *((float*)p+1);
507193631Sed  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
508193326Sed}
509193326Sed
510206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
511203955Srdivacky_mm_loadl_pi(__m128 a, const __m64 *p)
512193326Sed{
513193576Sed  __m128 b;
514193576Sed  b[0] = *(float*)p;
515193576Sed  b[1] = *((float*)p+1);
516193631Sed  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
517193326Sed}
518193326Sed
519206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
520203955Srdivacky_mm_load_ss(const float *p)
521193326Sed{
522193326Sed  return (__m128){ *p, 0, 0, 0 };
523193326Sed}
524193326Sed
525206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
526203955Srdivacky_mm_load1_ps(const float *p)
527193326Sed{
528193326Sed  return (__m128){ *p, *p, *p, *p };
529193326Sed}
530193326Sed
531193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
532193326Sed
533206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
534203955Srdivacky_mm_load_ps(const float *p)
535193326Sed{
536193326Sed  return *(__m128*)p;
537193326Sed}
538193326Sed
539206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
540203955Srdivacky_mm_loadu_ps(const float *p)
541193326Sed{
542193326Sed  return __builtin_ia32_loadups(p);
543193326Sed}
544193326Sed
545206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
546203955Srdivacky_mm_loadr_ps(const float *p)
547193326Sed{
548193326Sed  __m128 a = _mm_load_ps(p);
549193326Sed  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
550193326Sed}
551193326Sed
552206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
553193326Sed_mm_set_ss(float w)
554193326Sed{
555193326Sed  return (__m128){ w, 0, 0, 0 };
556193326Sed}
557193326Sed
558206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
559193326Sed_mm_set1_ps(float w)
560193326Sed{
561193326Sed  return (__m128){ w, w, w, w };
562193326Sed}
563193326Sed
564193326Sed// Microsoft specific.
565206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
566193326Sed_mm_set_ps1(float w)
567193326Sed{
568193326Sed    return _mm_set1_ps(w);
569193326Sed}
570193326Sed
571206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
572193326Sed_mm_set_ps(float z, float y, float x, float w)
573193326Sed{
574193326Sed  return (__m128){ w, x, y, z };
575193326Sed}
576193326Sed
577206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
578193326Sed_mm_setr_ps(float z, float y, float x, float w)
579193326Sed{
580193326Sed  return (__m128){ z, y, x, w };
581193326Sed}
582193326Sed
583206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__))
584193326Sed_mm_setzero_ps(void)
585193326Sed{
586193326Sed  return (__m128){ 0, 0, 0, 0 };
587193326Sed}
588193326Sed
589206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
590193326Sed_mm_storeh_pi(__m64 *p, __m128 a)
591193326Sed{
592193326Sed  __builtin_ia32_storehps((__v2si *)p, a);
593193326Sed}
594193326Sed
595206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
596193326Sed_mm_storel_pi(__m64 *p, __m128 a)
597193326Sed{
598193326Sed  __builtin_ia32_storelps((__v2si *)p, a);
599193326Sed}
600193326Sed
601206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
602193326Sed_mm_store_ss(float *p, __m128 a)
603193326Sed{
604193326Sed  *p = a[0];
605193326Sed}
606193326Sed
607206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
608193326Sed_mm_storeu_ps(float *p, __m128 a)
609193326Sed{
610193326Sed  __builtin_ia32_storeups(p, a);
611193326Sed}
612193326Sed
613206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
614193326Sed_mm_store1_ps(float *p, __m128 a)
615193326Sed{
616193326Sed  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
617193326Sed  _mm_storeu_ps(p, a);
618193326Sed}
619193326Sed
620206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
621212904Sdim_mm_store_ps1(float *p, __m128 a)
622212904Sdim{
623212904Sdim    return _mm_store1_ps(p, a);
624212904Sdim}
625212904Sdim
626212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__))
627193326Sed_mm_store_ps(float *p, __m128 a)
628193326Sed{
629193326Sed  *(__m128 *)p = a;
630193326Sed}
631193326Sed
632206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
633193326Sed_mm_storer_ps(float *p, __m128 a)
634193326Sed{
635193326Sed  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
636193326Sed  _mm_store_ps(p, a);
637193326Sed}
638193326Sed
639212904Sdim#define _MM_HINT_T0 3
640193326Sed#define _MM_HINT_T1 2
641212904Sdim#define _MM_HINT_T2 1
642193326Sed#define _MM_HINT_NTA 0
643193326Sed
644210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
645193326Sed   Sema doesn't do any form of constant propagation yet. */
646193326Sed
647210299Sed#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel))
648193326Sed
649206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
650193326Sed_mm_stream_pi(__m64 *p, __m64 a)
651193326Sed{
652193326Sed  __builtin_ia32_movntq(p, a);
653193326Sed}
654193326Sed
655206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
656193326Sed_mm_stream_ps(float *p, __m128 a)
657193326Sed{
658193326Sed  __builtin_ia32_movntps(p, a);
659193326Sed}
660193326Sed
661206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
662193326Sed_mm_sfence(void)
663193326Sed{
664193326Sed  __builtin_ia32_sfence();
665193326Sed}
666193326Sed
667206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
668193326Sed_mm_extract_pi16(__m64 a, int n)
669193326Sed{
670193326Sed  __v4hi b = (__v4hi)a;
671193576Sed  return (unsigned short)b[n & 3];
672193326Sed}
673193326Sed
674206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
675193326Sed_mm_insert_pi16(__m64 a, int d, int n)
676193326Sed{
677193576Sed   __v4hi b = (__v4hi)a;
678193576Sed   b[n & 3] = d;
679193576Sed   return (__m64)b;
680193326Sed}
681193326Sed
682206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
683193326Sed_mm_max_pi16(__m64 a, __m64 b)
684193326Sed{
685193326Sed  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
686193326Sed}
687193326Sed
688206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
689193326Sed_mm_max_pu8(__m64 a, __m64 b)
690193326Sed{
691193326Sed  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
692193326Sed}
693193326Sed
694206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
695193326Sed_mm_min_pi16(__m64 a, __m64 b)
696193326Sed{
697193326Sed  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
698193326Sed}
699193326Sed
700206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
701193326Sed_mm_min_pu8(__m64 a, __m64 b)
702193326Sed{
703193326Sed  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
704193326Sed}
705193326Sed
706206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
707193326Sed_mm_movemask_pi8(__m64 a)
708193326Sed{
709193326Sed  return __builtin_ia32_pmovmskb((__v8qi)a);
710193326Sed}
711193326Sed
712206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
713193326Sed_mm_mulhi_pu16(__m64 a, __m64 b)
714193326Sed{
715193326Sed  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
716193326Sed}
717193326Sed
718193576Sed#define _mm_shuffle_pi16(a, n) \
719218893Sdim  ((__m64)__builtin_ia32_pshufw(a, n))
720193326Sed
721206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
722193326Sed_mm_maskmove_si64(__m64 d, __m64 n, char *p)
723193326Sed{
724193326Sed  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
725193326Sed}
726193326Sed
727206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
728193326Sed_mm_avg_pu8(__m64 a, __m64 b)
729193326Sed{
730193326Sed  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
731193326Sed}
732193326Sed
733206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
734193326Sed_mm_avg_pu16(__m64 a, __m64 b)
735193326Sed{
736193326Sed  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
737193326Sed}
738193326Sed
739206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
740193326Sed_mm_sad_pu8(__m64 a, __m64 b)
741193326Sed{
742193326Sed  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
743193326Sed}
744193326Sed
745206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
746193326Sed_mm_getcsr(void)
747193326Sed{
748193326Sed  return __builtin_ia32_stmxcsr();
749193326Sed}
750193326Sed
751206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
752193326Sed_mm_setcsr(unsigned int i)
753193326Sed{
754193326Sed  __builtin_ia32_ldmxcsr(i);
755193326Sed}
756193326Sed
757193576Sed#define _mm_shuffle_ps(a, b, mask) \
758210299Sed        (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b),                \
759208600Srdivacky                                 (mask) & 0x3, ((mask) & 0xc) >> 2, \
760193576Sed                                 (((mask) & 0x30) >> 4) + 4, \
761193576Sed                                 (((mask) & 0xc0) >> 6) + 4))
762193326Sed
763206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
764193326Sed_mm_unpackhi_ps(__m128 a, __m128 b)
765193326Sed{
766193326Sed  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
767193326Sed}
768193326Sed
769206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
770193326Sed_mm_unpacklo_ps(__m128 a, __m128 b)
771193326Sed{
772193326Sed  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
773193326Sed}
774193326Sed
775206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
776193326Sed_mm_move_ss(__m128 a, __m128 b)
777193326Sed{
778193326Sed  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
779193326Sed}
780193326Sed
781206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
782193326Sed_mm_movehl_ps(__m128 a, __m128 b)
783193326Sed{
784193326Sed  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
785193326Sed}
786193326Sed
787206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
788193326Sed_mm_movelh_ps(__m128 a, __m128 b)
789193326Sed{
790193326Sed  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
791193326Sed}
792193326Sed
793206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
794193326Sed_mm_cvtpi16_ps(__m64 a)
795193326Sed{
796193326Sed  __m64 b, c;
797193326Sed  __m128 r;
798193326Sed
799193326Sed  b = _mm_setzero_si64();
800193326Sed  b = _mm_cmpgt_pi16(b, a);
801193326Sed  c = _mm_unpackhi_pi16(a, b);
802193326Sed  r = _mm_setzero_ps();
803193326Sed  r = _mm_cvtpi32_ps(r, c);
804193326Sed  r = _mm_movelh_ps(r, r);
805193326Sed  c = _mm_unpacklo_pi16(a, b);
806193326Sed  r = _mm_cvtpi32_ps(r, c);
807193326Sed
808193326Sed  return r;
809193326Sed}
810193326Sed
811206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
812193326Sed_mm_cvtpu16_ps(__m64 a)
813193326Sed{
814193326Sed  __m64 b, c;
815193326Sed  __m128 r;
816193326Sed
817193326Sed  b = _mm_setzero_si64();
818193326Sed  c = _mm_unpackhi_pi16(a, b);
819193326Sed  r = _mm_setzero_ps();
820193326Sed  r = _mm_cvtpi32_ps(r, c);
821193326Sed  r = _mm_movelh_ps(r, r);
822193326Sed  c = _mm_unpacklo_pi16(a, b);
823193326Sed  r = _mm_cvtpi32_ps(r, c);
824193326Sed
825193326Sed  return r;
826193326Sed}
827193326Sed
828206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
829193326Sed_mm_cvtpi8_ps(__m64 a)
830193326Sed{
831193326Sed  __m64 b;
832193326Sed
833193326Sed  b = _mm_setzero_si64();
834193326Sed  b = _mm_cmpgt_pi8(b, a);
835193326Sed  b = _mm_unpacklo_pi8(a, b);
836193326Sed
837193326Sed  return _mm_cvtpi16_ps(b);
838193326Sed}
839193326Sed
840206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
841193326Sed_mm_cvtpu8_ps(__m64 a)
842193326Sed{
843193326Sed  __m64 b;
844193326Sed
845193326Sed  b = _mm_setzero_si64();
846193326Sed  b = _mm_unpacklo_pi8(a, b);
847193326Sed
848193326Sed  return _mm_cvtpi16_ps(b);
849193326Sed}
850193326Sed
851206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
852193326Sed_mm_cvtpi32x2_ps(__m64 a, __m64 b)
853193326Sed{
854193326Sed  __m128 c;
855193326Sed
856193326Sed  c = _mm_setzero_ps();
857193326Sed  c = _mm_cvtpi32_ps(c, b);
858193326Sed  c = _mm_movelh_ps(c, c);
859193326Sed
860193326Sed  return _mm_cvtpi32_ps(c, a);
861193326Sed}
862193326Sed
863206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
864193326Sed_mm_cvtps_pi16(__m128 a)
865193326Sed{
866193326Sed  __m64 b, c;
867193326Sed
868193326Sed  b = _mm_cvtps_pi32(a);
869193326Sed  a = _mm_movehl_ps(a, a);
870193326Sed  c = _mm_cvtps_pi32(a);
871193326Sed
872193326Sed  return _mm_packs_pi16(b, c);
873193326Sed}
874193326Sed
875206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
876193326Sed_mm_cvtps_pi8(__m128 a)
877193326Sed{
878193326Sed  __m64 b, c;
879193326Sed
880193326Sed  b = _mm_cvtps_pi16(a);
881193326Sed  c = _mm_setzero_si64();
882193326Sed
883193326Sed  return _mm_packs_pi16(b, c);
884193326Sed}
885193326Sed
886206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
887193326Sed_mm_movemask_ps(__m128 a)
888193326Sed{
889193326Sed  return __builtin_ia32_movmskps(a);
890193326Sed}
891193326Sed
892193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
893193326Sed
894193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
895193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
896193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
897193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
898193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
899193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
900193326Sed#define _MM_EXCEPT_MASK       (0x003f)
901193326Sed
902193326Sed#define _MM_MASK_INVALID      (0x0080)
903193326Sed#define _MM_MASK_DENORM       (0x0100)
904193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
905193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
906193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
907193326Sed#define _MM_MASK_INEXACT      (0x1000)
908193326Sed#define _MM_MASK_MASK         (0x1f80)
909193326Sed
910193326Sed#define _MM_ROUND_NEAREST     (0x0000)
911193326Sed#define _MM_ROUND_DOWN        (0x2000)
912193326Sed#define _MM_ROUND_UP          (0x4000)
913193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
914193326Sed#define _MM_ROUND_MASK        (0x6000)
915193326Sed
916193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
917193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
918193326Sed#define _MM_FLUSH_ZERO_OFF    (0x8000)
919193326Sed
920193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
921193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
922193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
923193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
924193326Sed
925193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
926193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
927193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
928193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
929193326Sed
930193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
931193326Seddo { \
932193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
933193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
934193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
935193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
936193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
937193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
938193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
939193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
940203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
941193326Sed} while (0)
942193326Sed
943212904Sdim/* Aliases for compatibility. */
944212904Sdim#define _m_pextrw _mm_extract_pi16
945212904Sdim#define _m_pinsrw _mm_insert_pi16
946212904Sdim#define _m_pmaxsw _mm_max_pi16
947212904Sdim#define _m_pmaxub _mm_max_pu8
948212904Sdim#define _m_pminsw _mm_min_pi16
949212904Sdim#define _m_pminub _mm_min_pu8
950212904Sdim#define _m_pmovmskb _mm_movemask_pi8
951212904Sdim#define _m_pmulhuw _mm_mulhi_pu16
952212904Sdim#define _m_pshufw _mm_shuffle_pi16
953212904Sdim#define _m_maskmovq _mm_maskmove_si64
954212904Sdim#define _m_pavgb _mm_avg_pu8
955212904Sdim#define _m_pavgw _mm_avg_pu16
956212904Sdim#define _m_psadbw _mm_sad_pu8
957212904Sdim#define _m_ _mm_
958212904Sdim#define _m_ _mm_
959212904Sdim
960194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
961194179Sed#ifdef __SSE2__
962193326Sed#include <emmintrin.h>
963194179Sed#endif
964193326Sed
965193326Sed#endif /* __SSE__ */
966193326Sed
967193326Sed#endif /* __XMMINTRIN_H */
968