xmmintrin.h revision 266674
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23193326Sed
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE__
28193326Sed#error "SSE instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <mmintrin.h>
32193326Sed
33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
35193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
36193326Sed
37218893Sdim// This header should only be included in a hosted environment as it depends on
38218893Sdim// a standard library to provide allocation routines.
39218893Sdim#if __STDC_HOSTED__
40193326Sed#include <mm_malloc.h>
41218893Sdim#endif
42193326Sed
43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44249423Sdim_mm_add_ss(__m128 __a, __m128 __b)
45193326Sed{
46249423Sdim  __a[0] += __b[0];
47249423Sdim  return __a;
48193326Sed}
49193326Sed
50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51249423Sdim_mm_add_ps(__m128 __a, __m128 __b)
52193326Sed{
53249423Sdim  return __a + __b;
54193326Sed}
55193326Sed
56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57249423Sdim_mm_sub_ss(__m128 __a, __m128 __b)
58193326Sed{
59249423Sdim  __a[0] -= __b[0];
60249423Sdim  return __a;
61193326Sed}
62193326Sed
63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64249423Sdim_mm_sub_ps(__m128 __a, __m128 __b)
65193326Sed{
66249423Sdim  return __a - __b;
67193326Sed}
68193326Sed
69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70249423Sdim_mm_mul_ss(__m128 __a, __m128 __b)
71193326Sed{
72249423Sdim  __a[0] *= __b[0];
73249423Sdim  return __a;
74193326Sed}
75193326Sed
76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77249423Sdim_mm_mul_ps(__m128 __a, __m128 __b)
78193326Sed{
79249423Sdim  return __a * __b;
80193326Sed}
81193326Sed
82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83249423Sdim_mm_div_ss(__m128 __a, __m128 __b)
84193326Sed{
85249423Sdim  __a[0] /= __b[0];
86249423Sdim  return __a;
87193326Sed}
88193326Sed
89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90249423Sdim_mm_div_ps(__m128 __a, __m128 __b)
91193326Sed{
92249423Sdim  return __a / __b;
93193326Sed}
94193326Sed
95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96249423Sdim_mm_sqrt_ss(__m128 __a)
97193326Sed{
98249423Sdim  __m128 __c = __builtin_ia32_sqrtss(__a);
99249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
100193326Sed}
101193326Sed
102206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
103249423Sdim_mm_sqrt_ps(__m128 __a)
104193326Sed{
105249423Sdim  return __builtin_ia32_sqrtps(__a);
106193326Sed}
107193326Sed
108206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
109249423Sdim_mm_rcp_ss(__m128 __a)
110193326Sed{
111249423Sdim  __m128 __c = __builtin_ia32_rcpss(__a);
112249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
113193326Sed}
114193326Sed
115206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
116249423Sdim_mm_rcp_ps(__m128 __a)
117193326Sed{
118249423Sdim  return __builtin_ia32_rcpps(__a);
119193326Sed}
120193326Sed
121206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
122249423Sdim_mm_rsqrt_ss(__m128 __a)
123193326Sed{
124249423Sdim  __m128 __c = __builtin_ia32_rsqrtss(__a);
125249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
126193326Sed}
127193326Sed
128206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
129249423Sdim_mm_rsqrt_ps(__m128 __a)
130193326Sed{
131249423Sdim  return __builtin_ia32_rsqrtps(__a);
132193326Sed}
133193326Sed
134206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
135249423Sdim_mm_min_ss(__m128 __a, __m128 __b)
136193326Sed{
137249423Sdim  return __builtin_ia32_minss(__a, __b);
138193326Sed}
139193326Sed
140206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
141249423Sdim_mm_min_ps(__m128 __a, __m128 __b)
142193326Sed{
143249423Sdim  return __builtin_ia32_minps(__a, __b);
144193326Sed}
145193326Sed
146206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
147249423Sdim_mm_max_ss(__m128 __a, __m128 __b)
148193326Sed{
149249423Sdim  return __builtin_ia32_maxss(__a, __b);
150193326Sed}
151193326Sed
152206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
153249423Sdim_mm_max_ps(__m128 __a, __m128 __b)
154193326Sed{
155249423Sdim  return __builtin_ia32_maxps(__a, __b);
156193326Sed}
157193326Sed
158206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
159249423Sdim_mm_and_ps(__m128 __a, __m128 __b)
160193326Sed{
161249423Sdim  return (__m128)((__v4si)__a & (__v4si)__b);
162193326Sed}
163193326Sed
164206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
165249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b)
166193326Sed{
167249423Sdim  return (__m128)(~(__v4si)__a & (__v4si)__b);
168193326Sed}
169193326Sed
170206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
171249423Sdim_mm_or_ps(__m128 __a, __m128 __b)
172193326Sed{
173249423Sdim  return (__m128)((__v4si)__a | (__v4si)__b);
174193326Sed}
175193326Sed
176206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
177249423Sdim_mm_xor_ps(__m128 __a, __m128 __b)
178193326Sed{
179249423Sdim  return (__m128)((__v4si)__a ^ (__v4si)__b);
180193326Sed}
181193326Sed
182206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
183249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b)
184193326Sed{
185249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 0);
186193326Sed}
187193326Sed
188206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
189249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b)
190193326Sed{
191249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 0);
192193326Sed}
193193326Sed
194206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
195249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b)
196193326Sed{
197249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 1);
198193326Sed}
199193326Sed
200206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
201249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b)
202193326Sed{
203249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 1);
204193326Sed}
205193326Sed
206206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
207249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b)
208193326Sed{
209249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 2);
210193326Sed}
211193326Sed
212206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
213249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b)
214193326Sed{
215249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 2);
216193326Sed}
217193326Sed
218206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
219249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b)
220193326Sed{
221261991Sdim  return (__m128)__builtin_shufflevector(__a,
222261991Sdim                                         __builtin_ia32_cmpss(__b, __a, 1),
223261991Sdim                                         4, 1, 2, 3);
224193326Sed}
225193326Sed
226206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
227249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b)
228193326Sed{
229249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 1);
230193326Sed}
231193326Sed
232206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
233249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b)
234193326Sed{
235261991Sdim  return (__m128)__builtin_shufflevector(__a,
236261991Sdim                                         __builtin_ia32_cmpss(__b, __a, 2),
237261991Sdim                                         4, 1, 2, 3);
238193326Sed}
239193326Sed
240206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
241249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b)
242193326Sed{
243249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 2);
244193326Sed}
245193326Sed
246206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
247249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b)
248193326Sed{
249249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 4);
250193326Sed}
251193326Sed
252206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
253249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b)
254193326Sed{
255249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 4);
256193326Sed}
257193326Sed
258206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
259249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b)
260193326Sed{
261249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 5);
262193326Sed}
263193326Sed
264206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
265249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b)
266193326Sed{
267249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 5);
268193326Sed}
269193326Sed
270206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
271249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b)
272193326Sed{
273249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 6);
274193326Sed}
275193326Sed
276206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
277249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b)
278193326Sed{
279249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 6);
280193326Sed}
281193326Sed
282206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
283249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b)
284193326Sed{
285261991Sdim  return (__m128)__builtin_shufflevector(__a,
286261991Sdim                                         __builtin_ia32_cmpss(__b, __a, 5),
287261991Sdim                                         4, 1, 2, 3);
288193326Sed}
289193326Sed
290206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
291249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b)
292193326Sed{
293249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 5);
294193326Sed}
295193326Sed
296206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
297249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b)
298193326Sed{
299261991Sdim  return (__m128)__builtin_shufflevector(__a,
300261991Sdim                                         __builtin_ia32_cmpss(__b, __a, 6),
301261991Sdim                                         4, 1, 2, 3);
302193326Sed}
303193326Sed
304206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
305249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b)
306193326Sed{
307249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 6);
308193326Sed}
309193326Sed
310206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
311249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b)
312193326Sed{
313249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 7);
314193326Sed}
315193326Sed
316206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
317249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b)
318193326Sed{
319249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 7);
320193326Sed}
321193326Sed
322206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
323249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b)
324193326Sed{
325249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 3);
326193326Sed}
327193326Sed
328206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
329249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b)
330193326Sed{
331249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 3);
332193326Sed}
333193326Sed
334206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
335249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b)
336193326Sed{
337249423Sdim  return __builtin_ia32_comieq(__a, __b);
338193326Sed}
339193326Sed
340206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
341249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b)
342193326Sed{
343249423Sdim  return __builtin_ia32_comilt(__a, __b);
344193326Sed}
345193326Sed
346206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
347249423Sdim_mm_comile_ss(__m128 __a, __m128 __b)
348193326Sed{
349249423Sdim  return __builtin_ia32_comile(__a, __b);
350193326Sed}
351193326Sed
352206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
353249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b)
354193326Sed{
355249423Sdim  return __builtin_ia32_comigt(__a, __b);
356193326Sed}
357193326Sed
358206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
359249423Sdim_mm_comige_ss(__m128 __a, __m128 __b)
360193326Sed{
361249423Sdim  return __builtin_ia32_comige(__a, __b);
362193326Sed}
363193326Sed
364206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
365249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b)
366193326Sed{
367249423Sdim  return __builtin_ia32_comineq(__a, __b);
368193326Sed}
369193326Sed
370206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
371249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b)
372193326Sed{
373249423Sdim  return __builtin_ia32_ucomieq(__a, __b);
374193326Sed}
375193326Sed
376206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
377249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b)
378193326Sed{
379249423Sdim  return __builtin_ia32_ucomilt(__a, __b);
380193326Sed}
381193326Sed
382206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
383249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b)
384193326Sed{
385249423Sdim  return __builtin_ia32_ucomile(__a, __b);
386193326Sed}
387193326Sed
388206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
389249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b)
390193326Sed{
391249423Sdim  return __builtin_ia32_ucomigt(__a, __b);
392193326Sed}
393193326Sed
394206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
395249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b)
396193326Sed{
397249423Sdim  return __builtin_ia32_ucomige(__a, __b);
398193326Sed}
399193326Sed
400206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
401249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b)
402193326Sed{
403249423Sdim  return __builtin_ia32_ucomineq(__a, __b);
404193326Sed}
405193326Sed
406206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
407249423Sdim_mm_cvtss_si32(__m128 __a)
408193326Sed{
409249423Sdim  return __builtin_ia32_cvtss2si(__a);
410193326Sed}
411193326Sed
412206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
413249423Sdim_mm_cvt_ss2si(__m128 __a)
414204643Srdivacky{
415249423Sdim  return _mm_cvtss_si32(__a);
416204643Srdivacky}
417204643Srdivacky
418193576Sed#ifdef __x86_64__
419193576Sed
420206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
421249423Sdim_mm_cvtss_si64(__m128 __a)
422193326Sed{
423249423Sdim  return __builtin_ia32_cvtss2si64(__a);
424193326Sed}
425193326Sed
426193576Sed#endif
427193576Sed
428206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
429249423Sdim_mm_cvtps_pi32(__m128 __a)
430193326Sed{
431249423Sdim  return (__m64)__builtin_ia32_cvtps2pi(__a);
432193326Sed}
433193326Sed
434212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435249423Sdim_mm_cvt_ps2pi(__m128 __a)
436212904Sdim{
437249423Sdim  return _mm_cvtps_pi32(__a);
438212904Sdim}
439212904Sdim
440206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
441249423Sdim_mm_cvttss_si32(__m128 __a)
442193326Sed{
443249423Sdim  return __a[0];
444193326Sed}
445193326Sed
446206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
447249423Sdim_mm_cvtt_ss2si(__m128 __a)
448204643Srdivacky{
449249423Sdim  return _mm_cvttss_si32(__a);
450204643Srdivacky}
451204643Srdivacky
452206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
453249423Sdim_mm_cvttss_si64(__m128 __a)
454193326Sed{
455249423Sdim  return __a[0];
456193326Sed}
457193326Sed
458206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
459249423Sdim_mm_cvttps_pi32(__m128 __a)
460193326Sed{
461249423Sdim  return (__m64)__builtin_ia32_cvttps2pi(__a);
462193326Sed}
463193326Sed
464212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
465249423Sdim_mm_cvtt_ps2pi(__m128 __a)
466212904Sdim{
467249423Sdim  return _mm_cvttps_pi32(__a);
468212904Sdim}
469212904Sdim
470206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
471249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b)
472193326Sed{
473249423Sdim  __a[0] = __b;
474249423Sdim  return __a;
475193326Sed}
476193326Sed
477212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
478249423Sdim_mm_cvt_si2ss(__m128 __a, int __b)
479212904Sdim{
480249423Sdim  return _mm_cvtsi32_ss(__a, __b);
481212904Sdim}
482212904Sdim
483193326Sed#ifdef __x86_64__
484193326Sed
485206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
486249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b)
487193326Sed{
488249423Sdim  __a[0] = __b;
489249423Sdim  return __a;
490193326Sed}
491193326Sed
492193326Sed#endif
493193326Sed
494206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
495249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b)
496193326Sed{
497249423Sdim  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
498193326Sed}
499193326Sed
500212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
501249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b)
502212904Sdim{
503249423Sdim  return _mm_cvtpi32_ps(__a, __b);
504212904Sdim}
505212904Sdim
506206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__))
507249423Sdim_mm_cvtss_f32(__m128 __a)
508193326Sed{
509249423Sdim  return __a[0];
510193326Sed}
511193326Sed
512206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
513249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p)
514193326Sed{
515226633Sdim  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
516226633Sdim  struct __mm_loadh_pi_struct {
517249423Sdim    __mm_loadh_pi_v2f32 __u;
518226633Sdim  } __attribute__((__packed__, __may_alias__));
519249423Sdim  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
520249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
521249423Sdim  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
522193326Sed}
523193326Sed
524206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
525249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p)
526193326Sed{
527226633Sdim  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
528226633Sdim  struct __mm_loadl_pi_struct {
529249423Sdim    __mm_loadl_pi_v2f32 __u;
530226633Sdim  } __attribute__((__packed__, __may_alias__));
531249423Sdim  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
532249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
533249423Sdim  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
534193326Sed}
535193326Sed
536206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
537249423Sdim_mm_load_ss(const float *__p)
538193326Sed{
539226633Sdim  struct __mm_load_ss_struct {
540249423Sdim    float __u;
541226633Sdim  } __attribute__((__packed__, __may_alias__));
542249423Sdim  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
543249423Sdim  return (__m128){ __u, 0, 0, 0 };
544193326Sed}
545193326Sed
546206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
547249423Sdim_mm_load1_ps(const float *__p)
548193326Sed{
549226633Sdim  struct __mm_load1_ps_struct {
550249423Sdim    float __u;
551226633Sdim  } __attribute__((__packed__, __may_alias__));
552249423Sdim  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
553249423Sdim  return (__m128){ __u, __u, __u, __u };
554193326Sed}
555193326Sed
556193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
557193326Sed
558206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
559249423Sdim_mm_load_ps(const float *__p)
560193326Sed{
561249423Sdim  return *(__m128*)__p;
562193326Sed}
563193326Sed
564206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
565249423Sdim_mm_loadu_ps(const float *__p)
566193326Sed{
567223017Sdim  struct __loadu_ps {
568249423Sdim    __m128 __v;
569226633Sdim  } __attribute__((__packed__, __may_alias__));
570249423Sdim  return ((struct __loadu_ps*)__p)->__v;
571193326Sed}
572193326Sed
573206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
574249423Sdim_mm_loadr_ps(const float *__p)
575193326Sed{
576249423Sdim  __m128 __a = _mm_load_ps(__p);
577249423Sdim  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
578193326Sed}
579193326Sed
580206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
581249423Sdim_mm_set_ss(float __w)
582193326Sed{
583249423Sdim  return (__m128){ __w, 0, 0, 0 };
584193326Sed}
585193326Sed
586206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
587249423Sdim_mm_set1_ps(float __w)
588193326Sed{
589249423Sdim  return (__m128){ __w, __w, __w, __w };
590193326Sed}
591193326Sed
592193326Sed// Microsoft specific.
593206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
594249423Sdim_mm_set_ps1(float __w)
595193326Sed{
596249423Sdim    return _mm_set1_ps(__w);
597193326Sed}
598193326Sed
599206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
600249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w)
601193326Sed{
602249423Sdim  return (__m128){ __w, __x, __y, __z };
603193326Sed}
604193326Sed
605206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
606249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w)
607193326Sed{
608249423Sdim  return (__m128){ __z, __y, __x, __w };
609193326Sed}
610193326Sed
611206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__))
612193326Sed_mm_setzero_ps(void)
613193326Sed{
614193326Sed  return (__m128){ 0, 0, 0, 0 };
615193326Sed}
616193326Sed
617206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
618249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a)
619193326Sed{
620249423Sdim  __builtin_ia32_storehps((__v2si *)__p, __a);
621193326Sed}
622193326Sed
623206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
624249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a)
625193326Sed{
626249423Sdim  __builtin_ia32_storelps((__v2si *)__p, __a);
627193326Sed}
628193326Sed
629206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
630249423Sdim_mm_store_ss(float *__p, __m128 __a)
631193326Sed{
632226633Sdim  struct __mm_store_ss_struct {
633249423Sdim    float __u;
634226633Sdim  } __attribute__((__packed__, __may_alias__));
635249423Sdim  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
636193326Sed}
637193326Sed
638206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
639249423Sdim_mm_storeu_ps(float *__p, __m128 __a)
640193326Sed{
641249423Sdim  __builtin_ia32_storeups(__p, __a);
642193326Sed}
643193326Sed
644206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
645249423Sdim_mm_store1_ps(float *__p, __m128 __a)
646193326Sed{
647249423Sdim  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
648249423Sdim  _mm_storeu_ps(__p, __a);
649193326Sed}
650193326Sed
651206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
652249423Sdim_mm_store_ps1(float *__p, __m128 __a)
653212904Sdim{
654249423Sdim    return _mm_store1_ps(__p, __a);
655212904Sdim}
656212904Sdim
657212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__))
658249423Sdim_mm_store_ps(float *__p, __m128 __a)
659193326Sed{
660249423Sdim  *(__m128 *)__p = __a;
661193326Sed}
662193326Sed
663206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
664249423Sdim_mm_storer_ps(float *__p, __m128 __a)
665193326Sed{
666249423Sdim  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
667249423Sdim  _mm_store_ps(__p, __a);
668193326Sed}
669193326Sed
670212904Sdim#define _MM_HINT_T0 3
671193326Sed#define _MM_HINT_T1 2
672212904Sdim#define _MM_HINT_T2 1
673193326Sed#define _MM_HINT_NTA 0
674193326Sed
675210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
676193326Sed   Sema doesn't do any form of constant propagation yet. */
677193326Sed
678234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
679193326Sed
680206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
681249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a)
682193326Sed{
683249423Sdim  __builtin_ia32_movntq(__p, __a);
684193326Sed}
685193326Sed
686206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
687249423Sdim_mm_stream_ps(float *__p, __m128 __a)
688193326Sed{
689249423Sdim  __builtin_ia32_movntps(__p, __a);
690193326Sed}
691193326Sed
692206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
693193326Sed_mm_sfence(void)
694193326Sed{
695193326Sed  __builtin_ia32_sfence();
696193326Sed}
697193326Sed
698206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
699249423Sdim_mm_extract_pi16(__m64 __a, int __n)
700193326Sed{
701249423Sdim  __v4hi __b = (__v4hi)__a;
702249423Sdim  return (unsigned short)__b[__n & 3];
703193326Sed}
704193326Sed
705206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
706249423Sdim_mm_insert_pi16(__m64 __a, int __d, int __n)
707193326Sed{
708249423Sdim   __v4hi __b = (__v4hi)__a;
709249423Sdim   __b[__n & 3] = __d;
710249423Sdim   return (__m64)__b;
711193326Sed}
712193326Sed
713206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
714249423Sdim_mm_max_pi16(__m64 __a, __m64 __b)
715193326Sed{
716249423Sdim  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
717193326Sed}
718193326Sed
719206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
720249423Sdim_mm_max_pu8(__m64 __a, __m64 __b)
721193326Sed{
722249423Sdim  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
723193326Sed}
724193326Sed
725206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
726249423Sdim_mm_min_pi16(__m64 __a, __m64 __b)
727193326Sed{
728249423Sdim  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
729193326Sed}
730193326Sed
731206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
732249423Sdim_mm_min_pu8(__m64 __a, __m64 __b)
733193326Sed{
734249423Sdim  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
735193326Sed}
736193326Sed
737206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
738249423Sdim_mm_movemask_pi8(__m64 __a)
739193326Sed{
740249423Sdim  return __builtin_ia32_pmovmskb((__v8qi)__a);
741193326Sed}
742193326Sed
743206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
744249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b)
745193326Sed{
746249423Sdim  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
747193326Sed}
748193326Sed
749234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \
750234353Sdim  __m64 __a = (a); \
751234353Sdim  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
752193326Sed
753206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
754249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
755193326Sed{
756249423Sdim  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
757193326Sed}
758193326Sed
759206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
760249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b)
761193326Sed{
762249423Sdim  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
763193326Sed}
764193326Sed
765206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
766249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b)
767193326Sed{
768249423Sdim  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
769193326Sed}
770193326Sed
771206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
772249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b)
773193326Sed{
774249423Sdim  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
775193326Sed}
776193326Sed
777206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
778193326Sed_mm_getcsr(void)
779193326Sed{
780193326Sed  return __builtin_ia32_stmxcsr();
781193326Sed}
782193326Sed
783206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
784249423Sdim_mm_setcsr(unsigned int __i)
785193326Sed{
786249423Sdim  __builtin_ia32_ldmxcsr(__i);
787193326Sed}
788193326Sed
789234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
790234353Sdim  __m128 __a = (a); \
791234353Sdim  __m128 __b = (b); \
792234353Sdim  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
793234353Sdim                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
794234353Sdim                                  (((mask) & 0x30) >> 4) + 4, \
795234353Sdim                                  (((mask) & 0xc0) >> 6) + 4); })
796193326Sed
797206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
798249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b)
799193326Sed{
800249423Sdim  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
801193326Sed}
802193326Sed
803206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
804249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b)
805193326Sed{
806249423Sdim  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
807193326Sed}
808193326Sed
809206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
810249423Sdim_mm_move_ss(__m128 __a, __m128 __b)
811193326Sed{
812249423Sdim  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
813193326Sed}
814193326Sed
815206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
816249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b)
817193326Sed{
818249423Sdim  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
819193326Sed}
820193326Sed
821206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
822249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b)
823193326Sed{
824249423Sdim  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
825193326Sed}
826193326Sed
827206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
828249423Sdim_mm_cvtpi16_ps(__m64 __a)
829193326Sed{
830249423Sdim  __m64 __b, __c;
831249423Sdim  __m128 __r;
832193326Sed
833249423Sdim  __b = _mm_setzero_si64();
834249423Sdim  __b = _mm_cmpgt_pi16(__b, __a);
835249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
836249423Sdim  __r = _mm_setzero_ps();
837249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
838249423Sdim  __r = _mm_movelh_ps(__r, __r);
839249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
840249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
841193326Sed
842249423Sdim  return __r;
843193326Sed}
844193326Sed
845206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
846249423Sdim_mm_cvtpu16_ps(__m64 __a)
847193326Sed{
848249423Sdim  __m64 __b, __c;
849249423Sdim  __m128 __r;
850193326Sed
851249423Sdim  __b = _mm_setzero_si64();
852249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
853249423Sdim  __r = _mm_setzero_ps();
854249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
855249423Sdim  __r = _mm_movelh_ps(__r, __r);
856249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
857249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
858193326Sed
859249423Sdim  return __r;
860193326Sed}
861193326Sed
862206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
863249423Sdim_mm_cvtpi8_ps(__m64 __a)
864193326Sed{
865249423Sdim  __m64 __b;
866193326Sed
867249423Sdim  __b = _mm_setzero_si64();
868249423Sdim  __b = _mm_cmpgt_pi8(__b, __a);
869249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
870193326Sed
871249423Sdim  return _mm_cvtpi16_ps(__b);
872193326Sed}
873193326Sed
874206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
875249423Sdim_mm_cvtpu8_ps(__m64 __a)
876193326Sed{
877249423Sdim  __m64 __b;
878193326Sed
879249423Sdim  __b = _mm_setzero_si64();
880249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
881193326Sed
882249423Sdim  return _mm_cvtpi16_ps(__b);
883193326Sed}
884193326Sed
885206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
886249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
887193326Sed{
888249423Sdim  __m128 __c;
889193326Sed
890249423Sdim  __c = _mm_setzero_ps();
891249423Sdim  __c = _mm_cvtpi32_ps(__c, __b);
892249423Sdim  __c = _mm_movelh_ps(__c, __c);
893193326Sed
894249423Sdim  return _mm_cvtpi32_ps(__c, __a);
895193326Sed}
896193326Sed
897206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
898249423Sdim_mm_cvtps_pi16(__m128 __a)
899193326Sed{
900249423Sdim  __m64 __b, __c;
901193326Sed
902249423Sdim  __b = _mm_cvtps_pi32(__a);
903249423Sdim  __a = _mm_movehl_ps(__a, __a);
904249423Sdim  __c = _mm_cvtps_pi32(__a);
905193326Sed
906266674Sdim  return _mm_packs_pi32(__b, __c);
907193326Sed}
908193326Sed
909206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
910249423Sdim_mm_cvtps_pi8(__m128 __a)
911193326Sed{
912249423Sdim  __m64 __b, __c;
913193326Sed
914249423Sdim  __b = _mm_cvtps_pi16(__a);
915249423Sdim  __c = _mm_setzero_si64();
916193326Sed
917249423Sdim  return _mm_packs_pi16(__b, __c);
918193326Sed}
919193326Sed
920206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
921249423Sdim_mm_movemask_ps(__m128 __a)
922193326Sed{
923249423Sdim  return __builtin_ia32_movmskps(__a);
924193326Sed}
925193326Sed
926193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
927193326Sed
928193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
929193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
930193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
931193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
932193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
933193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
934193326Sed#define _MM_EXCEPT_MASK       (0x003f)
935193326Sed
936193326Sed#define _MM_MASK_INVALID      (0x0080)
937193326Sed#define _MM_MASK_DENORM       (0x0100)
938193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
939193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
940193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
941193326Sed#define _MM_MASK_INEXACT      (0x1000)
942193326Sed#define _MM_MASK_MASK         (0x1f80)
943193326Sed
944193326Sed#define _MM_ROUND_NEAREST     (0x0000)
945193326Sed#define _MM_ROUND_DOWN        (0x2000)
946193326Sed#define _MM_ROUND_UP          (0x4000)
947193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
948193326Sed#define _MM_ROUND_MASK        (0x6000)
949193326Sed
950193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
951193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
952234353Sdim#define _MM_FLUSH_ZERO_OFF    (0x0000)
953193326Sed
954193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
955193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
956193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
957193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
958193326Sed
959193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
960193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
961193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
962193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
963193326Sed
964193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
965193326Seddo { \
966193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
967193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
968193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
969193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
970193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
971193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
972193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
973193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
974203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
975193326Sed} while (0)
976193326Sed
977212904Sdim/* Aliases for compatibility. */
978212904Sdim#define _m_pextrw _mm_extract_pi16
979212904Sdim#define _m_pinsrw _mm_insert_pi16
980212904Sdim#define _m_pmaxsw _mm_max_pi16
981212904Sdim#define _m_pmaxub _mm_max_pu8
982212904Sdim#define _m_pminsw _mm_min_pi16
983212904Sdim#define _m_pminub _mm_min_pu8
984212904Sdim#define _m_pmovmskb _mm_movemask_pi8
985212904Sdim#define _m_pmulhuw _mm_mulhi_pu16
986212904Sdim#define _m_pshufw _mm_shuffle_pi16
987212904Sdim#define _m_maskmovq _mm_maskmove_si64
988212904Sdim#define _m_pavgb _mm_avg_pu8
989212904Sdim#define _m_pavgw _mm_avg_pu16
990212904Sdim#define _m_psadbw _mm_sad_pu8
991212904Sdim#define _m_ _mm_
992212904Sdim#define _m_ _mm_
993212904Sdim
994194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
995194179Sed#ifdef __SSE2__
996193326Sed#include <emmintrin.h>
997194179Sed#endif
998193326Sed
999193326Sed#endif /* __SSE__ */
1000193326Sed
1001193326Sed#endif /* __XMMINTRIN_H */
1002