1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23193326Sed
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE__
28193326Sed#error "SSE instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <mmintrin.h>
32193326Sed
33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
35193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
36193326Sed
37218893Sdim// This header should only be included in a hosted environment as it depends on
38218893Sdim// a standard library to provide allocation routines.
39218893Sdim#if __STDC_HOSTED__
40193326Sed#include <mm_malloc.h>
41218893Sdim#endif
42193326Sed
43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44249423Sdim_mm_add_ss(__m128 __a, __m128 __b)
45193326Sed{
46249423Sdim  __a[0] += __b[0];
47249423Sdim  return __a;
48193326Sed}
49193326Sed
50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51249423Sdim_mm_add_ps(__m128 __a, __m128 __b)
52193326Sed{
53249423Sdim  return __a + __b;
54193326Sed}
55193326Sed
56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57249423Sdim_mm_sub_ss(__m128 __a, __m128 __b)
58193326Sed{
59249423Sdim  __a[0] -= __b[0];
60249423Sdim  return __a;
61193326Sed}
62193326Sed
63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64249423Sdim_mm_sub_ps(__m128 __a, __m128 __b)
65193326Sed{
66249423Sdim  return __a - __b;
67193326Sed}
68193326Sed
69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70249423Sdim_mm_mul_ss(__m128 __a, __m128 __b)
71193326Sed{
72249423Sdim  __a[0] *= __b[0];
73249423Sdim  return __a;
74193326Sed}
75193326Sed
76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77249423Sdim_mm_mul_ps(__m128 __a, __m128 __b)
78193326Sed{
79249423Sdim  return __a * __b;
80193326Sed}
81193326Sed
82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83249423Sdim_mm_div_ss(__m128 __a, __m128 __b)
84193326Sed{
85249423Sdim  __a[0] /= __b[0];
86249423Sdim  return __a;
87193326Sed}
88193326Sed
89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90249423Sdim_mm_div_ps(__m128 __a, __m128 __b)
91193326Sed{
92249423Sdim  return __a / __b;
93193326Sed}
94193326Sed
95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96249423Sdim_mm_sqrt_ss(__m128 __a)
97193326Sed{
98249423Sdim  __m128 __c = __builtin_ia32_sqrtss(__a);
99249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
100193326Sed}
101193326Sed
102206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
103249423Sdim_mm_sqrt_ps(__m128 __a)
104193326Sed{
105249423Sdim  return __builtin_ia32_sqrtps(__a);
106193326Sed}
107193326Sed
108206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
109249423Sdim_mm_rcp_ss(__m128 __a)
110193326Sed{
111249423Sdim  __m128 __c = __builtin_ia32_rcpss(__a);
112249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
113193326Sed}
114193326Sed
115206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
116249423Sdim_mm_rcp_ps(__m128 __a)
117193326Sed{
118249423Sdim  return __builtin_ia32_rcpps(__a);
119193326Sed}
120193326Sed
121206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
122249423Sdim_mm_rsqrt_ss(__m128 __a)
123193326Sed{
124249423Sdim  __m128 __c = __builtin_ia32_rsqrtss(__a);
125249423Sdim  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
126193326Sed}
127193326Sed
128206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
129249423Sdim_mm_rsqrt_ps(__m128 __a)
130193326Sed{
131249423Sdim  return __builtin_ia32_rsqrtps(__a);
132193326Sed}
133193326Sed
134206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
135249423Sdim_mm_min_ss(__m128 __a, __m128 __b)
136193326Sed{
137249423Sdim  return __builtin_ia32_minss(__a, __b);
138193326Sed}
139193326Sed
140206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
141249423Sdim_mm_min_ps(__m128 __a, __m128 __b)
142193326Sed{
143249423Sdim  return __builtin_ia32_minps(__a, __b);
144193326Sed}
145193326Sed
146206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
147249423Sdim_mm_max_ss(__m128 __a, __m128 __b)
148193326Sed{
149249423Sdim  return __builtin_ia32_maxss(__a, __b);
150193326Sed}
151193326Sed
152206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
153249423Sdim_mm_max_ps(__m128 __a, __m128 __b)
154193326Sed{
155249423Sdim  return __builtin_ia32_maxps(__a, __b);
156193326Sed}
157193326Sed
158206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
159249423Sdim_mm_and_ps(__m128 __a, __m128 __b)
160193326Sed{
161249423Sdim  return (__m128)((__v4si)__a & (__v4si)__b);
162193326Sed}
163193326Sed
164206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
165249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b)
166193326Sed{
167249423Sdim  return (__m128)(~(__v4si)__a & (__v4si)__b);
168193326Sed}
169193326Sed
170206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
171249423Sdim_mm_or_ps(__m128 __a, __m128 __b)
172193326Sed{
173249423Sdim  return (__m128)((__v4si)__a | (__v4si)__b);
174193326Sed}
175193326Sed
176206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
177249423Sdim_mm_xor_ps(__m128 __a, __m128 __b)
178193326Sed{
179249423Sdim  return (__m128)((__v4si)__a ^ (__v4si)__b);
180193326Sed}
181193326Sed
182206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
183249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b)
184193326Sed{
185249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 0);
186193326Sed}
187193326Sed
188206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
189249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b)
190193326Sed{
191249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 0);
192193326Sed}
193193326Sed
194206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
195249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b)
196193326Sed{
197249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 1);
198193326Sed}
199193326Sed
200206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
201249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b)
202193326Sed{
203249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 1);
204193326Sed}
205193326Sed
206206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
207249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b)
208193326Sed{
209249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 2);
210193326Sed}
211193326Sed
212206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
213249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b)
214193326Sed{
215249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 2);
216193326Sed}
217193326Sed
218206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
219249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b)
220193326Sed{
221249423Sdim  return (__m128)__builtin_ia32_cmpss(__b, __a, 1);
222193326Sed}
223193326Sed
224206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
225249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b)
226193326Sed{
227249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 1);
228193326Sed}
229193326Sed
230206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
231249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b)
232193326Sed{
233249423Sdim  return (__m128)__builtin_ia32_cmpss(__b, __a, 2);
234193326Sed}
235193326Sed
236206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
237249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b)
238193326Sed{
239249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 2);
240193326Sed}
241193326Sed
242206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
243249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b)
244193326Sed{
245249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 4);
246193326Sed}
247193326Sed
248206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
249249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b)
250193326Sed{
251249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 4);
252193326Sed}
253193326Sed
254206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
255249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b)
256193326Sed{
257249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 5);
258193326Sed}
259193326Sed
260206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
261249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b)
262193326Sed{
263249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 5);
264193326Sed}
265193326Sed
266206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
267249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b)
268193326Sed{
269249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 6);
270193326Sed}
271193326Sed
272206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
273249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b)
274193326Sed{
275249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 6);
276193326Sed}
277193326Sed
278206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
279249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b)
280193326Sed{
281249423Sdim  return (__m128)__builtin_ia32_cmpss(__b, __a, 5);
282193326Sed}
283193326Sed
284206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
285249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b)
286193326Sed{
287249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 5);
288193326Sed}
289193326Sed
290206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
291249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b)
292193326Sed{
293249423Sdim  return (__m128)__builtin_ia32_cmpss(__b, __a, 6);
294193326Sed}
295193326Sed
296206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
297249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b)
298193326Sed{
299249423Sdim  return (__m128)__builtin_ia32_cmpps(__b, __a, 6);
300193326Sed}
301193326Sed
302206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
303249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b)
304193326Sed{
305249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 7);
306193326Sed}
307193326Sed
308206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
309249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b)
310193326Sed{
311249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 7);
312193326Sed}
313193326Sed
314206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
315249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b)
316193326Sed{
317249423Sdim  return (__m128)__builtin_ia32_cmpss(__a, __b, 3);
318193326Sed}
319193326Sed
320206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
321249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b)
322193326Sed{
323249423Sdim  return (__m128)__builtin_ia32_cmpps(__a, __b, 3);
324193326Sed}
325193326Sed
326206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
327249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b)
328193326Sed{
329249423Sdim  return __builtin_ia32_comieq(__a, __b);
330193326Sed}
331193326Sed
332206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
333249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b)
334193326Sed{
335249423Sdim  return __builtin_ia32_comilt(__a, __b);
336193326Sed}
337193326Sed
338206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
339249423Sdim_mm_comile_ss(__m128 __a, __m128 __b)
340193326Sed{
341249423Sdim  return __builtin_ia32_comile(__a, __b);
342193326Sed}
343193326Sed
344206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
345249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b)
346193326Sed{
347249423Sdim  return __builtin_ia32_comigt(__a, __b);
348193326Sed}
349193326Sed
350206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
351249423Sdim_mm_comige_ss(__m128 __a, __m128 __b)
352193326Sed{
353249423Sdim  return __builtin_ia32_comige(__a, __b);
354193326Sed}
355193326Sed
356206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
357249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b)
358193326Sed{
359249423Sdim  return __builtin_ia32_comineq(__a, __b);
360193326Sed}
361193326Sed
362206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
363249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b)
364193326Sed{
365249423Sdim  return __builtin_ia32_ucomieq(__a, __b);
366193326Sed}
367193326Sed
368206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
369249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b)
370193326Sed{
371249423Sdim  return __builtin_ia32_ucomilt(__a, __b);
372193326Sed}
373193326Sed
374206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
375249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b)
376193326Sed{
377249423Sdim  return __builtin_ia32_ucomile(__a, __b);
378193326Sed}
379193326Sed
380206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
381249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b)
382193326Sed{
383249423Sdim  return __builtin_ia32_ucomigt(__a, __b);
384193326Sed}
385193326Sed
386206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
387249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b)
388193326Sed{
389249423Sdim  return __builtin_ia32_ucomige(__a, __b);
390193326Sed}
391193326Sed
392206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
393249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b)
394193326Sed{
395249423Sdim  return __builtin_ia32_ucomineq(__a, __b);
396193326Sed}
397193326Sed
398206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
399249423Sdim_mm_cvtss_si32(__m128 __a)
400193326Sed{
401249423Sdim  return __builtin_ia32_cvtss2si(__a);
402193326Sed}
403193326Sed
404206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
405249423Sdim_mm_cvt_ss2si(__m128 __a)
406204643Srdivacky{
407249423Sdim  return _mm_cvtss_si32(__a);
408204643Srdivacky}
409204643Srdivacky
410193576Sed#ifdef __x86_64__
411193576Sed
412206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
413249423Sdim_mm_cvtss_si64(__m128 __a)
414193326Sed{
415249423Sdim  return __builtin_ia32_cvtss2si64(__a);
416193326Sed}
417193326Sed
418193576Sed#endif
419193576Sed
420206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
421249423Sdim_mm_cvtps_pi32(__m128 __a)
422193326Sed{
423249423Sdim  return (__m64)__builtin_ia32_cvtps2pi(__a);
424193326Sed}
425193326Sed
426212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
427249423Sdim_mm_cvt_ps2pi(__m128 __a)
428212904Sdim{
429249423Sdim  return _mm_cvtps_pi32(__a);
430212904Sdim}
431212904Sdim
432206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
433249423Sdim_mm_cvttss_si32(__m128 __a)
434193326Sed{
435249423Sdim  return __a[0];
436193326Sed}
437193326Sed
438206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
439249423Sdim_mm_cvtt_ss2si(__m128 __a)
440204643Srdivacky{
441249423Sdim  return _mm_cvttss_si32(__a);
442204643Srdivacky}
443204643Srdivacky
444206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
445249423Sdim_mm_cvttss_si64(__m128 __a)
446193326Sed{
447249423Sdim  return __a[0];
448193326Sed}
449193326Sed
450206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
451249423Sdim_mm_cvttps_pi32(__m128 __a)
452193326Sed{
453249423Sdim  return (__m64)__builtin_ia32_cvttps2pi(__a);
454193326Sed}
455193326Sed
456212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
457249423Sdim_mm_cvtt_ps2pi(__m128 __a)
458212904Sdim{
459249423Sdim  return _mm_cvttps_pi32(__a);
460212904Sdim}
461212904Sdim
462206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
463249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b)
464193326Sed{
465249423Sdim  __a[0] = __b;
466249423Sdim  return __a;
467193326Sed}
468193326Sed
469212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
470249423Sdim_mm_cvt_si2ss(__m128 __a, int __b)
471212904Sdim{
472249423Sdim  return _mm_cvtsi32_ss(__a, __b);
473212904Sdim}
474212904Sdim
475193326Sed#ifdef __x86_64__
476193326Sed
477206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
478249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b)
479193326Sed{
480249423Sdim  __a[0] = __b;
481249423Sdim  return __a;
482193326Sed}
483193326Sed
484193326Sed#endif
485193326Sed
486206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
487249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b)
488193326Sed{
489249423Sdim  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
490193326Sed}
491193326Sed
492212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
493249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b)
494212904Sdim{
495249423Sdim  return _mm_cvtpi32_ps(__a, __b);
496212904Sdim}
497212904Sdim
498206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__))
499249423Sdim_mm_cvtss_f32(__m128 __a)
500193326Sed{
501249423Sdim  return __a[0];
502193326Sed}
503193326Sed
504206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
505249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p)
506193326Sed{
507226633Sdim  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
508226633Sdim  struct __mm_loadh_pi_struct {
509249423Sdim    __mm_loadh_pi_v2f32 __u;
510226633Sdim  } __attribute__((__packed__, __may_alias__));
511249423Sdim  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
512249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
513249423Sdim  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
514193326Sed}
515193326Sed
516206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
517249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p)
518193326Sed{
519226633Sdim  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
520226633Sdim  struct __mm_loadl_pi_struct {
521249423Sdim    __mm_loadl_pi_v2f32 __u;
522226633Sdim  } __attribute__((__packed__, __may_alias__));
523249423Sdim  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
524249423Sdim  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
525249423Sdim  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
526193326Sed}
527193326Sed
528206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
529249423Sdim_mm_load_ss(const float *__p)
530193326Sed{
531226633Sdim  struct __mm_load_ss_struct {
532249423Sdim    float __u;
533226633Sdim  } __attribute__((__packed__, __may_alias__));
534249423Sdim  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
535249423Sdim  return (__m128){ __u, 0, 0, 0 };
536193326Sed}
537193326Sed
538206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
539249423Sdim_mm_load1_ps(const float *__p)
540193326Sed{
541226633Sdim  struct __mm_load1_ps_struct {
542249423Sdim    float __u;
543226633Sdim  } __attribute__((__packed__, __may_alias__));
544249423Sdim  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
545249423Sdim  return (__m128){ __u, __u, __u, __u };
546193326Sed}
547193326Sed
548193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
549193326Sed
550206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
551249423Sdim_mm_load_ps(const float *__p)
552193326Sed{
553249423Sdim  return *(__m128*)__p;
554193326Sed}
555193326Sed
556206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
557249423Sdim_mm_loadu_ps(const float *__p)
558193326Sed{
559223017Sdim  struct __loadu_ps {
560249423Sdim    __m128 __v;
561226633Sdim  } __attribute__((__packed__, __may_alias__));
562249423Sdim  return ((struct __loadu_ps*)__p)->__v;
563193326Sed}
564193326Sed
565206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
566249423Sdim_mm_loadr_ps(const float *__p)
567193326Sed{
568249423Sdim  __m128 __a = _mm_load_ps(__p);
569249423Sdim  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
570193326Sed}
571193326Sed
572206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
573249423Sdim_mm_set_ss(float __w)
574193326Sed{
575249423Sdim  return (__m128){ __w, 0, 0, 0 };
576193326Sed}
577193326Sed
578206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
579249423Sdim_mm_set1_ps(float __w)
580193326Sed{
581249423Sdim  return (__m128){ __w, __w, __w, __w };
582193326Sed}
583193326Sed
584193326Sed// Microsoft specific.
585206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
586249423Sdim_mm_set_ps1(float __w)
587193326Sed{
588249423Sdim    return _mm_set1_ps(__w);
589193326Sed}
590193326Sed
591206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
592249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w)
593193326Sed{
594249423Sdim  return (__m128){ __w, __x, __y, __z };
595193326Sed}
596193326Sed
597206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
598249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w)
599193326Sed{
600249423Sdim  return (__m128){ __z, __y, __x, __w };
601193326Sed}
602193326Sed
603206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__))
604193326Sed_mm_setzero_ps(void)
605193326Sed{
606193326Sed  return (__m128){ 0, 0, 0, 0 };
607193326Sed}
608193326Sed
609206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
610249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a)
611193326Sed{
612249423Sdim  __builtin_ia32_storehps((__v2si *)__p, __a);
613193326Sed}
614193326Sed
615206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
616249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a)
617193326Sed{
618249423Sdim  __builtin_ia32_storelps((__v2si *)__p, __a);
619193326Sed}
620193326Sed
621206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
622249423Sdim_mm_store_ss(float *__p, __m128 __a)
623193326Sed{
624226633Sdim  struct __mm_store_ss_struct {
625249423Sdim    float __u;
626226633Sdim  } __attribute__((__packed__, __may_alias__));
627249423Sdim  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
628193326Sed}
629193326Sed
630206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
631249423Sdim_mm_storeu_ps(float *__p, __m128 __a)
632193326Sed{
633249423Sdim  __builtin_ia32_storeups(__p, __a);
634193326Sed}
635193326Sed
636206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
637249423Sdim_mm_store1_ps(float *__p, __m128 __a)
638193326Sed{
639249423Sdim  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
640249423Sdim  _mm_storeu_ps(__p, __a);
641193326Sed}
642193326Sed
643206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
644249423Sdim_mm_store_ps1(float *__p, __m128 __a)
645212904Sdim{
646249423Sdim    return _mm_store1_ps(__p, __a);
647212904Sdim}
648212904Sdim
649212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__))
650249423Sdim_mm_store_ps(float *__p, __m128 __a)
651193326Sed{
652249423Sdim  *(__m128 *)__p = __a;
653193326Sed}
654193326Sed
655206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
656249423Sdim_mm_storer_ps(float *__p, __m128 __a)
657193326Sed{
658249423Sdim  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
659249423Sdim  _mm_store_ps(__p, __a);
660193326Sed}
661193326Sed
662212904Sdim#define _MM_HINT_T0 3
663193326Sed#define _MM_HINT_T1 2
664212904Sdim#define _MM_HINT_T2 1
665193326Sed#define _MM_HINT_NTA 0
666193326Sed
667210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
668193326Sed   Sema doesn't do any form of constant propagation yet. */
669193326Sed
670234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
671193326Sed
672206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
673249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a)
674193326Sed{
675249423Sdim  __builtin_ia32_movntq(__p, __a);
676193326Sed}
677193326Sed
678206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
679249423Sdim_mm_stream_ps(float *__p, __m128 __a)
680193326Sed{
681249423Sdim  __builtin_ia32_movntps(__p, __a);
682193326Sed}
683193326Sed
684206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
685193326Sed_mm_sfence(void)
686193326Sed{
687193326Sed  __builtin_ia32_sfence();
688193326Sed}
689193326Sed
690206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
691249423Sdim_mm_extract_pi16(__m64 __a, int __n)
692193326Sed{
693249423Sdim  __v4hi __b = (__v4hi)__a;
694249423Sdim  return (unsigned short)__b[__n & 3];
695193326Sed}
696193326Sed
697206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
698249423Sdim_mm_insert_pi16(__m64 __a, int __d, int __n)
699193326Sed{
700249423Sdim   __v4hi __b = (__v4hi)__a;
701249423Sdim   __b[__n & 3] = __d;
702249423Sdim   return (__m64)__b;
703193326Sed}
704193326Sed
705206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
706249423Sdim_mm_max_pi16(__m64 __a, __m64 __b)
707193326Sed{
708249423Sdim  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
709193326Sed}
710193326Sed
711206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
712249423Sdim_mm_max_pu8(__m64 __a, __m64 __b)
713193326Sed{
714249423Sdim  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
715193326Sed}
716193326Sed
717206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
718249423Sdim_mm_min_pi16(__m64 __a, __m64 __b)
719193326Sed{
720249423Sdim  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
721193326Sed}
722193326Sed
723206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
724249423Sdim_mm_min_pu8(__m64 __a, __m64 __b)
725193326Sed{
726249423Sdim  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
727193326Sed}
728193326Sed
729206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
730249423Sdim_mm_movemask_pi8(__m64 __a)
731193326Sed{
732249423Sdim  return __builtin_ia32_pmovmskb((__v8qi)__a);
733193326Sed}
734193326Sed
735206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
736249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b)
737193326Sed{
738249423Sdim  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
739193326Sed}
740193326Sed
741234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \
742234353Sdim  __m64 __a = (a); \
743234353Sdim  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
744193326Sed
745206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
746249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
747193326Sed{
748249423Sdim  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
749193326Sed}
750193326Sed
751206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
752249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b)
753193326Sed{
754249423Sdim  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
755193326Sed}
756193326Sed
757206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
758249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b)
759193326Sed{
760249423Sdim  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
761193326Sed}
762193326Sed
763206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
764249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b)
765193326Sed{
766249423Sdim  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
767193326Sed}
768193326Sed
769206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
770193326Sed_mm_getcsr(void)
771193326Sed{
772193326Sed  return __builtin_ia32_stmxcsr();
773193326Sed}
774193326Sed
775206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
776249423Sdim_mm_setcsr(unsigned int __i)
777193326Sed{
778249423Sdim  __builtin_ia32_ldmxcsr(__i);
779193326Sed}
780193326Sed
781234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
782234353Sdim  __m128 __a = (a); \
783234353Sdim  __m128 __b = (b); \
784234353Sdim  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
785234353Sdim                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
786234353Sdim                                  (((mask) & 0x30) >> 4) + 4, \
787234353Sdim                                  (((mask) & 0xc0) >> 6) + 4); })
788193326Sed
789206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
790249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b)
791193326Sed{
792249423Sdim  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
793193326Sed}
794193326Sed
795206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
796249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b)
797193326Sed{
798249423Sdim  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
799193326Sed}
800193326Sed
801206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
802249423Sdim_mm_move_ss(__m128 __a, __m128 __b)
803193326Sed{
804249423Sdim  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
805193326Sed}
806193326Sed
807206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
808249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b)
809193326Sed{
810249423Sdim  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
811193326Sed}
812193326Sed
813206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
814249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b)
815193326Sed{
816249423Sdim  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
817193326Sed}
818193326Sed
819206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
820249423Sdim_mm_cvtpi16_ps(__m64 __a)
821193326Sed{
822249423Sdim  __m64 __b, __c;
823249423Sdim  __m128 __r;
824193326Sed
825249423Sdim  __b = _mm_setzero_si64();
826249423Sdim  __b = _mm_cmpgt_pi16(__b, __a);
827249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
828249423Sdim  __r = _mm_setzero_ps();
829249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
830249423Sdim  __r = _mm_movelh_ps(__r, __r);
831249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
832249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
833193326Sed
834249423Sdim  return __r;
835193326Sed}
836193326Sed
837206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
838249423Sdim_mm_cvtpu16_ps(__m64 __a)
839193326Sed{
840249423Sdim  __m64 __b, __c;
841249423Sdim  __m128 __r;
842193326Sed
843249423Sdim  __b = _mm_setzero_si64();
844249423Sdim  __c = _mm_unpackhi_pi16(__a, __b);
845249423Sdim  __r = _mm_setzero_ps();
846249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
847249423Sdim  __r = _mm_movelh_ps(__r, __r);
848249423Sdim  __c = _mm_unpacklo_pi16(__a, __b);
849249423Sdim  __r = _mm_cvtpi32_ps(__r, __c);
850193326Sed
851249423Sdim  return __r;
852193326Sed}
853193326Sed
854206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
855249423Sdim_mm_cvtpi8_ps(__m64 __a)
856193326Sed{
857249423Sdim  __m64 __b;
858193326Sed
859249423Sdim  __b = _mm_setzero_si64();
860249423Sdim  __b = _mm_cmpgt_pi8(__b, __a);
861249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
862193326Sed
863249423Sdim  return _mm_cvtpi16_ps(__b);
864193326Sed}
865193326Sed
866206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
867249423Sdim_mm_cvtpu8_ps(__m64 __a)
868193326Sed{
869249423Sdim  __m64 __b;
870193326Sed
871249423Sdim  __b = _mm_setzero_si64();
872249423Sdim  __b = _mm_unpacklo_pi8(__a, __b);
873193326Sed
874249423Sdim  return _mm_cvtpi16_ps(__b);
875193326Sed}
876193326Sed
877206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
878249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
879193326Sed{
880249423Sdim  __m128 __c;
881193326Sed
882249423Sdim  __c = _mm_setzero_ps();
883249423Sdim  __c = _mm_cvtpi32_ps(__c, __b);
884249423Sdim  __c = _mm_movelh_ps(__c, __c);
885193326Sed
886249423Sdim  return _mm_cvtpi32_ps(__c, __a);
887193326Sed}
888193326Sed
889206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
890249423Sdim_mm_cvtps_pi16(__m128 __a)
891193326Sed{
892249423Sdim  __m64 __b, __c;
893193326Sed
894249423Sdim  __b = _mm_cvtps_pi32(__a);
895249423Sdim  __a = _mm_movehl_ps(__a, __a);
896249423Sdim  __c = _mm_cvtps_pi32(__a);
897193326Sed
898249423Sdim  return _mm_packs_pi16(__b, __c);
899193326Sed}
900193326Sed
901206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
902249423Sdim_mm_cvtps_pi8(__m128 __a)
903193326Sed{
904249423Sdim  __m64 __b, __c;
905193326Sed
906249423Sdim  __b = _mm_cvtps_pi16(__a);
907249423Sdim  __c = _mm_setzero_si64();
908193326Sed
909249423Sdim  return _mm_packs_pi16(__b, __c);
910193326Sed}
911193326Sed
912206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
913249423Sdim_mm_movemask_ps(__m128 __a)
914193326Sed{
915249423Sdim  return __builtin_ia32_movmskps(__a);
916193326Sed}
917193326Sed
918193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
919193326Sed
920193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
921193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
922193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
923193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
924193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
925193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
926193326Sed#define _MM_EXCEPT_MASK       (0x003f)
927193326Sed
928193326Sed#define _MM_MASK_INVALID      (0x0080)
929193326Sed#define _MM_MASK_DENORM       (0x0100)
930193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
931193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
932193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
933193326Sed#define _MM_MASK_INEXACT      (0x1000)
934193326Sed#define _MM_MASK_MASK         (0x1f80)
935193326Sed
936193326Sed#define _MM_ROUND_NEAREST     (0x0000)
937193326Sed#define _MM_ROUND_DOWN        (0x2000)
938193326Sed#define _MM_ROUND_UP          (0x4000)
939193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
940193326Sed#define _MM_ROUND_MASK        (0x6000)
941193326Sed
942193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
943193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
944234353Sdim#define _MM_FLUSH_ZERO_OFF    (0x0000)
945193326Sed
946193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
947193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
948193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
949193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
950193326Sed
951193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
952193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
953193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
954193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
955193326Sed
956193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
957193326Seddo { \
958193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
959193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
960193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
961193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
962193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
963193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
964193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
965193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
966203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
967193326Sed} while (0)
968193326Sed
969212904Sdim/* Aliases for compatibility. */
970212904Sdim#define _m_pextrw _mm_extract_pi16
971212904Sdim#define _m_pinsrw _mm_insert_pi16
972212904Sdim#define _m_pmaxsw _mm_max_pi16
973212904Sdim#define _m_pmaxub _mm_max_pu8
974212904Sdim#define _m_pminsw _mm_min_pi16
975212904Sdim#define _m_pminub _mm_min_pu8
976212904Sdim#define _m_pmovmskb _mm_movemask_pi8
977212904Sdim#define _m_pmulhuw _mm_mulhi_pu16
978212904Sdim#define _m_pshufw _mm_shuffle_pi16
979212904Sdim#define _m_maskmovq _mm_maskmove_si64
980212904Sdim#define _m_pavgb _mm_avg_pu8
981212904Sdim#define _m_pavgw _mm_avg_pu16
982212904Sdim#define _m_psadbw _mm_sad_pu8
983212904Sdim#define _m_ _mm_
984212904Sdim#define _m_ _mm_
985212904Sdim
986249423Sdim#if !__has_feature(modules)
987194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
988194179Sed#ifdef __SSE2__
989193326Sed#include <emmintrin.h>
990194179Sed#endif
991249423Sdim#endif
992193326Sed
993193326Sed#endif /* __SSE__ */
994193326Sed
995193326Sed#endif /* __XMMINTRIN_H */
996