xmmintrin.h revision 234353
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23193326Sed
24193326Sed#ifndef __XMMINTRIN_H
25193326Sed#define __XMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE__
28193326Sed#error "SSE instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <mmintrin.h>
32193326Sed
33205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16)));
34193326Sedtypedef float __v4sf __attribute__((__vector_size__(16)));
35193326Sedtypedef float __m128 __attribute__((__vector_size__(16)));
36193326Sed
37218893Sdim// This header should only be included in a hosted environment as it depends on
38218893Sdim// a standard library to provide allocation routines.
39218893Sdim#if __STDC_HOSTED__
40193326Sed#include <mm_malloc.h>
41218893Sdim#endif
42193326Sed
43206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44193326Sed_mm_add_ss(__m128 a, __m128 b)
45193326Sed{
46193576Sed  a[0] += b[0];
47193576Sed  return a;
48193326Sed}
49193326Sed
50206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51193326Sed_mm_add_ps(__m128 a, __m128 b)
52193326Sed{
53193326Sed  return a + b;
54193326Sed}
55193326Sed
56206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57193326Sed_mm_sub_ss(__m128 a, __m128 b)
58193326Sed{
59193576Sed  a[0] -= b[0];
60193576Sed  return a;
61193326Sed}
62193326Sed
63206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64193326Sed_mm_sub_ps(__m128 a, __m128 b)
65193326Sed{
66193326Sed  return a - b;
67193326Sed}
68193326Sed
69206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70193326Sed_mm_mul_ss(__m128 a, __m128 b)
71193326Sed{
72193576Sed  a[0] *= b[0];
73193576Sed  return a;
74193326Sed}
75193326Sed
76206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77193326Sed_mm_mul_ps(__m128 a, __m128 b)
78193326Sed{
79193326Sed  return a * b;
80193326Sed}
81193326Sed
82206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83193326Sed_mm_div_ss(__m128 a, __m128 b)
84193326Sed{
85193576Sed  a[0] /= b[0];
86193576Sed  return a;
87193326Sed}
88193326Sed
89206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90193326Sed_mm_div_ps(__m128 a, __m128 b)
91193326Sed{
92193326Sed  return a / b;
93193326Sed}
94193326Sed
95206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96193326Sed_mm_sqrt_ss(__m128 a)
97193326Sed{
98193326Sed  return __builtin_ia32_sqrtss(a);
99193326Sed}
100193326Sed
101206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
102193326Sed_mm_sqrt_ps(__m128 a)
103193326Sed{
104193326Sed  return __builtin_ia32_sqrtps(a);
105193326Sed}
106193326Sed
107206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
108193326Sed_mm_rcp_ss(__m128 a)
109193326Sed{
110193326Sed  return __builtin_ia32_rcpss(a);
111193326Sed}
112193326Sed
113206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
114193326Sed_mm_rcp_ps(__m128 a)
115193326Sed{
116193326Sed  return __builtin_ia32_rcpps(a);
117193326Sed}
118193326Sed
119206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
120193326Sed_mm_rsqrt_ss(__m128 a)
121193326Sed{
122193326Sed  return __builtin_ia32_rsqrtss(a);
123193326Sed}
124193326Sed
125206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
126193326Sed_mm_rsqrt_ps(__m128 a)
127193326Sed{
128193326Sed  return __builtin_ia32_rsqrtps(a);
129193326Sed}
130193326Sed
131206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
132193326Sed_mm_min_ss(__m128 a, __m128 b)
133193326Sed{
134193326Sed  return __builtin_ia32_minss(a, b);
135193326Sed}
136193326Sed
137206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
138193326Sed_mm_min_ps(__m128 a, __m128 b)
139193326Sed{
140193326Sed  return __builtin_ia32_minps(a, b);
141193326Sed}
142193326Sed
143206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
144193326Sed_mm_max_ss(__m128 a, __m128 b)
145193326Sed{
146193326Sed  return __builtin_ia32_maxss(a, b);
147193326Sed}
148193326Sed
149206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
150193326Sed_mm_max_ps(__m128 a, __m128 b)
151193326Sed{
152193326Sed  return __builtin_ia32_maxps(a, b);
153193326Sed}
154193326Sed
155206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
156193326Sed_mm_and_ps(__m128 a, __m128 b)
157193326Sed{
158193576Sed  return (__m128)((__v4si)a & (__v4si)b);
159193326Sed}
160193326Sed
161206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
162193326Sed_mm_andnot_ps(__m128 a, __m128 b)
163193326Sed{
164193576Sed  return (__m128)(~(__v4si)a & (__v4si)b);
165193326Sed}
166193326Sed
167206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
168193326Sed_mm_or_ps(__m128 a, __m128 b)
169193326Sed{
170193576Sed  return (__m128)((__v4si)a | (__v4si)b);
171193326Sed}
172193326Sed
173206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
174193326Sed_mm_xor_ps(__m128 a, __m128 b)
175193326Sed{
176202379Srdivacky  return (__m128)((__v4si)a ^ (__v4si)b);
177193326Sed}
178193326Sed
179206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
180193326Sed_mm_cmpeq_ss(__m128 a, __m128 b)
181193326Sed{
182193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 0);
183193326Sed}
184193326Sed
185206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
186193326Sed_mm_cmpeq_ps(__m128 a, __m128 b)
187193326Sed{
188193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 0);
189193326Sed}
190193326Sed
191206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
192193326Sed_mm_cmplt_ss(__m128 a, __m128 b)
193193326Sed{
194193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 1);
195193326Sed}
196193326Sed
197206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
198193326Sed_mm_cmplt_ps(__m128 a, __m128 b)
199193326Sed{
200193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 1);
201193326Sed}
202193326Sed
203206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
204193326Sed_mm_cmple_ss(__m128 a, __m128 b)
205193326Sed{
206193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 2);
207193326Sed}
208193326Sed
209206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
210193326Sed_mm_cmple_ps(__m128 a, __m128 b)
211193326Sed{
212193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 2);
213193326Sed}
214193326Sed
215206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
216193326Sed_mm_cmpgt_ss(__m128 a, __m128 b)
217193326Sed{
218193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 1);
219193326Sed}
220193326Sed
221206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
222193326Sed_mm_cmpgt_ps(__m128 a, __m128 b)
223193326Sed{
224193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 1);
225193326Sed}
226193326Sed
227206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
228193326Sed_mm_cmpge_ss(__m128 a, __m128 b)
229193326Sed{
230193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 2);
231193326Sed}
232193326Sed
233206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
234193326Sed_mm_cmpge_ps(__m128 a, __m128 b)
235193326Sed{
236193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 2);
237193326Sed}
238193326Sed
239206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
240193326Sed_mm_cmpneq_ss(__m128 a, __m128 b)
241193326Sed{
242193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 4);
243193326Sed}
244193326Sed
245206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
246193326Sed_mm_cmpneq_ps(__m128 a, __m128 b)
247193326Sed{
248193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 4);
249193326Sed}
250193326Sed
251206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
252193326Sed_mm_cmpnlt_ss(__m128 a, __m128 b)
253193326Sed{
254193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 5);
255193326Sed}
256193326Sed
257206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
258193326Sed_mm_cmpnlt_ps(__m128 a, __m128 b)
259193326Sed{
260193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 5);
261193326Sed}
262193326Sed
263206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
264193326Sed_mm_cmpnle_ss(__m128 a, __m128 b)
265193326Sed{
266193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 6);
267193326Sed}
268193326Sed
269206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
270193326Sed_mm_cmpnle_ps(__m128 a, __m128 b)
271193326Sed{
272193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 6);
273193326Sed}
274193326Sed
275206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
276193326Sed_mm_cmpngt_ss(__m128 a, __m128 b)
277193326Sed{
278193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 5);
279193326Sed}
280193326Sed
281206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
282193326Sed_mm_cmpngt_ps(__m128 a, __m128 b)
283193326Sed{
284193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 5);
285193326Sed}
286193326Sed
287206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
288193326Sed_mm_cmpnge_ss(__m128 a, __m128 b)
289193326Sed{
290193326Sed  return (__m128)__builtin_ia32_cmpss(b, a, 6);
291193326Sed}
292193326Sed
293206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
294193326Sed_mm_cmpnge_ps(__m128 a, __m128 b)
295193326Sed{
296193326Sed  return (__m128)__builtin_ia32_cmpps(b, a, 6);
297193326Sed}
298193326Sed
299206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
300193326Sed_mm_cmpord_ss(__m128 a, __m128 b)
301193326Sed{
302193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 7);
303193326Sed}
304193326Sed
305206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
306193326Sed_mm_cmpord_ps(__m128 a, __m128 b)
307193326Sed{
308193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 7);
309193326Sed}
310193326Sed
311206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
312193326Sed_mm_cmpunord_ss(__m128 a, __m128 b)
313193326Sed{
314193326Sed  return (__m128)__builtin_ia32_cmpss(a, b, 3);
315193326Sed}
316193326Sed
317206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
318193326Sed_mm_cmpunord_ps(__m128 a, __m128 b)
319193326Sed{
320193326Sed  return (__m128)__builtin_ia32_cmpps(a, b, 3);
321193326Sed}
322193326Sed
323206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
324193326Sed_mm_comieq_ss(__m128 a, __m128 b)
325193326Sed{
326193326Sed  return __builtin_ia32_comieq(a, b);
327193326Sed}
328193326Sed
329206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
330193326Sed_mm_comilt_ss(__m128 a, __m128 b)
331193326Sed{
332193326Sed  return __builtin_ia32_comilt(a, b);
333193326Sed}
334193326Sed
335206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
336193326Sed_mm_comile_ss(__m128 a, __m128 b)
337193326Sed{
338193326Sed  return __builtin_ia32_comile(a, b);
339193326Sed}
340193326Sed
341206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
342193326Sed_mm_comigt_ss(__m128 a, __m128 b)
343193326Sed{
344193326Sed  return __builtin_ia32_comigt(a, b);
345193326Sed}
346193326Sed
347206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
348193326Sed_mm_comige_ss(__m128 a, __m128 b)
349193326Sed{
350193326Sed  return __builtin_ia32_comige(a, b);
351193326Sed}
352193326Sed
353206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
354193326Sed_mm_comineq_ss(__m128 a, __m128 b)
355193326Sed{
356193326Sed  return __builtin_ia32_comineq(a, b);
357193326Sed}
358193326Sed
359206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
360193326Sed_mm_ucomieq_ss(__m128 a, __m128 b)
361193326Sed{
362193326Sed  return __builtin_ia32_ucomieq(a, b);
363193326Sed}
364193326Sed
365206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
366193326Sed_mm_ucomilt_ss(__m128 a, __m128 b)
367193326Sed{
368193326Sed  return __builtin_ia32_ucomilt(a, b);
369193326Sed}
370193326Sed
371206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
372193326Sed_mm_ucomile_ss(__m128 a, __m128 b)
373193326Sed{
374193326Sed  return __builtin_ia32_ucomile(a, b);
375193326Sed}
376193326Sed
377206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
378193326Sed_mm_ucomigt_ss(__m128 a, __m128 b)
379193326Sed{
380193326Sed  return __builtin_ia32_ucomigt(a, b);
381193326Sed}
382193326Sed
383206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
384193326Sed_mm_ucomige_ss(__m128 a, __m128 b)
385193326Sed{
386193326Sed  return __builtin_ia32_ucomige(a, b);
387193326Sed}
388193326Sed
389206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
390193326Sed_mm_ucomineq_ss(__m128 a, __m128 b)
391193326Sed{
392193326Sed  return __builtin_ia32_ucomineq(a, b);
393193326Sed}
394193326Sed
395206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
396193326Sed_mm_cvtss_si32(__m128 a)
397193326Sed{
398193326Sed  return __builtin_ia32_cvtss2si(a);
399193326Sed}
400193326Sed
401206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
402204643Srdivacky_mm_cvt_ss2si(__m128 a)
403204643Srdivacky{
404204643Srdivacky  return _mm_cvtss_si32(a);
405204643Srdivacky}
406204643Srdivacky
407193576Sed#ifdef __x86_64__
408193576Sed
409206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
410193326Sed_mm_cvtss_si64(__m128 a)
411193326Sed{
412193326Sed  return __builtin_ia32_cvtss2si64(a);
413193326Sed}
414193326Sed
415193576Sed#endif
416193576Sed
417206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
418193326Sed_mm_cvtps_pi32(__m128 a)
419193326Sed{
420193326Sed  return (__m64)__builtin_ia32_cvtps2pi(a);
421193326Sed}
422193326Sed
423212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
424212904Sdim_mm_cvt_ps2pi(__m128 a)
425212904Sdim{
426212904Sdim  return _mm_cvtps_pi32(a);
427212904Sdim}
428212904Sdim
429206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
430193326Sed_mm_cvttss_si32(__m128 a)
431193326Sed{
432193576Sed  return a[0];
433193326Sed}
434193326Sed
435206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
436204643Srdivacky_mm_cvtt_ss2si(__m128 a)
437204643Srdivacky{
438204643Srdivacky  return _mm_cvttss_si32(a);
439204643Srdivacky}
440204643Srdivacky
441206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
442193326Sed_mm_cvttss_si64(__m128 a)
443193326Sed{
444193576Sed  return a[0];
445193326Sed}
446193326Sed
447206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
448193326Sed_mm_cvttps_pi32(__m128 a)
449193326Sed{
450193326Sed  return (__m64)__builtin_ia32_cvttps2pi(a);
451193326Sed}
452193326Sed
453212904Sdimstatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
454212904Sdim_mm_cvtt_ps2pi(__m128 a)
455212904Sdim{
456212904Sdim  return _mm_cvttps_pi32(a);
457212904Sdim}
458212904Sdim
459206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
460193326Sed_mm_cvtsi32_ss(__m128 a, int b)
461193326Sed{
462193576Sed  a[0] = b;
463193576Sed  return a;
464193326Sed}
465193326Sed
466212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
467212904Sdim_mm_cvt_si2ss(__m128 a, int b)
468212904Sdim{
469212904Sdim  return _mm_cvtsi32_ss(a, b);
470212904Sdim}
471212904Sdim
472193326Sed#ifdef __x86_64__
473193326Sed
474206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475193326Sed_mm_cvtsi64_ss(__m128 a, long long b)
476193326Sed{
477193576Sed  a[0] = b;
478193576Sed  return a;
479193326Sed}
480193326Sed
481193326Sed#endif
482193326Sed
483206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
484193326Sed_mm_cvtpi32_ps(__m128 a, __m64 b)
485193326Sed{
486193326Sed  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
487193326Sed}
488193326Sed
489212904Sdimstatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
490212904Sdim_mm_cvt_pi2ps(__m128 a, __m64 b)
491212904Sdim{
492212904Sdim  return _mm_cvtpi32_ps(a, b);
493212904Sdim}
494212904Sdim
495206084Srdivackystatic __inline__ float __attribute__((__always_inline__, __nodebug__))
496193326Sed_mm_cvtss_f32(__m128 a)
497193326Sed{
498193326Sed  return a[0];
499193326Sed}
500193326Sed
501206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
502203955Srdivacky_mm_loadh_pi(__m128 a, const __m64 *p)
503193326Sed{
504226633Sdim  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
505226633Sdim  struct __mm_loadh_pi_struct {
506226633Sdim    __mm_loadh_pi_v2f32 u;
507226633Sdim  } __attribute__((__packed__, __may_alias__));
508226633Sdim  __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
509226633Sdim  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
510226633Sdim  return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
511193326Sed}
512193326Sed
513206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
514203955Srdivacky_mm_loadl_pi(__m128 a, const __m64 *p)
515193326Sed{
516226633Sdim  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
517226633Sdim  struct __mm_loadl_pi_struct {
518226633Sdim    __mm_loadl_pi_v2f32 u;
519226633Sdim  } __attribute__((__packed__, __may_alias__));
520226633Sdim  __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
521226633Sdim  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
522226633Sdim  return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
523193326Sed}
524193326Sed
525206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
526203955Srdivacky_mm_load_ss(const float *p)
527193326Sed{
528226633Sdim  struct __mm_load_ss_struct {
529226633Sdim    float u;
530226633Sdim  } __attribute__((__packed__, __may_alias__));
531226633Sdim  float u = ((struct __mm_load_ss_struct*)p)->u;
532226633Sdim  return (__m128){ u, 0, 0, 0 };
533193326Sed}
534193326Sed
535206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
536203955Srdivacky_mm_load1_ps(const float *p)
537193326Sed{
538226633Sdim  struct __mm_load1_ps_struct {
539226633Sdim    float u;
540226633Sdim  } __attribute__((__packed__, __may_alias__));
541226633Sdim  float u = ((struct __mm_load1_ps_struct*)p)->u;
542226633Sdim  return (__m128){ u, u, u, u };
543193326Sed}
544193326Sed
545193326Sed#define        _mm_load_ps1(p) _mm_load1_ps(p)
546193326Sed
547206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
548203955Srdivacky_mm_load_ps(const float *p)
549193326Sed{
550193326Sed  return *(__m128*)p;
551193326Sed}
552193326Sed
553206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
554203955Srdivacky_mm_loadu_ps(const float *p)
555193326Sed{
556223017Sdim  struct __loadu_ps {
557223017Sdim    __m128 v;
558226633Sdim  } __attribute__((__packed__, __may_alias__));
559223017Sdim  return ((struct __loadu_ps*)p)->v;
560193326Sed}
561193326Sed
562206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
563203955Srdivacky_mm_loadr_ps(const float *p)
564193326Sed{
565193326Sed  __m128 a = _mm_load_ps(p);
566193326Sed  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
567193326Sed}
568193326Sed
569206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
570193326Sed_mm_set_ss(float w)
571193326Sed{
572193326Sed  return (__m128){ w, 0, 0, 0 };
573193326Sed}
574193326Sed
575206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
576193326Sed_mm_set1_ps(float w)
577193326Sed{
578193326Sed  return (__m128){ w, w, w, w };
579193326Sed}
580193326Sed
581193326Sed// Microsoft specific.
582206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
583193326Sed_mm_set_ps1(float w)
584193326Sed{
585193326Sed    return _mm_set1_ps(w);
586193326Sed}
587193326Sed
588206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
589193326Sed_mm_set_ps(float z, float y, float x, float w)
590193326Sed{
591193326Sed  return (__m128){ w, x, y, z };
592193326Sed}
593193326Sed
594206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
595193326Sed_mm_setr_ps(float z, float y, float x, float w)
596193326Sed{
597193326Sed  return (__m128){ z, y, x, w };
598193326Sed}
599193326Sed
600206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__))
601193326Sed_mm_setzero_ps(void)
602193326Sed{
603193326Sed  return (__m128){ 0, 0, 0, 0 };
604193326Sed}
605193326Sed
606206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
607193326Sed_mm_storeh_pi(__m64 *p, __m128 a)
608193326Sed{
609193326Sed  __builtin_ia32_storehps((__v2si *)p, a);
610193326Sed}
611193326Sed
612206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
613193326Sed_mm_storel_pi(__m64 *p, __m128 a)
614193326Sed{
615193326Sed  __builtin_ia32_storelps((__v2si *)p, a);
616193326Sed}
617193326Sed
618206084Srdivackystatic __inline__ void __attribute__((__always_inline__))
619193326Sed_mm_store_ss(float *p, __m128 a)
620193326Sed{
621226633Sdim  struct __mm_store_ss_struct {
622226633Sdim    float u;
623226633Sdim  } __attribute__((__packed__, __may_alias__));
624226633Sdim  ((struct __mm_store_ss_struct*)p)->u = a[0];
625193326Sed}
626193326Sed
627206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
628193326Sed_mm_storeu_ps(float *p, __m128 a)
629193326Sed{
630193326Sed  __builtin_ia32_storeups(p, a);
631193326Sed}
632193326Sed
633206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
634193326Sed_mm_store1_ps(float *p, __m128 a)
635193326Sed{
636193326Sed  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
637193326Sed  _mm_storeu_ps(p, a);
638193326Sed}
639193326Sed
640206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
641212904Sdim_mm_store_ps1(float *p, __m128 a)
642212904Sdim{
643212904Sdim    return _mm_store1_ps(p, a);
644212904Sdim}
645212904Sdim
646212904Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__))
647193326Sed_mm_store_ps(float *p, __m128 a)
648193326Sed{
649193326Sed  *(__m128 *)p = a;
650193326Sed}
651193326Sed
652206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
653193326Sed_mm_storer_ps(float *p, __m128 a)
654193326Sed{
655193326Sed  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
656193326Sed  _mm_store_ps(p, a);
657193326Sed}
658193326Sed
659212904Sdim#define _MM_HINT_T0 3
660193326Sed#define _MM_HINT_T1 2
661212904Sdim#define _MM_HINT_T2 1
662193326Sed#define _MM_HINT_NTA 0
663193326Sed
664210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and
665193326Sed   Sema doesn't do any form of constant propagation yet. */
666193326Sed
667234353Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
668193326Sed
669206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
670193326Sed_mm_stream_pi(__m64 *p, __m64 a)
671193326Sed{
672193326Sed  __builtin_ia32_movntq(p, a);
673193326Sed}
674193326Sed
675206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
676193326Sed_mm_stream_ps(float *p, __m128 a)
677193326Sed{
678193326Sed  __builtin_ia32_movntps(p, a);
679193326Sed}
680193326Sed
681206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
682193326Sed_mm_sfence(void)
683193326Sed{
684193326Sed  __builtin_ia32_sfence();
685193326Sed}
686193326Sed
687206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
688193326Sed_mm_extract_pi16(__m64 a, int n)
689193326Sed{
690193326Sed  __v4hi b = (__v4hi)a;
691193576Sed  return (unsigned short)b[n & 3];
692193326Sed}
693193326Sed
694206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
695193326Sed_mm_insert_pi16(__m64 a, int d, int n)
696193326Sed{
697193576Sed   __v4hi b = (__v4hi)a;
698193576Sed   b[n & 3] = d;
699193576Sed   return (__m64)b;
700193326Sed}
701193326Sed
702206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
703193326Sed_mm_max_pi16(__m64 a, __m64 b)
704193326Sed{
705193326Sed  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
706193326Sed}
707193326Sed
708206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
709193326Sed_mm_max_pu8(__m64 a, __m64 b)
710193326Sed{
711193326Sed  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
712193326Sed}
713193326Sed
714206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
715193326Sed_mm_min_pi16(__m64 a, __m64 b)
716193326Sed{
717193326Sed  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
718193326Sed}
719193326Sed
720206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
721193326Sed_mm_min_pu8(__m64 a, __m64 b)
722193326Sed{
723193326Sed  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
724193326Sed}
725193326Sed
726206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
727193326Sed_mm_movemask_pi8(__m64 a)
728193326Sed{
729193326Sed  return __builtin_ia32_pmovmskb((__v8qi)a);
730193326Sed}
731193326Sed
732206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733193326Sed_mm_mulhi_pu16(__m64 a, __m64 b)
734193326Sed{
735193326Sed  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
736193326Sed}
737193326Sed
738234353Sdim#define _mm_shuffle_pi16(a, n) __extension__ ({ \
739234353Sdim  __m64 __a = (a); \
740234353Sdim  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
741193326Sed
742206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
743193326Sed_mm_maskmove_si64(__m64 d, __m64 n, char *p)
744193326Sed{
745193326Sed  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
746193326Sed}
747193326Sed
748206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
749193326Sed_mm_avg_pu8(__m64 a, __m64 b)
750193326Sed{
751193326Sed  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
752193326Sed}
753193326Sed
754206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
755193326Sed_mm_avg_pu16(__m64 a, __m64 b)
756193326Sed{
757193326Sed  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
758193326Sed}
759193326Sed
760206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
761193326Sed_mm_sad_pu8(__m64 a, __m64 b)
762193326Sed{
763193326Sed  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
764193326Sed}
765193326Sed
766206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
767193326Sed_mm_getcsr(void)
768193326Sed{
769193326Sed  return __builtin_ia32_stmxcsr();
770193326Sed}
771193326Sed
772206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
773193326Sed_mm_setcsr(unsigned int i)
774193326Sed{
775193326Sed  __builtin_ia32_ldmxcsr(i);
776193326Sed}
777193326Sed
778234353Sdim#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
779234353Sdim  __m128 __a = (a); \
780234353Sdim  __m128 __b = (b); \
781234353Sdim  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
782234353Sdim                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
783234353Sdim                                  (((mask) & 0x30) >> 4) + 4, \
784234353Sdim                                  (((mask) & 0xc0) >> 6) + 4); })
785193326Sed
786206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
787193326Sed_mm_unpackhi_ps(__m128 a, __m128 b)
788193326Sed{
789193326Sed  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
790193326Sed}
791193326Sed
792206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
793193326Sed_mm_unpacklo_ps(__m128 a, __m128 b)
794193326Sed{
795193326Sed  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
796193326Sed}
797193326Sed
798206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
799193326Sed_mm_move_ss(__m128 a, __m128 b)
800193326Sed{
801193326Sed  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
802193326Sed}
803193326Sed
804206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
805193326Sed_mm_movehl_ps(__m128 a, __m128 b)
806193326Sed{
807193326Sed  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
808193326Sed}
809193326Sed
810206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
811193326Sed_mm_movelh_ps(__m128 a, __m128 b)
812193326Sed{
813193326Sed  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
814193326Sed}
815193326Sed
816206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
817193326Sed_mm_cvtpi16_ps(__m64 a)
818193326Sed{
819193326Sed  __m64 b, c;
820193326Sed  __m128 r;
821193326Sed
822193326Sed  b = _mm_setzero_si64();
823193326Sed  b = _mm_cmpgt_pi16(b, a);
824193326Sed  c = _mm_unpackhi_pi16(a, b);
825193326Sed  r = _mm_setzero_ps();
826193326Sed  r = _mm_cvtpi32_ps(r, c);
827193326Sed  r = _mm_movelh_ps(r, r);
828193326Sed  c = _mm_unpacklo_pi16(a, b);
829193326Sed  r = _mm_cvtpi32_ps(r, c);
830193326Sed
831193326Sed  return r;
832193326Sed}
833193326Sed
834206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
835193326Sed_mm_cvtpu16_ps(__m64 a)
836193326Sed{
837193326Sed  __m64 b, c;
838193326Sed  __m128 r;
839193326Sed
840193326Sed  b = _mm_setzero_si64();
841193326Sed  c = _mm_unpackhi_pi16(a, b);
842193326Sed  r = _mm_setzero_ps();
843193326Sed  r = _mm_cvtpi32_ps(r, c);
844193326Sed  r = _mm_movelh_ps(r, r);
845193326Sed  c = _mm_unpacklo_pi16(a, b);
846193326Sed  r = _mm_cvtpi32_ps(r, c);
847193326Sed
848193326Sed  return r;
849193326Sed}
850193326Sed
851206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
852193326Sed_mm_cvtpi8_ps(__m64 a)
853193326Sed{
854193326Sed  __m64 b;
855193326Sed
856193326Sed  b = _mm_setzero_si64();
857193326Sed  b = _mm_cmpgt_pi8(b, a);
858193326Sed  b = _mm_unpacklo_pi8(a, b);
859193326Sed
860193326Sed  return _mm_cvtpi16_ps(b);
861193326Sed}
862193326Sed
863206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
864193326Sed_mm_cvtpu8_ps(__m64 a)
865193326Sed{
866193326Sed  __m64 b;
867193326Sed
868193326Sed  b = _mm_setzero_si64();
869193326Sed  b = _mm_unpacklo_pi8(a, b);
870193326Sed
871193326Sed  return _mm_cvtpi16_ps(b);
872193326Sed}
873193326Sed
874206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
875193326Sed_mm_cvtpi32x2_ps(__m64 a, __m64 b)
876193326Sed{
877193326Sed  __m128 c;
878193326Sed
879193326Sed  c = _mm_setzero_ps();
880193326Sed  c = _mm_cvtpi32_ps(c, b);
881193326Sed  c = _mm_movelh_ps(c, c);
882193326Sed
883193326Sed  return _mm_cvtpi32_ps(c, a);
884193326Sed}
885193326Sed
886206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
887193326Sed_mm_cvtps_pi16(__m128 a)
888193326Sed{
889193326Sed  __m64 b, c;
890193326Sed
891193326Sed  b = _mm_cvtps_pi32(a);
892193326Sed  a = _mm_movehl_ps(a, a);
893193326Sed  c = _mm_cvtps_pi32(a);
894193326Sed
895193326Sed  return _mm_packs_pi16(b, c);
896193326Sed}
897193326Sed
898206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
899193326Sed_mm_cvtps_pi8(__m128 a)
900193326Sed{
901193326Sed  __m64 b, c;
902193326Sed
903193326Sed  b = _mm_cvtps_pi16(a);
904193326Sed  c = _mm_setzero_si64();
905193326Sed
906193326Sed  return _mm_packs_pi16(b, c);
907193326Sed}
908193326Sed
909206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
910193326Sed_mm_movemask_ps(__m128 a)
911193326Sed{
912193326Sed  return __builtin_ia32_movmskps(a);
913193326Sed}
914193326Sed
915193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
916193326Sed
917193326Sed#define _MM_EXCEPT_INVALID    (0x0001)
918193326Sed#define _MM_EXCEPT_DENORM     (0x0002)
919193326Sed#define _MM_EXCEPT_DIV_ZERO   (0x0004)
920193326Sed#define _MM_EXCEPT_OVERFLOW   (0x0008)
921193326Sed#define _MM_EXCEPT_UNDERFLOW  (0x0010)
922193326Sed#define _MM_EXCEPT_INEXACT    (0x0020)
923193326Sed#define _MM_EXCEPT_MASK       (0x003f)
924193326Sed
925193326Sed#define _MM_MASK_INVALID      (0x0080)
926193326Sed#define _MM_MASK_DENORM       (0x0100)
927193326Sed#define _MM_MASK_DIV_ZERO     (0x0200)
928193326Sed#define _MM_MASK_OVERFLOW     (0x0400)
929193326Sed#define _MM_MASK_UNDERFLOW    (0x0800)
930193326Sed#define _MM_MASK_INEXACT      (0x1000)
931193326Sed#define _MM_MASK_MASK         (0x1f80)
932193326Sed
933193326Sed#define _MM_ROUND_NEAREST     (0x0000)
934193326Sed#define _MM_ROUND_DOWN        (0x2000)
935193326Sed#define _MM_ROUND_UP          (0x4000)
936193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000)
937193326Sed#define _MM_ROUND_MASK        (0x6000)
938193326Sed
939193326Sed#define _MM_FLUSH_ZERO_MASK   (0x8000)
940193326Sed#define _MM_FLUSH_ZERO_ON     (0x8000)
941234353Sdim#define _MM_FLUSH_ZERO_OFF    (0x0000)
942193326Sed
943193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
944193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
945193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
946193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
947193326Sed
948193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
949193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
950193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
951193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
952193326Sed
953193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
954193326Seddo { \
955193326Sed  __m128 tmp3, tmp2, tmp1, tmp0; \
956193326Sed  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
957193326Sed  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
958193326Sed  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
959193326Sed  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
960193326Sed  (row0) = _mm_movelh_ps(tmp0, tmp2); \
961193326Sed  (row1) = _mm_movehl_ps(tmp2, tmp0); \
962193326Sed  (row2) = _mm_movelh_ps(tmp1, tmp3); \
963203955Srdivacky  (row3) = _mm_movehl_ps(tmp3, tmp1); \
964193326Sed} while (0)
965193326Sed
966212904Sdim/* Aliases for compatibility. */
967212904Sdim#define _m_pextrw _mm_extract_pi16
968212904Sdim#define _m_pinsrw _mm_insert_pi16
969212904Sdim#define _m_pmaxsw _mm_max_pi16
970212904Sdim#define _m_pmaxub _mm_max_pu8
971212904Sdim#define _m_pminsw _mm_min_pi16
972212904Sdim#define _m_pminub _mm_min_pu8
973212904Sdim#define _m_pmovmskb _mm_movemask_pi8
974212904Sdim#define _m_pmulhuw _mm_mulhi_pu16
975212904Sdim#define _m_pshufw _mm_shuffle_pi16
976212904Sdim#define _m_maskmovq _mm_maskmove_si64
977212904Sdim#define _m_pavgb _mm_avg_pu8
978212904Sdim#define _m_pavgw _mm_avg_pu16
979212904Sdim#define _m_psadbw _mm_sad_pu8
980212904Sdim#define _m_ _mm_
981212904Sdim#define _m_ _mm_
982212904Sdim
983194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */
984194179Sed#ifdef __SSE2__
985193326Sed#include <emmintrin.h>
986194179Sed#endif
987193326Sed
988193326Sed#endif /* __SSE__ */
989193326Sed
990193326Sed#endif /* __XMMINTRIN_H */
991