xmmintrin.h revision 193326
1225394Sjchandra/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2233563Sjchandra *
3233563Sjchandra * Permission is hereby granted, free of charge, to any person obtaining a copy
4225394Sjchandra * of this software and associated documentation files (the "Software"), to deal
5225394Sjchandra * in the Software without restriction, including without limitation the rights
6225394Sjchandra * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7225394Sjchandra * copies of the Software, and to permit persons to whom the Software is
8233563Sjchandra * furnished to do so, subject to the following conditions:
9225394Sjchandra *
10225394Sjchandra * The above copyright notice and this permission notice shall be included in
11225394Sjchandra * all copies or substantial portions of the Software.
12233563Sjchandra *
13233563Sjchandra * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14233563Sjchandra * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15233563Sjchandra * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16233563Sjchandra * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17233563Sjchandra * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18233563Sjchandra * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19233563Sjchandra * THE SOFTWARE.
20233563Sjchandra *
21233563Sjchandra *===-----------------------------------------------------------------------===
22233563Sjchandra */
23233563Sjchandra
24233563Sjchandra#ifndef __XMMINTRIN_H
25233563Sjchandra#define __XMMINTRIN_H
26233563Sjchandra
27233563Sjchandra#ifndef __SSE__
28233563Sjchandra#error "SSE instruction set not enabled"
29225394Sjchandra#else
30225394Sjchandra
31225394Sjchandra#include <mmintrin.h>
32225394Sjchandra
33225394Sjchandratypedef float __v4sf __attribute__((__vector_size__(16)));
34225394Sjchandratypedef float __m128 __attribute__((__vector_size__(16)));
35225394Sjchandra
36225394Sjchandra#include <mm_malloc.h>
37225394Sjchandra
38225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
39225394Sjchandra_mm_add_ss(__m128 a, __m128 b)
40225394Sjchandra{
41233563Sjchandra  return __builtin_ia32_addss(a, b);
42225394Sjchandra}
43225394Sjchandra
44225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
45225394Sjchandra_mm_add_ps(__m128 a, __m128 b)
46225394Sjchandra{
47225394Sjchandra  return a + b;
48225394Sjchandra}
49233563Sjchandra
50233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
51225394Sjchandra_mm_sub_ss(__m128 a, __m128 b)
52225394Sjchandra{
53225394Sjchandra  return __builtin_ia32_subss(a, b);
54225394Sjchandra}
55279345Sjchandra
56279345Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
57279345Sjchandra_mm_sub_ps(__m128 a, __m128 b)
58225394Sjchandra{
59225394Sjchandra  return a - b;
60225394Sjchandra}
61225394Sjchandra
62225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
63225394Sjchandra_mm_mul_ss(__m128 a, __m128 b)
64225394Sjchandra{
65225394Sjchandra  return __builtin_ia32_mulss(a, b);
66225394Sjchandra}
67225394Sjchandra
68233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
69233556Sjchandra_mm_mul_ps(__m128 a, __m128 b)
70225394Sjchandra{
71225394Sjchandra  return a * b;
72225394Sjchandra}
73225394Sjchandra
74225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
75233563Sjchandra_mm_div_ss(__m128 a, __m128 b)
76225394Sjchandra{
77233563Sjchandra  return __builtin_ia32_divss(a, b);
78233563Sjchandra}
79233563Sjchandra
80233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
81233563Sjchandra_mm_div_ps(__m128 a, __m128 b)
82233563Sjchandra{
83233563Sjchandra  return a / b;
84233563Sjchandra}
85233563Sjchandra
86233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
87225394Sjchandra_mm_sqrt_ss(__m128 a)
88225394Sjchandra{
89233563Sjchandra  return __builtin_ia32_sqrtss(a);
90233563Sjchandra}
91233563Sjchandra
92233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
93233563Sjchandra_mm_sqrt_ps(__m128 a)
94233563Sjchandra{
95233563Sjchandra  return __builtin_ia32_sqrtps(a);
96233563Sjchandra}
97233563Sjchandra
98233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
99233563Sjchandra_mm_rcp_ss(__m128 a)
100233563Sjchandra{
101233563Sjchandra  return __builtin_ia32_rcpss(a);
102233563Sjchandra}
103233563Sjchandra
104233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
105233563Sjchandra_mm_rcp_ps(__m128 a)
106233563Sjchandra{
107233563Sjchandra  return __builtin_ia32_rcpps(a);
108233563Sjchandra}
109233563Sjchandra
110233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
111233563Sjchandra_mm_rsqrt_ss(__m128 a)
112233563Sjchandra{
113233563Sjchandra  return __builtin_ia32_rsqrtss(a);
114233563Sjchandra}
115233564Sjchandra
116233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
117233563Sjchandra_mm_rsqrt_ps(__m128 a)
118233564Sjchandra{
119233564Sjchandra  return __builtin_ia32_rsqrtps(a);
120233564Sjchandra}
121233564Sjchandra
122233564Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
123233564Sjchandra_mm_min_ss(__m128 a, __m128 b)
124233564Sjchandra{
125233564Sjchandra  return __builtin_ia32_minss(a, b);
126233564Sjchandra}
127233564Sjchandra
128233564Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
129233564Sjchandra_mm_min_ps(__m128 a, __m128 b)
130233564Sjchandra{
131233563Sjchandra  return __builtin_ia32_minps(a, b);
132233563Sjchandra}
133233563Sjchandra
134233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
135233563Sjchandra_mm_max_ss(__m128 a, __m128 b)
136233563Sjchandra{
137233563Sjchandra  return __builtin_ia32_maxss(a, b);
138233563Sjchandra}
139233563Sjchandra
140233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
141233563Sjchandra_mm_max_ps(__m128 a, __m128 b)
142233563Sjchandra{
143233563Sjchandra  return __builtin_ia32_maxps(a, b);
144233563Sjchandra}
145233563Sjchandra
146233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
147233563Sjchandra_mm_and_ps(__m128 a, __m128 b)
148233563Sjchandra{
149233563Sjchandra  return __builtin_ia32_andps(a, b);
150233563Sjchandra}
151233563Sjchandra
152233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
153233563Sjchandra_mm_andnot_ps(__m128 a, __m128 b)
154233563Sjchandra{
155233563Sjchandra  return __builtin_ia32_andnps(a, b);
156233563Sjchandra}
157233563Sjchandra
158233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
159233563Sjchandra_mm_or_ps(__m128 a, __m128 b)
160233563Sjchandra{
161233563Sjchandra  return __builtin_ia32_orps(a, b);
162233563Sjchandra}
163233563Sjchandra
164233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
165233563Sjchandra_mm_xor_ps(__m128 a, __m128 b)
166233563Sjchandra{
167233563Sjchandra  return __builtin_ia32_xorps(a, b);
168233563Sjchandra}
169279306Sjchandra
170233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
171233563Sjchandra_mm_cmpeq_ss(__m128 a, __m128 b)
172233563Sjchandra{
173233563Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 0);
174233563Sjchandra}
175233563Sjchandra
176233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
177233563Sjchandra_mm_cmpeq_ps(__m128 a, __m128 b)
178233563Sjchandra{
179233563Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 0);
180279306Sjchandra}
181233563Sjchandra
182279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
183279306Sjchandra_mm_cmplt_ss(__m128 a, __m128 b)
184279306Sjchandra{
185279306Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 1);
186279306Sjchandra}
187279306Sjchandra
188279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
189279306Sjchandra_mm_cmplt_ps(__m128 a, __m128 b)
190279306Sjchandra{
191279306Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 1);
192279306Sjchandra}
193279306Sjchandra
194279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
195279306Sjchandra_mm_cmple_ss(__m128 a, __m128 b)
196279306Sjchandra{
197279306Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 2);
198279306Sjchandra}
199279306Sjchandra
200279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
201279306Sjchandra_mm_cmple_ps(__m128 a, __m128 b)
202279306Sjchandra{
203279306Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 2);
204279306Sjchandra}
205279306Sjchandra
206279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
207233563Sjchandra_mm_cmpgt_ss(__m128 a, __m128 b)
208233563Sjchandra{
209233563Sjchandra  return (__m128)__builtin_ia32_cmpss(b, a, 1);
210233563Sjchandra}
211233563Sjchandra
212233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
213233563Sjchandra_mm_cmpgt_ps(__m128 a, __m128 b)
214233563Sjchandra{
215233564Sjchandra  return (__m128)__builtin_ia32_cmpps(b, a, 1);
216279306Sjchandra}
217279306Sjchandra
218279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
219279306Sjchandra_mm_cmpge_ss(__m128 a, __m128 b)
220279306Sjchandra{
221233564Sjchandra  return (__m128)__builtin_ia32_cmpss(b, a, 2);
222233563Sjchandra}
223233563Sjchandra
224233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
225233563Sjchandra_mm_cmpge_ps(__m128 a, __m128 b)
226233563Sjchandra{
227233563Sjchandra  return (__m128)__builtin_ia32_cmpps(b, a, 2);
228233563Sjchandra}
229233563Sjchandra
230233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
231233563Sjchandra_mm_cmpneq_ss(__m128 a, __m128 b)
232233563Sjchandra{
233233563Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 4);
234233563Sjchandra}
235233563Sjchandra
236233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
237233563Sjchandra_mm_cmpneq_ps(__m128 a, __m128 b)
238233563Sjchandra{
239233563Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 4);
240233563Sjchandra}
241233563Sjchandra
242233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
243233563Sjchandra_mm_cmpnlt_ss(__m128 a, __m128 b)
244233563Sjchandra{
245233563Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 5);
246233563Sjchandra}
247233563Sjchandra
248233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
249233563Sjchandra_mm_cmpnlt_ps(__m128 a, __m128 b)
250233563Sjchandra{
251233563Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 5);
252233563Sjchandra}
253233563Sjchandra
254233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
255233563Sjchandra_mm_cmpnle_ss(__m128 a, __m128 b)
256233563Sjchandra{
257233563Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 6);
258233563Sjchandra}
259233563Sjchandra
260233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
261233563Sjchandra_mm_cmpnle_ps(__m128 a, __m128 b)
262233563Sjchandra{
263233563Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 6);
264233563Sjchandra}
265233563Sjchandra
266233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
267233563Sjchandra_mm_cmpngt_ss(__m128 a, __m128 b)
268233563Sjchandra{
269233563Sjchandra  return (__m128)__builtin_ia32_cmpss(b, a, 5);
270233563Sjchandra}
271233563Sjchandra
272233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
273233563Sjchandra_mm_cmpngt_ps(__m128 a, __m128 b)
274233563Sjchandra{
275233563Sjchandra  return (__m128)__builtin_ia32_cmpps(b, a, 5);
276233563Sjchandra}
277233563Sjchandra
278233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
279233563Sjchandra_mm_cmpnge_ss(__m128 a, __m128 b)
280233563Sjchandra{
281233563Sjchandra  return (__m128)__builtin_ia32_cmpss(b, a, 6);
282233563Sjchandra}
283233570Sjchandra
284233570Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
285233563Sjchandra_mm_cmpnge_ps(__m128 a, __m128 b)
286233563Sjchandra{
287225394Sjchandra  return (__m128)__builtin_ia32_cmpps(b, a, 6);
288225394Sjchandra}
289225394Sjchandra
290233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
291225394Sjchandra_mm_cmpord_ss(__m128 a, __m128 b)
292225394Sjchandra{
293225394Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 7);
294225394Sjchandra}
295225394Sjchandra
296225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
297225394Sjchandra_mm_cmpord_ps(__m128 a, __m128 b)
298225394Sjchandra{
299225394Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 7);
300225394Sjchandra}
301225394Sjchandra
302225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
303225394Sjchandra_mm_cmpunord_ss(__m128 a, __m128 b)
304225394Sjchandra{
305233536Sjchandra  return (__m128)__builtin_ia32_cmpss(a, b, 3);
306225394Sjchandra}
307225394Sjchandra
308225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
309225394Sjchandra_mm_cmpunord_ps(__m128 a, __m128 b)
310225394Sjchandra{
311225394Sjchandra  return (__m128)__builtin_ia32_cmpps(a, b, 3);
312225394Sjchandra}
313233536Sjchandra
314225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
315225394Sjchandra_mm_comieq_ss(__m128 a, __m128 b)
316233536Sjchandra{
317233536Sjchandra  return __builtin_ia32_comieq(a, b);
318233536Sjchandra}
319233536Sjchandra
320225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
321225394Sjchandra_mm_comilt_ss(__m128 a, __m128 b)
322225394Sjchandra{
323225394Sjchandra  return __builtin_ia32_comilt(a, b);
324225394Sjchandra}
325233563Sjchandra
326225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
327225394Sjchandra_mm_comile_ss(__m128 a, __m128 b)
328225394Sjchandra{
329225394Sjchandra  return __builtin_ia32_comile(a, b);
330225394Sjchandra}
331225394Sjchandra
332225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
333279345Sjchandra_mm_comigt_ss(__m128 a, __m128 b)
334279345Sjchandra{
335279345Sjchandra  return __builtin_ia32_comigt(a, b);
336279345Sjchandra}
337279345Sjchandra
338225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
339225394Sjchandra_mm_comige_ss(__m128 a, __m128 b)
340225394Sjchandra{
341225394Sjchandra  return __builtin_ia32_comige(a, b);
342225394Sjchandra}
343225394Sjchandra
344225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
345225394Sjchandra_mm_comineq_ss(__m128 a, __m128 b)
346225394Sjchandra{
347225394Sjchandra  return __builtin_ia32_comineq(a, b);
348225394Sjchandra}
349225394Sjchandra
350225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
351225394Sjchandra_mm_ucomieq_ss(__m128 a, __m128 b)
352225394Sjchandra{
353225394Sjchandra  return __builtin_ia32_ucomieq(a, b);
354225394Sjchandra}
355225394Sjchandra
356225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
357225394Sjchandra_mm_ucomilt_ss(__m128 a, __m128 b)
358225394Sjchandra{
359225394Sjchandra  return __builtin_ia32_ucomilt(a, b);
360225394Sjchandra}
361225394Sjchandra
362225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
363225394Sjchandra_mm_ucomile_ss(__m128 a, __m128 b)
364225394Sjchandra{
365225394Sjchandra  return __builtin_ia32_ucomile(a, b);
366225394Sjchandra}
367225394Sjchandra
368225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
369225394Sjchandra_mm_ucomigt_ss(__m128 a, __m128 b)
370225394Sjchandra{
371225394Sjchandra  return __builtin_ia32_ucomigt(a, b);
372225394Sjchandra}
373225394Sjchandra
374225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
375225394Sjchandra_mm_ucomige_ss(__m128 a, __m128 b)
376225394Sjchandra{
377225394Sjchandra  return __builtin_ia32_ucomige(a, b);
378225394Sjchandra}
379225394Sjchandra
380225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
381225394Sjchandra_mm_ucomineq_ss(__m128 a, __m128 b)
382225394Sjchandra{
383225394Sjchandra  return __builtin_ia32_ucomineq(a, b);
384225394Sjchandra}
385225394Sjchandra
386225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
387225394Sjchandra_mm_cvtss_si32(__m128 a)
388225394Sjchandra{
389233563Sjchandra  return __builtin_ia32_cvtss2si(a);
390233563Sjchandra}
391233563Sjchandra
392225394Sjchandrastatic inline long long __attribute__((__always_inline__, __nodebug__))
393233563Sjchandra_mm_cvtss_si64(__m128 a)
394233563Sjchandra{
395225394Sjchandra  return __builtin_ia32_cvtss2si64(a);
396233563Sjchandra}
397225394Sjchandra
398225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
399225394Sjchandra_mm_cvtps_pi32(__m128 a)
400225394Sjchandra{
401225394Sjchandra  return (__m64)__builtin_ia32_cvtps2pi(a);
402225394Sjchandra}
403225394Sjchandra
404225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
405225394Sjchandra_mm_cvttss_si32(__m128 a)
406225394Sjchandra{
407225394Sjchandra  return __builtin_ia32_cvttss2si(a);
408225394Sjchandra}
409225394Sjchandra
410225394Sjchandrastatic inline long long __attribute__((__always_inline__, __nodebug__))
411225394Sjchandra_mm_cvttss_si64(__m128 a)
412225394Sjchandra{
413225394Sjchandra  return __builtin_ia32_cvttss2si64(a);
414225394Sjchandra}
415225394Sjchandra
416225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
417225394Sjchandra_mm_cvttps_pi32(__m128 a)
418225394Sjchandra{
419225394Sjchandra  return (__m64)__builtin_ia32_cvttps2pi(a);
420225394Sjchandra}
421225394Sjchandra
422225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
423225394Sjchandra_mm_cvtsi32_ss(__m128 a, int b)
424225394Sjchandra{
425225394Sjchandra  return __builtin_ia32_cvtsi2ss(a, b);
426225394Sjchandra}
427225394Sjchandra
428225394Sjchandra#ifdef __x86_64__
429225394Sjchandra
430225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
431233563Sjchandra_mm_cvtsi64_ss(__m128 a, long long b)
432233563Sjchandra{
433233563Sjchandra  return __builtin_ia32_cvtsi642ss(a, b);
434233563Sjchandra}
435233563Sjchandra
436225394Sjchandra#endif
437225394Sjchandra
438225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
439233536Sjchandra_mm_cvtpi32_ps(__m128 a, __m64 b)
440245877Sjchandra{
441245877Sjchandra  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
442245877Sjchandra}
443233536Sjchandra
444233536Sjchandrastatic inline float __attribute__((__always_inline__, __nodebug__))
445233563Sjchandra_mm_cvtss_f32(__m128 a)
446233536Sjchandra{
447245877Sjchandra  return a[0];
448233536Sjchandra}
449233536Sjchandra
450233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
451233536Sjchandra_mm_loadh_pi(__m128 a, __m64 const *p)
452233536Sjchandra{
453233536Sjchandra  return __builtin_ia32_loadhps(a, (__v2si *)p);
454233536Sjchandra}
455233536Sjchandra
456233536Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
457233536Sjchandra_mm_loadl_pi(__m128 a, __m64 const *p)
458233536Sjchandra{
459233536Sjchandra  return __builtin_ia32_loadlps(a, (__v2si *)p);
460233536Sjchandra}
461233536Sjchandra
462238289Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
463233536Sjchandra_mm_load_ss(float *p)
464233536Sjchandra{
465233536Sjchandra  return (__m128){ *p, 0, 0, 0 };
466233536Sjchandra}
467233536Sjchandra
468238289Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
469245877Sjchandra_mm_load1_ps(float *p)
470233536Sjchandra{
471233536Sjchandra  return (__m128){ *p, *p, *p, *p };
472225394Sjchandra}
473225394Sjchandra
474225394Sjchandra#define        _mm_load_ps1(p) _mm_load1_ps(p)
475233536Sjchandra
476225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
477257338Snwhitehorn_mm_load_ps(float *p)
478257338Snwhitehorn{
479233536Sjchandra  return *(__m128*)p;
480233536Sjchandra}
481233536Sjchandra
482233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
483233536Sjchandra_mm_loadu_ps(float *p)
484225394Sjchandra{
485225394Sjchandra  return __builtin_ia32_loadups(p);
486225394Sjchandra}
487225394Sjchandra
488225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
489225394Sjchandra_mm_loadr_ps(float *p)
490225394Sjchandra{
491225394Sjchandra  __m128 a = _mm_load_ps(p);
492225394Sjchandra  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
493225394Sjchandra}
494225394Sjchandra
495225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
496225394Sjchandra_mm_set_ss(float w)
497225394Sjchandra{
498225394Sjchandra  return (__m128){ w, 0, 0, 0 };
499225394Sjchandra}
500225394Sjchandra
501225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
502225394Sjchandra_mm_set1_ps(float w)
503225394Sjchandra{
504225394Sjchandra  return (__m128){ w, w, w, w };
505225394Sjchandra}
506225394Sjchandra
507225394Sjchandra// Microsoft specific.
508225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
509225394Sjchandra_mm_set_ps1(float w)
510225394Sjchandra{
511225394Sjchandra    return _mm_set1_ps(w);
512225394Sjchandra}
513225394Sjchandra
514225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
515225394Sjchandra_mm_set_ps(float z, float y, float x, float w)
516225394Sjchandra{
517225394Sjchandra  return (__m128){ w, x, y, z };
518225394Sjchandra}
519225394Sjchandra
520225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
521225394Sjchandra_mm_setr_ps(float z, float y, float x, float w)
522225394Sjchandra{
523225394Sjchandra  return (__m128){ z, y, x, w };
524225394Sjchandra}
525225394Sjchandra
526225394Sjchandrastatic inline __m128 __attribute__((__always_inline__))
527225394Sjchandra_mm_setzero_ps(void)
528225394Sjchandra{
529225394Sjchandra  return (__m128){ 0, 0, 0, 0 };
530225394Sjchandra}
531225394Sjchandra
532225394Sjchandrastatic inline void __attribute__((__always_inline__))
533225394Sjchandra_mm_storeh_pi(__m64 *p, __m128 a)
534225394Sjchandra{
535225394Sjchandra  __builtin_ia32_storehps((__v2si *)p, a);
536225394Sjchandra}
537225394Sjchandra
538225394Sjchandrastatic inline void __attribute__((__always_inline__))
539225394Sjchandra_mm_storel_pi(__m64 *p, __m128 a)
540225394Sjchandra{
541225394Sjchandra  __builtin_ia32_storelps((__v2si *)p, a);
542225394Sjchandra}
543225394Sjchandra
544225394Sjchandrastatic inline void __attribute__((__always_inline__))
545225394Sjchandra_mm_store_ss(float *p, __m128 a)
546225394Sjchandra{
547225394Sjchandra  *p = a[0];
548279306Sjchandra}
549225394Sjchandra
550279306Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
551225394Sjchandra_mm_storeu_ps(float *p, __m128 a)
552225394Sjchandra{
553225394Sjchandra  __builtin_ia32_storeups(p, a);
554225394Sjchandra}
555279306Sjchandra
556279306Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
557279306Sjchandra_mm_store1_ps(float *p, __m128 a)
558279306Sjchandra{
559225394Sjchandra  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
560225394Sjchandra  _mm_storeu_ps(p, a);
561225394Sjchandra}
562279341Sjchandra
563225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
564225394Sjchandra_mm_store_ps(float *p, __m128 a)
565225394Sjchandra{
566225394Sjchandra  *(__m128 *)p = a;
567225394Sjchandra}
568225394Sjchandra
569225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
570227783Sjchandra_mm_storer_ps(float *p, __m128 a)
571225394Sjchandra{
572225394Sjchandra  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
573225394Sjchandra  _mm_store_ps(p, a);
574225394Sjchandra}
575225394Sjchandra
576225394Sjchandra#define _MM_HINT_T0 1
577225394Sjchandra#define _MM_HINT_T1 2
578225394Sjchandra#define _MM_HINT_T2 3
579225394Sjchandra#define _MM_HINT_NTA 0
580225394Sjchandra
581225394Sjchandra/* FIXME: We have to #define this because "sel" must be a constant integer, and
582225394Sjchandra   Sema doesn't do any form of constant propagation yet. */
583225394Sjchandra
584225394Sjchandra#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
585225394Sjchandra
586225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
587225394Sjchandra_mm_stream_pi(__m64 *p, __m64 a)
588225394Sjchandra{
589225394Sjchandra  __builtin_ia32_movntq(p, a);
590225394Sjchandra}
591225394Sjchandra
592233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
593225394Sjchandra_mm_stream_ps(float *p, __m128 a)
594225394Sjchandra{
595225394Sjchandra  __builtin_ia32_movntps(p, a);
596225394Sjchandra}
597225394Sjchandra
598225394Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
599225394Sjchandra_mm_sfence(void)
600225394Sjchandra{
601225394Sjchandra  __builtin_ia32_sfence();
602225394Sjchandra}
603225394Sjchandra
604225394Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
605225394Sjchandra_mm_extract_pi16(__m64 a, int n)
606225394Sjchandra{
607225394Sjchandra  /* FIXME:
608233563Sjchandra   * This should force n to be an immediate.
609233563Sjchandra   * This does not use the PEXTRW instruction. From looking at the LLVM source, the
610225394Sjchandra     instruction doesn't seem to be hooked up.
611233563Sjchandra   * The code could probably be made better :)
612225394Sjchandra   */
613225394Sjchandra  __v4hi b = (__v4hi)a;
614225394Sjchandra  return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
615225394Sjchandra}
616225394Sjchandra
617225394Sjchandra/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
618225394Sjchandra   the already existing __builtin_shufflevector.
619227783Sjchandra*/
620227783Sjchandra/*
621227783Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
622225394Sjchandra_mm_insert_pi16(__m64 a, int d, int n)
623225394Sjchandra{
624225394Sjchandra   return (__m64){ 0LL };
625225394Sjchandra}
626225394Sjchandra*/
627233536Sjchandra
628225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
629225394Sjchandra_mm_max_pi16(__m64 a, __m64 b)
630225394Sjchandra{
631225394Sjchandra  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
632225394Sjchandra}
633225394Sjchandra
634225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
635225394Sjchandra_mm_max_pu8(__m64 a, __m64 b)
636233536Sjchandra{
637225394Sjchandra  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
638225394Sjchandra}
639225394Sjchandra
640225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
641233536Sjchandra_mm_min_pi16(__m64 a, __m64 b)
642225394Sjchandra{
643225394Sjchandra  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
644225394Sjchandra}
645233536Sjchandra
646233536Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
647225394Sjchandra_mm_min_pu8(__m64 a, __m64 b)
648225394Sjchandra{
649225394Sjchandra  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
650225394Sjchandra}
651233536Sjchandra
652233536Sjchandrastatic inline int __attribute__((__always_inline__, __nodebug__))
653279306Sjchandra_mm_movemask_pi8(__m64 a)
654279306Sjchandra{
655225394Sjchandra  return __builtin_ia32_pmovmskb((__v8qi)a);
656279341Sjchandra}
657279341Sjchandra
658279341Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
659279341Sjchandra_mm_mulhi_pu16(__m64 a, __m64 b)
660279341Sjchandra{
661225394Sjchandra  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
662225394Sjchandra}
663225394Sjchandra
664225394Sjchandra#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
665225394Sjchandra
666233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
667225394Sjchandra_mm_maskmove_si64(__m64 d, __m64 n, char *p)
668225394Sjchandra{
669225394Sjchandra  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
670225394Sjchandra}
671225394Sjchandra
672225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
673225394Sjchandra_mm_avg_pu8(__m64 a, __m64 b)
674225394Sjchandra{
675225394Sjchandra  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
676225394Sjchandra}
677233563Sjchandra
678225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
679225394Sjchandra_mm_avg_pu16(__m64 a, __m64 b)
680225394Sjchandra{
681225394Sjchandra  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
682233563Sjchandra}
683225394Sjchandra
684225394Sjchandrastatic inline __m64 __attribute__((__always_inline__, __nodebug__))
685233563Sjchandra_mm_sad_pu8(__m64 a, __m64 b)
686233563Sjchandra{
687233563Sjchandra  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
688233563Sjchandra}
689225394Sjchandra
690233563Sjchandrastatic inline unsigned int __attribute__((__always_inline__, __nodebug__))
691233563Sjchandra_mm_getcsr(void)
692233563Sjchandra{
693225394Sjchandra  return __builtin_ia32_stmxcsr();
694233563Sjchandra}
695233563Sjchandra
696233563Sjchandrastatic inline void __attribute__((__always_inline__, __nodebug__))
697233563Sjchandra_mm_setcsr(unsigned int i)
698225394Sjchandra{
699225394Sjchandra  __builtin_ia32_ldmxcsr(i);
700225394Sjchandra}
701233563Sjchandra
702233563Sjchandra#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
703225394Sjchandra
704225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
705225394Sjchandra_mm_unpackhi_ps(__m128 a, __m128 b)
706233563Sjchandra{
707233563Sjchandra  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
708225394Sjchandra}
709225394Sjchandra
710225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
711225394Sjchandra_mm_unpacklo_ps(__m128 a, __m128 b)
712233563Sjchandra{
713233563Sjchandra  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
714233563Sjchandra}
715225394Sjchandra
716225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
717225394Sjchandra_mm_move_ss(__m128 a, __m128 b)
718225394Sjchandra{
719225394Sjchandra  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
720225394Sjchandra}
721225394Sjchandra
722225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
723225394Sjchandra_mm_movehl_ps(__m128 a, __m128 b)
724225394Sjchandra{
725225394Sjchandra  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
726233563Sjchandra}
727225394Sjchandra
728225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
729225394Sjchandra_mm_movelh_ps(__m128 a, __m128 b)
730225394Sjchandra{
731225394Sjchandra  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
732225394Sjchandra}
733225394Sjchandra
734233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
735225394Sjchandra_mm_cvtpi16_ps(__m64 a)
736225394Sjchandra{
737225394Sjchandra  __m64 b, c;
738225394Sjchandra  __m128 r;
739225394Sjchandra
740225394Sjchandra  b = _mm_setzero_si64();
741225394Sjchandra  b = _mm_cmpgt_pi16(b, a);
742233563Sjchandra  c = _mm_unpackhi_pi16(a, b);
743225394Sjchandra  r = _mm_setzero_ps();
744225394Sjchandra  r = _mm_cvtpi32_ps(r, c);
745225394Sjchandra  r = _mm_movelh_ps(r, r);
746225394Sjchandra  c = _mm_unpacklo_pi16(a, b);
747225394Sjchandra  r = _mm_cvtpi32_ps(r, c);
748225394Sjchandra
749225394Sjchandra  return r;
750233563Sjchandra}
751225394Sjchandra
752279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
753225394Sjchandra_mm_cvtpu16_ps(__m64 a)
754225394Sjchandra{
755225394Sjchandra  __m64 b, c;
756225394Sjchandra  __m128 r;
757225394Sjchandra
758225394Sjchandra  b = _mm_setzero_si64();
759225394Sjchandra  c = _mm_unpackhi_pi16(a, b);
760227783Sjchandra  r = _mm_setzero_ps();
761227783Sjchandra  r = _mm_cvtpi32_ps(r, c);
762227783Sjchandra  r = _mm_movelh_ps(r, r);
763227783Sjchandra  c = _mm_unpacklo_pi16(a, b);
764227783Sjchandra  r = _mm_cvtpi32_ps(r, c);
765227783Sjchandra
766227783Sjchandra  return r;
767227783Sjchandra}
768227783Sjchandra
769279306Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
770279306Sjchandra_mm_cvtpi8_ps(__m64 a)
771279306Sjchandra{
772279306Sjchandra  __m64 b;
773227783Sjchandra
774227783Sjchandra  b = _mm_setzero_si64();
775279306Sjchandra  b = _mm_cmpgt_pi8(b, a);
776227783Sjchandra  b = _mm_unpacklo_pi8(a, b);
777225394Sjchandra
778225394Sjchandra  return _mm_cvtpi16_ps(b);
779225394Sjchandra}
780225394Sjchandra
781225394Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
782225394Sjchandra_mm_cvtpu8_ps(__m64 a)
783225394Sjchandra{
784225394Sjchandra  __m64 b;
785225394Sjchandra
786225394Sjchandra  b = _mm_setzero_si64();
787233563Sjchandra  b = _mm_unpacklo_pi8(a, b);
788233563Sjchandra
789233563Sjchandra  return _mm_cvtpi16_ps(b);
790233563Sjchandra}
791233563Sjchandra
792233563Sjchandrastatic inline __m128 __attribute__((__always_inline__, __nodebug__))
793225394Sjchandra_mm_cvtpi32x2_ps(__m64 a, __m64 b)
794225394Sjchandra{
795225394Sjchandra  __m128 c;
796225394Sjchandra
797225394Sjchandra  c = _mm_setzero_ps();
798233563Sjchandra  c = _mm_cvtpi32_ps(c, b);
799225394Sjchandra  c = _mm_movelh_ps(c, c);
800225394Sjchandra
801225394Sjchandra  return _mm_cvtpi32_ps(c, a);
802225394Sjchandra}
803225394Sjchandra
804227843Smariusstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
805225394Sjchandra_mm_cvtps_pi16(__m128 a)
806225394Sjchandra{
807225394Sjchandra  __m64 b, c;
808225394Sjchandra
809225394Sjchandra  b = _mm_cvtps_pi32(a);
810233563Sjchandra  a = _mm_movehl_ps(a, a);
811225394Sjchandra  c = _mm_cvtps_pi32(a);
812225394Sjchandra
813279345Sjchandra  return _mm_packs_pi16(b, c);
814279345Sjchandra}
815
816static inline __m64 __attribute__((__always_inline__, __nodebug__))
817_mm_cvtps_pi8(__m128 a)
818{
819  __m64 b, c;
820
821  b = _mm_cvtps_pi16(a);
822  c = _mm_setzero_si64();
823
824  return _mm_packs_pi16(b, c);
825}
826
827static inline int __attribute__((__always_inline__, __nodebug__))
828_mm_movemask_ps(__m128 a)
829{
830  return __builtin_ia32_movmskps(a);
831}
832
833#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
834
835#define _MM_EXCEPT_INVALID    (0x0001)
836#define _MM_EXCEPT_DENORM     (0x0002)
837#define _MM_EXCEPT_DIV_ZERO   (0x0004)
838#define _MM_EXCEPT_OVERFLOW   (0x0008)
839#define _MM_EXCEPT_UNDERFLOW  (0x0010)
840#define _MM_EXCEPT_INEXACT    (0x0020)
841#define _MM_EXCEPT_MASK       (0x003f)
842
843#define _MM_MASK_INVALID      (0x0080)
844#define _MM_MASK_DENORM       (0x0100)
845#define _MM_MASK_DIV_ZERO     (0x0200)
846#define _MM_MASK_OVERFLOW     (0x0400)
847#define _MM_MASK_UNDERFLOW    (0x0800)
848#define _MM_MASK_INEXACT      (0x1000)
849#define _MM_MASK_MASK         (0x1f80)
850
851#define _MM_ROUND_NEAREST     (0x0000)
852#define _MM_ROUND_DOWN        (0x2000)
853#define _MM_ROUND_UP          (0x4000)
854#define _MM_ROUND_TOWARD_ZERO (0x6000)
855#define _MM_ROUND_MASK        (0x6000)
856
857#define _MM_FLUSH_ZERO_MASK   (0x8000)
858#define _MM_FLUSH_ZERO_ON     (0x8000)
859#define _MM_FLUSH_ZERO_OFF    (0x8000)
860
861#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
862#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
863#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
864#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
865
866#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
867#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
868#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
869#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
870
871#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
872do { \
873  __m128 tmp3, tmp2, tmp1, tmp0; \
874  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
875  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
876  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
877  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
878  (row0) = _mm_movelh_ps(tmp0, tmp2); \
879  (row1) = _mm_movehl_ps(tmp2, tmp0); \
880  (row2) = _mm_movelh_ps(tmp1, tmp3); \
881  (row3) = _mm_movelh_ps(tmp3, tmp1); \
882} while (0)
883
884#include <emmintrin.h>
885
886#endif /* __SSE__ */
887
888#endif /* __XMMINTRIN_H */
889