1212904Sdim/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2193326Sed *
3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy
4193326Sed * of this software and associated documentation files (the "Software"), to deal
5193326Sed * in the Software without restriction, including without limitation the rights
6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7193326Sed * copies of the Software, and to permit persons to whom the Software is
8193326Sed * furnished to do so, subject to the following conditions:
9193326Sed *
10193326Sed * The above copyright notice and this permission notice shall be included in
11193326Sed * all copies or substantial portions of the Software.
12193326Sed *
13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19193326Sed * THE SOFTWARE.
20193326Sed *
21193326Sed *===-----------------------------------------------------------------------===
22193326Sed */
23212904Sdim
24193326Sed#ifndef __EMMINTRIN_H
25193326Sed#define __EMMINTRIN_H
26193326Sed
27193326Sed#ifndef __SSE2__
28193326Sed#error "SSE2 instruction set not enabled"
29193326Sed#else
30193326Sed
31193326Sed#include <xmmintrin.h>
32193326Sed
33193326Sedtypedef double __m128d __attribute__((__vector_size__(16)));
34193326Sedtypedef long long __m128i __attribute__((__vector_size__(16)));
35193326Sed
36212904Sdim/* Type defines.  */
37212904Sdimtypedef double __v2df __attribute__ ((__vector_size__ (16)));
38212904Sdimtypedef long long __v2di __attribute__ ((__vector_size__ (16)));
39193326Sedtypedef short __v8hi __attribute__((__vector_size__(16)));
40193326Sedtypedef char __v16qi __attribute__((__vector_size__(16)));
41193326Sed
42206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43249423Sdim_mm_add_sd(__m128d __a, __m128d __b)
44193326Sed{
45249423Sdim  __a[0] += __b[0];
46249423Sdim  return __a;
47193326Sed}
48193326Sed
49206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50249423Sdim_mm_add_pd(__m128d __a, __m128d __b)
51193326Sed{
52249423Sdim  return __a + __b;
53193326Sed}
54193326Sed
55206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56249423Sdim_mm_sub_sd(__m128d __a, __m128d __b)
57193326Sed{
58249423Sdim  __a[0] -= __b[0];
59249423Sdim  return __a;
60193326Sed}
61193326Sed
62206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63249423Sdim_mm_sub_pd(__m128d __a, __m128d __b)
64193326Sed{
65249423Sdim  return __a - __b;
66193326Sed}
67193326Sed
68206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69249423Sdim_mm_mul_sd(__m128d __a, __m128d __b)
70193326Sed{
71249423Sdim  __a[0] *= __b[0];
72249423Sdim  return __a;
73193326Sed}
74193326Sed
75206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76249423Sdim_mm_mul_pd(__m128d __a, __m128d __b)
77193326Sed{
78249423Sdim  return __a * __b;
79193326Sed}
80193326Sed
81206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82249423Sdim_mm_div_sd(__m128d __a, __m128d __b)
83193326Sed{
84249423Sdim  __a[0] /= __b[0];
85249423Sdim  return __a;
86193326Sed}
87193326Sed
88206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89249423Sdim_mm_div_pd(__m128d __a, __m128d __b)
90193326Sed{
91249423Sdim  return __a / __b;
92193326Sed}
93193326Sed
94206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95249423Sdim_mm_sqrt_sd(__m128d __a, __m128d __b)
96193326Sed{
97249423Sdim  __m128d __c = __builtin_ia32_sqrtsd(__b);
98249423Sdim  return (__m128d) { __c[0], __a[1] };
99193326Sed}
100193326Sed
101206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102249423Sdim_mm_sqrt_pd(__m128d __a)
103193326Sed{
104249423Sdim  return __builtin_ia32_sqrtpd(__a);
105193326Sed}
106193326Sed
107206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108249423Sdim_mm_min_sd(__m128d __a, __m128d __b)
109193326Sed{
110249423Sdim  return __builtin_ia32_minsd(__a, __b);
111193326Sed}
112193326Sed
113206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114249423Sdim_mm_min_pd(__m128d __a, __m128d __b)
115193326Sed{
116249423Sdim  return __builtin_ia32_minpd(__a, __b);
117193326Sed}
118193326Sed
119206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120249423Sdim_mm_max_sd(__m128d __a, __m128d __b)
121193326Sed{
122249423Sdim  return __builtin_ia32_maxsd(__a, __b);
123193326Sed}
124193326Sed
125206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126249423Sdim_mm_max_pd(__m128d __a, __m128d __b)
127193326Sed{
128249423Sdim  return __builtin_ia32_maxpd(__a, __b);
129193326Sed}
130193326Sed
131206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132249423Sdim_mm_and_pd(__m128d __a, __m128d __b)
133193326Sed{
134249423Sdim  return (__m128d)((__v4si)__a & (__v4si)__b);
135193326Sed}
136193326Sed
137206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138249423Sdim_mm_andnot_pd(__m128d __a, __m128d __b)
139193326Sed{
140249423Sdim  return (__m128d)(~(__v4si)__a & (__v4si)__b);
141193326Sed}
142193326Sed
143206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144249423Sdim_mm_or_pd(__m128d __a, __m128d __b)
145193326Sed{
146249423Sdim  return (__m128d)((__v4si)__a | (__v4si)__b);
147193326Sed}
148193326Sed
149206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150249423Sdim_mm_xor_pd(__m128d __a, __m128d __b)
151193326Sed{
152249423Sdim  return (__m128d)((__v4si)__a ^ (__v4si)__b);
153193326Sed}
154193326Sed
155206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156249423Sdim_mm_cmpeq_pd(__m128d __a, __m128d __b)
157193326Sed{
158249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
159193326Sed}
160193326Sed
161206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162249423Sdim_mm_cmplt_pd(__m128d __a, __m128d __b)
163193326Sed{
164249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
165193326Sed}
166193326Sed
167206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168249423Sdim_mm_cmple_pd(__m128d __a, __m128d __b)
169193326Sed{
170249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
171193326Sed}
172193326Sed
173206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174249423Sdim_mm_cmpgt_pd(__m128d __a, __m128d __b)
175193326Sed{
176249423Sdim  return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
177193326Sed}
178193326Sed
179206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180249423Sdim_mm_cmpge_pd(__m128d __a, __m128d __b)
181193326Sed{
182249423Sdim  return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
183193326Sed}
184193326Sed
185206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186249423Sdim_mm_cmpord_pd(__m128d __a, __m128d __b)
187193326Sed{
188249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
189193326Sed}
190193326Sed
191206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192249423Sdim_mm_cmpunord_pd(__m128d __a, __m128d __b)
193193326Sed{
194249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
195193326Sed}
196193326Sed
197206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198249423Sdim_mm_cmpneq_pd(__m128d __a, __m128d __b)
199193326Sed{
200249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
201193326Sed}
202193326Sed
203206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204249423Sdim_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205193326Sed{
206249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
207193326Sed}
208193326Sed
209206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210249423Sdim_mm_cmpnle_pd(__m128d __a, __m128d __b)
211193326Sed{
212249423Sdim  return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
213193326Sed}
214193326Sed
215206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216249423Sdim_mm_cmpngt_pd(__m128d __a, __m128d __b)
217193326Sed{
218249423Sdim  return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
219193326Sed}
220193326Sed
221206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222249423Sdim_mm_cmpnge_pd(__m128d __a, __m128d __b)
223193326Sed{
224249423Sdim  return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
225193326Sed}
226193326Sed
227206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228249423Sdim_mm_cmpeq_sd(__m128d __a, __m128d __b)
229193326Sed{
230249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
231193326Sed}
232193326Sed
233206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234249423Sdim_mm_cmplt_sd(__m128d __a, __m128d __b)
235193326Sed{
236249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
237193326Sed}
238193326Sed
239206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240249423Sdim_mm_cmple_sd(__m128d __a, __m128d __b)
241193326Sed{
242249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
243193326Sed}
244193326Sed
245206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246249423Sdim_mm_cmpgt_sd(__m128d __a, __m128d __b)
247193326Sed{
248263508Sdim  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
249263508Sdim  return (__m128d) { __c[0], __a[1] };
250193326Sed}
251193326Sed
252206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
253249423Sdim_mm_cmpge_sd(__m128d __a, __m128d __b)
254193326Sed{
255263508Sdim  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
256263508Sdim  return (__m128d) { __c[0], __a[1] };
257193326Sed}
258193326Sed
259206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260249423Sdim_mm_cmpord_sd(__m128d __a, __m128d __b)
261193326Sed{
262249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
263193326Sed}
264193326Sed
265206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266249423Sdim_mm_cmpunord_sd(__m128d __a, __m128d __b)
267193326Sed{
268249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
269193326Sed}
270193326Sed
271206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272249423Sdim_mm_cmpneq_sd(__m128d __a, __m128d __b)
273193326Sed{
274249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
275193326Sed}
276193326Sed
277206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278249423Sdim_mm_cmpnlt_sd(__m128d __a, __m128d __b)
279193326Sed{
280249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
281193326Sed}
282193326Sed
283206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284249423Sdim_mm_cmpnle_sd(__m128d __a, __m128d __b)
285193326Sed{
286249423Sdim  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
287193326Sed}
288193326Sed
289206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
290249423Sdim_mm_cmpngt_sd(__m128d __a, __m128d __b)
291193326Sed{
292263508Sdim  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
293263508Sdim  return (__m128d) { __c[0], __a[1] };
294193326Sed}
295193326Sed
296206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
297249423Sdim_mm_cmpnge_sd(__m128d __a, __m128d __b)
298193326Sed{
299263508Sdim  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
300263508Sdim  return (__m128d) { __c[0], __a[1] };
301193326Sed}
302193326Sed
303206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
304249423Sdim_mm_comieq_sd(__m128d __a, __m128d __b)
305193326Sed{
306249423Sdim  return __builtin_ia32_comisdeq(__a, __b);
307193326Sed}
308193326Sed
309206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
310249423Sdim_mm_comilt_sd(__m128d __a, __m128d __b)
311193326Sed{
312249423Sdim  return __builtin_ia32_comisdlt(__a, __b);
313193326Sed}
314193326Sed
315206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
316249423Sdim_mm_comile_sd(__m128d __a, __m128d __b)
317193326Sed{
318249423Sdim  return __builtin_ia32_comisdle(__a, __b);
319193326Sed}
320193326Sed
321206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
322249423Sdim_mm_comigt_sd(__m128d __a, __m128d __b)
323193326Sed{
324249423Sdim  return __builtin_ia32_comisdgt(__a, __b);
325193326Sed}
326193326Sed
327206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
328249423Sdim_mm_comige_sd(__m128d __a, __m128d __b)
329226633Sdim{
330249423Sdim  return __builtin_ia32_comisdge(__a, __b);
331226633Sdim}
332226633Sdim
333226633Sdimstatic __inline__ int __attribute__((__always_inline__, __nodebug__))
334249423Sdim_mm_comineq_sd(__m128d __a, __m128d __b)
335193326Sed{
336249423Sdim  return __builtin_ia32_comisdneq(__a, __b);
337193326Sed}
338193326Sed
339206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
340249423Sdim_mm_ucomieq_sd(__m128d __a, __m128d __b)
341193326Sed{
342249423Sdim  return __builtin_ia32_ucomisdeq(__a, __b);
343193326Sed}
344193326Sed
345206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
346249423Sdim_mm_ucomilt_sd(__m128d __a, __m128d __b)
347193326Sed{
348249423Sdim  return __builtin_ia32_ucomisdlt(__a, __b);
349193326Sed}
350193326Sed
351206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
352249423Sdim_mm_ucomile_sd(__m128d __a, __m128d __b)
353193326Sed{
354249423Sdim  return __builtin_ia32_ucomisdle(__a, __b);
355193326Sed}
356193326Sed
357206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
358249423Sdim_mm_ucomigt_sd(__m128d __a, __m128d __b)
359193326Sed{
360249423Sdim  return __builtin_ia32_ucomisdgt(__a, __b);
361193326Sed}
362193326Sed
363206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
364249423Sdim_mm_ucomige_sd(__m128d __a, __m128d __b)
365226633Sdim{
366249423Sdim  return __builtin_ia32_ucomisdge(__a, __b);
367226633Sdim}
368226633Sdim
369226633Sdimstatic __inline__ int __attribute__((__always_inline__, __nodebug__))
370249423Sdim_mm_ucomineq_sd(__m128d __a, __m128d __b)
371193326Sed{
372249423Sdim  return __builtin_ia32_ucomisdneq(__a, __b);
373193326Sed}
374193326Sed
375206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
376249423Sdim_mm_cvtpd_ps(__m128d __a)
377193326Sed{
378249423Sdim  return __builtin_ia32_cvtpd2ps(__a);
379193326Sed}
380193326Sed
381206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
382249423Sdim_mm_cvtps_pd(__m128 __a)
383193326Sed{
384249423Sdim  return __builtin_ia32_cvtps2pd(__a);
385193326Sed}
386193326Sed
387206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
388249423Sdim_mm_cvtepi32_pd(__m128i __a)
389193326Sed{
390249423Sdim  return __builtin_ia32_cvtdq2pd((__v4si)__a);
391193326Sed}
392193326Sed
393206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
394249423Sdim_mm_cvtpd_epi32(__m128d __a)
395193326Sed{
396249423Sdim  return __builtin_ia32_cvtpd2dq(__a);
397193326Sed}
398193326Sed
399206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
400249423Sdim_mm_cvtsd_si32(__m128d __a)
401193326Sed{
402249423Sdim  return __builtin_ia32_cvtsd2si(__a);
403193326Sed}
404193326Sed
405206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
406249423Sdim_mm_cvtsd_ss(__m128 __a, __m128d __b)
407193326Sed{
408249423Sdim  __a[0] = __b[0];
409249423Sdim  return __a;
410193326Sed}
411193326Sed
412206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
413249423Sdim_mm_cvtsi32_sd(__m128d __a, int __b)
414193326Sed{
415249423Sdim  __a[0] = __b;
416249423Sdim  return __a;
417193326Sed}
418193326Sed
419206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
420249423Sdim_mm_cvtss_sd(__m128d __a, __m128 __b)
421193326Sed{
422249423Sdim  __a[0] = __b[0];
423249423Sdim  return __a;
424193326Sed}
425193326Sed
426206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
427249423Sdim_mm_cvttpd_epi32(__m128d __a)
428193326Sed{
429249423Sdim  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430193326Sed}
431193326Sed
432206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
433249423Sdim_mm_cvttsd_si32(__m128d __a)
434193326Sed{
435249423Sdim  return __a[0];
436193326Sed}
437193326Sed
438206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
439249423Sdim_mm_cvtpd_pi32(__m128d __a)
440193326Sed{
441249423Sdim  return (__m64)__builtin_ia32_cvtpd2pi(__a);
442193326Sed}
443193326Sed
444206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
445249423Sdim_mm_cvttpd_pi32(__m128d __a)
446193326Sed{
447249423Sdim  return (__m64)__builtin_ia32_cvttpd2pi(__a);
448193326Sed}
449193326Sed
450206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
451249423Sdim_mm_cvtpi32_pd(__m64 __a)
452193326Sed{
453249423Sdim  return __builtin_ia32_cvtpi2pd((__v2si)__a);
454193326Sed}
455193326Sed
456206084Srdivackystatic __inline__ double __attribute__((__always_inline__, __nodebug__))
457249423Sdim_mm_cvtsd_f64(__m128d __a)
458193326Sed{
459249423Sdim  return __a[0];
460193326Sed}
461193326Sed
462206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
463249423Sdim_mm_load_pd(double const *__dp)
464193326Sed{
465249423Sdim  return *(__m128d*)__dp;
466193326Sed}
467193326Sed
468206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
469249423Sdim_mm_load1_pd(double const *__dp)
470193326Sed{
471226633Sdim  struct __mm_load1_pd_struct {
472249423Sdim    double __u;
473226633Sdim  } __attribute__((__packed__, __may_alias__));
474249423Sdim  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475249423Sdim  return (__m128d){ __u, __u };
476193326Sed}
477193326Sed
478193326Sed#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
479193326Sed
480206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481249423Sdim_mm_loadr_pd(double const *__dp)
482193326Sed{
483249423Sdim  __m128d __u = *(__m128d*)__dp;
484249423Sdim  return __builtin_shufflevector(__u, __u, 1, 0);
485193326Sed}
486193326Sed
487206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488249423Sdim_mm_loadu_pd(double const *__dp)
489193326Sed{
490223017Sdim  struct __loadu_pd {
491249423Sdim    __m128d __v;
492223017Sdim  } __attribute__((packed, may_alias));
493249423Sdim  return ((struct __loadu_pd*)__dp)->__v;
494193326Sed}
495193326Sed
496206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497249423Sdim_mm_load_sd(double const *__dp)
498193326Sed{
499226633Sdim  struct __mm_load_sd_struct {
500249423Sdim    double __u;
501226633Sdim  } __attribute__((__packed__, __may_alias__));
502249423Sdim  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503249423Sdim  return (__m128d){ __u, 0 };
504193326Sed}
505193326Sed
506206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
507249423Sdim_mm_loadh_pd(__m128d __a, double const *__dp)
508193326Sed{
509226633Sdim  struct __mm_loadh_pd_struct {
510249423Sdim    double __u;
511226633Sdim  } __attribute__((__packed__, __may_alias__));
512249423Sdim  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513249423Sdim  return (__m128d){ __a[0], __u };
514193326Sed}
515193326Sed
516206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
517249423Sdim_mm_loadl_pd(__m128d __a, double const *__dp)
518193326Sed{
519226633Sdim  struct __mm_loadl_pd_struct {
520249423Sdim    double __u;
521226633Sdim  } __attribute__((__packed__, __may_alias__));
522249423Sdim  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523249423Sdim  return (__m128d){ __u, __a[1] };
524193326Sed}
525193326Sed
526206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527249423Sdim_mm_set_sd(double __w)
528193326Sed{
529249423Sdim  return (__m128d){ __w, 0 };
530193326Sed}
531193326Sed
532206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533249423Sdim_mm_set1_pd(double __w)
534193326Sed{
535249423Sdim  return (__m128d){ __w, __w };
536193326Sed}
537193326Sed
538206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539249423Sdim_mm_set_pd(double __w, double __x)
540193326Sed{
541249423Sdim  return (__m128d){ __x, __w };
542193326Sed}
543193326Sed
544206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
545249423Sdim_mm_setr_pd(double __w, double __x)
546193326Sed{
547249423Sdim  return (__m128d){ __w, __x };
548193326Sed}
549193326Sed
550206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
551193326Sed_mm_setzero_pd(void)
552193326Sed{
553193326Sed  return (__m128d){ 0, 0 };
554193326Sed}
555193326Sed
556206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
557249423Sdim_mm_move_sd(__m128d __a, __m128d __b)
558193326Sed{
559249423Sdim  return (__m128d){ __b[0], __a[1] };
560193326Sed}
561193326Sed
562206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
563249423Sdim_mm_store_sd(double *__dp, __m128d __a)
564193326Sed{
565226633Sdim  struct __mm_store_sd_struct {
566249423Sdim    double __u;
567226633Sdim  } __attribute__((__packed__, __may_alias__));
568249423Sdim  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569193326Sed}
570193326Sed
571206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
572249423Sdim_mm_store1_pd(double *__dp, __m128d __a)
573193326Sed{
574226633Sdim  struct __mm_store1_pd_struct {
575249423Sdim    double __u[2];
576226633Sdim  } __attribute__((__packed__, __may_alias__));
577249423Sdim  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578249423Sdim  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579193326Sed}
580193326Sed
581206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
582249423Sdim_mm_store_pd(double *__dp, __m128d __a)
583193326Sed{
584249423Sdim  *(__m128d *)__dp = __a;
585193326Sed}
586193326Sed
587206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
588249423Sdim_mm_storeu_pd(double *__dp, __m128d __a)
589193326Sed{
590249423Sdim  __builtin_ia32_storeupd(__dp, __a);
591193326Sed}
592193326Sed
593206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
594249423Sdim_mm_storer_pd(double *__dp, __m128d __a)
595193326Sed{
596249423Sdim  __a = __builtin_shufflevector(__a, __a, 1, 0);
597249423Sdim  *(__m128d *)__dp = __a;
598193326Sed}
599193326Sed
600206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
601249423Sdim_mm_storeh_pd(double *__dp, __m128d __a)
602193326Sed{
603226633Sdim  struct __mm_storeh_pd_struct {
604249423Sdim    double __u;
605226633Sdim  } __attribute__((__packed__, __may_alias__));
606249423Sdim  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607193326Sed}
608193326Sed
609206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
610249423Sdim_mm_storel_pd(double *__dp, __m128d __a)
611193326Sed{
612226633Sdim  struct __mm_storeh_pd_struct {
613249423Sdim    double __u;
614226633Sdim  } __attribute__((__packed__, __may_alias__));
615249423Sdim  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616193326Sed}
617193326Sed
618206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619249423Sdim_mm_add_epi8(__m128i __a, __m128i __b)
620193326Sed{
621249423Sdim  return (__m128i)((__v16qi)__a + (__v16qi)__b);
622193326Sed}
623193326Sed
624206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625249423Sdim_mm_add_epi16(__m128i __a, __m128i __b)
626193326Sed{
627249423Sdim  return (__m128i)((__v8hi)__a + (__v8hi)__b);
628193326Sed}
629193326Sed
630206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631249423Sdim_mm_add_epi32(__m128i __a, __m128i __b)
632193326Sed{
633249423Sdim  return (__m128i)((__v4si)__a + (__v4si)__b);
634193326Sed}
635193326Sed
636206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
637249423Sdim_mm_add_si64(__m64 __a, __m64 __b)
638193326Sed{
639249423Sdim  return __a + __b;
640193326Sed}
641193326Sed
642206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643249423Sdim_mm_add_epi64(__m128i __a, __m128i __b)
644193326Sed{
645249423Sdim  return __a + __b;
646193326Sed}
647193326Sed
648206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649249423Sdim_mm_adds_epi8(__m128i __a, __m128i __b)
650193326Sed{
651249423Sdim  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652193326Sed}
653193326Sed
654206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655249423Sdim_mm_adds_epi16(__m128i __a, __m128i __b)
656193326Sed{
657249423Sdim  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658193326Sed}
659193326Sed
660206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661249423Sdim_mm_adds_epu8(__m128i __a, __m128i __b)
662193326Sed{
663249423Sdim  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664193326Sed}
665193326Sed
666206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667249423Sdim_mm_adds_epu16(__m128i __a, __m128i __b)
668193326Sed{
669249423Sdim  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670193326Sed}
671193326Sed
672206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673249423Sdim_mm_avg_epu8(__m128i __a, __m128i __b)
674193326Sed{
675249423Sdim  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676193326Sed}
677193326Sed
678206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679249423Sdim_mm_avg_epu16(__m128i __a, __m128i __b)
680193326Sed{
681249423Sdim  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682193326Sed}
683193326Sed
684206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685249423Sdim_mm_madd_epi16(__m128i __a, __m128i __b)
686193326Sed{
687249423Sdim  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688193326Sed}
689193326Sed
690206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691249423Sdim_mm_max_epi16(__m128i __a, __m128i __b)
692193326Sed{
693249423Sdim  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694193326Sed}
695193326Sed
696206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697249423Sdim_mm_max_epu8(__m128i __a, __m128i __b)
698193326Sed{
699249423Sdim  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700193326Sed}
701193326Sed
702206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703249423Sdim_mm_min_epi16(__m128i __a, __m128i __b)
704193326Sed{
705249423Sdim  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706193326Sed}
707193326Sed
708206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709249423Sdim_mm_min_epu8(__m128i __a, __m128i __b)
710193326Sed{
711249423Sdim  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712193326Sed}
713193326Sed
714206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715249423Sdim_mm_mulhi_epi16(__m128i __a, __m128i __b)
716193326Sed{
717249423Sdim  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718193326Sed}
719193326Sed
720206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721249423Sdim_mm_mulhi_epu16(__m128i __a, __m128i __b)
722193326Sed{
723249423Sdim  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724193326Sed}
725193326Sed
726206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727249423Sdim_mm_mullo_epi16(__m128i __a, __m128i __b)
728193326Sed{
729249423Sdim  return (__m128i)((__v8hi)__a * (__v8hi)__b);
730193326Sed}
731193326Sed
732206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733249423Sdim_mm_mul_su32(__m64 __a, __m64 __b)
734193326Sed{
735249423Sdim  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736193326Sed}
737193326Sed
738206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739249423Sdim_mm_mul_epu32(__m128i __a, __m128i __b)
740193326Sed{
741249423Sdim  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742193326Sed}
743193326Sed
744206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745249423Sdim_mm_sad_epu8(__m128i __a, __m128i __b)
746193326Sed{
747249423Sdim  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748193326Sed}
749193326Sed
750206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751249423Sdim_mm_sub_epi8(__m128i __a, __m128i __b)
752193326Sed{
753249423Sdim  return (__m128i)((__v16qi)__a - (__v16qi)__b);
754193326Sed}
755193326Sed
756206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757249423Sdim_mm_sub_epi16(__m128i __a, __m128i __b)
758193326Sed{
759249423Sdim  return (__m128i)((__v8hi)__a - (__v8hi)__b);
760193326Sed}
761193326Sed
762206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763249423Sdim_mm_sub_epi32(__m128i __a, __m128i __b)
764193326Sed{
765249423Sdim  return (__m128i)((__v4si)__a - (__v4si)__b);
766193326Sed}
767193326Sed
768206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
769249423Sdim_mm_sub_si64(__m64 __a, __m64 __b)
770193326Sed{
771249423Sdim  return __a - __b;
772193326Sed}
773193326Sed
774206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775249423Sdim_mm_sub_epi64(__m128i __a, __m128i __b)
776193326Sed{
777249423Sdim  return __a - __b;
778193326Sed}
779193326Sed
780206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781249423Sdim_mm_subs_epi8(__m128i __a, __m128i __b)
782193326Sed{
783249423Sdim  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784193326Sed}
785193326Sed
786206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787249423Sdim_mm_subs_epi16(__m128i __a, __m128i __b)
788193326Sed{
789249423Sdim  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790193326Sed}
791193326Sed
792206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793249423Sdim_mm_subs_epu8(__m128i __a, __m128i __b)
794193326Sed{
795249423Sdim  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796193326Sed}
797193326Sed
798206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799249423Sdim_mm_subs_epu16(__m128i __a, __m128i __b)
800193326Sed{
801249423Sdim  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802193326Sed}
803193326Sed
804206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805249423Sdim_mm_and_si128(__m128i __a, __m128i __b)
806193326Sed{
807249423Sdim  return __a & __b;
808193326Sed}
809193326Sed
810206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811249423Sdim_mm_andnot_si128(__m128i __a, __m128i __b)
812193326Sed{
813249423Sdim  return ~__a & __b;
814193326Sed}
815193326Sed
816206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817249423Sdim_mm_or_si128(__m128i __a, __m128i __b)
818193326Sed{
819249423Sdim  return __a | __b;
820193326Sed}
821193326Sed
822206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823249423Sdim_mm_xor_si128(__m128i __a, __m128i __b)
824193326Sed{
825249423Sdim  return __a ^ __b;
826193326Sed}
827193326Sed
828234353Sdim#define _mm_slli_si128(a, count) __extension__ ({ \
829263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
830234353Sdim  __m128i __a = (a); \
831263508Sdim   _Pragma("clang diagnostic pop"); \
832234353Sdim  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
833193326Sed
834206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835249423Sdim_mm_slli_epi16(__m128i __a, int __count)
836193326Sed{
837249423Sdim  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
838193326Sed}
839193326Sed
840206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841249423Sdim_mm_sll_epi16(__m128i __a, __m128i __count)
842193326Sed{
843249423Sdim  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
844193326Sed}
845193326Sed
846206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847249423Sdim_mm_slli_epi32(__m128i __a, int __count)
848193326Sed{
849249423Sdim  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
850193326Sed}
851193326Sed
852206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853249423Sdim_mm_sll_epi32(__m128i __a, __m128i __count)
854193326Sed{
855249423Sdim  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
856193326Sed}
857193326Sed
858206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859249423Sdim_mm_slli_epi64(__m128i __a, int __count)
860193326Sed{
861249423Sdim  return __builtin_ia32_psllqi128(__a, __count);
862193326Sed}
863193326Sed
864206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865249423Sdim_mm_sll_epi64(__m128i __a, __m128i __count)
866193326Sed{
867249423Sdim  return __builtin_ia32_psllq128(__a, __count);
868193326Sed}
869193326Sed
870206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871249423Sdim_mm_srai_epi16(__m128i __a, int __count)
872193326Sed{
873249423Sdim  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
874193326Sed}
875193326Sed
876206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877249423Sdim_mm_sra_epi16(__m128i __a, __m128i __count)
878193326Sed{
879249423Sdim  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
880193326Sed}
881193326Sed
882206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883249423Sdim_mm_srai_epi32(__m128i __a, int __count)
884193326Sed{
885249423Sdim  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
886193326Sed}
887193326Sed
888206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
889249423Sdim_mm_sra_epi32(__m128i __a, __m128i __count)
890193326Sed{
891249423Sdim  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
892193326Sed}
893193326Sed
894193326Sed
895234353Sdim#define _mm_srli_si128(a, count) __extension__ ({ \
896263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
897234353Sdim  __m128i __a = (a); \
898263508Sdim  _Pragma("clang diagnostic pop"); \
899234353Sdim  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
900218893Sdim
901206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
902249423Sdim_mm_srli_epi16(__m128i __a, int __count)
903193326Sed{
904249423Sdim  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
905193326Sed}
906193326Sed
907206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
908249423Sdim_mm_srl_epi16(__m128i __a, __m128i __count)
909193326Sed{
910249423Sdim  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
911193326Sed}
912193326Sed
913206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914249423Sdim_mm_srli_epi32(__m128i __a, int __count)
915193326Sed{
916249423Sdim  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
917193326Sed}
918193326Sed
919206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920249423Sdim_mm_srl_epi32(__m128i __a, __m128i __count)
921193326Sed{
922249423Sdim  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
923193326Sed}
924193326Sed
925206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
926249423Sdim_mm_srli_epi64(__m128i __a, int __count)
927193326Sed{
928249423Sdim  return __builtin_ia32_psrlqi128(__a, __count);
929193326Sed}
930193326Sed
931206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
932249423Sdim_mm_srl_epi64(__m128i __a, __m128i __count)
933193326Sed{
934249423Sdim  return __builtin_ia32_psrlq128(__a, __count);
935193326Sed}
936193326Sed
937206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
938249423Sdim_mm_cmpeq_epi8(__m128i __a, __m128i __b)
939193326Sed{
940249423Sdim  return (__m128i)((__v16qi)__a == (__v16qi)__b);
941193326Sed}
942193326Sed
943206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
944249423Sdim_mm_cmpeq_epi16(__m128i __a, __m128i __b)
945193326Sed{
946249423Sdim  return (__m128i)((__v8hi)__a == (__v8hi)__b);
947193326Sed}
948193326Sed
949206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
950249423Sdim_mm_cmpeq_epi32(__m128i __a, __m128i __b)
951193326Sed{
952249423Sdim  return (__m128i)((__v4si)__a == (__v4si)__b);
953193326Sed}
954193326Sed
955206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
956249423Sdim_mm_cmpgt_epi8(__m128i __a, __m128i __b)
957193326Sed{
958234353Sdim  /* This function always performs a signed comparison, but __v16qi is a char
959234353Sdim     which may be signed or unsigned. */
960234353Sdim  typedef signed char __v16qs __attribute__((__vector_size__(16)));
961249423Sdim  return (__m128i)((__v16qs)__a > (__v16qs)__b);
962193326Sed}
963193326Sed
964206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965249423Sdim_mm_cmpgt_epi16(__m128i __a, __m128i __b)
966193326Sed{
967249423Sdim  return (__m128i)((__v8hi)__a > (__v8hi)__b);
968193326Sed}
969193326Sed
970206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971249423Sdim_mm_cmpgt_epi32(__m128i __a, __m128i __b)
972193326Sed{
973249423Sdim  return (__m128i)((__v4si)__a > (__v4si)__b);
974193326Sed}
975193326Sed
976206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977249423Sdim_mm_cmplt_epi8(__m128i __a, __m128i __b)
978193326Sed{
979249423Sdim  return _mm_cmpgt_epi8(__b, __a);
980193326Sed}
981193326Sed
982206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
983249423Sdim_mm_cmplt_epi16(__m128i __a, __m128i __b)
984193326Sed{
985249423Sdim  return _mm_cmpgt_epi16(__b, __a);
986193326Sed}
987193326Sed
988206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989249423Sdim_mm_cmplt_epi32(__m128i __a, __m128i __b)
990193326Sed{
991249423Sdim  return _mm_cmpgt_epi32(__b, __a);
992193326Sed}
993193326Sed
994193326Sed#ifdef __x86_64__
995206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
996249423Sdim_mm_cvtsi64_sd(__m128d __a, long long __b)
997193326Sed{
998249423Sdim  __a[0] = __b;
999249423Sdim  return __a;
1000193326Sed}
1001193326Sed
1002206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
1003249423Sdim_mm_cvtsd_si64(__m128d __a)
1004193326Sed{
1005249423Sdim  return __builtin_ia32_cvtsd2si64(__a);
1006193326Sed}
1007193326Sed
1008206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
1009249423Sdim_mm_cvttsd_si64(__m128d __a)
1010193326Sed{
1011249423Sdim  return __a[0];
1012193326Sed}
1013193326Sed#endif
1014193326Sed
1015206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1016249423Sdim_mm_cvtepi32_ps(__m128i __a)
1017193326Sed{
1018249423Sdim  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1019193326Sed}
1020193326Sed
1021206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1022249423Sdim_mm_cvtps_epi32(__m128 __a)
1023193326Sed{
1024249423Sdim  return (__m128i)__builtin_ia32_cvtps2dq(__a);
1025193326Sed}
1026193326Sed
1027206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028249423Sdim_mm_cvttps_epi32(__m128 __a)
1029193326Sed{
1030249423Sdim  return (__m128i)__builtin_ia32_cvttps2dq(__a);
1031193326Sed}
1032193326Sed
1033206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1034249423Sdim_mm_cvtsi32_si128(int __a)
1035193326Sed{
1036249423Sdim  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1037193326Sed}
1038193326Sed
1039193326Sed#ifdef __x86_64__
1040206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041249423Sdim_mm_cvtsi64_si128(long long __a)
1042193326Sed{
1043249423Sdim  return (__m128i){ __a, 0 };
1044193326Sed}
1045193326Sed#endif
1046193326Sed
1047206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
1048249423Sdim_mm_cvtsi128_si32(__m128i __a)
1049193326Sed{
1050249423Sdim  __v4si __b = (__v4si)__a;
1051249423Sdim  return __b[0];
1052193326Sed}
1053193326Sed
1054193326Sed#ifdef __x86_64__
1055206084Srdivackystatic __inline__ long long __attribute__((__always_inline__, __nodebug__))
1056249423Sdim_mm_cvtsi128_si64(__m128i __a)
1057193326Sed{
1058249423Sdim  return __a[0];
1059193326Sed}
1060193326Sed#endif
1061193326Sed
1062206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1063249423Sdim_mm_load_si128(__m128i const *__p)
1064193326Sed{
1065249423Sdim  return *__p;
1066193326Sed}
1067193326Sed
1068206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1069249423Sdim_mm_loadu_si128(__m128i const *__p)
1070193326Sed{
1071223017Sdim  struct __loadu_si128 {
1072249423Sdim    __m128i __v;
1073223017Sdim  } __attribute__((packed, may_alias));
1074249423Sdim  return ((struct __loadu_si128*)__p)->__v;
1075193326Sed}
1076193326Sed
1077206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078249423Sdim_mm_loadl_epi64(__m128i const *__p)
1079193326Sed{
1080226633Sdim  struct __mm_loadl_epi64_struct {
1081249423Sdim    long long __u;
1082226633Sdim  } __attribute__((__packed__, __may_alias__));
1083249423Sdim  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1084193326Sed}
1085193326Sed
1086206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1087198092Srdivacky_mm_set_epi64x(long long q1, long long q0)
1088198092Srdivacky{
1089198092Srdivacky  return (__m128i){ q0, q1 };
1090198092Srdivacky}
1091198092Srdivacky
1092206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1093193326Sed_mm_set_epi64(__m64 q1, __m64 q0)
1094193326Sed{
1095193326Sed  return (__m128i){ (long long)q0, (long long)q1 };
1096193326Sed}
1097193326Sed
1098206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1099193326Sed_mm_set_epi32(int i3, int i2, int i1, int i0)
1100193326Sed{
1101193326Sed  return (__m128i)(__v4si){ i0, i1, i2, i3};
1102193326Sed}
1103193326Sed
1104206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1105193326Sed_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1106193326Sed{
1107193326Sed  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1108193326Sed}
1109193326Sed
1110206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1111193326Sed_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1112193326Sed{
1113193326Sed  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1114193326Sed}
1115193326Sed
1116206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1117249423Sdim_mm_set1_epi64x(long long __q)
1118198092Srdivacky{
1119249423Sdim  return (__m128i){ __q, __q };
1120198092Srdivacky}
1121198092Srdivacky
1122206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1123249423Sdim_mm_set1_epi64(__m64 __q)
1124193326Sed{
1125249423Sdim  return (__m128i){ (long long)__q, (long long)__q };
1126193326Sed}
1127193326Sed
1128206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1129249423Sdim_mm_set1_epi32(int __i)
1130193326Sed{
1131249423Sdim  return (__m128i)(__v4si){ __i, __i, __i, __i };
1132193326Sed}
1133193326Sed
1134206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1135249423Sdim_mm_set1_epi16(short __w)
1136193326Sed{
1137249423Sdim  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1138193326Sed}
1139193326Sed
1140206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1141249423Sdim_mm_set1_epi8(char __b)
1142193326Sed{
1143249423Sdim  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1144193326Sed}
1145193326Sed
1146206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1147193326Sed_mm_setr_epi64(__m64 q0, __m64 q1)
1148193326Sed{
1149193326Sed  return (__m128i){ (long long)q0, (long long)q1 };
1150193326Sed}
1151193326Sed
1152206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1153193326Sed_mm_setr_epi32(int i0, int i1, int i2, int i3)
1154193326Sed{
1155193326Sed  return (__m128i)(__v4si){ i0, i1, i2, i3};
1156193326Sed}
1157193326Sed
1158206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1159193326Sed_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1160193326Sed{
1161193326Sed  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1162193326Sed}
1163193326Sed
1164206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1165193326Sed_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1166193326Sed{
1167193326Sed  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1168193326Sed}
1169193326Sed
1170206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1171193326Sed_mm_setzero_si128(void)
1172193326Sed{
1173193326Sed  return (__m128i){ 0LL, 0LL };
1174193326Sed}
1175193326Sed
1176206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1177249423Sdim_mm_store_si128(__m128i *__p, __m128i __b)
1178193326Sed{
1179249423Sdim  *__p = __b;
1180193326Sed}
1181193326Sed
1182206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1183249423Sdim_mm_storeu_si128(__m128i *__p, __m128i __b)
1184193326Sed{
1185249423Sdim  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1186193326Sed}
1187193326Sed
1188206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1189249423Sdim_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1190193326Sed{
1191249423Sdim  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1192193326Sed}
1193193326Sed
1194206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1195249423Sdim_mm_storel_epi64(__m128i *__p, __m128i __a)
1196193326Sed{
1197239462Sdim  struct __mm_storel_epi64_struct {
1198249423Sdim    long long __u;
1199239462Sdim  } __attribute__((__packed__, __may_alias__));
1200249423Sdim  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1201193326Sed}
1202193326Sed
1203206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1204249423Sdim_mm_stream_pd(double *__p, __m128d __a)
1205193326Sed{
1206249423Sdim  __builtin_ia32_movntpd(__p, __a);
1207193326Sed}
1208193326Sed
1209206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1210249423Sdim_mm_stream_si128(__m128i *__p, __m128i __a)
1211193326Sed{
1212249423Sdim  __builtin_ia32_movntdq(__p, __a);
1213193326Sed}
1214193326Sed
1215206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1216249423Sdim_mm_stream_si32(int *__p, int __a)
1217193326Sed{
1218249423Sdim  __builtin_ia32_movnti(__p, __a);
1219193326Sed}
1220193326Sed
1221263508Sdim#ifdef __x86_64__
1222206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1223263508Sdim_mm_stream_si64(long long *__p, long long __a)
1224263508Sdim{
1225263508Sdim  __builtin_ia32_movnti64(__p, __a);
1226263508Sdim}
1227263508Sdim#endif
1228263508Sdim
1229263508Sdimstatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1230249423Sdim_mm_clflush(void const *__p)
1231193326Sed{
1232249423Sdim  __builtin_ia32_clflush(__p);
1233193326Sed}
1234193326Sed
1235206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1236193326Sed_mm_lfence(void)
1237193326Sed{
1238193326Sed  __builtin_ia32_lfence();
1239193326Sed}
1240193326Sed
1241206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1242193326Sed_mm_mfence(void)
1243193326Sed{
1244193326Sed  __builtin_ia32_mfence();
1245193326Sed}
1246193326Sed
1247206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1248249423Sdim_mm_packs_epi16(__m128i __a, __m128i __b)
1249193326Sed{
1250249423Sdim  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1251193326Sed}
1252193326Sed
1253206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1254249423Sdim_mm_packs_epi32(__m128i __a, __m128i __b)
1255193326Sed{
1256249423Sdim  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1257193326Sed}
1258193326Sed
1259206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1260249423Sdim_mm_packus_epi16(__m128i __a, __m128i __b)
1261193326Sed{
1262249423Sdim  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1263193326Sed}
1264193326Sed
1265206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
1266249423Sdim_mm_extract_epi16(__m128i __a, int __imm)
1267193326Sed{
1268249423Sdim  __v8hi __b = (__v8hi)__a;
1269263508Sdim  return (unsigned short)__b[__imm & 7];
1270193326Sed}
1271193326Sed
1272206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273249423Sdim_mm_insert_epi16(__m128i __a, int __b, int __imm)
1274193326Sed{
1275249423Sdim  __v8hi __c = (__v8hi)__a;
1276249423Sdim  __c[__imm & 7] = __b;
1277249423Sdim  return (__m128i)__c;
1278193326Sed}
1279193326Sed
1280206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
1281249423Sdim_mm_movemask_epi8(__m128i __a)
1282193326Sed{
1283249423Sdim  return __builtin_ia32_pmovmskb128((__v16qi)__a);
1284193326Sed}
1285193326Sed
1286234353Sdim#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1287263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1288234353Sdim  __m128i __a = (a); \
1289263508Sdim  _Pragma("clang diagnostic pop"); \
1290234353Sdim  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1291234353Sdim                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1292234353Sdim                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1293221345Sdim
1294234353Sdim#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1295263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1296234353Sdim  __m128i __a = (a); \
1297263508Sdim  _Pragma("clang diagnostic pop"); \
1298234353Sdim  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1299234353Sdim                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1300234353Sdim                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1301234353Sdim                                   4, 5, 6, 7); })
1302221345Sdim
1303234353Sdim#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1304263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1305234353Sdim  __m128i __a = (a); \
1306263508Sdim  _Pragma("clang diagnostic pop"); \
1307234353Sdim  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1308234353Sdim                                   0, 1, 2, 3, \
1309234353Sdim                                   4 + (((imm) & 0x03) >> 0), \
1310234353Sdim                                   4 + (((imm) & 0x0c) >> 2), \
1311234353Sdim                                   4 + (((imm) & 0x30) >> 4), \
1312234353Sdim                                   4 + (((imm) & 0xc0) >> 6)); })
1313193326Sed
1314206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1315249423Sdim_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1316193326Sed{
1317249423Sdim  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1318193326Sed}
1319193326Sed
1320206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1321249423Sdim_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1322193326Sed{
1323249423Sdim  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1324193326Sed}
1325193326Sed
1326206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1327249423Sdim_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1328193326Sed{
1329249423Sdim  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1330193326Sed}
1331193326Sed
1332206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1333249423Sdim_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1334193326Sed{
1335249423Sdim  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1336193326Sed}
1337193326Sed
1338206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1339249423Sdim_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1340193326Sed{
1341249423Sdim  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1342193326Sed}
1343193326Sed
1344206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1345249423Sdim_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1346193326Sed{
1347249423Sdim  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1348193326Sed}
1349193326Sed
1350206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1351249423Sdim_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1352193326Sed{
1353249423Sdim  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1354193326Sed}
1355193326Sed
1356206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1357249423Sdim_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1358193326Sed{
1359249423Sdim  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1360193326Sed}
1361193326Sed
1362206084Srdivackystatic __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1363249423Sdim_mm_movepi64_pi64(__m128i __a)
1364193326Sed{
1365249423Sdim  return (__m64)__a[0];
1366193326Sed}
1367193326Sed
1368206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1369258747Sdim_mm_movpi64_epi64(__m64 __a)
1370193326Sed{
1371249423Sdim  return (__m128i){ (long long)__a, 0 };
1372193326Sed}
1373193326Sed
1374206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375249423Sdim_mm_move_epi64(__m128i __a)
1376193326Sed{
1377249423Sdim  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1378193326Sed}
1379193326Sed
1380206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381249423Sdim_mm_unpackhi_pd(__m128d __a, __m128d __b)
1382193326Sed{
1383249423Sdim  return __builtin_shufflevector(__a, __b, 1, 2+1);
1384193326Sed}
1385193326Sed
1386206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1387249423Sdim_mm_unpacklo_pd(__m128d __a, __m128d __b)
1388193326Sed{
1389249423Sdim  return __builtin_shufflevector(__a, __b, 0, 2+0);
1390193326Sed}
1391193326Sed
1392206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__))
1393249423Sdim_mm_movemask_pd(__m128d __a)
1394193326Sed{
1395249423Sdim  return __builtin_ia32_movmskpd(__a);
1396193326Sed}
1397193326Sed
1398234353Sdim#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1399263508Sdim  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1400234353Sdim  __m128d __a = (a); \
1401234353Sdim  __m128d __b = (b); \
1402263508Sdim  _Pragma("clang diagnostic pop"); \
1403234353Sdim  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1404193326Sed
1405206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406251662Sdim_mm_castpd_ps(__m128d __a)
1407193326Sed{
1408251662Sdim  return (__m128)__a;
1409193326Sed}
1410193326Sed
1411206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1412251662Sdim_mm_castpd_si128(__m128d __a)
1413193326Sed{
1414251662Sdim  return (__m128i)__a;
1415193326Sed}
1416193326Sed
1417206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1418251662Sdim_mm_castps_pd(__m128 __a)
1419193326Sed{
1420251662Sdim  return (__m128d)__a;
1421193326Sed}
1422193326Sed
1423206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1424251662Sdim_mm_castps_si128(__m128 __a)
1425193326Sed{
1426251662Sdim  return (__m128i)__a;
1427193326Sed}
1428193326Sed
1429206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1430251662Sdim_mm_castsi128_ps(__m128i __a)
1431193326Sed{
1432251662Sdim  return (__m128)__a;
1433193326Sed}
1434193326Sed
1435206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1436251662Sdim_mm_castsi128_pd(__m128i __a)
1437193326Sed{
1438251662Sdim  return (__m128d)__a;
1439193326Sed}
1440193326Sed
1441206084Srdivackystatic __inline__ void __attribute__((__always_inline__, __nodebug__))
1442193326Sed_mm_pause(void)
1443193326Sed{
1444193326Sed  __asm__ volatile ("pause");
1445193326Sed}
1446193326Sed
1447193326Sed#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1448193326Sed
1449193326Sed#endif /* __SSE2__ */
1450193326Sed
1451193326Sed#endif /* __EMMINTRIN_H */
1452