emmintrin.h revision 208600
117683Spst/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
217683Spst *
317683Spst * Permission is hereby granted, free of charge, to any person obtaining a copy
417683Spst * of this software and associated documentation files (the "Software"), to deal
517683Spst * in the Software without restriction, including without limitation the rights
617683Spst * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
717683Spst * copies of the Software, and to permit persons to whom the Software is
817683Spst * furnished to do so, subject to the following conditions:
917683Spst *
1017683Spst * The above copyright notice and this permission notice shall be included in
1117683Spst * all copies or substantial portions of the Software.
1217683Spst *
1317683Spst * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1417683Spst * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1517683Spst * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1617683Spst * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1717683Spst * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1817683Spst * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1917683Spst * THE SOFTWARE.
2017683Spst *
21214518Srpaulo *===-----------------------------------------------------------------------===
2217683Spst */
2317683Spst
2475107Sfenner#ifndef __EMMINTRIN_H
2575107Sfenner#define __EMMINTRIN_H
2675107Sfenner
2775107Sfenner#ifndef __SSE2__
2875107Sfenner#error "SSE2 instruction set not enabled"
2975107Sfenner#else
3075107Sfenner
3175107Sfenner#include <xmmintrin.h>
3275107Sfenner
3375107Sfennertypedef double __m128d __attribute__((__vector_size__(16)));
3475107Sfennertypedef long long __m128i __attribute__((__vector_size__(16)));
3517683Spst
3675107Sfennertypedef short __v8hi __attribute__((__vector_size__(16)));
3775107Sfennertypedef char __v16qi __attribute__((__vector_size__(16)));
3875107Sfenner
3975107Sfennerstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
4075107Sfenner_mm_add_sd(__m128d a, __m128d b)
4175107Sfenner{
4275107Sfenner  a[0] += b[0];
4375107Sfenner  return a;
4475107Sfenner}
4575107Sfenner
4675107Sfennerstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
4775107Sfenner_mm_add_pd(__m128d a, __m128d b)
4817683Spst{
4917683Spst  return a + b;
5017683Spst}
5117683Spst
5217683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
5317683Spst_mm_sub_sd(__m128d a, __m128d b)
5417683Spst{
5517683Spst  a[0] -= b[0];
5617683Spst  return a;
5717683Spst}
5817683Spst
5917683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
6017683Spst_mm_sub_pd(__m128d a, __m128d b)
6117683Spst{
6217683Spst  return a - b;
6317683Spst}
6417683Spst
6517683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
6617683Spst_mm_mul_sd(__m128d a, __m128d b)
6717683Spst{
6817683Spst  a[0] *= b[0];
6917683Spst  return a;
7017683Spst}
7117683Spst
7217683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
7317683Spst_mm_mul_pd(__m128d a, __m128d b)
7417683Spst{
7517683Spst  return a * b;
7617683Spst}
7717683Spst
7817683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
7917683Spst_mm_div_sd(__m128d a, __m128d b)
8017683Spst{
8117683Spst  a[0] /= b[0];
8217683Spst  return a;
8317683Spst}
8417683Spst
8517683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
8617683Spst_mm_div_pd(__m128d a, __m128d b)
8717683Spst{
8817683Spst  return a / b;
8917683Spst}
9017683Spst
9117683Spststatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
9217683Spst_mm_sqrt_sd(__m128d a, __m128d b)
9317683Spst{
9417683Spst  __m128d c = __builtin_ia32_sqrtsd(b);
9517683Spst  return (__m128d) { c[0], a[1] };
9698530Sfenner}
9798530Sfenner
9898530Sfennerstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
9998530Sfenner_mm_sqrt_pd(__m128d a)
10098530Sfenner{
10198530Sfenner  return __builtin_ia32_sqrtpd(a);
10256889Sfenner}
10375107Sfenner
10456889Sfennerstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
105146768Ssam_mm_min_sd(__m128d a, __m128d b)
106146768Ssam{
107146768Ssam  return __builtin_ia32_minsd(a, b);
108146768Ssam}
109146768Ssam
110146768Ssamstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
111162012Ssam_mm_min_pd(__m128d a, __m128d b)
112162012Ssam{
113162012Ssam  return __builtin_ia32_minpd(a, b);
114162012Ssam}
115162012Ssam
116162012Ssamstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
11717683Spst_mm_max_sd(__m128d a, __m128d b)
11817683Spst{
11917683Spst  return __builtin_ia32_maxsd(a, b);
120235426Sdelphij}
121235426Sdelphij
122235426Sdelphijstatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
123_mm_max_pd(__m128d a, __m128d b)
124{
125  return __builtin_ia32_maxpd(a, b);
126}
127
128static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
129_mm_and_pd(__m128d a, __m128d b)
130{
131  return (__m128d)((__v4si)a & (__v4si)b);
132}
133
134static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
135_mm_andnot_pd(__m128d a, __m128d b)
136{
137  return (__m128d)(~(__v4si)a & (__v4si)b);
138}
139
140static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
141_mm_or_pd(__m128d a, __m128d b)
142{
143  return (__m128d)((__v4si)a | (__v4si)b);
144}
145
146static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
147_mm_xor_pd(__m128d a, __m128d b)
148{
149  return (__m128d)((__v4si)a ^ (__v4si)b);
150}
151
152static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
153_mm_cmpeq_pd(__m128d a, __m128d b)
154{
155  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
156}
157
158static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
159_mm_cmplt_pd(__m128d a, __m128d b)
160{
161  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
162}
163
164static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
165_mm_cmple_pd(__m128d a, __m128d b)
166{
167  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
168}
169
170static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
171_mm_cmpgt_pd(__m128d a, __m128d b)
172{
173  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
174}
175
176static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
177_mm_cmpge_pd(__m128d a, __m128d b)
178{
179  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
180}
181
182static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
183_mm_cmpord_pd(__m128d a, __m128d b)
184{
185  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
186}
187
188static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
189_mm_cmpunord_pd(__m128d a, __m128d b)
190{
191  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
192}
193
194static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
195_mm_cmpneq_pd(__m128d a, __m128d b)
196{
197  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
198}
199
200static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
201_mm_cmpnlt_pd(__m128d a, __m128d b)
202{
203  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
204}
205
206static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
207_mm_cmpnle_pd(__m128d a, __m128d b)
208{
209  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
210}
211
212static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
213_mm_cmpngt_pd(__m128d a, __m128d b)
214{
215  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
216}
217
218static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
219_mm_cmpnge_pd(__m128d a, __m128d b)
220{
221  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
222}
223
224static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
225_mm_cmpeq_sd(__m128d a, __m128d b)
226{
227  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
228}
229
230static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
231_mm_cmplt_sd(__m128d a, __m128d b)
232{
233  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
234}
235
236static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_cmple_sd(__m128d a, __m128d b)
238{
239  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
240}
241
242static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
243_mm_cmpgt_sd(__m128d a, __m128d b)
244{
245  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
246}
247
248static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
249_mm_cmpge_sd(__m128d a, __m128d b)
250{
251  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
252}
253
254static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
255_mm_cmpord_sd(__m128d a, __m128d b)
256{
257  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
258}
259
260static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
261_mm_cmpunord_sd(__m128d a, __m128d b)
262{
263  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
264}
265
266static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
267_mm_cmpneq_sd(__m128d a, __m128d b)
268{
269  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
270}
271
272static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
273_mm_cmpnlt_sd(__m128d a, __m128d b)
274{
275  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
276}
277
278static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
279_mm_cmpnle_sd(__m128d a, __m128d b)
280{
281  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
282}
283
284static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
285_mm_cmpngt_sd(__m128d a, __m128d b)
286{
287  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
288}
289
290static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
291_mm_cmpnge_sd(__m128d a, __m128d b)
292{
293  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
294}
295
296static __inline__ int __attribute__((__always_inline__, __nodebug__))
297_mm_comieq_sd(__m128d a, __m128d b)
298{
299  return __builtin_ia32_comisdeq(a, b);
300}
301
302static __inline__ int __attribute__((__always_inline__, __nodebug__))
303_mm_comilt_sd(__m128d a, __m128d b)
304{
305  return __builtin_ia32_comisdlt(a, b);
306}
307
308static __inline__ int __attribute__((__always_inline__, __nodebug__))
309_mm_comile_sd(__m128d a, __m128d b)
310{
311  return __builtin_ia32_comisdle(a, b);
312}
313
314static __inline__ int __attribute__((__always_inline__, __nodebug__))
315_mm_comigt_sd(__m128d a, __m128d b)
316{
317  return __builtin_ia32_comisdgt(a, b);
318}
319
320static __inline__ int __attribute__((__always_inline__, __nodebug__))
321_mm_comineq_sd(__m128d a, __m128d b)
322{
323  return __builtin_ia32_comisdneq(a, b);
324}
325
326static __inline__ int __attribute__((__always_inline__, __nodebug__))
327_mm_ucomieq_sd(__m128d a, __m128d b)
328{
329  return __builtin_ia32_ucomisdeq(a, b);
330}
331
332static __inline__ int __attribute__((__always_inline__, __nodebug__))
333_mm_ucomilt_sd(__m128d a, __m128d b)
334{
335  return __builtin_ia32_ucomisdlt(a, b);
336}
337
338static __inline__ int __attribute__((__always_inline__, __nodebug__))
339_mm_ucomile_sd(__m128d a, __m128d b)
340{
341  return __builtin_ia32_ucomisdle(a, b);
342}
343
344static __inline__ int __attribute__((__always_inline__, __nodebug__))
345_mm_ucomigt_sd(__m128d a, __m128d b)
346{
347  return __builtin_ia32_ucomisdgt(a, b);
348}
349
350static __inline__ int __attribute__((__always_inline__, __nodebug__))
351_mm_ucomineq_sd(__m128d a, __m128d b)
352{
353  return __builtin_ia32_ucomisdneq(a, b);
354}
355
356static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
357_mm_cvtpd_ps(__m128d a)
358{
359  return __builtin_ia32_cvtpd2ps(a);
360}
361
362static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
363_mm_cvtps_pd(__m128 a)
364{
365  return __builtin_ia32_cvtps2pd(a);
366}
367
368static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
369_mm_cvtepi32_pd(__m128i a)
370{
371  return __builtin_ia32_cvtdq2pd((__v4si)a);
372}
373
374static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
375_mm_cvtpd_epi32(__m128d a)
376{
377  return __builtin_ia32_cvtpd2dq(a);
378}
379
380static __inline__ int __attribute__((__always_inline__, __nodebug__))
381_mm_cvtsd_si32(__m128d a)
382{
383  return __builtin_ia32_cvtsd2si(a);
384}
385
386static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
387_mm_cvtsd_ss(__m128 a, __m128d b)
388{
389  a[0] = b[0];
390  return a;
391}
392
393static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
394_mm_cvtsi32_sd(__m128d a, int b)
395{
396  a[0] = b;
397  return a;
398}
399
400static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
401_mm_cvtss_sd(__m128d a, __m128 b)
402{
403  a[0] = b[0];
404  return a;
405}
406
407static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
408_mm_cvttpd_epi32(__m128d a)
409{
410  return (__m128i)__builtin_ia32_cvttpd2dq(a);
411}
412
413static __inline__ int __attribute__((__always_inline__, __nodebug__))
414_mm_cvttsd_si32(__m128d a)
415{
416  return a[0];
417}
418
419static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
420_mm_cvtpd_pi32(__m128d a)
421{
422  return (__m64)__builtin_ia32_cvtpd2pi(a);
423}
424
425static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
426_mm_cvttpd_pi32(__m128d a)
427{
428  return (__m64)__builtin_ia32_cvttpd2pi(a);
429}
430
431static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
432_mm_cvtpi32_pd(__m64 a)
433{
434  return __builtin_ia32_cvtpi2pd((__v2si)a);
435}
436
437static __inline__ double __attribute__((__always_inline__, __nodebug__))
438_mm_cvtsd_f64(__m128d a)
439{
440  return a[0];
441}
442
443static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
444_mm_load_pd(double const *dp)
445{
446  return *(__m128d*)dp;
447}
448
449static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
450_mm_load1_pd(double const *dp)
451{
452  return (__m128d){ dp[0], dp[0] };
453}
454
455#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
456
457static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
458_mm_loadr_pd(double const *dp)
459{
460  return (__m128d){ dp[1], dp[0] };
461}
462
463static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
464_mm_loadu_pd(double const *dp)
465{
466  return __builtin_ia32_loadupd(dp);
467}
468
469static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
470_mm_load_sd(double const *dp)
471{
472  return (__m128d){ *dp, 0.0 };
473}
474
475static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
476_mm_loadh_pd(__m128d a, double const *dp)
477{
478  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
479}
480
481static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
482_mm_loadl_pd(__m128d a, double const *dp)
483{
484  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
485}
486
487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_set_sd(double w)
489{
490  return (__m128d){ w, 0 };
491}
492
493static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
494_mm_set1_pd(double w)
495{
496  return (__m128d){ w, w };
497}
498
499static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
500_mm_set_pd(double w, double x)
501{
502  return (__m128d){ x, w };
503}
504
505static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
506_mm_setr_pd(double w, double x)
507{
508  return (__m128d){ w, x };
509}
510
511static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
512_mm_setzero_pd(void)
513{
514  return (__m128d){ 0, 0 };
515}
516
517static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
518_mm_move_sd(__m128d a, __m128d b)
519{
520  return (__m128d){ b[0], a[1] };
521}
522
523static __inline__ void __attribute__((__always_inline__, __nodebug__))
524_mm_store_sd(double *dp, __m128d a)
525{
526  dp[0] = a[0];
527}
528
529static __inline__ void __attribute__((__always_inline__, __nodebug__))
530_mm_store1_pd(double *dp, __m128d a)
531{
532  dp[0] = a[0];
533  dp[1] = a[0];
534}
535
536static __inline__ void __attribute__((__always_inline__, __nodebug__))
537_mm_store_pd(double *dp, __m128d a)
538{
539  *(__m128d *)dp = a;
540}
541
542static __inline__ void __attribute__((__always_inline__, __nodebug__))
543_mm_storeu_pd(double *dp, __m128d a)
544{
545  __builtin_ia32_storeupd(dp, a);
546}
547
548static __inline__ void __attribute__((__always_inline__, __nodebug__))
549_mm_storer_pd(double *dp, __m128d a)
550{
551  dp[0] = a[1];
552  dp[1] = a[0];
553}
554
555static __inline__ void __attribute__((__always_inline__, __nodebug__))
556_mm_storeh_pd(double *dp, __m128d a)
557{
558  dp[0] = a[1];
559}
560
561static __inline__ void __attribute__((__always_inline__, __nodebug__))
562_mm_storel_pd(double *dp, __m128d a)
563{
564  dp[0] = a[0];
565}
566
567static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
568_mm_add_epi8(__m128i a, __m128i b)
569{
570  return (__m128i)((__v16qi)a + (__v16qi)b);
571}
572
573static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
574_mm_add_epi16(__m128i a, __m128i b)
575{
576  return (__m128i)((__v8hi)a + (__v8hi)b);
577}
578
579static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
580_mm_add_epi32(__m128i a, __m128i b)
581{
582  return (__m128i)((__v4si)a + (__v4si)b);
583}
584
585static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
586_mm_add_si64(__m64 a, __m64 b)
587{
588  return a + b;
589}
590
591static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
592_mm_add_epi64(__m128i a, __m128i b)
593{
594  return a + b;
595}
596
597static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
598_mm_adds_epi8(__m128i a, __m128i b)
599{
600  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
601}
602
603static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
604_mm_adds_epi16(__m128i a, __m128i b)
605{
606  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
607}
608
609static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
610_mm_adds_epu8(__m128i a, __m128i b)
611{
612  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
613}
614
615static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
616_mm_adds_epu16(__m128i a, __m128i b)
617{
618  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
619}
620
621static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
622_mm_avg_epu8(__m128i a, __m128i b)
623{
624  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
625}
626
627static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
628_mm_avg_epu16(__m128i a, __m128i b)
629{
630  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
631}
632
633static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
634_mm_madd_epi16(__m128i a, __m128i b)
635{
636  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
637}
638
639static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
640_mm_max_epi16(__m128i a, __m128i b)
641{
642  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
643}
644
645static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
646_mm_max_epu8(__m128i a, __m128i b)
647{
648  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
649}
650
651static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
652_mm_min_epi16(__m128i a, __m128i b)
653{
654  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
655}
656
657static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
658_mm_min_epu8(__m128i a, __m128i b)
659{
660  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
661}
662
663static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
664_mm_mulhi_epi16(__m128i a, __m128i b)
665{
666  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
667}
668
669static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_epu16(__m128i a, __m128i b)
671{
672  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
673}
674
675static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
676_mm_mullo_epi16(__m128i a, __m128i b)
677{
678  return (__m128i)((__v8hi)a * (__v8hi)b);
679}
680
681static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
682_mm_mul_su32(__m64 a, __m64 b)
683{
684  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
685}
686
687static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
688_mm_mul_epu32(__m128i a, __m128i b)
689{
690  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
691}
692
693static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
694_mm_sad_epu8(__m128i a, __m128i b)
695{
696  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
697}
698
699static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
700_mm_sub_epi8(__m128i a, __m128i b)
701{
702  return (__m128i)((__v16qi)a - (__v16qi)b);
703}
704
705static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
706_mm_sub_epi16(__m128i a, __m128i b)
707{
708  return (__m128i)((__v8hi)a - (__v8hi)b);
709}
710
711static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
712_mm_sub_epi32(__m128i a, __m128i b)
713{
714  return (__m128i)((__v4si)a - (__v4si)b);
715}
716
717static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
718_mm_sub_si64(__m64 a, __m64 b)
719{
720  return a - b;
721}
722
723static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
724_mm_sub_epi64(__m128i a, __m128i b)
725{
726  return a - b;
727}
728
729static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
730_mm_subs_epi8(__m128i a, __m128i b)
731{
732  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
733}
734
735static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
736_mm_subs_epi16(__m128i a, __m128i b)
737{
738  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
739}
740
741static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
742_mm_subs_epu8(__m128i a, __m128i b)
743{
744  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
745}
746
747static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
748_mm_subs_epu16(__m128i a, __m128i b)
749{
750  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
751}
752
753static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
754_mm_and_si128(__m128i a, __m128i b)
755{
756  return a & b;
757}
758
759static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
760_mm_andnot_si128(__m128i a, __m128i b)
761{
762  return ~a & b;
763}
764
765static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
766_mm_or_si128(__m128i a, __m128i b)
767{
768  return a | b;
769}
770
771static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
772_mm_xor_si128(__m128i a, __m128i b)
773{
774  return a ^ b;
775}
776
777static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
778_mm_slli_si128(__m128i a, int imm)
779{
780  return __builtin_ia32_pslldqi128(a, imm * 8);
781}
782
783static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
784_mm_slli_epi16(__m128i a, int count)
785{
786  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
787}
788
789static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
790_mm_sll_epi16(__m128i a, __m128i count)
791{
792  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
793}
794
795static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
796_mm_slli_epi32(__m128i a, int count)
797{
798  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
799}
800
801static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
802_mm_sll_epi32(__m128i a, __m128i count)
803{
804  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
805}
806
807static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
808_mm_slli_epi64(__m128i a, int count)
809{
810  return __builtin_ia32_psllqi128(a, count);
811}
812
813static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
814_mm_sll_epi64(__m128i a, __m128i count)
815{
816  return __builtin_ia32_psllq128(a, count);
817}
818
819static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
820_mm_srai_epi16(__m128i a, int count)
821{
822  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
823}
824
825static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
826_mm_sra_epi16(__m128i a, __m128i count)
827{
828  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
829}
830
831static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
832_mm_srai_epi32(__m128i a, int count)
833{
834  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
835}
836
837static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
838_mm_sra_epi32(__m128i a, __m128i count)
839{
840  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
841}
842
843static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
844_mm_srli_si128(__m128i a, int imm)
845{
846  return __builtin_ia32_psrldqi128(a, imm * 8);
847}
848
849static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
850_mm_srli_epi16(__m128i a, int count)
851{
852  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
853}
854
855static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
856_mm_srl_epi16(__m128i a, __m128i count)
857{
858  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
859}
860
861static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
862_mm_srli_epi32(__m128i a, int count)
863{
864  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
865}
866
867static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
868_mm_srl_epi32(__m128i a, __m128i count)
869{
870  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
871}
872
873static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
874_mm_srli_epi64(__m128i a, int count)
875{
876  return __builtin_ia32_psrlqi128(a, count);
877}
878
879static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
880_mm_srl_epi64(__m128i a, __m128i count)
881{
882  return __builtin_ia32_psrlq128(a, count);
883}
884
885static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
886_mm_cmpeq_epi8(__m128i a, __m128i b)
887{
888  return (__m128i)((__v16qi)a == (__v16qi)b);
889}
890
891static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
892_mm_cmpeq_epi16(__m128i a, __m128i b)
893{
894  return (__m128i)((__v8hi)a == (__v8hi)b);
895}
896
897static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
898_mm_cmpeq_epi32(__m128i a, __m128i b)
899{
900  return (__m128i)((__v4si)a == (__v4si)b);
901}
902
903static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
904_mm_cmpgt_epi8(__m128i a, __m128i b)
905{
906  return (__m128i)((__v16qi)a > (__v16qi)b);
907}
908
909static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
910_mm_cmpgt_epi16(__m128i a, __m128i b)
911{
912  return (__m128i)((__v8hi)a > (__v8hi)b);
913}
914
915static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
916_mm_cmpgt_epi32(__m128i a, __m128i b)
917{
918  return (__m128i)((__v4si)a > (__v4si)b);
919}
920
921static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
922_mm_cmplt_epi8(__m128i a, __m128i b)
923{
924  return _mm_cmpgt_epi8(b,a);
925}
926
927static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
928_mm_cmplt_epi16(__m128i a, __m128i b)
929{
930  return _mm_cmpgt_epi16(b,a);
931}
932
933static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
934_mm_cmplt_epi32(__m128i a, __m128i b)
935{
936  return _mm_cmpgt_epi32(b,a);
937}
938
939#ifdef __x86_64__
940static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
941_mm_cvtsi64_sd(__m128d a, long long b)
942{
943  a[0] = b;
944  return a;
945}
946
947static __inline__ long long __attribute__((__always_inline__, __nodebug__))
948_mm_cvtsd_si64(__m128d a)
949{
950  return __builtin_ia32_cvtsd2si64(a);
951}
952
953static __inline__ long long __attribute__((__always_inline__, __nodebug__))
954_mm_cvttsd_si64(__m128d a)
955{
956  return a[0];
957}
958#endif
959
960static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
961_mm_cvtepi32_ps(__m128i a)
962{
963  return __builtin_ia32_cvtdq2ps((__v4si)a);
964}
965
966static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
967_mm_cvtps_epi32(__m128 a)
968{
969  return (__m128i)__builtin_ia32_cvtps2dq(a);
970}
971
972static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
973_mm_cvttps_epi32(__m128 a)
974{
975  return (__m128i)__builtin_ia32_cvttps2dq(a);
976}
977
978static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
979_mm_cvtsi32_si128(int a)
980{
981  return (__m128i)(__v4si){ a, 0, 0, 0 };
982}
983
984#ifdef __x86_64__
985static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
986_mm_cvtsi64_si128(long long a)
987{
988  return (__m128i){ a, 0 };
989}
990#endif
991
992static __inline__ int __attribute__((__always_inline__, __nodebug__))
993_mm_cvtsi128_si32(__m128i a)
994{
995  __v4si b = (__v4si)a;
996  return b[0];
997}
998
999#ifdef __x86_64__
1000static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1001_mm_cvtsi128_si64(__m128i a)
1002{
1003  return a[0];
1004}
1005#endif
1006
1007static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1008_mm_load_si128(__m128i const *p)
1009{
1010  return *p;
1011}
1012
1013static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1014_mm_loadu_si128(__m128i const *p)
1015{
1016  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1017}
1018
1019static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1020_mm_loadl_epi64(__m128i const *p)
1021{
1022  return (__m128i) { *(long long*)p, 0};
1023}
1024
1025static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1026_mm_set_epi64x(long long q1, long long q0)
1027{
1028  return (__m128i){ q0, q1 };
1029}
1030
1031static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1032_mm_set_epi64(__m64 q1, __m64 q0)
1033{
1034  return (__m128i){ (long long)q0, (long long)q1 };
1035}
1036
1037static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1038_mm_set_epi32(int i3, int i2, int i1, int i0)
1039{
1040  return (__m128i)(__v4si){ i0, i1, i2, i3};
1041}
1042
1043static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1044_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1045{
1046  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1047}
1048
1049static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1050_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1051{
1052  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1053}
1054
1055static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1056_mm_set1_epi64x(long long q)
1057{
1058  return (__m128i){ q, q };
1059}
1060
1061static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1062_mm_set1_epi64(__m64 q)
1063{
1064  return (__m128i){ (long long)q, (long long)q };
1065}
1066
1067static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1068_mm_set1_epi32(int i)
1069{
1070  return (__m128i)(__v4si){ i, i, i, i };
1071}
1072
1073static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1074_mm_set1_epi16(short w)
1075{
1076  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1077}
1078
1079static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1080_mm_set1_epi8(char b)
1081{
1082  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1083}
1084
1085static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1086_mm_setr_epi64(__m64 q0, __m64 q1)
1087{
1088  return (__m128i){ (long long)q0, (long long)q1 };
1089}
1090
1091static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1092_mm_setr_epi32(int i0, int i1, int i2, int i3)
1093{
1094  return (__m128i)(__v4si){ i0, i1, i2, i3};
1095}
1096
1097static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1098_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1099{
1100  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1101}
1102
1103static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1104_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1105{
1106  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1107}
1108
1109static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1110_mm_setzero_si128(void)
1111{
1112  return (__m128i){ 0LL, 0LL };
1113}
1114
1115static __inline__ void __attribute__((__always_inline__, __nodebug__))
1116_mm_store_si128(__m128i *p, __m128i b)
1117{
1118  *p = b;
1119}
1120
1121static __inline__ void __attribute__((__always_inline__, __nodebug__))
1122_mm_storeu_si128(__m128i *p, __m128i b)
1123{
1124  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1125}
1126
1127static __inline__ void __attribute__((__always_inline__, __nodebug__))
1128_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1129{
1130  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1131}
1132
1133static __inline__ void __attribute__((__always_inline__, __nodebug__))
1134_mm_storel_epi64(__m128i *p, __m128i a)
1135{
1136  __builtin_ia32_storelv4si((__v2si *)p, a);
1137}
1138
1139static __inline__ void __attribute__((__always_inline__, __nodebug__))
1140_mm_stream_pd(double *p, __m128d a)
1141{
1142  __builtin_ia32_movntpd(p, a);
1143}
1144
1145static __inline__ void __attribute__((__always_inline__, __nodebug__))
1146_mm_stream_si128(__m128i *p, __m128i a)
1147{
1148  __builtin_ia32_movntdq(p, a);
1149}
1150
1151static __inline__ void __attribute__((__always_inline__, __nodebug__))
1152_mm_stream_si32(int *p, int a)
1153{
1154  __builtin_ia32_movnti(p, a);
1155}
1156
1157static __inline__ void __attribute__((__always_inline__, __nodebug__))
1158_mm_clflush(void const *p)
1159{
1160  __builtin_ia32_clflush(p);
1161}
1162
1163static __inline__ void __attribute__((__always_inline__, __nodebug__))
1164_mm_lfence(void)
1165{
1166  __builtin_ia32_lfence();
1167}
1168
1169static __inline__ void __attribute__((__always_inline__, __nodebug__))
1170_mm_mfence(void)
1171{
1172  __builtin_ia32_mfence();
1173}
1174
1175static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1176_mm_packs_epi16(__m128i a, __m128i b)
1177{
1178  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1179}
1180
1181static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1182_mm_packs_epi32(__m128i a, __m128i b)
1183{
1184  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1185}
1186
1187static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1188_mm_packus_epi16(__m128i a, __m128i b)
1189{
1190  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1191}
1192
1193static __inline__ int __attribute__((__always_inline__, __nodebug__))
1194_mm_extract_epi16(__m128i a, int imm)
1195{
1196  __v8hi b = (__v8hi)a;
1197  return b[imm];
1198}
1199
1200static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1201_mm_insert_epi16(__m128i a, int b, int imm)
1202{
1203  __v8hi c = (__v8hi)a;
1204  c[imm & 7] = b;
1205  return (__m128i)c;
1206}
1207
1208static __inline__ int __attribute__((__always_inline__, __nodebug__))
1209_mm_movemask_epi8(__m128i a)
1210{
1211  return __builtin_ia32_pmovmskb128((__v16qi)a);
1212}
1213
1214#define _mm_shuffle_epi32(a, imm) \
1215  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \
1216                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1217                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1218#define _mm_shufflelo_epi16(a, imm) \
1219  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \
1220                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1222                                    4, 5, 6, 7))
1223#define _mm_shufflehi_epi16(a, imm) \
1224  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \
1225                                    4 + ((imm) & 0x3), 4 + ((imm) & 0xc) >> 2, \
1226                                    4 + ((imm) & 0x30) >> 4, \
1227                                    4 + ((imm) & 0xc0) >> 6))
1228
1229static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1230_mm_unpackhi_epi8(__m128i a, __m128i b)
1231{
1232  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1233}
1234
1235static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1236_mm_unpackhi_epi16(__m128i a, __m128i b)
1237{
1238  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1239}
1240
1241static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1242_mm_unpackhi_epi32(__m128i a, __m128i b)
1243{
1244  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1245}
1246
1247static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1248_mm_unpackhi_epi64(__m128i a, __m128i b)
1249{
1250  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1251}
1252
1253static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1254_mm_unpacklo_epi8(__m128i a, __m128i b)
1255{
1256  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1257}
1258
1259static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1260_mm_unpacklo_epi16(__m128i a, __m128i b)
1261{
1262  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1263}
1264
1265static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1266_mm_unpacklo_epi32(__m128i a, __m128i b)
1267{
1268  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1269}
1270
1271static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1272_mm_unpacklo_epi64(__m128i a, __m128i b)
1273{
1274  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1275}
1276
1277static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1278_mm_movepi64_pi64(__m128i a)
1279{
1280  return (__m64)a[0];
1281}
1282
1283static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1284_mm_movpi64_pi64(__m64 a)
1285{
1286  return (__m128i){ (long long)a, 0 };
1287}
1288
1289static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1290_mm_move_epi64(__m128i a)
1291{
1292  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1293}
1294
1295static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1296_mm_unpackhi_pd(__m128d a, __m128d b)
1297{
1298  return __builtin_shufflevector(a, b, 1, 2+1);
1299}
1300
1301static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1302_mm_unpacklo_pd(__m128d a, __m128d b)
1303{
1304  return __builtin_shufflevector(a, b, 0, 2+0);
1305}
1306
1307static __inline__ int __attribute__((__always_inline__, __nodebug__))
1308_mm_movemask_pd(__m128d a)
1309{
1310  return __builtin_ia32_movmskpd(a);
1311}
1312
1313#define _mm_shuffle_pd(a, b, i) \
1314  (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1315                                                       (((i) & 2) >> 1) + 2))
1316
1317static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1318_mm_castpd_ps(__m128d in)
1319{
1320  return (__m128)in;
1321}
1322
1323static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1324_mm_castpd_si128(__m128d in)
1325{
1326  return (__m128i)in;
1327}
1328
1329static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1330_mm_castps_pd(__m128 in)
1331{
1332  return (__m128d)in;
1333}
1334
1335static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1336_mm_castps_si128(__m128 in)
1337{
1338  return (__m128i)in;
1339}
1340
1341static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1342_mm_castsi128_ps(__m128i in)
1343{
1344  return (__m128)in;
1345}
1346
1347static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1348_mm_castsi128_pd(__m128i in)
1349{
1350  return (__m128d)in;
1351}
1352
1353static __inline__ void __attribute__((__always_inline__, __nodebug__))
1354_mm_pause(void)
1355{
1356  __asm__ volatile ("pause");
1357}
1358
1359#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1360
1361#endif /* __SSE2__ */
1362
1363#endif /* __EMMINTRIN_H */
1364