emmintrin.h revision 122180
1/* Copyright (C) 2003 Free Software Foundation, Inc. 2 3 This file is part of GNU CC. 4 5 GNU CC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GNU CC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GNU CC; see the file COPYING. If not, write to 17 the Free Software Foundation, 59 Temple Place - Suite 330, 18 Boston, MA 02111-1307, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 8.0. */ 29 30#ifndef _EMMINTRIN_H_INCLUDED 31#define _EMMINTRIN_H_INCLUDED 32 33#ifdef __SSE2__ 34#include <xmmintrin.h> 35 36/* SSE2 */ 37typedef int __v2df __attribute__ ((mode (V2DF))); 38typedef int __v2di __attribute__ ((mode (V2DI))); 39typedef int __v4si __attribute__ ((mode (V4SI))); 40typedef int __v8hi __attribute__ ((mode (V8HI))); 41typedef int __v16qi __attribute__ ((mode (V16QI))); 42 43/* Create a selector for use with the SHUFPD instruction. */ 44#define _MM_SHUFFLE2(fp1,fp0) \ 45 (((fp1) << 1) | (fp0)) 46 47#define __m128i __v2di 48#define __m128d __v2df 49 50/* Create a vector with element 0 as *P and the rest zero. */ 51static __inline __m128d 52_mm_load_sd (double const *__P) 53{ 54 return (__m128d) __builtin_ia32_loadsd (__P); 55} 56 57/* Create a vector with all two elements equal to *P. */ 58static __inline __m128d 59_mm_load1_pd (double const *__P) 60{ 61 __v2df __tmp = __builtin_ia32_loadsd (__P); 62 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0)); 63} 64 65static __inline __m128d 66_mm_load_pd1 (double const *__P) 67{ 68 return _mm_load1_pd (__P); 69} 70 71/* Load two DPFP values from P. The address must be 16-byte aligned. */ 72static __inline __m128d 73_mm_load_pd (double const *__P) 74{ 75 return (__m128d) __builtin_ia32_loadapd (__P); 76} 77 78/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 79static __inline __m128d 80_mm_loadu_pd (double const *__P) 81{ 82 return (__m128d) __builtin_ia32_loadupd (__P); 83} 84 85/* Load two DPFP values in reverse order. The address must be aligned. */ 86static __inline __m128d 87_mm_loadr_pd (double const *__P) 88{ 89 __v2df __tmp = __builtin_ia32_loadapd (__P); 90 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); 91} 92 93/* Create a vector with element 0 as F and the rest zero. */ 94static __inline __m128d 95_mm_set_sd (double __F) 96{ 97 return (__m128d) __builtin_ia32_loadsd (&__F); 98} 99 100/* Create a vector with all two elements equal to F. */ 101static __inline __m128d 102_mm_set1_pd (double __F) 103{ 104 __v2df __tmp = __builtin_ia32_loadsd (&__F); 105 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0)); 106} 107 108static __inline __m128d 109_mm_set_pd1 (double __F) 110{ 111 return _mm_set1_pd (__F); 112} 113 114/* Create the vector [Z Y]. */ 115static __inline __m128d 116_mm_set_pd (double __Z, double __Y) 117{ 118 union { 119 double __a[2]; 120 __m128d __v; 121 } __u; 122 123 __u.__a[0] = __Y; 124 __u.__a[1] = __Z; 125 126 return __u.__v; 127} 128 129/* Create the vector [Y Z]. */ 130static __inline __m128d 131_mm_setr_pd (double __Z, double __Y) 132{ 133 return _mm_set_pd (__Y, __Z); 134} 135 136/* Create a vector of zeros. */ 137static __inline __m128d 138_mm_setzero_pd (void) 139{ 140 return (__m128d) __builtin_ia32_setzeropd (); 141} 142 143/* Stores the lower DPFP value. */ 144static __inline void 145_mm_store_sd (double *__P, __m128d __A) 146{ 147 __builtin_ia32_storesd (__P, (__v2df)__A); 148} 149 150/* Store the lower DPFP value acrosd two words. */ 151static __inline void 152_mm_store1_pd (double *__P, __m128d __A) 153{ 154 __v2df __va = (__v2df)__A; 155 __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0)); 156 __builtin_ia32_storeapd (__P, __tmp); 157} 158 159static __inline void 160_mm_store_pd1 (double *__P, __m128d __A) 161{ 162 _mm_store1_pd (__P, __A); 163} 164 165/* Store two DPFP values. The address must be 16-byte aligned. */ 166static __inline void 167_mm_store_pd (double *__P, __m128d __A) 168{ 169 __builtin_ia32_storeapd (__P, (__v2df)__A); 170} 171 172/* Store two DPFP values. The address need not be 16-byte aligned. */ 173static __inline void 174_mm_storeu_pd (double *__P, __m128d __A) 175{ 176 __builtin_ia32_storeupd (__P, (__v2df)__A); 177} 178 179/* Store two DPFP values in reverse order. The address must be aligned. */ 180static __inline void 181_mm_storer_pd (double *__P, __m128d __A) 182{ 183 __v2df __va = (__v2df)__A; 184 __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1)); 185 __builtin_ia32_storeapd (__P, __tmp); 186} 187 188/* Sets the low DPFP value of A from the low value of B. */ 189static __inline __m128d 190_mm_move_sd (__m128d __A, __m128d __B) 191{ 192 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 193} 194 195 196static __inline __m128d 197_mm_add_pd (__m128d __A, __m128d __B) 198{ 199 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); 200} 201 202static __inline __m128d 203_mm_add_sd (__m128d __A, __m128d __B) 204{ 205 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); 206} 207 208static __inline __m128d 209_mm_sub_pd (__m128d __A, __m128d __B) 210{ 211 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); 212} 213 214static __inline __m128d 215_mm_sub_sd (__m128d __A, __m128d __B) 216{ 217 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); 218} 219 220static __inline __m128d 221_mm_mul_pd (__m128d __A, __m128d __B) 222{ 223 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); 224} 225 226static __inline __m128d 227_mm_mul_sd (__m128d __A, __m128d __B) 228{ 229 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); 230} 231 232static __inline __m128d 233_mm_div_pd (__m128d __A, __m128d __B) 234{ 235 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); 236} 237 238static __inline __m128d 239_mm_div_sd (__m128d __A, __m128d __B) 240{ 241 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); 242} 243 244static __inline __m128d 245_mm_sqrt_pd (__m128d __A) 246{ 247 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); 248} 249 250/* Return pair {sqrt (A[0), B[1]}. */ 251static __inline __m128d 252_mm_sqrt_sd (__m128d __A, __m128d __B) 253{ 254 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 255 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); 256} 257 258static __inline __m128d 259_mm_min_pd (__m128d __A, __m128d __B) 260{ 261 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); 262} 263 264static __inline __m128d 265_mm_min_sd (__m128d __A, __m128d __B) 266{ 267 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); 268} 269 270static __inline __m128d 271_mm_max_pd (__m128d __A, __m128d __B) 272{ 273 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); 274} 275 276static __inline __m128d 277_mm_max_sd (__m128d __A, __m128d __B) 278{ 279 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); 280} 281 282static __inline __m128d 283_mm_and_pd (__m128d __A, __m128d __B) 284{ 285 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); 286} 287 288static __inline __m128d 289_mm_andnot_pd (__m128d __A, __m128d __B) 290{ 291 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); 292} 293 294static __inline __m128d 295_mm_or_pd (__m128d __A, __m128d __B) 296{ 297 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); 298} 299 300static __inline __m128d 301_mm_xor_pd (__m128d __A, __m128d __B) 302{ 303 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); 304} 305 306static __inline __m128d 307_mm_cmpeq_pd (__m128d __A, __m128d __B) 308{ 309 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); 310} 311 312static __inline __m128d 313_mm_cmplt_pd (__m128d __A, __m128d __B) 314{ 315 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); 316} 317 318static __inline __m128d 319_mm_cmple_pd (__m128d __A, __m128d __B) 320{ 321 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); 322} 323 324static __inline __m128d 325_mm_cmpgt_pd (__m128d __A, __m128d __B) 326{ 327 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); 328} 329 330static __inline __m128d 331_mm_cmpge_pd (__m128d __A, __m128d __B) 332{ 333 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); 334} 335 336static __inline __m128d 337_mm_cmpneq_pd (__m128d __A, __m128d __B) 338{ 339 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); 340} 341 342static __inline __m128d 343_mm_cmpnlt_pd (__m128d __A, __m128d __B) 344{ 345 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); 346} 347 348static __inline __m128d 349_mm_cmpnle_pd (__m128d __A, __m128d __B) 350{ 351 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); 352} 353 354static __inline __m128d 355_mm_cmpngt_pd (__m128d __A, __m128d __B) 356{ 357 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); 358} 359 360static __inline __m128d 361_mm_cmpnge_pd (__m128d __A, __m128d __B) 362{ 363 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); 364} 365 366static __inline __m128d 367_mm_cmpord_pd (__m128d __A, __m128d __B) 368{ 369 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); 370} 371 372static __inline __m128d 373_mm_cmpunord_pd (__m128d __A, __m128d __B) 374{ 375 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); 376} 377 378static __inline __m128d 379_mm_cmpeq_sd (__m128d __A, __m128d __B) 380{ 381 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); 382} 383 384static __inline __m128d 385_mm_cmplt_sd (__m128d __A, __m128d __B) 386{ 387 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); 388} 389 390static __inline __m128d 391_mm_cmple_sd (__m128d __A, __m128d __B) 392{ 393 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); 394} 395 396static __inline __m128d 397_mm_cmpgt_sd (__m128d __A, __m128d __B) 398{ 399 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 400 (__v2df) 401 __builtin_ia32_cmpltsd ((__v2df) __B, 402 (__v2df) 403 __A)); 404} 405 406static __inline __m128d 407_mm_cmpge_sd (__m128d __A, __m128d __B) 408{ 409 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 410 (__v2df) 411 __builtin_ia32_cmplesd ((__v2df) __B, 412 (__v2df) 413 __A)); 414} 415 416static __inline __m128d 417_mm_cmpneq_sd (__m128d __A, __m128d __B) 418{ 419 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); 420} 421 422static __inline __m128d 423_mm_cmpnlt_sd (__m128d __A, __m128d __B) 424{ 425 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); 426} 427 428static __inline __m128d 429_mm_cmpnle_sd (__m128d __A, __m128d __B) 430{ 431 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); 432} 433 434static __inline __m128d 435_mm_cmpngt_sd (__m128d __A, __m128d __B) 436{ 437 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 438 (__v2df) 439 __builtin_ia32_cmpnltsd ((__v2df) __B, 440 (__v2df) 441 __A)); 442} 443 444static __inline __m128d 445_mm_cmpnge_sd (__m128d __A, __m128d __B) 446{ 447 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 448 (__v2df) 449 __builtin_ia32_cmpnlesd ((__v2df) __B, 450 (__v2df) 451 __A)); 452} 453 454static __inline __m128d 455_mm_cmpord_sd (__m128d __A, __m128d __B) 456{ 457 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); 458} 459 460static __inline __m128d 461_mm_cmpunord_sd (__m128d __A, __m128d __B) 462{ 463 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); 464} 465 466static __inline int 467_mm_comieq_sd (__m128d __A, __m128d __B) 468{ 469 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); 470} 471 472static __inline int 473_mm_comilt_sd (__m128d __A, __m128d __B) 474{ 475 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); 476} 477 478static __inline int 479_mm_comile_sd (__m128d __A, __m128d __B) 480{ 481 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); 482} 483 484static __inline int 485_mm_comigt_sd (__m128d __A, __m128d __B) 486{ 487 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); 488} 489 490static __inline int 491_mm_comige_sd (__m128d __A, __m128d __B) 492{ 493 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); 494} 495 496static __inline int 497_mm_comineq_sd (__m128d __A, __m128d __B) 498{ 499 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); 500} 501 502static __inline int 503_mm_ucomieq_sd (__m128d __A, __m128d __B) 504{ 505 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); 506} 507 508static __inline int 509_mm_ucomilt_sd (__m128d __A, __m128d __B) 510{ 511 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); 512} 513 514static __inline int 515_mm_ucomile_sd (__m128d __A, __m128d __B) 516{ 517 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); 518} 519 520static __inline int 521_mm_ucomigt_sd (__m128d __A, __m128d __B) 522{ 523 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); 524} 525 526static __inline int 527_mm_ucomige_sd (__m128d __A, __m128d __B) 528{ 529 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); 530} 531 532static __inline int 533_mm_ucomineq_sd (__m128d __A, __m128d __B) 534{ 535 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); 536} 537 538/* Create a vector with element 0 as *P and the rest zero. */ 539 540static __inline __m128i 541_mm_load_si128 (__m128i const *__P) 542{ 543 return (__m128i) __builtin_ia32_loaddqa ((char const *)__P); 544} 545 546static __inline __m128i 547_mm_loadu_si128 (__m128i const *__P) 548{ 549 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); 550} 551 552static __inline __m128i 553_mm_loadl_epi64 (__m128i const *__P) 554{ 555 return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P); 556} 557 558static __inline void 559_mm_store_si128 (__m128i *__P, __m128i __B) 560{ 561 __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B); 562} 563 564static __inline void 565_mm_storeu_si128 (__m128i *__P, __m128i __B) 566{ 567 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); 568} 569 570static __inline void 571_mm_storel_epi64 (__m128i *__P, __m128i __B) 572{ 573 *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B); 574} 575 576static __inline __m64 577_mm_movepi64_pi64 (__m128i __B) 578{ 579 return (__m64) __builtin_ia32_movdq2q ((__v2di)__B); 580} 581 582static __inline __m128i 583_mm_move_epi64 (__m128i __A) 584{ 585 return (__m128i) __builtin_ia32_movq ((__v2di)__A); 586} 587 588/* Create a vector of zeros. */ 589static __inline __m128i 590_mm_setzero_si128 (void) 591{ 592 return (__m128i) __builtin_ia32_setzero128 (); 593} 594 595static __inline __m128i 596_mm_set_epi64 (__m64 __A, __m64 __B) 597{ 598 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 599 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); 600 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp); 601} 602 603/* Create the vector [Z Y X W]. */ 604static __inline __m128i 605_mm_set_epi32 (int __Z, int __Y, int __X, int __W) 606{ 607 union { 608 int __a[4]; 609 __m128i __v; 610 } __u; 611 612 __u.__a[0] = __W; 613 __u.__a[1] = __X; 614 __u.__a[2] = __Y; 615 __u.__a[3] = __Z; 616 617 return __u.__v; 618} 619 620#ifdef __x86_64__ 621/* Create the vector [Z Y]. */ 622static __inline __m128i 623_mm_set_epi64x (long long __Z, long long __Y) 624{ 625 union { 626 long __a[2]; 627 __m128i __v; 628 } __u; 629 630 __u.__a[0] = __Y; 631 __u.__a[1] = __Z; 632 633 return __u.__v; 634} 635#endif 636 637/* Create the vector [S T U V Z Y X W]. */ 638static __inline __m128i 639_mm_set_epi16 (short __Z, short __Y, short __X, short __W, 640 short __V, short __U, short __T, short __S) 641{ 642 union { 643 short __a[8]; 644 __m128i __v; 645 } __u; 646 647 __u.__a[0] = __S; 648 __u.__a[1] = __T; 649 __u.__a[2] = __U; 650 __u.__a[3] = __V; 651 __u.__a[4] = __W; 652 __u.__a[5] = __X; 653 __u.__a[6] = __Y; 654 __u.__a[7] = __Z; 655 656 return __u.__v; 657} 658 659/* Create the vector [S T U V Z Y X W]. */ 660static __inline __m128i 661_mm_set_epi8 (char __Z, char __Y, char __X, char __W, 662 char __V, char __U, char __T, char __S, 663 char __Z1, char __Y1, char __X1, char __W1, 664 char __V1, char __U1, char __T1, char __S1) 665{ 666 union { 667 char __a[16]; 668 __m128i __v; 669 } __u; 670 671 __u.__a[0] = __S1; 672 __u.__a[1] = __T1; 673 __u.__a[2] = __U1; 674 __u.__a[3] = __V1; 675 __u.__a[4] = __W1; 676 __u.__a[5] = __X1; 677 __u.__a[6] = __Y1; 678 __u.__a[7] = __Z1; 679 __u.__a[8] = __S; 680 __u.__a[9] = __T; 681 __u.__a[10] = __U; 682 __u.__a[11] = __V; 683 __u.__a[12] = __W; 684 __u.__a[13] = __X; 685 __u.__a[14] = __Y; 686 __u.__a[15] = __Z; 687 688 return __u.__v; 689} 690 691static __inline __m128i 692_mm_set1_epi64 (__m64 __A) 693{ 694 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 695 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp); 696} 697 698static __inline __m128i 699_mm_set1_epi32 (int __A) 700{ 701 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A); 702 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 703} 704 705#ifdef __x86_64__ 706static __inline __m128i 707_mm_set1_epi64x (long long __A) 708{ 709 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 710 return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0)); 711} 712#endif 713 714static __inline __m128i 715_mm_set1_epi16 (short __A) 716{ 717 int __Acopy = (unsigned short)__A; 718 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); 719 __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp); 720 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 721} 722 723static __inline __m128i 724_mm_set1_epi8 (char __A) 725{ 726 int __Acopy = (unsigned char)__A; 727 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); 728 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); 729 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); 730 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 731} 732 733static __inline __m128i 734_mm_setr_epi64 (__m64 __A, __m64 __B) 735{ 736 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 737 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); 738 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2); 739} 740 741/* Create the vector [Z Y X W]. */ 742static __inline __m128i 743_mm_setr_epi32 (int __W, int __X, int __Y, int __Z) 744{ 745 union { 746 int __a[4]; 747 __m128i __v; 748 } __u; 749 750 __u.__a[0] = __W; 751 __u.__a[1] = __X; 752 __u.__a[2] = __Y; 753 __u.__a[3] = __Z; 754 755 return __u.__v; 756} 757/* Create the vector [S T U V Z Y X W]. */ 758static __inline __m128i 759_mm_setr_epi16 (short __S, short __T, short __U, short __V, 760 short __W, short __X, short __Y, short __Z) 761{ 762 union { 763 short __a[8]; 764 __m128i __v; 765 } __u; 766 767 __u.__a[0] = __S; 768 __u.__a[1] = __T; 769 __u.__a[2] = __U; 770 __u.__a[3] = __V; 771 __u.__a[4] = __W; 772 __u.__a[5] = __X; 773 __u.__a[6] = __Y; 774 __u.__a[7] = __Z; 775 776 return __u.__v; 777} 778 779/* Create the vector [S T U V Z Y X W]. */ 780static __inline __m128i 781_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1, 782 char __W1, char __X1, char __Y1, char __Z1, 783 char __S, char __T, char __U, char __V, 784 char __W, char __X, char __Y, char __Z) 785{ 786 union { 787 char __a[16]; 788 __m128i __v; 789 } __u; 790 791 __u.__a[0] = __S1; 792 __u.__a[1] = __T1; 793 __u.__a[2] = __U1; 794 __u.__a[3] = __V1; 795 __u.__a[4] = __W1; 796 __u.__a[5] = __X1; 797 __u.__a[6] = __Y1; 798 __u.__a[7] = __Z1; 799 __u.__a[8] = __S; 800 __u.__a[9] = __T; 801 __u.__a[10] = __U; 802 __u.__a[11] = __V; 803 __u.__a[12] = __W; 804 __u.__a[13] = __X; 805 __u.__a[14] = __Y; 806 __u.__a[15] = __Z; 807 808 return __u.__v; 809} 810 811static __inline __m128d 812_mm_cvtepi32_pd (__m128i __A) 813{ 814 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); 815} 816 817static __inline __m128 818_mm_cvtepi32_ps (__m128i __A) 819{ 820 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); 821} 822 823static __inline __m128i 824_mm_cvtpd_epi32 (__m128d __A) 825{ 826 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); 827} 828 829static __inline __m64 830_mm_cvtpd_pi32 (__m128d __A) 831{ 832 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); 833} 834 835static __inline __m128 836_mm_cvtpd_ps (__m128d __A) 837{ 838 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); 839} 840 841static __inline __m128i 842_mm_cvttpd_epi32 (__m128d __A) 843{ 844 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); 845} 846 847static __inline __m64 848_mm_cvttpd_pi32 (__m128d __A) 849{ 850 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); 851} 852 853static __inline __m128d 854_mm_cvtpi32_pd (__m64 __A) 855{ 856 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); 857} 858 859static __inline __m128i 860_mm_cvtps_epi32 (__m128 __A) 861{ 862 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); 863} 864 865static __inline __m128i 866_mm_cvttps_epi32 (__m128 __A) 867{ 868 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); 869} 870 871static __inline __m128d 872_mm_cvtps_pd (__m128 __A) 873{ 874 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); 875} 876 877static __inline int 878_mm_cvtsd_si32 (__m128d __A) 879{ 880 return __builtin_ia32_cvtsd2si ((__v2df) __A); 881} 882 883#ifdef __x86_64__ 884static __inline long long 885_mm_cvtsd_si64x (__m128d __A) 886{ 887 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 888} 889#endif 890 891static __inline int 892_mm_cvttsd_si32 (__m128d __A) 893{ 894 return __builtin_ia32_cvttsd2si ((__v2df) __A); 895} 896 897#ifdef __x86_64__ 898static __inline long long 899_mm_cvttsd_si64x (__m128d __A) 900{ 901 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 902} 903#endif 904 905static __inline __m128 906_mm_cvtsd_ss (__m128 __A, __m128d __B) 907{ 908 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); 909} 910 911static __inline __m128d 912_mm_cvtsi32_sd (__m128d __A, int __B) 913{ 914 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); 915} 916 917#ifdef __x86_64__ 918static __inline __m128d 919_mm_cvtsi64x_sd (__m128d __A, long long __B) 920{ 921 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 922} 923#endif 924 925static __inline __m128d 926_mm_cvtss_sd (__m128d __A, __m128 __B) 927{ 928 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); 929} 930 931#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C))) 932 933static __inline __m128d 934_mm_unpackhi_pd (__m128d __A, __m128d __B) 935{ 936 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); 937} 938 939static __inline __m128d 940_mm_unpacklo_pd (__m128d __A, __m128d __B) 941{ 942 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); 943} 944 945static __inline __m128d 946_mm_loadh_pd (__m128d __A, double const *__B) 947{ 948 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B); 949} 950 951static __inline void 952_mm_storeh_pd (double *__A, __m128d __B) 953{ 954 __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B); 955} 956 957static __inline __m128d 958_mm_loadl_pd (__m128d __A, double const *__B) 959{ 960 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B); 961} 962 963static __inline void 964_mm_storel_pd (double *__A, __m128d __B) 965{ 966 __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B); 967} 968 969static __inline int 970_mm_movemask_pd (__m128d __A) 971{ 972 return __builtin_ia32_movmskpd ((__v2df)__A); 973} 974 975static __inline __m128i 976_mm_packs_epi16 (__m128i __A, __m128i __B) 977{ 978 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); 979} 980 981static __inline __m128i 982_mm_packs_epi32 (__m128i __A, __m128i __B) 983{ 984 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); 985} 986 987static __inline __m128i 988_mm_packus_epi16 (__m128i __A, __m128i __B) 989{ 990 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); 991} 992 993static __inline __m128i 994_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 995{ 996 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); 997} 998 999static __inline __m128i 1000_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1001{ 1002 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); 1003} 1004 1005static __inline __m128i 1006_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1007{ 1008 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); 1009} 1010 1011static __inline __m128i 1012_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1013{ 1014 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); 1015} 1016 1017static __inline __m128i 1018_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1019{ 1020 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); 1021} 1022 1023static __inline __m128i 1024_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1025{ 1026 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); 1027} 1028 1029static __inline __m128i 1030_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1031{ 1032 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); 1033} 1034 1035static __inline __m128i 1036_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1037{ 1038 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); 1039} 1040 1041static __inline __m128i 1042_mm_add_epi8 (__m128i __A, __m128i __B) 1043{ 1044 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); 1045} 1046 1047static __inline __m128i 1048_mm_add_epi16 (__m128i __A, __m128i __B) 1049{ 1050 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); 1051} 1052 1053static __inline __m128i 1054_mm_add_epi32 (__m128i __A, __m128i __B) 1055{ 1056 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); 1057} 1058 1059static __inline __m128i 1060_mm_add_epi64 (__m128i __A, __m128i __B) 1061{ 1062 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); 1063} 1064 1065static __inline __m128i 1066_mm_adds_epi8 (__m128i __A, __m128i __B) 1067{ 1068 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); 1069} 1070 1071static __inline __m128i 1072_mm_adds_epi16 (__m128i __A, __m128i __B) 1073{ 1074 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); 1075} 1076 1077static __inline __m128i 1078_mm_adds_epu8 (__m128i __A, __m128i __B) 1079{ 1080 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); 1081} 1082 1083static __inline __m128i 1084_mm_adds_epu16 (__m128i __A, __m128i __B) 1085{ 1086 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); 1087} 1088 1089static __inline __m128i 1090_mm_sub_epi8 (__m128i __A, __m128i __B) 1091{ 1092 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); 1093} 1094 1095static __inline __m128i 1096_mm_sub_epi16 (__m128i __A, __m128i __B) 1097{ 1098 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); 1099} 1100 1101static __inline __m128i 1102_mm_sub_epi32 (__m128i __A, __m128i __B) 1103{ 1104 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); 1105} 1106 1107static __inline __m128i 1108_mm_sub_epi64 (__m128i __A, __m128i __B) 1109{ 1110 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); 1111} 1112 1113static __inline __m128i 1114_mm_subs_epi8 (__m128i __A, __m128i __B) 1115{ 1116 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); 1117} 1118 1119static __inline __m128i 1120_mm_subs_epi16 (__m128i __A, __m128i __B) 1121{ 1122 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); 1123} 1124 1125static __inline __m128i 1126_mm_subs_epu8 (__m128i __A, __m128i __B) 1127{ 1128 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); 1129} 1130 1131static __inline __m128i 1132_mm_subs_epu16 (__m128i __A, __m128i __B) 1133{ 1134 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); 1135} 1136 1137static __inline __m128i 1138_mm_madd_epi16 (__m128i __A, __m128i __B) 1139{ 1140 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); 1141} 1142 1143static __inline __m128i 1144_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1145{ 1146 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); 1147} 1148 1149static __inline __m128i 1150_mm_mullo_epi16 (__m128i __A, __m128i __B) 1151{ 1152 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); 1153} 1154 1155static __inline __m64 1156_mm_mul_su32 (__m64 __A, __m64 __B) 1157{ 1158 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); 1159} 1160 1161static __inline __m128i 1162_mm_mul_epu32 (__m128i __A, __m128i __B) 1163{ 1164 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); 1165} 1166 1167static __inline __m128i 1168_mm_sll_epi16 (__m128i __A, __m128i __B) 1169{ 1170 return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B); 1171} 1172 1173static __inline __m128i 1174_mm_sll_epi32 (__m128i __A, __m128i __B) 1175{ 1176 return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B); 1177} 1178 1179static __inline __m128i 1180_mm_sll_epi64 (__m128i __A, __m128i __B) 1181{ 1182 return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B); 1183} 1184 1185static __inline __m128i 1186_mm_sra_epi16 (__m128i __A, __m128i __B) 1187{ 1188 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B); 1189} 1190 1191static __inline __m128i 1192_mm_sra_epi32 (__m128i __A, __m128i __B) 1193{ 1194 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B); 1195} 1196 1197static __inline __m128i 1198_mm_srl_epi16 (__m128i __A, __m128i __B) 1199{ 1200 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B); 1201} 1202 1203static __inline __m128i 1204_mm_srl_epi32 (__m128i __A, __m128i __B) 1205{ 1206 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B); 1207} 1208 1209static __inline __m128i 1210_mm_srl_epi64 (__m128i __A, __m128i __B) 1211{ 1212 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); 1213} 1214 1215static __inline __m128i 1216_mm_slli_epi16 (__m128i __A, int __B) 1217{ 1218 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); 1219} 1220 1221static __inline __m128i 1222_mm_slli_epi32 (__m128i __A, int __B) 1223{ 1224 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); 1225} 1226 1227static __inline __m128i 1228_mm_slli_epi64 (__m128i __A, int __B) 1229{ 1230 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); 1231} 1232 1233static __inline __m128i 1234_mm_srai_epi16 (__m128i __A, int __B) 1235{ 1236 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); 1237} 1238 1239static __inline __m128i 1240_mm_srai_epi32 (__m128i __A, int __B) 1241{ 1242 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); 1243} 1244 1245#if 0 1246static __m128i __attribute__((__always_inline__)) 1247_mm_srli_si128 (__m128i __A, const int __B) 1248{ 1249 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B)) 1250} 1251 1252static __m128i __attribute__((__always_inline__)) 1253_mm_srli_si128 (__m128i __A, const int __B) 1254{ 1255 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B)) 1256} 1257#endif 1258#define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B)) 1259#define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B)) 1260 1261static __inline __m128i 1262_mm_srli_epi16 (__m128i __A, int __B) 1263{ 1264 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); 1265} 1266 1267static __inline __m128i 1268_mm_srli_epi32 (__m128i __A, int __B) 1269{ 1270 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); 1271} 1272 1273static __inline __m128i 1274_mm_srli_epi64 (__m128i __A, int __B) 1275{ 1276 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); 1277} 1278 1279static __inline __m128i 1280_mm_and_si128 (__m128i __A, __m128i __B) 1281{ 1282 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 1283} 1284 1285static __inline __m128i 1286_mm_andnot_si128 (__m128i __A, __m128i __B) 1287{ 1288 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); 1289} 1290 1291static __inline __m128i 1292_mm_or_si128 (__m128i __A, __m128i __B) 1293{ 1294 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 1295} 1296 1297static __inline __m128i 1298_mm_xor_si128 (__m128i __A, __m128i __B) 1299{ 1300 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); 1301} 1302 1303static __inline __m128i 1304_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1305{ 1306 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); 1307} 1308 1309static __inline __m128i 1310_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1311{ 1312 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); 1313} 1314 1315static __inline __m128i 1316_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1317{ 1318 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); 1319} 1320 1321static __inline __m128i 1322_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1323{ 1324 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); 1325} 1326 1327static __inline __m128i 1328_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1329{ 1330 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); 1331} 1332 1333static __inline __m128i 1334_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1335{ 1336 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); 1337} 1338 1339static __inline __m128i 1340_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1341{ 1342 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); 1343} 1344 1345static __inline __m128i 1346_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1347{ 1348 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); 1349} 1350 1351static __inline __m128i 1352_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1353{ 1354 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); 1355} 1356 1357#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B) 1358 1359#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C)) 1360 1361static __inline __m128i 1362_mm_max_epi16 (__m128i __A, __m128i __B) 1363{ 1364 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); 1365} 1366 1367static __inline __m128i 1368_mm_max_epu8 (__m128i __A, __m128i __B) 1369{ 1370 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); 1371} 1372 1373static __inline __m128i 1374_mm_min_epi16 (__m128i __A, __m128i __B) 1375{ 1376 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); 1377} 1378 1379static __inline __m128i 1380_mm_min_epu8 (__m128i __A, __m128i __B) 1381{ 1382 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); 1383} 1384 1385static __inline int 1386_mm_movemask_epi8 (__m128i __A) 1387{ 1388 return __builtin_ia32_pmovmskb128 ((__v16qi)__A); 1389} 1390 1391static __inline __m128i 1392_mm_mulhi_epu16 (__m128i __A, __m128i __B) 1393{ 1394 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); 1395} 1396 1397#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B)) 1398#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B)) 1399#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) 1400 1401static __inline void 1402_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 1403{ 1404 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); 1405} 1406 1407static __inline __m128i 1408_mm_avg_epu8 (__m128i __A, __m128i __B) 1409{ 1410 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); 1411} 1412 1413static __inline __m128i 1414_mm_avg_epu16 (__m128i __A, __m128i __B) 1415{ 1416 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); 1417} 1418 1419static __inline __m128i 1420_mm_sad_epu8 (__m128i __A, __m128i __B) 1421{ 1422 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); 1423} 1424 1425static __inline void 1426_mm_stream_si32 (int *__A, int __B) 1427{ 1428 __builtin_ia32_movnti (__A, __B); 1429} 1430 1431static __inline void 1432_mm_stream_si128 (__m128i *__A, __m128i __B) 1433{ 1434 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); 1435} 1436 1437static __inline void 1438_mm_stream_pd (double *__A, __m128d __B) 1439{ 1440 __builtin_ia32_movntpd (__A, (__v2df)__B); 1441} 1442 1443static __inline __m128i 1444_mm_movpi64_epi64 (__m64 __A) 1445{ 1446 return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A); 1447} 1448 1449static __inline void 1450_mm_clflush (void const *__A) 1451{ 1452 return __builtin_ia32_clflush (__A); 1453} 1454 1455static __inline void 1456_mm_lfence (void) 1457{ 1458 __builtin_ia32_lfence (); 1459} 1460 1461static __inline void 1462_mm_mfence (void) 1463{ 1464 __builtin_ia32_mfence (); 1465} 1466 1467static __inline __m128i 1468_mm_cvtsi32_si128 (int __A) 1469{ 1470 return (__m128i) __builtin_ia32_loadd (&__A); 1471} 1472 1473#ifdef __x86_64__ 1474static __inline __m128i 1475_mm_cvtsi64x_si128 (long long __A) 1476{ 1477 return (__m128i) __builtin_ia32_movq2dq (__A); 1478} 1479#endif 1480 1481static __inline int 1482_mm_cvtsi128_si32 (__m128i __A) 1483{ 1484 int __tmp; 1485 __builtin_ia32_stored (&__tmp, (__v4si)__A); 1486 return __tmp; 1487} 1488 1489#ifdef __x86_64__ 1490static __inline long long 1491_mm_cvtsi128_si64x (__m128i __A) 1492{ 1493 return __builtin_ia32_movdq2q ((__v2di)__A); 1494} 1495#endif 1496 1497#endif /* __SSE2__ */ 1498 1499#endif /* _EMMINTRIN_H_INCLUDED */ 1500