1/* Copyright (C) 2003, 2004, 2005, 2007 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GCC; see the file COPYING. If not, write to 17 the Free Software Foundation, 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 8.0. */ 29 30#ifndef _EMMINTRIN_H_INCLUDED 31#define _EMMINTRIN_H_INCLUDED 32 33#ifdef __SSE2__ 34#include <xmmintrin.h> 35 36/* SSE2 */ 37typedef double __v2df __attribute__ ((__vector_size__ (16))); 38typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39typedef int __v4si __attribute__ ((__vector_size__ (16))); 40typedef short __v8hi __attribute__ ((__vector_size__ (16))); 41typedef char __v16qi __attribute__ ((__vector_size__ (16))); 42 43/* The Intel API is flexible enough that we must allow aliasing with other 44 vector types, and their scalar components. */ 45typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 46typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 47 48/* Create a selector for use with the SHUFPD instruction. */ 49#define _MM_SHUFFLE2(fp1,fp0) \ 50 (((fp1) << 1) | (fp0)) 51 52/* Create a vector with element 0 as F and the rest zero. */ 53static __inline __m128d __attribute__((__always_inline__)) 54_mm_set_sd (double __F) 55{ 56 return __extension__ (__m128d){ __F, 0 }; 57} 58 59/* Create a vector with both elements equal to F. */ 60static __inline __m128d __attribute__((__always_inline__)) 61_mm_set1_pd (double __F) 62{ 63 return __extension__ (__m128d){ __F, __F }; 64} 65 66static __inline __m128d __attribute__((__always_inline__)) 67_mm_set_pd1 (double __F) 68{ 69 return _mm_set1_pd (__F); 70} 71 72/* Create a vector with the lower value X and upper value W. */ 73static __inline __m128d __attribute__((__always_inline__)) 74_mm_set_pd (double __W, double __X) 75{ 76 return __extension__ (__m128d){ __X, __W }; 77} 78 79/* Create a vector with the lower value W and upper value X. */ 80static __inline __m128d __attribute__((__always_inline__)) 81_mm_setr_pd (double __W, double __X) 82{ 83 return __extension__ (__m128d){ __W, __X }; 84} 85 86/* Create a vector of zeros. */ 87static __inline __m128d __attribute__((__always_inline__)) 88_mm_setzero_pd (void) 89{ 90 return __extension__ (__m128d){ 0.0, 0.0 }; 91} 92 93/* Sets the low DPFP value of A from the low value of B. */ 94static __inline __m128d __attribute__((__always_inline__)) 95_mm_move_sd (__m128d __A, __m128d __B) 96{ 97 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 98} 99 100/* Load two DPFP values from P. The address must be 16-byte aligned. */ 101static __inline __m128d __attribute__((__always_inline__)) 102_mm_load_pd (double const *__P) 103{ 104 return *(__m128d *)__P; 105} 106 107/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 108static __inline __m128d __attribute__((__always_inline__)) 109_mm_loadu_pd (double const *__P) 110{ 111 return __builtin_ia32_loadupd (__P); 112} 113 114/* Create a vector with all two elements equal to *P. */ 115static __inline __m128d __attribute__((__always_inline__)) 116_mm_load1_pd (double const *__P) 117{ 118 return _mm_set1_pd (*__P); 119} 120 121/* Create a vector with element 0 as *P and the rest zero. */ 122static __inline __m128d __attribute__((__always_inline__)) 123_mm_load_sd (double const *__P) 124{ 125 return _mm_set_sd (*__P); 126} 127 128static __inline __m128d __attribute__((__always_inline__)) 129_mm_load_pd1 (double const *__P) 130{ 131 return _mm_load1_pd (__P); 132} 133 134/* Load two DPFP values in reverse order. The address must be aligned. */ 135static __inline __m128d __attribute__((__always_inline__)) 136_mm_loadr_pd (double const *__P) 137{ 138 __m128d __tmp = _mm_load_pd (__P); 139 return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); 140} 141 142/* Store two DPFP values. The address must be 16-byte aligned. */ 143static __inline void __attribute__((__always_inline__)) 144_mm_store_pd (double *__P, __m128d __A) 145{ 146 *(__m128d *)__P = __A; 147} 148 149/* Store two DPFP values. The address need not be 16-byte aligned. */ 150static __inline void __attribute__((__always_inline__)) 151_mm_storeu_pd (double *__P, __m128d __A) 152{ 153 __builtin_ia32_storeupd (__P, __A); 154} 155 156/* Stores the lower DPFP value. */ 157static __inline void __attribute__((__always_inline__)) 158_mm_store_sd (double *__P, __m128d __A) 159{ 160 *__P = __builtin_ia32_vec_ext_v2df (__A, 0); 161} 162 163static __inline void __attribute__((__always_inline__)) 164_mm_storel_pd (double *__P, __m128d __A) 165{ 166 _mm_store_sd (__P, __A); 167} 168 169/* Stores the upper DPFP value. */ 170static __inline void __attribute__((__always_inline__)) 171_mm_storeh_pd (double *__P, __m128d __A) 172{ 173 *__P = __builtin_ia32_vec_ext_v2df (__A, 1); 174} 175 176/* Store the lower DPFP value across two words. 177 The address must be 16-byte aligned. */ 178static __inline void __attribute__((__always_inline__)) 179_mm_store1_pd (double *__P, __m128d __A) 180{ 181 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); 182} 183 184static __inline void __attribute__((__always_inline__)) 185_mm_store_pd1 (double *__P, __m128d __A) 186{ 187 _mm_store1_pd (__P, __A); 188} 189 190/* Store two DPFP values in reverse order. The address must be aligned. */ 191static __inline void __attribute__((__always_inline__)) 192_mm_storer_pd (double *__P, __m128d __A) 193{ 194 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); 195} 196 197static __inline int __attribute__((__always_inline__)) 198_mm_cvtsi128_si32 (__m128i __A) 199{ 200 return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); 201} 202 203#ifdef __x86_64__ 204static __inline long long __attribute__((__always_inline__)) 205_mm_cvtsi128_si64x (__m128i __A) 206{ 207 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); 208} 209#endif 210 211static __inline __m128d __attribute__((__always_inline__)) 212_mm_add_pd (__m128d __A, __m128d __B) 213{ 214 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); 215} 216 217static __inline __m128d __attribute__((__always_inline__)) 218_mm_add_sd (__m128d __A, __m128d __B) 219{ 220 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); 221} 222 223static __inline __m128d __attribute__((__always_inline__)) 224_mm_sub_pd (__m128d __A, __m128d __B) 225{ 226 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); 227} 228 229static __inline __m128d __attribute__((__always_inline__)) 230_mm_sub_sd (__m128d __A, __m128d __B) 231{ 232 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); 233} 234 235static __inline __m128d __attribute__((__always_inline__)) 236_mm_mul_pd (__m128d __A, __m128d __B) 237{ 238 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); 239} 240 241static __inline __m128d __attribute__((__always_inline__)) 242_mm_mul_sd (__m128d __A, __m128d __B) 243{ 244 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); 245} 246 247static __inline __m128d __attribute__((__always_inline__)) 248_mm_div_pd (__m128d __A, __m128d __B) 249{ 250 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); 251} 252 253static __inline __m128d __attribute__((__always_inline__)) 254_mm_div_sd (__m128d __A, __m128d __B) 255{ 256 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); 257} 258 259static __inline __m128d __attribute__((__always_inline__)) 260_mm_sqrt_pd (__m128d __A) 261{ 262 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); 263} 264 265/* Return pair {sqrt (A[0), B[1]}. */ 266static __inline __m128d __attribute__((__always_inline__)) 267_mm_sqrt_sd (__m128d __A, __m128d __B) 268{ 269 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 270 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); 271} 272 273static __inline __m128d __attribute__((__always_inline__)) 274_mm_min_pd (__m128d __A, __m128d __B) 275{ 276 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); 277} 278 279static __inline __m128d __attribute__((__always_inline__)) 280_mm_min_sd (__m128d __A, __m128d __B) 281{ 282 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); 283} 284 285static __inline __m128d __attribute__((__always_inline__)) 286_mm_max_pd (__m128d __A, __m128d __B) 287{ 288 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); 289} 290 291static __inline __m128d __attribute__((__always_inline__)) 292_mm_max_sd (__m128d __A, __m128d __B) 293{ 294 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); 295} 296 297static __inline __m128d __attribute__((__always_inline__)) 298_mm_and_pd (__m128d __A, __m128d __B) 299{ 300 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); 301} 302 303static __inline __m128d __attribute__((__always_inline__)) 304_mm_andnot_pd (__m128d __A, __m128d __B) 305{ 306 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); 307} 308 309static __inline __m128d __attribute__((__always_inline__)) 310_mm_or_pd (__m128d __A, __m128d __B) 311{ 312 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); 313} 314 315static __inline __m128d __attribute__((__always_inline__)) 316_mm_xor_pd (__m128d __A, __m128d __B) 317{ 318 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); 319} 320 321static __inline __m128d __attribute__((__always_inline__)) 322_mm_cmpeq_pd (__m128d __A, __m128d __B) 323{ 324 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); 325} 326 327static __inline __m128d __attribute__((__always_inline__)) 328_mm_cmplt_pd (__m128d __A, __m128d __B) 329{ 330 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); 331} 332 333static __inline __m128d __attribute__((__always_inline__)) 334_mm_cmple_pd (__m128d __A, __m128d __B) 335{ 336 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); 337} 338 339static __inline __m128d __attribute__((__always_inline__)) 340_mm_cmpgt_pd (__m128d __A, __m128d __B) 341{ 342 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); 343} 344 345static __inline __m128d __attribute__((__always_inline__)) 346_mm_cmpge_pd (__m128d __A, __m128d __B) 347{ 348 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); 349} 350 351static __inline __m128d __attribute__((__always_inline__)) 352_mm_cmpneq_pd (__m128d __A, __m128d __B) 353{ 354 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); 355} 356 357static __inline __m128d __attribute__((__always_inline__)) 358_mm_cmpnlt_pd (__m128d __A, __m128d __B) 359{ 360 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); 361} 362 363static __inline __m128d __attribute__((__always_inline__)) 364_mm_cmpnle_pd (__m128d __A, __m128d __B) 365{ 366 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); 367} 368 369static __inline __m128d __attribute__((__always_inline__)) 370_mm_cmpngt_pd (__m128d __A, __m128d __B) 371{ 372 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); 373} 374 375static __inline __m128d __attribute__((__always_inline__)) 376_mm_cmpnge_pd (__m128d __A, __m128d __B) 377{ 378 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); 379} 380 381static __inline __m128d __attribute__((__always_inline__)) 382_mm_cmpord_pd (__m128d __A, __m128d __B) 383{ 384 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); 385} 386 387static __inline __m128d __attribute__((__always_inline__)) 388_mm_cmpunord_pd (__m128d __A, __m128d __B) 389{ 390 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); 391} 392 393static __inline __m128d __attribute__((__always_inline__)) 394_mm_cmpeq_sd (__m128d __A, __m128d __B) 395{ 396 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); 397} 398 399static __inline __m128d __attribute__((__always_inline__)) 400_mm_cmplt_sd (__m128d __A, __m128d __B) 401{ 402 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); 403} 404 405static __inline __m128d __attribute__((__always_inline__)) 406_mm_cmple_sd (__m128d __A, __m128d __B) 407{ 408 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); 409} 410 411static __inline __m128d __attribute__((__always_inline__)) 412_mm_cmpgt_sd (__m128d __A, __m128d __B) 413{ 414 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 415 (__v2df) 416 __builtin_ia32_cmpltsd ((__v2df) __B, 417 (__v2df) 418 __A)); 419} 420 421static __inline __m128d __attribute__((__always_inline__)) 422_mm_cmpge_sd (__m128d __A, __m128d __B) 423{ 424 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 425 (__v2df) 426 __builtin_ia32_cmplesd ((__v2df) __B, 427 (__v2df) 428 __A)); 429} 430 431static __inline __m128d __attribute__((__always_inline__)) 432_mm_cmpneq_sd (__m128d __A, __m128d __B) 433{ 434 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); 435} 436 437static __inline __m128d __attribute__((__always_inline__)) 438_mm_cmpnlt_sd (__m128d __A, __m128d __B) 439{ 440 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); 441} 442 443static __inline __m128d __attribute__((__always_inline__)) 444_mm_cmpnle_sd (__m128d __A, __m128d __B) 445{ 446 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); 447} 448 449static __inline __m128d __attribute__((__always_inline__)) 450_mm_cmpngt_sd (__m128d __A, __m128d __B) 451{ 452 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 453 (__v2df) 454 __builtin_ia32_cmpnltsd ((__v2df) __B, 455 (__v2df) 456 __A)); 457} 458 459static __inline __m128d __attribute__((__always_inline__)) 460_mm_cmpnge_sd (__m128d __A, __m128d __B) 461{ 462 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 463 (__v2df) 464 __builtin_ia32_cmpnlesd ((__v2df) __B, 465 (__v2df) 466 __A)); 467} 468 469static __inline __m128d __attribute__((__always_inline__)) 470_mm_cmpord_sd (__m128d __A, __m128d __B) 471{ 472 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); 473} 474 475static __inline __m128d __attribute__((__always_inline__)) 476_mm_cmpunord_sd (__m128d __A, __m128d __B) 477{ 478 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); 479} 480 481static __inline int __attribute__((__always_inline__)) 482_mm_comieq_sd (__m128d __A, __m128d __B) 483{ 484 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); 485} 486 487static __inline int __attribute__((__always_inline__)) 488_mm_comilt_sd (__m128d __A, __m128d __B) 489{ 490 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); 491} 492 493static __inline int __attribute__((__always_inline__)) 494_mm_comile_sd (__m128d __A, __m128d __B) 495{ 496 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); 497} 498 499static __inline int __attribute__((__always_inline__)) 500_mm_comigt_sd (__m128d __A, __m128d __B) 501{ 502 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); 503} 504 505static __inline int __attribute__((__always_inline__)) 506_mm_comige_sd (__m128d __A, __m128d __B) 507{ 508 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); 509} 510 511static __inline int __attribute__((__always_inline__)) 512_mm_comineq_sd (__m128d __A, __m128d __B) 513{ 514 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); 515} 516 517static __inline int __attribute__((__always_inline__)) 518_mm_ucomieq_sd (__m128d __A, __m128d __B) 519{ 520 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); 521} 522 523static __inline int __attribute__((__always_inline__)) 524_mm_ucomilt_sd (__m128d __A, __m128d __B) 525{ 526 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); 527} 528 529static __inline int __attribute__((__always_inline__)) 530_mm_ucomile_sd (__m128d __A, __m128d __B) 531{ 532 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); 533} 534 535static __inline int __attribute__((__always_inline__)) 536_mm_ucomigt_sd (__m128d __A, __m128d __B) 537{ 538 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); 539} 540 541static __inline int __attribute__((__always_inline__)) 542_mm_ucomige_sd (__m128d __A, __m128d __B) 543{ 544 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); 545} 546 547static __inline int __attribute__((__always_inline__)) 548_mm_ucomineq_sd (__m128d __A, __m128d __B) 549{ 550 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); 551} 552 553/* Create a vector of Qi, where i is the element number. */ 554 555static __inline __m128i __attribute__((__always_inline__)) 556_mm_set_epi64x (long long __q1, long long __q0) 557{ 558 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 559} 560 561static __inline __m128i __attribute__((__always_inline__)) 562_mm_set_epi64 (__m64 __q1, __m64 __q0) 563{ 564 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 565} 566 567static __inline __m128i __attribute__((__always_inline__)) 568_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 569{ 570 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 571} 572 573static __inline __m128i __attribute__((__always_inline__)) 574_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 575 short __q3, short __q2, short __q1, short __q0) 576{ 577 return __extension__ (__m128i)(__v8hi){ 578 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 579} 580 581static __inline __m128i __attribute__((__always_inline__)) 582_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 583 char __q11, char __q10, char __q09, char __q08, 584 char __q07, char __q06, char __q05, char __q04, 585 char __q03, char __q02, char __q01, char __q00) 586{ 587 return __extension__ (__m128i)(__v16qi){ 588 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 589 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 590 }; 591} 592 593/* Set all of the elements of the vector to A. */ 594 595static __inline __m128i __attribute__((__always_inline__)) 596_mm_set1_epi64x (long long __A) 597{ 598 return _mm_set_epi64x (__A, __A); 599} 600 601static __inline __m128i __attribute__((__always_inline__)) 602_mm_set1_epi64 (__m64 __A) 603{ 604 return _mm_set_epi64 (__A, __A); 605} 606 607static __inline __m128i __attribute__((__always_inline__)) 608_mm_set1_epi32 (int __A) 609{ 610 return _mm_set_epi32 (__A, __A, __A, __A); 611} 612 613static __inline __m128i __attribute__((__always_inline__)) 614_mm_set1_epi16 (short __A) 615{ 616 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 617} 618 619static __inline __m128i __attribute__((__always_inline__)) 620_mm_set1_epi8 (char __A) 621{ 622 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 623 __A, __A, __A, __A, __A, __A, __A, __A); 624} 625 626/* Create a vector of Qi, where i is the element number. 627 The parameter order is reversed from the _mm_set_epi* functions. */ 628 629static __inline __m128i __attribute__((__always_inline__)) 630_mm_setr_epi64 (__m64 __q0, __m64 __q1) 631{ 632 return _mm_set_epi64 (__q1, __q0); 633} 634 635static __inline __m128i __attribute__((__always_inline__)) 636_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 637{ 638 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 639} 640 641static __inline __m128i __attribute__((__always_inline__)) 642_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 643 short __q4, short __q5, short __q6, short __q7) 644{ 645 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 646} 647 648static __inline __m128i __attribute__((__always_inline__)) 649_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 650 char __q04, char __q05, char __q06, char __q07, 651 char __q08, char __q09, char __q10, char __q11, 652 char __q12, char __q13, char __q14, char __q15) 653{ 654 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 655 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 656} 657 658/* Create a vector with element 0 as *P and the rest zero. */ 659 660static __inline __m128i __attribute__((__always_inline__)) 661_mm_load_si128 (__m128i const *__P) 662{ 663 return *__P; 664} 665 666static __inline __m128i __attribute__((__always_inline__)) 667_mm_loadu_si128 (__m128i const *__P) 668{ 669 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); 670} 671 672static __inline __m128i __attribute__((__always_inline__)) 673_mm_loadl_epi64 (__m128i const *__P) 674{ 675 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 676} 677 678static __inline void __attribute__((__always_inline__)) 679_mm_store_si128 (__m128i *__P, __m128i __B) 680{ 681 *__P = __B; 682} 683 684static __inline void __attribute__((__always_inline__)) 685_mm_storeu_si128 (__m128i *__P, __m128i __B) 686{ 687 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); 688} 689 690static __inline void __attribute__((__always_inline__)) 691_mm_storel_epi64 (__m128i *__P, __m128i __B) 692{ 693 *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); 694} 695 696static __inline __m64 __attribute__((__always_inline__)) 697_mm_movepi64_pi64 (__m128i __B) 698{ 699 return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); 700} 701 702static __inline __m128i __attribute__((__always_inline__)) 703_mm_movpi64_epi64 (__m64 __A) 704{ 705 return _mm_set_epi64 ((__m64)0LL, __A); 706} 707 708static __inline __m128i __attribute__((__always_inline__)) 709_mm_move_epi64 (__m128i __A) 710{ 711 return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A)); 712} 713 714/* Create a vector of zeros. */ 715static __inline __m128i __attribute__((__always_inline__)) 716_mm_setzero_si128 (void) 717{ 718 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 719} 720 721static __inline __m128d __attribute__((__always_inline__)) 722_mm_cvtepi32_pd (__m128i __A) 723{ 724 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); 725} 726 727static __inline __m128 __attribute__((__always_inline__)) 728_mm_cvtepi32_ps (__m128i __A) 729{ 730 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); 731} 732 733static __inline __m128i __attribute__((__always_inline__)) 734_mm_cvtpd_epi32 (__m128d __A) 735{ 736 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); 737} 738 739static __inline __m64 __attribute__((__always_inline__)) 740_mm_cvtpd_pi32 (__m128d __A) 741{ 742 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); 743} 744 745static __inline __m128 __attribute__((__always_inline__)) 746_mm_cvtpd_ps (__m128d __A) 747{ 748 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); 749} 750 751static __inline __m128i __attribute__((__always_inline__)) 752_mm_cvttpd_epi32 (__m128d __A) 753{ 754 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); 755} 756 757static __inline __m64 __attribute__((__always_inline__)) 758_mm_cvttpd_pi32 (__m128d __A) 759{ 760 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); 761} 762 763static __inline __m128d __attribute__((__always_inline__)) 764_mm_cvtpi32_pd (__m64 __A) 765{ 766 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); 767} 768 769static __inline __m128i __attribute__((__always_inline__)) 770_mm_cvtps_epi32 (__m128 __A) 771{ 772 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); 773} 774 775static __inline __m128i __attribute__((__always_inline__)) 776_mm_cvttps_epi32 (__m128 __A) 777{ 778 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); 779} 780 781static __inline __m128d __attribute__((__always_inline__)) 782_mm_cvtps_pd (__m128 __A) 783{ 784 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); 785} 786 787static __inline int __attribute__((__always_inline__)) 788_mm_cvtsd_si32 (__m128d __A) 789{ 790 return __builtin_ia32_cvtsd2si ((__v2df) __A); 791} 792 793#ifdef __x86_64__ 794static __inline long long __attribute__((__always_inline__)) 795_mm_cvtsd_si64x (__m128d __A) 796{ 797 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 798} 799#endif 800 801static __inline int __attribute__((__always_inline__)) 802_mm_cvttsd_si32 (__m128d __A) 803{ 804 return __builtin_ia32_cvttsd2si ((__v2df) __A); 805} 806 807#ifdef __x86_64__ 808static __inline long long __attribute__((__always_inline__)) 809_mm_cvttsd_si64x (__m128d __A) 810{ 811 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 812} 813#endif 814 815static __inline __m128 __attribute__((__always_inline__)) 816_mm_cvtsd_ss (__m128 __A, __m128d __B) 817{ 818 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); 819} 820 821static __inline __m128d __attribute__((__always_inline__)) 822_mm_cvtsi32_sd (__m128d __A, int __B) 823{ 824 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); 825} 826 827#ifdef __x86_64__ 828static __inline __m128d __attribute__((__always_inline__)) 829_mm_cvtsi64x_sd (__m128d __A, long long __B) 830{ 831 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 832} 833#endif 834 835static __inline __m128d __attribute__((__always_inline__)) 836_mm_cvtss_sd (__m128d __A, __m128 __B) 837{ 838 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); 839} 840 841#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C))) 842 843static __inline __m128d __attribute__((__always_inline__)) 844_mm_unpackhi_pd (__m128d __A, __m128d __B) 845{ 846 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); 847} 848 849static __inline __m128d __attribute__((__always_inline__)) 850_mm_unpacklo_pd (__m128d __A, __m128d __B) 851{ 852 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); 853} 854 855static __inline __m128d __attribute__((__always_inline__)) 856_mm_loadh_pd (__m128d __A, double const *__B) 857{ 858 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); 859} 860 861static __inline __m128d __attribute__((__always_inline__)) 862_mm_loadl_pd (__m128d __A, double const *__B) 863{ 864 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); 865} 866 867static __inline int __attribute__((__always_inline__)) 868_mm_movemask_pd (__m128d __A) 869{ 870 return __builtin_ia32_movmskpd ((__v2df)__A); 871} 872 873static __inline __m128i __attribute__((__always_inline__)) 874_mm_packs_epi16 (__m128i __A, __m128i __B) 875{ 876 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); 877} 878 879static __inline __m128i __attribute__((__always_inline__)) 880_mm_packs_epi32 (__m128i __A, __m128i __B) 881{ 882 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); 883} 884 885static __inline __m128i __attribute__((__always_inline__)) 886_mm_packus_epi16 (__m128i __A, __m128i __B) 887{ 888 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); 889} 890 891static __inline __m128i __attribute__((__always_inline__)) 892_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 893{ 894 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); 895} 896 897static __inline __m128i __attribute__((__always_inline__)) 898_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 899{ 900 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); 901} 902 903static __inline __m128i __attribute__((__always_inline__)) 904_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 905{ 906 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); 907} 908 909static __inline __m128i __attribute__((__always_inline__)) 910_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 911{ 912 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); 913} 914 915static __inline __m128i __attribute__((__always_inline__)) 916_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 917{ 918 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); 919} 920 921static __inline __m128i __attribute__((__always_inline__)) 922_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 923{ 924 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); 925} 926 927static __inline __m128i __attribute__((__always_inline__)) 928_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 929{ 930 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); 931} 932 933static __inline __m128i __attribute__((__always_inline__)) 934_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 935{ 936 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); 937} 938 939static __inline __m128i __attribute__((__always_inline__)) 940_mm_add_epi8 (__m128i __A, __m128i __B) 941{ 942 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); 943} 944 945static __inline __m128i __attribute__((__always_inline__)) 946_mm_add_epi16 (__m128i __A, __m128i __B) 947{ 948 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); 949} 950 951static __inline __m128i __attribute__((__always_inline__)) 952_mm_add_epi32 (__m128i __A, __m128i __B) 953{ 954 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); 955} 956 957static __inline __m128i __attribute__((__always_inline__)) 958_mm_add_epi64 (__m128i __A, __m128i __B) 959{ 960 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); 961} 962 963static __inline __m128i __attribute__((__always_inline__)) 964_mm_adds_epi8 (__m128i __A, __m128i __B) 965{ 966 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); 967} 968 969static __inline __m128i __attribute__((__always_inline__)) 970_mm_adds_epi16 (__m128i __A, __m128i __B) 971{ 972 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); 973} 974 975static __inline __m128i __attribute__((__always_inline__)) 976_mm_adds_epu8 (__m128i __A, __m128i __B) 977{ 978 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); 979} 980 981static __inline __m128i __attribute__((__always_inline__)) 982_mm_adds_epu16 (__m128i __A, __m128i __B) 983{ 984 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); 985} 986 987static __inline __m128i __attribute__((__always_inline__)) 988_mm_sub_epi8 (__m128i __A, __m128i __B) 989{ 990 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); 991} 992 993static __inline __m128i __attribute__((__always_inline__)) 994_mm_sub_epi16 (__m128i __A, __m128i __B) 995{ 996 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); 997} 998 999static __inline __m128i __attribute__((__always_inline__)) 1000_mm_sub_epi32 (__m128i __A, __m128i __B) 1001{ 1002 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); 1003} 1004 1005static __inline __m128i __attribute__((__always_inline__)) 1006_mm_sub_epi64 (__m128i __A, __m128i __B) 1007{ 1008 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); 1009} 1010 1011static __inline __m128i __attribute__((__always_inline__)) 1012_mm_subs_epi8 (__m128i __A, __m128i __B) 1013{ 1014 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); 1015} 1016 1017static __inline __m128i __attribute__((__always_inline__)) 1018_mm_subs_epi16 (__m128i __A, __m128i __B) 1019{ 1020 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); 1021} 1022 1023static __inline __m128i __attribute__((__always_inline__)) 1024_mm_subs_epu8 (__m128i __A, __m128i __B) 1025{ 1026 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); 1027} 1028 1029static __inline __m128i __attribute__((__always_inline__)) 1030_mm_subs_epu16 (__m128i __A, __m128i __B) 1031{ 1032 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); 1033} 1034 1035static __inline __m128i __attribute__((__always_inline__)) 1036_mm_madd_epi16 (__m128i __A, __m128i __B) 1037{ 1038 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); 1039} 1040 1041static __inline __m128i __attribute__((__always_inline__)) 1042_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1043{ 1044 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); 1045} 1046 1047static __inline __m128i __attribute__((__always_inline__)) 1048_mm_mullo_epi16 (__m128i __A, __m128i __B) 1049{ 1050 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); 1051} 1052 1053static __inline __m64 __attribute__((__always_inline__)) 1054_mm_mul_su32 (__m64 __A, __m64 __B) 1055{ 1056 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); 1057} 1058 1059static __inline __m128i __attribute__((__always_inline__)) 1060_mm_mul_epu32 (__m128i __A, __m128i __B) 1061{ 1062 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); 1063} 1064 1065#if 0 1066static __inline __m128i __attribute__((__always_inline__)) 1067_mm_slli_epi16 (__m128i __A, int __B) 1068{ 1069 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); 1070} 1071 1072static __inline __m128i __attribute__((__always_inline__)) 1073_mm_slli_epi32 (__m128i __A, int __B) 1074{ 1075 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); 1076} 1077 1078static __inline __m128i __attribute__((__always_inline__)) 1079_mm_slli_epi64 (__m128i __A, int __B) 1080{ 1081 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); 1082} 1083#else 1084#define _mm_slli_epi16(__A, __B) \ 1085 ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B)) 1086#define _mm_slli_epi32(__A, __B) \ 1087 ((__m128i)__builtin_ia32_pslldi128 ((__v8hi)(__A), __B)) 1088#define _mm_slli_epi64(__A, __B) \ 1089 ((__m128i)__builtin_ia32_psllqi128 ((__v8hi)(__A), __B)) 1090#endif 1091 1092#if 0 1093static __inline __m128i __attribute__((__always_inline__)) 1094_mm_srai_epi16 (__m128i __A, int __B) 1095{ 1096 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); 1097} 1098 1099static __inline __m128i __attribute__((__always_inline__)) 1100_mm_srai_epi32 (__m128i __A, int __B) 1101{ 1102 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); 1103} 1104#else 1105#define _mm_srai_epi16(__A, __B) \ 1106 ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B)) 1107#define _mm_srai_epi32(__A, __B) \ 1108 ((__m128i)__builtin_ia32_psradi128 ((__v8hi)(__A), __B)) 1109#endif 1110 1111#if 0 1112static __m128i __attribute__((__always_inline__)) 1113_mm_srli_si128 (__m128i __A, int __B) 1114{ 1115 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8)); 1116} 1117 1118static __m128i __attribute__((__always_inline__)) 1119_mm_srli_si128 (__m128i __A, int __B) 1120{ 1121 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8)); 1122} 1123#else 1124#define _mm_srli_si128(__A, __B) \ 1125 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8)) 1126#define _mm_slli_si128(__A, __B) \ 1127 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8)) 1128#endif 1129 1130#if 0 1131static __inline __m128i __attribute__((__always_inline__)) 1132_mm_srli_epi16 (__m128i __A, int __B) 1133{ 1134 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); 1135} 1136 1137static __inline __m128i __attribute__((__always_inline__)) 1138_mm_srli_epi32 (__m128i __A, int __B) 1139{ 1140 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); 1141} 1142 1143static __inline __m128i __attribute__((__always_inline__)) 1144_mm_srli_epi64 (__m128i __A, int __B) 1145{ 1146 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); 1147} 1148#else 1149#define _mm_srli_epi16(__A, __B) \ 1150 ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B)) 1151#define _mm_srli_epi32(__A, __B) \ 1152 ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B)) 1153#define _mm_srli_epi64(__A, __B) \ 1154 ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B)) 1155#endif 1156 1157static __inline __m128i __attribute__((__always_inline__)) 1158_mm_sll_epi16 (__m128i __A, __m128i __B) 1159{ 1160 return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); 1161} 1162 1163static __inline __m128i __attribute__((__always_inline__)) 1164_mm_sll_epi32 (__m128i __A, __m128i __B) 1165{ 1166 return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); 1167} 1168 1169static __inline __m128i __attribute__((__always_inline__)) 1170_mm_sll_epi64 (__m128i __A, __m128i __B) 1171{ 1172 return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); 1173} 1174 1175static __inline __m128i __attribute__((__always_inline__)) 1176_mm_sra_epi16 (__m128i __A, __m128i __B) 1177{ 1178 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); 1179} 1180 1181static __inline __m128i __attribute__((__always_inline__)) 1182_mm_sra_epi32 (__m128i __A, __m128i __B) 1183{ 1184 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); 1185} 1186 1187static __inline __m128i __attribute__((__always_inline__)) 1188_mm_srl_epi16 (__m128i __A, __m128i __B) 1189{ 1190 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); 1191} 1192 1193static __inline __m128i __attribute__((__always_inline__)) 1194_mm_srl_epi32 (__m128i __A, __m128i __B) 1195{ 1196 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); 1197} 1198 1199static __inline __m128i __attribute__((__always_inline__)) 1200_mm_srl_epi64 (__m128i __A, __m128i __B) 1201{ 1202 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); 1203} 1204 1205static __inline __m128i __attribute__((__always_inline__)) 1206_mm_and_si128 (__m128i __A, __m128i __B) 1207{ 1208 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 1209} 1210 1211static __inline __m128i __attribute__((__always_inline__)) 1212_mm_andnot_si128 (__m128i __A, __m128i __B) 1213{ 1214 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); 1215} 1216 1217static __inline __m128i __attribute__((__always_inline__)) 1218_mm_or_si128 (__m128i __A, __m128i __B) 1219{ 1220 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 1221} 1222 1223static __inline __m128i __attribute__((__always_inline__)) 1224_mm_xor_si128 (__m128i __A, __m128i __B) 1225{ 1226 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); 1227} 1228 1229static __inline __m128i __attribute__((__always_inline__)) 1230_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1231{ 1232 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); 1233} 1234 1235static __inline __m128i __attribute__((__always_inline__)) 1236_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1237{ 1238 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); 1239} 1240 1241static __inline __m128i __attribute__((__always_inline__)) 1242_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1243{ 1244 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); 1245} 1246 1247static __inline __m128i __attribute__((__always_inline__)) 1248_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1249{ 1250 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); 1251} 1252 1253static __inline __m128i __attribute__((__always_inline__)) 1254_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1255{ 1256 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); 1257} 1258 1259static __inline __m128i __attribute__((__always_inline__)) 1260_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1261{ 1262 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); 1263} 1264 1265static __inline __m128i __attribute__((__always_inline__)) 1266_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1267{ 1268 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); 1269} 1270 1271static __inline __m128i __attribute__((__always_inline__)) 1272_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1273{ 1274 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); 1275} 1276 1277static __inline __m128i __attribute__((__always_inline__)) 1278_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1279{ 1280 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); 1281} 1282 1283#if 0 1284static __inline int __attribute__((__always_inline__)) 1285_mm_extract_epi16 (__m128i const __A, int const __N) 1286{ 1287 return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); 1288} 1289 1290static __inline __m128i __attribute__((__always_inline__)) 1291_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1292{ 1293 return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); 1294} 1295#else 1296#define _mm_extract_epi16(A, N) \ 1297 ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N))) 1298#define _mm_insert_epi16(A, D, N) \ 1299 ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N))) 1300#endif 1301 1302static __inline __m128i __attribute__((__always_inline__)) 1303_mm_max_epi16 (__m128i __A, __m128i __B) 1304{ 1305 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); 1306} 1307 1308static __inline __m128i __attribute__((__always_inline__)) 1309_mm_max_epu8 (__m128i __A, __m128i __B) 1310{ 1311 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); 1312} 1313 1314static __inline __m128i __attribute__((__always_inline__)) 1315_mm_min_epi16 (__m128i __A, __m128i __B) 1316{ 1317 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); 1318} 1319 1320static __inline __m128i __attribute__((__always_inline__)) 1321_mm_min_epu8 (__m128i __A, __m128i __B) 1322{ 1323 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); 1324} 1325 1326static __inline int __attribute__((__always_inline__)) 1327_mm_movemask_epi8 (__m128i __A) 1328{ 1329 return __builtin_ia32_pmovmskb128 ((__v16qi)__A); 1330} 1331 1332static __inline __m128i __attribute__((__always_inline__)) 1333_mm_mulhi_epu16 (__m128i __A, __m128i __B) 1334{ 1335 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); 1336} 1337 1338#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B)) 1339#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B)) 1340#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) 1341 1342static __inline void __attribute__((__always_inline__)) 1343_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 1344{ 1345 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); 1346} 1347 1348static __inline __m128i __attribute__((__always_inline__)) 1349_mm_avg_epu8 (__m128i __A, __m128i __B) 1350{ 1351 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); 1352} 1353 1354static __inline __m128i __attribute__((__always_inline__)) 1355_mm_avg_epu16 (__m128i __A, __m128i __B) 1356{ 1357 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); 1358} 1359 1360static __inline __m128i __attribute__((__always_inline__)) 1361_mm_sad_epu8 (__m128i __A, __m128i __B) 1362{ 1363 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); 1364} 1365 1366static __inline void __attribute__((__always_inline__)) 1367_mm_stream_si32 (int *__A, int __B) 1368{ 1369 __builtin_ia32_movnti (__A, __B); 1370} 1371 1372static __inline void __attribute__((__always_inline__)) 1373_mm_stream_si128 (__m128i *__A, __m128i __B) 1374{ 1375 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); 1376} 1377 1378static __inline void __attribute__((__always_inline__)) 1379_mm_stream_pd (double *__A, __m128d __B) 1380{ 1381 __builtin_ia32_movntpd (__A, (__v2df)__B); 1382} 1383 1384static __inline void __attribute__((__always_inline__)) 1385_mm_clflush (void const *__A) 1386{ 1387 __builtin_ia32_clflush (__A); 1388} 1389 1390static __inline void __attribute__((__always_inline__)) 1391_mm_lfence (void) 1392{ 1393 __builtin_ia32_lfence (); 1394} 1395 1396static __inline void __attribute__((__always_inline__)) 1397_mm_mfence (void) 1398{ 1399 __builtin_ia32_mfence (); 1400} 1401 1402static __inline __m128i __attribute__((__always_inline__)) 1403_mm_cvtsi32_si128 (int __A) 1404{ 1405 return _mm_set_epi32 (0, 0, 0, __A); 1406} 1407 1408#ifdef __x86_64__ 1409static __inline __m128i __attribute__((__always_inline__)) 1410_mm_cvtsi64x_si128 (long long __A) 1411{ 1412 return _mm_set_epi64x (0, __A); 1413} 1414#endif 1415 1416/* Casts between various SP, DP, INT vector types. Note that these do no 1417 conversion of values, they just change the type. */ 1418static __inline __m128 __attribute__((__always_inline__)) 1419_mm_castpd_ps(__m128d __A) 1420{ 1421 return (__m128) __A; 1422} 1423 1424static __inline __m128i __attribute__((__always_inline__)) 1425_mm_castpd_si128(__m128d __A) 1426{ 1427 return (__m128i) __A; 1428} 1429 1430static __inline __m128d __attribute__((__always_inline__)) 1431_mm_castps_pd(__m128 __A) 1432{ 1433 return (__m128d) __A; 1434} 1435 1436static __inline __m128i __attribute__((__always_inline__)) 1437_mm_castps_si128(__m128 __A) 1438{ 1439 return (__m128i) __A; 1440} 1441 1442static __inline __m128 __attribute__((__always_inline__)) 1443_mm_castsi128_ps(__m128i __A) 1444{ 1445 return (__m128) __A; 1446} 1447 1448static __inline __m128d __attribute__((__always_inline__)) 1449_mm_castsi128_pd(__m128i __A) 1450{ 1451 return (__m128d) __A; 1452} 1453 1454#endif /* __SSE2__ */ 1455 1456#endif /* _EMMINTRIN_H_INCLUDED */ 1457