emmintrin.h revision 259111
1/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with GCC; see the file COPYING. If not, write to 17 the Free Software Foundation, 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. */ 19 20/* As a special exception, if you include this header file into source 21 files compiled by GCC, this header file does not by itself cause 22 the resulting executable to be covered by the GNU General Public 23 License. This exception does not however invalidate any other 24 reasons why the executable file might be covered by the GNU General 25 Public License. */ 26 27/* Implemented from the specification included in the Intel C++ Compiler 28 User Guide and Reference, version 9.0. */ 29 30#ifndef _EMMINTRIN_H_INCLUDED 31#define _EMMINTRIN_H_INCLUDED 32 33#ifndef __SSE2__ 34# error "SSE2 instruction set not enabled" 35#else 36 37/* We need definitions from the SSE header files*/ 38#include <xmmintrin.h> 39 40/* SSE2 */ 41typedef double __v2df __attribute__ ((__vector_size__ (16))); 42typedef long long __v2di __attribute__ ((__vector_size__ (16))); 43typedef int __v4si __attribute__ ((__vector_size__ (16))); 44typedef short __v8hi __attribute__ ((__vector_size__ (16))); 45typedef char __v16qi __attribute__ ((__vector_size__ (16))); 46 47/* The Intel API is flexible enough that we must allow aliasing with other 48 vector types, and their scalar components. */ 49typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 50typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 51 52/* Create a selector for use with the SHUFPD instruction. */ 53#define _MM_SHUFFLE2(fp1,fp0) \ 54 (((fp1) << 1) | (fp0)) 55 56/* Create a vector with element 0 as F and the rest zero. */ 57static __inline __m128d __attribute__((__always_inline__)) 58_mm_set_sd (double __F) 59{ 60 return __extension__ (__m128d){ __F, 0 }; 61} 62 63/* Create a vector with both elements equal to F. */ 64static __inline __m128d __attribute__((__always_inline__)) 65_mm_set1_pd (double __F) 66{ 67 return __extension__ (__m128d){ __F, __F }; 68} 69 70static __inline __m128d __attribute__((__always_inline__)) 71_mm_set_pd1 (double __F) 72{ 73 return _mm_set1_pd (__F); 74} 75 76/* Create a vector with the lower value X and upper value W. */ 77static __inline __m128d __attribute__((__always_inline__)) 78_mm_set_pd (double __W, double __X) 79{ 80 return __extension__ (__m128d){ __X, __W }; 81} 82 83/* Create a vector with the lower value W and upper value X. */ 84static __inline __m128d __attribute__((__always_inline__)) 85_mm_setr_pd (double __W, double __X) 86{ 87 return __extension__ (__m128d){ __W, __X }; 88} 89 90/* Create a vector of zeros. */ 91static __inline __m128d __attribute__((__always_inline__)) 92_mm_setzero_pd (void) 93{ 94 return __extension__ (__m128d){ 0.0, 0.0 }; 95} 96 97/* Sets the low DPFP value of A from the low value of B. */ 98static __inline __m128d __attribute__((__always_inline__)) 99_mm_move_sd (__m128d __A, __m128d __B) 100{ 101 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 102} 103 104/* Load two DPFP values from P. The address must be 16-byte aligned. */ 105static __inline __m128d __attribute__((__always_inline__)) 106_mm_load_pd (double const *__P) 107{ 108 return *(__m128d *)__P; 109} 110 111/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 112static __inline __m128d __attribute__((__always_inline__)) 113_mm_loadu_pd (double const *__P) 114{ 115 return __builtin_ia32_loadupd (__P); 116} 117 118/* Create a vector with all two elements equal to *P. */ 119static __inline __m128d __attribute__((__always_inline__)) 120_mm_load1_pd (double const *__P) 121{ 122 return _mm_set1_pd (*__P); 123} 124 125/* Create a vector with element 0 as *P and the rest zero. */ 126static __inline __m128d __attribute__((__always_inline__)) 127_mm_load_sd (double const *__P) 128{ 129 return _mm_set_sd (*__P); 130} 131 132static __inline __m128d __attribute__((__always_inline__)) 133_mm_load_pd1 (double const *__P) 134{ 135 return _mm_load1_pd (__P); 136} 137 138/* Load two DPFP values in reverse order. The address must be aligned. */ 139static __inline __m128d __attribute__((__always_inline__)) 140_mm_loadr_pd (double const *__P) 141{ 142 __m128d __tmp = _mm_load_pd (__P); 143 return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); 144} 145 146/* Store two DPFP values. The address must be 16-byte aligned. */ 147static __inline void __attribute__((__always_inline__)) 148_mm_store_pd (double *__P, __m128d __A) 149{ 150 *(__m128d *)__P = __A; 151} 152 153/* Store two DPFP values. The address need not be 16-byte aligned. */ 154static __inline void __attribute__((__always_inline__)) 155_mm_storeu_pd (double *__P, __m128d __A) 156{ 157 __builtin_ia32_storeupd (__P, __A); 158} 159 160/* Stores the lower DPFP value. */ 161static __inline void __attribute__((__always_inline__)) 162_mm_store_sd (double *__P, __m128d __A) 163{ 164 *__P = __builtin_ia32_vec_ext_v2df (__A, 0); 165} 166 167static __inline double __attribute__((__always_inline__)) 168_mm_cvtsd_f64 (__m128d __A) 169{ 170 return __builtin_ia32_vec_ext_v2df (__A, 0); 171} 172 173static __inline void __attribute__((__always_inline__)) 174_mm_storel_pd (double *__P, __m128d __A) 175{ 176 _mm_store_sd (__P, __A); 177} 178 179/* Stores the upper DPFP value. */ 180static __inline void __attribute__((__always_inline__)) 181_mm_storeh_pd (double *__P, __m128d __A) 182{ 183 *__P = __builtin_ia32_vec_ext_v2df (__A, 1); 184} 185 186/* Store the lower DPFP value across two words. 187 The address must be 16-byte aligned. */ 188static __inline void __attribute__((__always_inline__)) 189_mm_store1_pd (double *__P, __m128d __A) 190{ 191 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); 192} 193 194static __inline void __attribute__((__always_inline__)) 195_mm_store_pd1 (double *__P, __m128d __A) 196{ 197 _mm_store1_pd (__P, __A); 198} 199 200/* Store two DPFP values in reverse order. The address must be aligned. */ 201static __inline void __attribute__((__always_inline__)) 202_mm_storer_pd (double *__P, __m128d __A) 203{ 204 _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); 205} 206 207static __inline int __attribute__((__always_inline__)) 208_mm_cvtsi128_si32 (__m128i __A) 209{ 210 return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); 211} 212 213#ifdef __x86_64__ 214/* Intel intrinsic. */ 215static __inline long long __attribute__((__always_inline__)) 216_mm_cvtsi128_si64 (__m128i __A) 217{ 218 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); 219} 220 221/* Microsoft intrinsic. */ 222static __inline long long __attribute__((__always_inline__)) 223_mm_cvtsi128_si64x (__m128i __A) 224{ 225 return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); 226} 227#endif 228 229static __inline __m128d __attribute__((__always_inline__)) 230_mm_add_pd (__m128d __A, __m128d __B) 231{ 232 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); 233} 234 235static __inline __m128d __attribute__((__always_inline__)) 236_mm_add_sd (__m128d __A, __m128d __B) 237{ 238 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); 239} 240 241static __inline __m128d __attribute__((__always_inline__)) 242_mm_sub_pd (__m128d __A, __m128d __B) 243{ 244 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); 245} 246 247static __inline __m128d __attribute__((__always_inline__)) 248_mm_sub_sd (__m128d __A, __m128d __B) 249{ 250 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); 251} 252 253static __inline __m128d __attribute__((__always_inline__)) 254_mm_mul_pd (__m128d __A, __m128d __B) 255{ 256 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); 257} 258 259static __inline __m128d __attribute__((__always_inline__)) 260_mm_mul_sd (__m128d __A, __m128d __B) 261{ 262 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); 263} 264 265static __inline __m128d __attribute__((__always_inline__)) 266_mm_div_pd (__m128d __A, __m128d __B) 267{ 268 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); 269} 270 271static __inline __m128d __attribute__((__always_inline__)) 272_mm_div_sd (__m128d __A, __m128d __B) 273{ 274 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); 275} 276 277static __inline __m128d __attribute__((__always_inline__)) 278_mm_sqrt_pd (__m128d __A) 279{ 280 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); 281} 282 283/* Return pair {sqrt (A[0), B[1]}. */ 284static __inline __m128d __attribute__((__always_inline__)) 285_mm_sqrt_sd (__m128d __A, __m128d __B) 286{ 287 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 288 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); 289} 290 291static __inline __m128d __attribute__((__always_inline__)) 292_mm_min_pd (__m128d __A, __m128d __B) 293{ 294 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); 295} 296 297static __inline __m128d __attribute__((__always_inline__)) 298_mm_min_sd (__m128d __A, __m128d __B) 299{ 300 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); 301} 302 303static __inline __m128d __attribute__((__always_inline__)) 304_mm_max_pd (__m128d __A, __m128d __B) 305{ 306 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); 307} 308 309static __inline __m128d __attribute__((__always_inline__)) 310_mm_max_sd (__m128d __A, __m128d __B) 311{ 312 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); 313} 314 315static __inline __m128d __attribute__((__always_inline__)) 316_mm_and_pd (__m128d __A, __m128d __B) 317{ 318 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); 319} 320 321static __inline __m128d __attribute__((__always_inline__)) 322_mm_andnot_pd (__m128d __A, __m128d __B) 323{ 324 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); 325} 326 327static __inline __m128d __attribute__((__always_inline__)) 328_mm_or_pd (__m128d __A, __m128d __B) 329{ 330 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); 331} 332 333static __inline __m128d __attribute__((__always_inline__)) 334_mm_xor_pd (__m128d __A, __m128d __B) 335{ 336 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); 337} 338 339static __inline __m128d __attribute__((__always_inline__)) 340_mm_cmpeq_pd (__m128d __A, __m128d __B) 341{ 342 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); 343} 344 345static __inline __m128d __attribute__((__always_inline__)) 346_mm_cmplt_pd (__m128d __A, __m128d __B) 347{ 348 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); 349} 350 351static __inline __m128d __attribute__((__always_inline__)) 352_mm_cmple_pd (__m128d __A, __m128d __B) 353{ 354 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); 355} 356 357static __inline __m128d __attribute__((__always_inline__)) 358_mm_cmpgt_pd (__m128d __A, __m128d __B) 359{ 360 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); 361} 362 363static __inline __m128d __attribute__((__always_inline__)) 364_mm_cmpge_pd (__m128d __A, __m128d __B) 365{ 366 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); 367} 368 369static __inline __m128d __attribute__((__always_inline__)) 370_mm_cmpneq_pd (__m128d __A, __m128d __B) 371{ 372 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); 373} 374 375static __inline __m128d __attribute__((__always_inline__)) 376_mm_cmpnlt_pd (__m128d __A, __m128d __B) 377{ 378 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); 379} 380 381static __inline __m128d __attribute__((__always_inline__)) 382_mm_cmpnle_pd (__m128d __A, __m128d __B) 383{ 384 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); 385} 386 387static __inline __m128d __attribute__((__always_inline__)) 388_mm_cmpngt_pd (__m128d __A, __m128d __B) 389{ 390 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); 391} 392 393static __inline __m128d __attribute__((__always_inline__)) 394_mm_cmpnge_pd (__m128d __A, __m128d __B) 395{ 396 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); 397} 398 399static __inline __m128d __attribute__((__always_inline__)) 400_mm_cmpord_pd (__m128d __A, __m128d __B) 401{ 402 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); 403} 404 405static __inline __m128d __attribute__((__always_inline__)) 406_mm_cmpunord_pd (__m128d __A, __m128d __B) 407{ 408 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); 409} 410 411static __inline __m128d __attribute__((__always_inline__)) 412_mm_cmpeq_sd (__m128d __A, __m128d __B) 413{ 414 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); 415} 416 417static __inline __m128d __attribute__((__always_inline__)) 418_mm_cmplt_sd (__m128d __A, __m128d __B) 419{ 420 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); 421} 422 423static __inline __m128d __attribute__((__always_inline__)) 424_mm_cmple_sd (__m128d __A, __m128d __B) 425{ 426 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); 427} 428 429static __inline __m128d __attribute__((__always_inline__)) 430_mm_cmpgt_sd (__m128d __A, __m128d __B) 431{ 432 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 433 (__v2df) 434 __builtin_ia32_cmpltsd ((__v2df) __B, 435 (__v2df) 436 __A)); 437} 438 439static __inline __m128d __attribute__((__always_inline__)) 440_mm_cmpge_sd (__m128d __A, __m128d __B) 441{ 442 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 443 (__v2df) 444 __builtin_ia32_cmplesd ((__v2df) __B, 445 (__v2df) 446 __A)); 447} 448 449static __inline __m128d __attribute__((__always_inline__)) 450_mm_cmpneq_sd (__m128d __A, __m128d __B) 451{ 452 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); 453} 454 455static __inline __m128d __attribute__((__always_inline__)) 456_mm_cmpnlt_sd (__m128d __A, __m128d __B) 457{ 458 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); 459} 460 461static __inline __m128d __attribute__((__always_inline__)) 462_mm_cmpnle_sd (__m128d __A, __m128d __B) 463{ 464 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); 465} 466 467static __inline __m128d __attribute__((__always_inline__)) 468_mm_cmpngt_sd (__m128d __A, __m128d __B) 469{ 470 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 471 (__v2df) 472 __builtin_ia32_cmpnltsd ((__v2df) __B, 473 (__v2df) 474 __A)); 475} 476 477static __inline __m128d __attribute__((__always_inline__)) 478_mm_cmpnge_sd (__m128d __A, __m128d __B) 479{ 480 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 481 (__v2df) 482 __builtin_ia32_cmpnlesd ((__v2df) __B, 483 (__v2df) 484 __A)); 485} 486 487static __inline __m128d __attribute__((__always_inline__)) 488_mm_cmpord_sd (__m128d __A, __m128d __B) 489{ 490 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); 491} 492 493static __inline __m128d __attribute__((__always_inline__)) 494_mm_cmpunord_sd (__m128d __A, __m128d __B) 495{ 496 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); 497} 498 499static __inline int __attribute__((__always_inline__)) 500_mm_comieq_sd (__m128d __A, __m128d __B) 501{ 502 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); 503} 504 505static __inline int __attribute__((__always_inline__)) 506_mm_comilt_sd (__m128d __A, __m128d __B) 507{ 508 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); 509} 510 511static __inline int __attribute__((__always_inline__)) 512_mm_comile_sd (__m128d __A, __m128d __B) 513{ 514 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); 515} 516 517static __inline int __attribute__((__always_inline__)) 518_mm_comigt_sd (__m128d __A, __m128d __B) 519{ 520 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); 521} 522 523static __inline int __attribute__((__always_inline__)) 524_mm_comige_sd (__m128d __A, __m128d __B) 525{ 526 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); 527} 528 529static __inline int __attribute__((__always_inline__)) 530_mm_comineq_sd (__m128d __A, __m128d __B) 531{ 532 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); 533} 534 535static __inline int __attribute__((__always_inline__)) 536_mm_ucomieq_sd (__m128d __A, __m128d __B) 537{ 538 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); 539} 540 541static __inline int __attribute__((__always_inline__)) 542_mm_ucomilt_sd (__m128d __A, __m128d __B) 543{ 544 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); 545} 546 547static __inline int __attribute__((__always_inline__)) 548_mm_ucomile_sd (__m128d __A, __m128d __B) 549{ 550 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); 551} 552 553static __inline int __attribute__((__always_inline__)) 554_mm_ucomigt_sd (__m128d __A, __m128d __B) 555{ 556 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); 557} 558 559static __inline int __attribute__((__always_inline__)) 560_mm_ucomige_sd (__m128d __A, __m128d __B) 561{ 562 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); 563} 564 565static __inline int __attribute__((__always_inline__)) 566_mm_ucomineq_sd (__m128d __A, __m128d __B) 567{ 568 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); 569} 570 571/* Create a vector of Qi, where i is the element number. */ 572 573static __inline __m128i __attribute__((__always_inline__)) 574_mm_set_epi64x (long long __q1, long long __q0) 575{ 576 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 577} 578 579static __inline __m128i __attribute__((__always_inline__)) 580_mm_set_epi64 (__m64 __q1, __m64 __q0) 581{ 582 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 583} 584 585static __inline __m128i __attribute__((__always_inline__)) 586_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 587{ 588 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 589} 590 591static __inline __m128i __attribute__((__always_inline__)) 592_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 593 short __q3, short __q2, short __q1, short __q0) 594{ 595 return __extension__ (__m128i)(__v8hi){ 596 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 597} 598 599static __inline __m128i __attribute__((__always_inline__)) 600_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 601 char __q11, char __q10, char __q09, char __q08, 602 char __q07, char __q06, char __q05, char __q04, 603 char __q03, char __q02, char __q01, char __q00) 604{ 605 return __extension__ (__m128i)(__v16qi){ 606 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 607 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 608 }; 609} 610 611/* Set all of the elements of the vector to A. */ 612 613static __inline __m128i __attribute__((__always_inline__)) 614_mm_set1_epi64x (long long __A) 615{ 616 return _mm_set_epi64x (__A, __A); 617} 618 619static __inline __m128i __attribute__((__always_inline__)) 620_mm_set1_epi64 (__m64 __A) 621{ 622 return _mm_set_epi64 (__A, __A); 623} 624 625static __inline __m128i __attribute__((__always_inline__)) 626_mm_set1_epi32 (int __A) 627{ 628 return _mm_set_epi32 (__A, __A, __A, __A); 629} 630 631static __inline __m128i __attribute__((__always_inline__)) 632_mm_set1_epi16 (short __A) 633{ 634 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 635} 636 637static __inline __m128i __attribute__((__always_inline__)) 638_mm_set1_epi8 (char __A) 639{ 640 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 641 __A, __A, __A, __A, __A, __A, __A, __A); 642} 643 644/* Create a vector of Qi, where i is the element number. 645 The parameter order is reversed from the _mm_set_epi* functions. */ 646 647static __inline __m128i __attribute__((__always_inline__)) 648_mm_setr_epi64 (__m64 __q0, __m64 __q1) 649{ 650 return _mm_set_epi64 (__q1, __q0); 651} 652 653static __inline __m128i __attribute__((__always_inline__)) 654_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 655{ 656 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 657} 658 659static __inline __m128i __attribute__((__always_inline__)) 660_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 661 short __q4, short __q5, short __q6, short __q7) 662{ 663 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 664} 665 666static __inline __m128i __attribute__((__always_inline__)) 667_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 668 char __q04, char __q05, char __q06, char __q07, 669 char __q08, char __q09, char __q10, char __q11, 670 char __q12, char __q13, char __q14, char __q15) 671{ 672 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 673 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 674} 675 676/* Create a vector with element 0 as *P and the rest zero. */ 677 678static __inline __m128i __attribute__((__always_inline__)) 679_mm_load_si128 (__m128i const *__P) 680{ 681 return *__P; 682} 683 684static __inline __m128i __attribute__((__always_inline__)) 685_mm_loadu_si128 (__m128i const *__P) 686{ 687 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); 688} 689 690static __inline __m128i __attribute__((__always_inline__)) 691_mm_loadl_epi64 (__m128i const *__P) 692{ 693 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 694} 695 696static __inline void __attribute__((__always_inline__)) 697_mm_store_si128 (__m128i *__P, __m128i __B) 698{ 699 *__P = __B; 700} 701 702static __inline void __attribute__((__always_inline__)) 703_mm_storeu_si128 (__m128i *__P, __m128i __B) 704{ 705 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); 706} 707 708static __inline void __attribute__((__always_inline__)) 709_mm_storel_epi64 (__m128i *__P, __m128i __B) 710{ 711 *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); 712} 713 714static __inline __m64 __attribute__((__always_inline__)) 715_mm_movepi64_pi64 (__m128i __B) 716{ 717 return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); 718} 719 720static __inline __m128i __attribute__((__always_inline__)) 721_mm_movpi64_epi64 (__m64 __A) 722{ 723 return _mm_set_epi64 ((__m64)0LL, __A); 724} 725 726static __inline __m128i __attribute__((__always_inline__)) 727_mm_move_epi64 (__m128i __A) 728{ 729 return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A)); 730} 731 732/* Create a vector of zeros. */ 733static __inline __m128i __attribute__((__always_inline__)) 734_mm_setzero_si128 (void) 735{ 736 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 737} 738 739static __inline __m128d __attribute__((__always_inline__)) 740_mm_cvtepi32_pd (__m128i __A) 741{ 742 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); 743} 744 745static __inline __m128 __attribute__((__always_inline__)) 746_mm_cvtepi32_ps (__m128i __A) 747{ 748 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); 749} 750 751static __inline __m128i __attribute__((__always_inline__)) 752_mm_cvtpd_epi32 (__m128d __A) 753{ 754 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); 755} 756 757static __inline __m64 __attribute__((__always_inline__)) 758_mm_cvtpd_pi32 (__m128d __A) 759{ 760 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); 761} 762 763static __inline __m128 __attribute__((__always_inline__)) 764_mm_cvtpd_ps (__m128d __A) 765{ 766 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); 767} 768 769static __inline __m128i __attribute__((__always_inline__)) 770_mm_cvttpd_epi32 (__m128d __A) 771{ 772 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); 773} 774 775static __inline __m64 __attribute__((__always_inline__)) 776_mm_cvttpd_pi32 (__m128d __A) 777{ 778 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); 779} 780 781static __inline __m128d __attribute__((__always_inline__)) 782_mm_cvtpi32_pd (__m64 __A) 783{ 784 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); 785} 786 787static __inline __m128i __attribute__((__always_inline__)) 788_mm_cvtps_epi32 (__m128 __A) 789{ 790 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); 791} 792 793static __inline __m128i __attribute__((__always_inline__)) 794_mm_cvttps_epi32 (__m128 __A) 795{ 796 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); 797} 798 799static __inline __m128d __attribute__((__always_inline__)) 800_mm_cvtps_pd (__m128 __A) 801{ 802 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); 803} 804 805static __inline int __attribute__((__always_inline__)) 806_mm_cvtsd_si32 (__m128d __A) 807{ 808 return __builtin_ia32_cvtsd2si ((__v2df) __A); 809} 810 811#ifdef __x86_64__ 812/* Intel intrinsic. */ 813static __inline long long __attribute__((__always_inline__)) 814_mm_cvtsd_si64 (__m128d __A) 815{ 816 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 817} 818 819/* Microsoft intrinsic. */ 820static __inline long long __attribute__((__always_inline__)) 821_mm_cvtsd_si64x (__m128d __A) 822{ 823 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 824} 825#endif 826 827static __inline int __attribute__((__always_inline__)) 828_mm_cvttsd_si32 (__m128d __A) 829{ 830 return __builtin_ia32_cvttsd2si ((__v2df) __A); 831} 832 833#ifdef __x86_64__ 834/* Intel intrinsic. */ 835static __inline long long __attribute__((__always_inline__)) 836_mm_cvttsd_si64 (__m128d __A) 837{ 838 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 839} 840 841/* Microsoft intrinsic. */ 842static __inline long long __attribute__((__always_inline__)) 843_mm_cvttsd_si64x (__m128d __A) 844{ 845 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 846} 847#endif 848 849static __inline __m128 __attribute__((__always_inline__)) 850_mm_cvtsd_ss (__m128 __A, __m128d __B) 851{ 852 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); 853} 854 855static __inline __m128d __attribute__((__always_inline__)) 856_mm_cvtsi32_sd (__m128d __A, int __B) 857{ 858 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); 859} 860 861#ifdef __x86_64__ 862/* Intel intrinsic. */ 863static __inline __m128d __attribute__((__always_inline__)) 864_mm_cvtsi64_sd (__m128d __A, long long __B) 865{ 866 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 867} 868 869/* Microsoft intrinsic. */ 870static __inline __m128d __attribute__((__always_inline__)) 871_mm_cvtsi64x_sd (__m128d __A, long long __B) 872{ 873 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 874} 875#endif 876 877static __inline __m128d __attribute__((__always_inline__)) 878_mm_cvtss_sd (__m128d __A, __m128 __B) 879{ 880 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); 881} 882 883#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C))) 884 885static __inline __m128d __attribute__((__always_inline__)) 886_mm_unpackhi_pd (__m128d __A, __m128d __B) 887{ 888 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); 889} 890 891static __inline __m128d __attribute__((__always_inline__)) 892_mm_unpacklo_pd (__m128d __A, __m128d __B) 893{ 894 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); 895} 896 897static __inline __m128d __attribute__((__always_inline__)) 898_mm_loadh_pd (__m128d __A, double const *__B) 899{ 900 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); 901} 902 903static __inline __m128d __attribute__((__always_inline__)) 904_mm_loadl_pd (__m128d __A, double const *__B) 905{ 906 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); 907} 908 909static __inline int __attribute__((__always_inline__)) 910_mm_movemask_pd (__m128d __A) 911{ 912 return __builtin_ia32_movmskpd ((__v2df)__A); 913} 914 915static __inline __m128i __attribute__((__always_inline__)) 916_mm_packs_epi16 (__m128i __A, __m128i __B) 917{ 918 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); 919} 920 921static __inline __m128i __attribute__((__always_inline__)) 922_mm_packs_epi32 (__m128i __A, __m128i __B) 923{ 924 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); 925} 926 927static __inline __m128i __attribute__((__always_inline__)) 928_mm_packus_epi16 (__m128i __A, __m128i __B) 929{ 930 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); 931} 932 933static __inline __m128i __attribute__((__always_inline__)) 934_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 935{ 936 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); 937} 938 939static __inline __m128i __attribute__((__always_inline__)) 940_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 941{ 942 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); 943} 944 945static __inline __m128i __attribute__((__always_inline__)) 946_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 947{ 948 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); 949} 950 951static __inline __m128i __attribute__((__always_inline__)) 952_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 953{ 954 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); 955} 956 957static __inline __m128i __attribute__((__always_inline__)) 958_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 959{ 960 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); 961} 962 963static __inline __m128i __attribute__((__always_inline__)) 964_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 965{ 966 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); 967} 968 969static __inline __m128i __attribute__((__always_inline__)) 970_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 971{ 972 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); 973} 974 975static __inline __m128i __attribute__((__always_inline__)) 976_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 977{ 978 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); 979} 980 981static __inline __m128i __attribute__((__always_inline__)) 982_mm_add_epi8 (__m128i __A, __m128i __B) 983{ 984 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); 985} 986 987static __inline __m128i __attribute__((__always_inline__)) 988_mm_add_epi16 (__m128i __A, __m128i __B) 989{ 990 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); 991} 992 993static __inline __m128i __attribute__((__always_inline__)) 994_mm_add_epi32 (__m128i __A, __m128i __B) 995{ 996 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); 997} 998 999static __inline __m128i __attribute__((__always_inline__)) 1000_mm_add_epi64 (__m128i __A, __m128i __B) 1001{ 1002 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); 1003} 1004 1005static __inline __m128i __attribute__((__always_inline__)) 1006_mm_adds_epi8 (__m128i __A, __m128i __B) 1007{ 1008 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); 1009} 1010 1011static __inline __m128i __attribute__((__always_inline__)) 1012_mm_adds_epi16 (__m128i __A, __m128i __B) 1013{ 1014 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); 1015} 1016 1017static __inline __m128i __attribute__((__always_inline__)) 1018_mm_adds_epu8 (__m128i __A, __m128i __B) 1019{ 1020 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); 1021} 1022 1023static __inline __m128i __attribute__((__always_inline__)) 1024_mm_adds_epu16 (__m128i __A, __m128i __B) 1025{ 1026 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); 1027} 1028 1029static __inline __m128i __attribute__((__always_inline__)) 1030_mm_sub_epi8 (__m128i __A, __m128i __B) 1031{ 1032 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); 1033} 1034 1035static __inline __m128i __attribute__((__always_inline__)) 1036_mm_sub_epi16 (__m128i __A, __m128i __B) 1037{ 1038 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); 1039} 1040 1041static __inline __m128i __attribute__((__always_inline__)) 1042_mm_sub_epi32 (__m128i __A, __m128i __B) 1043{ 1044 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); 1045} 1046 1047static __inline __m128i __attribute__((__always_inline__)) 1048_mm_sub_epi64 (__m128i __A, __m128i __B) 1049{ 1050 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); 1051} 1052 1053static __inline __m128i __attribute__((__always_inline__)) 1054_mm_subs_epi8 (__m128i __A, __m128i __B) 1055{ 1056 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); 1057} 1058 1059static __inline __m128i __attribute__((__always_inline__)) 1060_mm_subs_epi16 (__m128i __A, __m128i __B) 1061{ 1062 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); 1063} 1064 1065static __inline __m128i __attribute__((__always_inline__)) 1066_mm_subs_epu8 (__m128i __A, __m128i __B) 1067{ 1068 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); 1069} 1070 1071static __inline __m128i __attribute__((__always_inline__)) 1072_mm_subs_epu16 (__m128i __A, __m128i __B) 1073{ 1074 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); 1075} 1076 1077static __inline __m128i __attribute__((__always_inline__)) 1078_mm_madd_epi16 (__m128i __A, __m128i __B) 1079{ 1080 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); 1081} 1082 1083static __inline __m128i __attribute__((__always_inline__)) 1084_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1085{ 1086 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); 1087} 1088 1089static __inline __m128i __attribute__((__always_inline__)) 1090_mm_mullo_epi16 (__m128i __A, __m128i __B) 1091{ 1092 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); 1093} 1094 1095static __inline __m64 __attribute__((__always_inline__)) 1096_mm_mul_su32 (__m64 __A, __m64 __B) 1097{ 1098 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); 1099} 1100 1101static __inline __m128i __attribute__((__always_inline__)) 1102_mm_mul_epu32 (__m128i __A, __m128i __B) 1103{ 1104 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); 1105} 1106 1107#if 0 1108static __inline __m128i __attribute__((__always_inline__)) 1109_mm_slli_epi16 (__m128i __A, int __B) 1110{ 1111 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); 1112} 1113 1114static __inline __m128i __attribute__((__always_inline__)) 1115_mm_slli_epi32 (__m128i __A, int __B) 1116{ 1117 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); 1118} 1119 1120static __inline __m128i __attribute__((__always_inline__)) 1121_mm_slli_epi64 (__m128i __A, int __B) 1122{ 1123 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); 1124} 1125#else 1126#define _mm_slli_epi16(__A, __B) \ 1127 ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B)) 1128#define _mm_slli_epi32(__A, __B) \ 1129 ((__m128i)__builtin_ia32_pslldi128 ((__v4si)(__A), __B)) 1130#define _mm_slli_epi64(__A, __B) \ 1131 ((__m128i)__builtin_ia32_psllqi128 ((__v2di)(__A), __B)) 1132#endif 1133 1134#if 0 1135static __inline __m128i __attribute__((__always_inline__)) 1136_mm_srai_epi16 (__m128i __A, int __B) 1137{ 1138 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); 1139} 1140 1141static __inline __m128i __attribute__((__always_inline__)) 1142_mm_srai_epi32 (__m128i __A, int __B) 1143{ 1144 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); 1145} 1146#else 1147#define _mm_srai_epi16(__A, __B) \ 1148 ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B)) 1149#define _mm_srai_epi32(__A, __B) \ 1150 ((__m128i)__builtin_ia32_psradi128 ((__v4si)(__A), __B)) 1151#endif 1152 1153#if 0 1154static __m128i __attribute__((__always_inline__)) 1155_mm_srli_si128 (__m128i __A, int __B) 1156{ 1157 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8)); 1158} 1159 1160static __m128i __attribute__((__always_inline__)) 1161_mm_srli_si128 (__m128i __A, int __B) 1162{ 1163 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8)); 1164} 1165#else 1166#define _mm_srli_si128(__A, __B) \ 1167 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8)) 1168#define _mm_slli_si128(__A, __B) \ 1169 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8)) 1170#endif 1171 1172#if 0 1173static __inline __m128i __attribute__((__always_inline__)) 1174_mm_srli_epi16 (__m128i __A, int __B) 1175{ 1176 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); 1177} 1178 1179static __inline __m128i __attribute__((__always_inline__)) 1180_mm_srli_epi32 (__m128i __A, int __B) 1181{ 1182 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); 1183} 1184 1185static __inline __m128i __attribute__((__always_inline__)) 1186_mm_srli_epi64 (__m128i __A, int __B) 1187{ 1188 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); 1189} 1190#else 1191#define _mm_srli_epi16(__A, __B) \ 1192 ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B)) 1193#define _mm_srli_epi32(__A, __B) \ 1194 ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B)) 1195#define _mm_srli_epi64(__A, __B) \ 1196 ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B)) 1197#endif 1198 1199static __inline __m128i __attribute__((__always_inline__)) 1200_mm_sll_epi16 (__m128i __A, __m128i __B) 1201{ 1202 return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); 1203} 1204 1205static __inline __m128i __attribute__((__always_inline__)) 1206_mm_sll_epi32 (__m128i __A, __m128i __B) 1207{ 1208 return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); 1209} 1210 1211static __inline __m128i __attribute__((__always_inline__)) 1212_mm_sll_epi64 (__m128i __A, __m128i __B) 1213{ 1214 return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); 1215} 1216 1217static __inline __m128i __attribute__((__always_inline__)) 1218_mm_sra_epi16 (__m128i __A, __m128i __B) 1219{ 1220 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); 1221} 1222 1223static __inline __m128i __attribute__((__always_inline__)) 1224_mm_sra_epi32 (__m128i __A, __m128i __B) 1225{ 1226 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); 1227} 1228 1229static __inline __m128i __attribute__((__always_inline__)) 1230_mm_srl_epi16 (__m128i __A, __m128i __B) 1231{ 1232 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); 1233} 1234 1235static __inline __m128i __attribute__((__always_inline__)) 1236_mm_srl_epi32 (__m128i __A, __m128i __B) 1237{ 1238 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); 1239} 1240 1241static __inline __m128i __attribute__((__always_inline__)) 1242_mm_srl_epi64 (__m128i __A, __m128i __B) 1243{ 1244 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); 1245} 1246 1247static __inline __m128i __attribute__((__always_inline__)) 1248_mm_and_si128 (__m128i __A, __m128i __B) 1249{ 1250 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 1251} 1252 1253static __inline __m128i __attribute__((__always_inline__)) 1254_mm_andnot_si128 (__m128i __A, __m128i __B) 1255{ 1256 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); 1257} 1258 1259static __inline __m128i __attribute__((__always_inline__)) 1260_mm_or_si128 (__m128i __A, __m128i __B) 1261{ 1262 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 1263} 1264 1265static __inline __m128i __attribute__((__always_inline__)) 1266_mm_xor_si128 (__m128i __A, __m128i __B) 1267{ 1268 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); 1269} 1270 1271static __inline __m128i __attribute__((__always_inline__)) 1272_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1273{ 1274 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); 1275} 1276 1277static __inline __m128i __attribute__((__always_inline__)) 1278_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1279{ 1280 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); 1281} 1282 1283static __inline __m128i __attribute__((__always_inline__)) 1284_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1285{ 1286 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); 1287} 1288 1289static __inline __m128i __attribute__((__always_inline__)) 1290_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1291{ 1292 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); 1293} 1294 1295static __inline __m128i __attribute__((__always_inline__)) 1296_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1297{ 1298 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); 1299} 1300 1301static __inline __m128i __attribute__((__always_inline__)) 1302_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1303{ 1304 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); 1305} 1306 1307static __inline __m128i __attribute__((__always_inline__)) 1308_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1309{ 1310 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); 1311} 1312 1313static __inline __m128i __attribute__((__always_inline__)) 1314_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1315{ 1316 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); 1317} 1318 1319static __inline __m128i __attribute__((__always_inline__)) 1320_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1321{ 1322 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); 1323} 1324 1325#if 0 1326static __inline int __attribute__((__always_inline__)) 1327_mm_extract_epi16 (__m128i const __A, int const __N) 1328{ 1329 return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); 1330} 1331 1332static __inline __m128i __attribute__((__always_inline__)) 1333_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1334{ 1335 return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); 1336} 1337#else 1338#define _mm_extract_epi16(A, N) \ 1339 ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N))) 1340#define _mm_insert_epi16(A, D, N) \ 1341 ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N))) 1342#endif 1343 1344static __inline __m128i __attribute__((__always_inline__)) 1345_mm_max_epi16 (__m128i __A, __m128i __B) 1346{ 1347 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); 1348} 1349 1350static __inline __m128i __attribute__((__always_inline__)) 1351_mm_max_epu8 (__m128i __A, __m128i __B) 1352{ 1353 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); 1354} 1355 1356static __inline __m128i __attribute__((__always_inline__)) 1357_mm_min_epi16 (__m128i __A, __m128i __B) 1358{ 1359 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); 1360} 1361 1362static __inline __m128i __attribute__((__always_inline__)) 1363_mm_min_epu8 (__m128i __A, __m128i __B) 1364{ 1365 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); 1366} 1367 1368static __inline int __attribute__((__always_inline__)) 1369_mm_movemask_epi8 (__m128i __A) 1370{ 1371 return __builtin_ia32_pmovmskb128 ((__v16qi)__A); 1372} 1373 1374static __inline __m128i __attribute__((__always_inline__)) 1375_mm_mulhi_epu16 (__m128i __A, __m128i __B) 1376{ 1377 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); 1378} 1379 1380#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B)) 1381#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B)) 1382#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) 1383 1384static __inline void __attribute__((__always_inline__)) 1385_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 1386{ 1387 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); 1388} 1389 1390static __inline __m128i __attribute__((__always_inline__)) 1391_mm_avg_epu8 (__m128i __A, __m128i __B) 1392{ 1393 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); 1394} 1395 1396static __inline __m128i __attribute__((__always_inline__)) 1397_mm_avg_epu16 (__m128i __A, __m128i __B) 1398{ 1399 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); 1400} 1401 1402static __inline __m128i __attribute__((__always_inline__)) 1403_mm_sad_epu8 (__m128i __A, __m128i __B) 1404{ 1405 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); 1406} 1407 1408static __inline void __attribute__((__always_inline__)) 1409_mm_stream_si32 (int *__A, int __B) 1410{ 1411 __builtin_ia32_movnti (__A, __B); 1412} 1413 1414static __inline void __attribute__((__always_inline__)) 1415_mm_stream_si128 (__m128i *__A, __m128i __B) 1416{ 1417 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); 1418} 1419 1420static __inline void __attribute__((__always_inline__)) 1421_mm_stream_pd (double *__A, __m128d __B) 1422{ 1423 __builtin_ia32_movntpd (__A, (__v2df)__B); 1424} 1425 1426static __inline void __attribute__((__always_inline__)) 1427_mm_clflush (void const *__A) 1428{ 1429 __builtin_ia32_clflush (__A); 1430} 1431 1432static __inline void __attribute__((__always_inline__)) 1433_mm_lfence (void) 1434{ 1435 __builtin_ia32_lfence (); 1436} 1437 1438static __inline void __attribute__((__always_inline__)) 1439_mm_mfence (void) 1440{ 1441 __builtin_ia32_mfence (); 1442} 1443 1444static __inline __m128i __attribute__((__always_inline__)) 1445_mm_cvtsi32_si128 (int __A) 1446{ 1447 return _mm_set_epi32 (0, 0, 0, __A); 1448} 1449 1450#ifdef __x86_64__ 1451/* Intel intrinsic. */ 1452static __inline __m128i __attribute__((__always_inline__)) 1453_mm_cvtsi64_si128 (long long __A) 1454{ 1455 return _mm_set_epi64x (0, __A); 1456} 1457 1458/* Microsoft intrinsic. */ 1459static __inline __m128i __attribute__((__always_inline__)) 1460_mm_cvtsi64x_si128 (long long __A) 1461{ 1462 return _mm_set_epi64x (0, __A); 1463} 1464#endif 1465 1466/* Casts between various SP, DP, INT vector types. Note that these do no 1467 conversion of values, they just change the type. */ 1468static __inline __m128 __attribute__((__always_inline__)) 1469_mm_castpd_ps(__m128d __A) 1470{ 1471 return (__m128) __A; 1472} 1473 1474static __inline __m128i __attribute__((__always_inline__)) 1475_mm_castpd_si128(__m128d __A) 1476{ 1477 return (__m128i) __A; 1478} 1479 1480static __inline __m128d __attribute__((__always_inline__)) 1481_mm_castps_pd(__m128 __A) 1482{ 1483 return (__m128d) __A; 1484} 1485 1486static __inline __m128i __attribute__((__always_inline__)) 1487_mm_castps_si128(__m128 __A) 1488{ 1489 return (__m128i) __A; 1490} 1491 1492static __inline __m128 __attribute__((__always_inline__)) 1493_mm_castsi128_ps(__m128i __A) 1494{ 1495 return (__m128) __A; 1496} 1497 1498static __inline __m128d __attribute__((__always_inline__)) 1499_mm_castsi128_pd(__m128i __A) 1500{ 1501 return (__m128d) __A; 1502} 1503 1504#endif /* __SSE2__ */ 1505 1506#endif /* _EMMINTRIN_H_INCLUDED */ 1507