emmintrin.h revision 146895
1194955Strasz/* Copyright (C) 2003, 2004 Free Software Foundation, Inc. 2194955Strasz 3194955Strasz This file is part of GCC. 4194955Strasz 5194955Strasz GCC is free software; you can redistribute it and/or modify 6194955Strasz it under the terms of the GNU General Public License as published by 7194955Strasz the Free Software Foundation; either version 2, or (at your option) 8194955Strasz any later version. 9194955Strasz 10194955Strasz GCC is distributed in the hope that it will be useful, 11194955Strasz but WITHOUT ANY WARRANTY; without even the implied warranty of 12194955Strasz MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13194955Strasz GNU General Public License for more details. 14194955Strasz 15194955Strasz You should have received a copy of the GNU General Public License 16194955Strasz along with GCC; see the file COPYING. If not, write to 17194955Strasz the Free Software Foundation, 59 Temple Place - Suite 330, 18194955Strasz Boston, MA 02111-1307, USA. */ 19194955Strasz 20194955Strasz/* As a special exception, if you include this header file into source 21194955Strasz files compiled by GCC, this header file does not by itself cause 22194955Strasz the resulting executable to be covered by the GNU General Public 23194955Strasz License. This exception does not however invalidate any other 24194955Strasz reasons why the executable file might be covered by the GNU General 25194955Strasz Public License. */ 26194955Strasz 27194955Strasz/* Implemented from the specification included in the Intel C++ Compiler 28194955Strasz User Guide and Reference, version 8.0. */ 29194955Strasz 30194955Strasz#ifndef _EMMINTRIN_H_INCLUDED 31194955Strasz#define _EMMINTRIN_H_INCLUDED 32194955Strasz 33194955Strasz#ifdef __SSE2__ 34194955Strasz#include <xmmintrin.h> 35194955Strasz 36194955Strasz/* SSE2 */ 37194955Strasztypedef double __v2df __attribute__ ((mode (V2DF))); 38194955Strasztypedef int __v2di __attribute__ ((mode (V2DI))); 39194955Strasztypedef int __v4si __attribute__ ((mode (V4SI))); 40194955Strasztypedef int __v8hi __attribute__ ((mode (V8HI))); 41194955Strasztypedef int __v16qi __attribute__ ((mode (V16QI))); 42194955Strasz 43194955Strasz/* Create a selector for use with the SHUFPD instruction. */ 44194955Strasz#define _MM_SHUFFLE2(fp1,fp0) \ 45194955Strasz (((fp1) << 1) | (fp0)) 46194955Strasz 47194955Strasz#define __m128i __v2di 48194955Strasz#define __m128d __v2df 49194955Strasz 50194955Strasz/* Create a vector with element 0 as *P and the rest zero. */ 51290893Sngiestatic __inline __m128d 52194955Strasz_mm_load_sd (double const *__P) 53194955Strasz{ 54194955Strasz return (__m128d) __builtin_ia32_loadsd (__P); 55194955Strasz} 56194955Strasz 57194955Strasz/* Create a vector with all two elements equal to *P. */ 58194955Straszstatic __inline __m128d 59194955Strasz_mm_load1_pd (double const *__P) 60194955Strasz{ 61194955Strasz __v2df __tmp = __builtin_ia32_loadsd (__P); 62194955Strasz return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0)); 63194955Strasz} 64194955Strasz 65194955Straszstatic __inline __m128d 66194955Strasz_mm_load_pd1 (double const *__P) 67194955Strasz{ 68194955Strasz return _mm_load1_pd (__P); 69194955Strasz} 70194955Strasz 71194955Strasz/* Load two DPFP values from P. The address must be 16-byte aligned. */ 72194955Straszstatic __inline __m128d 73194955Strasz_mm_load_pd (double const *__P) 74220465Strasz{ 75220465Strasz return (__m128d) __builtin_ia32_loadapd (__P); 76220465Strasz} 77220465Strasz 78194955Strasz/* Load two DPFP values from P. The address need not be 16-byte aligned. */ 79194955Straszstatic __inline __m128d 80194955Strasz_mm_loadu_pd (double const *__P) 81194955Strasz{ 82194955Strasz return (__m128d) __builtin_ia32_loadupd (__P); 83194955Strasz} 84309485Sngie 85194955Strasz/* Load two DPFP values in reverse order. The address must be aligned. */ 86194955Straszstatic __inline __m128d 87194955Strasz_mm_loadr_pd (double const *__P) 88194955Strasz{ 89194955Strasz __v2df __tmp = __builtin_ia32_loadapd (__P); 90194955Strasz return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); 91194955Strasz} 92194955Strasz 93194955Strasz/* Create a vector with element 0 as F and the rest zero. */ 94194955Straszstatic __inline __m128d 95194955Strasz_mm_set_sd (double __F) 96194955Strasz{ 97194955Strasz return (__m128d) __builtin_ia32_loadsd (&__F); 98194955Strasz} 99194955Strasz 100194955Strasz/* Create a vector with all two elements equal to F. */ 101194955Straszstatic __inline __m128d 102194955Strasz_mm_set1_pd (double __F) 103194955Strasz{ 104194955Strasz __v2df __tmp = __builtin_ia32_loadsd (&__F); 105194955Strasz return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0)); 106194955Strasz} 107194955Strasz 108194955Straszstatic __inline __m128d 109194955Strasz_mm_set_pd1 (double __F) 110194955Strasz{ 111194955Strasz return _mm_set1_pd (__F); 112194955Strasz} 113194955Strasz 114194955Strasz/* Create the vector [Z Y]. */ 115194955Straszstatic __inline __m128d 116194955Strasz_mm_set_pd (double __Z, double __Y) 117194955Strasz{ 118194955Strasz return (__v2df) {__Y, __Z}; 119194955Strasz} 120194955Strasz 121194955Strasz/* Create the vector [Y Z]. */ 122194955Straszstatic __inline __m128d 123194955Strasz_mm_setr_pd (double __Z, double __Y) 124194955Strasz{ 125220465Strasz return _mm_set_pd (__Y, __Z); 126194955Strasz} 127194955Strasz 128194955Strasz/* Create a vector of zeros. */ 129194955Straszstatic __inline __m128d 130194955Strasz_mm_setzero_pd (void) 131194955Strasz{ 132194955Strasz return (__m128d) __builtin_ia32_setzeropd (); 133194955Strasz} 134194955Strasz 135194955Strasz/* Stores the lower DPFP value. */ 136194955Straszstatic __inline void 137194955Strasz_mm_store_sd (double *__P, __m128d __A) 138194955Strasz{ 139194955Strasz __builtin_ia32_storesd (__P, (__v2df)__A); 140194955Strasz} 141194955Strasz 142194955Strasz/* Store the lower DPFP value across two words. */ 143194955Straszstatic __inline void 144194955Strasz_mm_store1_pd (double *__P, __m128d __A) 145194955Strasz{ 146194955Strasz __v2df __va = (__v2df)__A; 147194955Strasz __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0)); 148194955Strasz __builtin_ia32_storeapd (__P, __tmp); 149194955Strasz} 150194955Strasz 151194955Straszstatic __inline void 152194955Strasz_mm_store_pd1 (double *__P, __m128d __A) 153194955Strasz{ 154194955Strasz _mm_store1_pd (__P, __A); 155194955Strasz} 156194955Strasz 157194955Strasz/* Store two DPFP values. The address must be 16-byte aligned. */ 158194955Straszstatic __inline void 159194955Strasz_mm_store_pd (double *__P, __m128d __A) 160194955Strasz{ 161194955Strasz __builtin_ia32_storeapd (__P, (__v2df)__A); 162194955Strasz} 163194955Strasz 164194955Strasz/* Store two DPFP values. The address need not be 16-byte aligned. */ 165194955Straszstatic __inline void 166194955Strasz_mm_storeu_pd (double *__P, __m128d __A) 167194955Strasz{ 168194955Strasz __builtin_ia32_storeupd (__P, (__v2df)__A); 169194955Strasz} 170194955Strasz 171194955Strasz/* Store two DPFP values in reverse order. The address must be aligned. */ 172194955Straszstatic __inline void 173194955Strasz_mm_storer_pd (double *__P, __m128d __A) 174194955Strasz{ 175194955Strasz __v2df __va = (__v2df)__A; 176194955Strasz __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1)); 177194955Strasz __builtin_ia32_storeapd (__P, __tmp); 178194955Strasz} 179194955Strasz 180194955Strasz/* Sets the low DPFP value of A from the low value of B. */ 181194955Straszstatic __inline __m128d 182194955Strasz_mm_move_sd (__m128d __A, __m128d __B) 183194955Strasz{ 184194955Strasz return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 185194955Strasz} 186194955Strasz 187194955Strasz 188194955Straszstatic __inline __m128d 189194955Strasz_mm_add_pd (__m128d __A, __m128d __B) 190194955Strasz{ 191194955Strasz return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); 192194955Strasz} 193194955Strasz 194194955Straszstatic __inline __m128d 195194955Strasz_mm_add_sd (__m128d __A, __m128d __B) 196194955Strasz{ 197194955Strasz return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); 198194955Strasz} 199194955Strasz 200194955Straszstatic __inline __m128d 201194955Strasz_mm_sub_pd (__m128d __A, __m128d __B) 202194955Strasz{ 203194955Strasz return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); 204194955Strasz} 205194955Strasz 206194955Straszstatic __inline __m128d 207194955Strasz_mm_sub_sd (__m128d __A, __m128d __B) 208194955Strasz{ 209194955Strasz return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); 210194955Strasz} 211194955Strasz 212194955Straszstatic __inline __m128d 213194955Strasz_mm_mul_pd (__m128d __A, __m128d __B) 214194955Strasz{ 215194955Strasz return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); 216194955Strasz} 217194955Strasz 218194955Straszstatic __inline __m128d 219194955Strasz_mm_mul_sd (__m128d __A, __m128d __B) 220194955Strasz{ 221194955Strasz return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); 222194955Strasz} 223194955Strasz 224194955Straszstatic __inline __m128d 225194955Strasz_mm_div_pd (__m128d __A, __m128d __B) 226194955Strasz{ 227194955Strasz return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); 228194955Strasz} 229194955Strasz 230194955Straszstatic __inline __m128d 231194955Strasz_mm_div_sd (__m128d __A, __m128d __B) 232194955Strasz{ 233194955Strasz return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); 234194955Strasz} 235194955Strasz 236194955Straszstatic __inline __m128d 237194955Strasz_mm_sqrt_pd (__m128d __A) 238194955Strasz{ 239194955Strasz return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); 240194955Strasz} 241194955Strasz 242194955Strasz/* Return pair {sqrt (A[0), B[1]}. */ 243194955Straszstatic __inline __m128d 244194955Strasz_mm_sqrt_sd (__m128d __A, __m128d __B) 245194955Strasz{ 246194955Strasz __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); 247194955Strasz return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); 248194955Strasz} 249194955Strasz 250194955Straszstatic __inline __m128d 251194955Strasz_mm_min_pd (__m128d __A, __m128d __B) 252194955Strasz{ 253194955Strasz return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); 254194955Strasz} 255194955Strasz 256194955Straszstatic __inline __m128d 257194955Strasz_mm_min_sd (__m128d __A, __m128d __B) 258194955Strasz{ 259194955Strasz return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); 260194955Strasz} 261194955Strasz 262static __inline __m128d 263_mm_max_pd (__m128d __A, __m128d __B) 264{ 265 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); 266} 267 268static __inline __m128d 269_mm_max_sd (__m128d __A, __m128d __B) 270{ 271 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); 272} 273 274static __inline __m128d 275_mm_and_pd (__m128d __A, __m128d __B) 276{ 277 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); 278} 279 280static __inline __m128d 281_mm_andnot_pd (__m128d __A, __m128d __B) 282{ 283 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); 284} 285 286static __inline __m128d 287_mm_or_pd (__m128d __A, __m128d __B) 288{ 289 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); 290} 291 292static __inline __m128d 293_mm_xor_pd (__m128d __A, __m128d __B) 294{ 295 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); 296} 297 298static __inline __m128d 299_mm_cmpeq_pd (__m128d __A, __m128d __B) 300{ 301 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); 302} 303 304static __inline __m128d 305_mm_cmplt_pd (__m128d __A, __m128d __B) 306{ 307 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); 308} 309 310static __inline __m128d 311_mm_cmple_pd (__m128d __A, __m128d __B) 312{ 313 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); 314} 315 316static __inline __m128d 317_mm_cmpgt_pd (__m128d __A, __m128d __B) 318{ 319 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); 320} 321 322static __inline __m128d 323_mm_cmpge_pd (__m128d __A, __m128d __B) 324{ 325 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); 326} 327 328static __inline __m128d 329_mm_cmpneq_pd (__m128d __A, __m128d __B) 330{ 331 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); 332} 333 334static __inline __m128d 335_mm_cmpnlt_pd (__m128d __A, __m128d __B) 336{ 337 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); 338} 339 340static __inline __m128d 341_mm_cmpnle_pd (__m128d __A, __m128d __B) 342{ 343 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); 344} 345 346static __inline __m128d 347_mm_cmpngt_pd (__m128d __A, __m128d __B) 348{ 349 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); 350} 351 352static __inline __m128d 353_mm_cmpnge_pd (__m128d __A, __m128d __B) 354{ 355 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); 356} 357 358static __inline __m128d 359_mm_cmpord_pd (__m128d __A, __m128d __B) 360{ 361 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); 362} 363 364static __inline __m128d 365_mm_cmpunord_pd (__m128d __A, __m128d __B) 366{ 367 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); 368} 369 370static __inline __m128d 371_mm_cmpeq_sd (__m128d __A, __m128d __B) 372{ 373 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); 374} 375 376static __inline __m128d 377_mm_cmplt_sd (__m128d __A, __m128d __B) 378{ 379 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); 380} 381 382static __inline __m128d 383_mm_cmple_sd (__m128d __A, __m128d __B) 384{ 385 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); 386} 387 388static __inline __m128d 389_mm_cmpgt_sd (__m128d __A, __m128d __B) 390{ 391 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 392 (__v2df) 393 __builtin_ia32_cmpltsd ((__v2df) __B, 394 (__v2df) 395 __A)); 396} 397 398static __inline __m128d 399_mm_cmpge_sd (__m128d __A, __m128d __B) 400{ 401 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 402 (__v2df) 403 __builtin_ia32_cmplesd ((__v2df) __B, 404 (__v2df) 405 __A)); 406} 407 408static __inline __m128d 409_mm_cmpneq_sd (__m128d __A, __m128d __B) 410{ 411 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); 412} 413 414static __inline __m128d 415_mm_cmpnlt_sd (__m128d __A, __m128d __B) 416{ 417 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); 418} 419 420static __inline __m128d 421_mm_cmpnle_sd (__m128d __A, __m128d __B) 422{ 423 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); 424} 425 426static __inline __m128d 427_mm_cmpngt_sd (__m128d __A, __m128d __B) 428{ 429 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 430 (__v2df) 431 __builtin_ia32_cmpnltsd ((__v2df) __B, 432 (__v2df) 433 __A)); 434} 435 436static __inline __m128d 437_mm_cmpnge_sd (__m128d __A, __m128d __B) 438{ 439 return (__m128d) __builtin_ia32_movsd ((__v2df) __A, 440 (__v2df) 441 __builtin_ia32_cmpnlesd ((__v2df) __B, 442 (__v2df) 443 __A)); 444} 445 446static __inline __m128d 447_mm_cmpord_sd (__m128d __A, __m128d __B) 448{ 449 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); 450} 451 452static __inline __m128d 453_mm_cmpunord_sd (__m128d __A, __m128d __B) 454{ 455 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); 456} 457 458static __inline int 459_mm_comieq_sd (__m128d __A, __m128d __B) 460{ 461 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); 462} 463 464static __inline int 465_mm_comilt_sd (__m128d __A, __m128d __B) 466{ 467 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); 468} 469 470static __inline int 471_mm_comile_sd (__m128d __A, __m128d __B) 472{ 473 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); 474} 475 476static __inline int 477_mm_comigt_sd (__m128d __A, __m128d __B) 478{ 479 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); 480} 481 482static __inline int 483_mm_comige_sd (__m128d __A, __m128d __B) 484{ 485 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); 486} 487 488static __inline int 489_mm_comineq_sd (__m128d __A, __m128d __B) 490{ 491 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); 492} 493 494static __inline int 495_mm_ucomieq_sd (__m128d __A, __m128d __B) 496{ 497 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); 498} 499 500static __inline int 501_mm_ucomilt_sd (__m128d __A, __m128d __B) 502{ 503 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); 504} 505 506static __inline int 507_mm_ucomile_sd (__m128d __A, __m128d __B) 508{ 509 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); 510} 511 512static __inline int 513_mm_ucomigt_sd (__m128d __A, __m128d __B) 514{ 515 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); 516} 517 518static __inline int 519_mm_ucomige_sd (__m128d __A, __m128d __B) 520{ 521 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); 522} 523 524static __inline int 525_mm_ucomineq_sd (__m128d __A, __m128d __B) 526{ 527 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); 528} 529 530/* Create a vector with element 0 as *P and the rest zero. */ 531 532static __inline __m128i 533_mm_load_si128 (__m128i const *__P) 534{ 535 return (__m128i) __builtin_ia32_loaddqa ((char const *)__P); 536} 537 538static __inline __m128i 539_mm_loadu_si128 (__m128i const *__P) 540{ 541 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); 542} 543 544static __inline __m128i 545_mm_loadl_epi64 (__m128i const *__P) 546{ 547 return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P); 548} 549 550static __inline void 551_mm_store_si128 (__m128i *__P, __m128i __B) 552{ 553 __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B); 554} 555 556static __inline void 557_mm_storeu_si128 (__m128i *__P, __m128i __B) 558{ 559 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); 560} 561 562static __inline void 563_mm_storel_epi64 (__m128i *__P, __m128i __B) 564{ 565 *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B); 566} 567 568static __inline __m64 569_mm_movepi64_pi64 (__m128i __B) 570{ 571 return (__m64) __builtin_ia32_movdq2q ((__v2di)__B); 572} 573 574static __inline __m128i 575_mm_move_epi64 (__m128i __A) 576{ 577 return (__m128i) __builtin_ia32_movq ((__v2di)__A); 578} 579 580/* Create a vector of zeros. */ 581static __inline __m128i 582_mm_setzero_si128 (void) 583{ 584 return (__m128i) __builtin_ia32_setzero128 (); 585} 586 587static __inline __m128i 588_mm_set_epi64 (__m64 __A, __m64 __B) 589{ 590 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 591 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); 592 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp); 593} 594 595/* Create the vector [Z Y X W]. */ 596static __inline __m128i 597_mm_set_epi32 (int __Z, int __Y, int __X, int __W) 598{ 599 union { 600 int __a[4]; 601 __m128i __v; 602 } __u; 603 604 __u.__a[0] = __W; 605 __u.__a[1] = __X; 606 __u.__a[2] = __Y; 607 __u.__a[3] = __Z; 608 609 return __u.__v; 610} 611 612#ifdef __x86_64__ 613/* Create the vector [Z Y]. */ 614static __inline __m128i 615_mm_set_epi64x (long long __Z, long long __Y) 616{ 617 union { 618 long __a[2]; 619 __m128i __v; 620 } __u; 621 622 __u.__a[0] = __Y; 623 __u.__a[1] = __Z; 624 625 return __u.__v; 626} 627#endif 628 629/* Create the vector [S T U V Z Y X W]. */ 630static __inline __m128i 631_mm_set_epi16 (short __Z, short __Y, short __X, short __W, 632 short __V, short __U, short __T, short __S) 633{ 634 union { 635 short __a[8]; 636 __m128i __v; 637 } __u; 638 639 __u.__a[0] = __S; 640 __u.__a[1] = __T; 641 __u.__a[2] = __U; 642 __u.__a[3] = __V; 643 __u.__a[4] = __W; 644 __u.__a[5] = __X; 645 __u.__a[6] = __Y; 646 __u.__a[7] = __Z; 647 648 return __u.__v; 649} 650 651/* Create the vector [S T U V Z Y X W]. */ 652static __inline __m128i 653_mm_set_epi8 (char __Z, char __Y, char __X, char __W, 654 char __V, char __U, char __T, char __S, 655 char __Z1, char __Y1, char __X1, char __W1, 656 char __V1, char __U1, char __T1, char __S1) 657{ 658 union { 659 char __a[16]; 660 __m128i __v; 661 } __u; 662 663 __u.__a[0] = __S1; 664 __u.__a[1] = __T1; 665 __u.__a[2] = __U1; 666 __u.__a[3] = __V1; 667 __u.__a[4] = __W1; 668 __u.__a[5] = __X1; 669 __u.__a[6] = __Y1; 670 __u.__a[7] = __Z1; 671 __u.__a[8] = __S; 672 __u.__a[9] = __T; 673 __u.__a[10] = __U; 674 __u.__a[11] = __V; 675 __u.__a[12] = __W; 676 __u.__a[13] = __X; 677 __u.__a[14] = __Y; 678 __u.__a[15] = __Z; 679 680 return __u.__v; 681} 682 683static __inline __m128i 684_mm_set1_epi64 (__m64 __A) 685{ 686 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 687 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp); 688} 689 690static __inline __m128i 691_mm_set1_epi32 (int __A) 692{ 693 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A); 694 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 695} 696 697#ifdef __x86_64__ 698static __inline __m128i 699_mm_set1_epi64x (long long __A) 700{ 701 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 702 return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0)); 703} 704#endif 705 706static __inline __m128i 707_mm_set1_epi16 (short __A) 708{ 709 int __Acopy = (unsigned short)__A; 710 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); 711 __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp); 712 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 713} 714 715static __inline __m128i 716_mm_set1_epi8 (char __A) 717{ 718 int __Acopy = (unsigned char)__A; 719 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); 720 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); 721 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); 722 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); 723} 724 725static __inline __m128i 726_mm_setr_epi64 (__m64 __A, __m64 __B) 727{ 728 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); 729 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); 730 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2); 731} 732 733/* Create the vector [Z Y X W]. */ 734static __inline __m128i 735_mm_setr_epi32 (int __W, int __X, int __Y, int __Z) 736{ 737 union { 738 int __a[4]; 739 __m128i __v; 740 } __u; 741 742 __u.__a[0] = __W; 743 __u.__a[1] = __X; 744 __u.__a[2] = __Y; 745 __u.__a[3] = __Z; 746 747 return __u.__v; 748} 749/* Create the vector [S T U V Z Y X W]. */ 750static __inline __m128i 751_mm_setr_epi16 (short __S, short __T, short __U, short __V, 752 short __W, short __X, short __Y, short __Z) 753{ 754 union { 755 short __a[8]; 756 __m128i __v; 757 } __u; 758 759 __u.__a[0] = __S; 760 __u.__a[1] = __T; 761 __u.__a[2] = __U; 762 __u.__a[3] = __V; 763 __u.__a[4] = __W; 764 __u.__a[5] = __X; 765 __u.__a[6] = __Y; 766 __u.__a[7] = __Z; 767 768 return __u.__v; 769} 770 771/* Create the vector [S T U V Z Y X W]. */ 772static __inline __m128i 773_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1, 774 char __W1, char __X1, char __Y1, char __Z1, 775 char __S, char __T, char __U, char __V, 776 char __W, char __X, char __Y, char __Z) 777{ 778 union { 779 char __a[16]; 780 __m128i __v; 781 } __u; 782 783 __u.__a[0] = __S1; 784 __u.__a[1] = __T1; 785 __u.__a[2] = __U1; 786 __u.__a[3] = __V1; 787 __u.__a[4] = __W1; 788 __u.__a[5] = __X1; 789 __u.__a[6] = __Y1; 790 __u.__a[7] = __Z1; 791 __u.__a[8] = __S; 792 __u.__a[9] = __T; 793 __u.__a[10] = __U; 794 __u.__a[11] = __V; 795 __u.__a[12] = __W; 796 __u.__a[13] = __X; 797 __u.__a[14] = __Y; 798 __u.__a[15] = __Z; 799 800 return __u.__v; 801} 802 803static __inline __m128d 804_mm_cvtepi32_pd (__m128i __A) 805{ 806 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); 807} 808 809static __inline __m128 810_mm_cvtepi32_ps (__m128i __A) 811{ 812 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); 813} 814 815static __inline __m128i 816_mm_cvtpd_epi32 (__m128d __A) 817{ 818 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); 819} 820 821static __inline __m64 822_mm_cvtpd_pi32 (__m128d __A) 823{ 824 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); 825} 826 827static __inline __m128 828_mm_cvtpd_ps (__m128d __A) 829{ 830 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); 831} 832 833static __inline __m128i 834_mm_cvttpd_epi32 (__m128d __A) 835{ 836 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); 837} 838 839static __inline __m64 840_mm_cvttpd_pi32 (__m128d __A) 841{ 842 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); 843} 844 845static __inline __m128d 846_mm_cvtpi32_pd (__m64 __A) 847{ 848 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); 849} 850 851static __inline __m128i 852_mm_cvtps_epi32 (__m128 __A) 853{ 854 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); 855} 856 857static __inline __m128i 858_mm_cvttps_epi32 (__m128 __A) 859{ 860 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); 861} 862 863static __inline __m128d 864_mm_cvtps_pd (__m128 __A) 865{ 866 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); 867} 868 869static __inline int 870_mm_cvtsd_si32 (__m128d __A) 871{ 872 return __builtin_ia32_cvtsd2si ((__v2df) __A); 873} 874 875#ifdef __x86_64__ 876static __inline long long 877_mm_cvtsd_si64x (__m128d __A) 878{ 879 return __builtin_ia32_cvtsd2si64 ((__v2df) __A); 880} 881#endif 882 883static __inline int 884_mm_cvttsd_si32 (__m128d __A) 885{ 886 return __builtin_ia32_cvttsd2si ((__v2df) __A); 887} 888 889#ifdef __x86_64__ 890static __inline long long 891_mm_cvttsd_si64x (__m128d __A) 892{ 893 return __builtin_ia32_cvttsd2si64 ((__v2df) __A); 894} 895#endif 896 897static __inline __m128 898_mm_cvtsd_ss (__m128 __A, __m128d __B) 899{ 900 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); 901} 902 903static __inline __m128d 904_mm_cvtsi32_sd (__m128d __A, int __B) 905{ 906 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); 907} 908 909#ifdef __x86_64__ 910static __inline __m128d 911_mm_cvtsi64x_sd (__m128d __A, long long __B) 912{ 913 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); 914} 915#endif 916 917static __inline __m128d 918_mm_cvtss_sd (__m128d __A, __m128 __B) 919{ 920 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); 921} 922 923#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C))) 924 925static __inline __m128d 926_mm_unpackhi_pd (__m128d __A, __m128d __B) 927{ 928 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); 929} 930 931static __inline __m128d 932_mm_unpacklo_pd (__m128d __A, __m128d __B) 933{ 934 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); 935} 936 937static __inline __m128d 938_mm_loadh_pd (__m128d __A, double const *__B) 939{ 940 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B); 941} 942 943static __inline void 944_mm_storeh_pd (double *__A, __m128d __B) 945{ 946 __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B); 947} 948 949static __inline __m128d 950_mm_loadl_pd (__m128d __A, double const *__B) 951{ 952 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B); 953} 954 955static __inline void 956_mm_storel_pd (double *__A, __m128d __B) 957{ 958 __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B); 959} 960 961static __inline int 962_mm_movemask_pd (__m128d __A) 963{ 964 return __builtin_ia32_movmskpd ((__v2df)__A); 965} 966 967static __inline __m128i 968_mm_packs_epi16 (__m128i __A, __m128i __B) 969{ 970 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); 971} 972 973static __inline __m128i 974_mm_packs_epi32 (__m128i __A, __m128i __B) 975{ 976 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); 977} 978 979static __inline __m128i 980_mm_packus_epi16 (__m128i __A, __m128i __B) 981{ 982 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); 983} 984 985static __inline __m128i 986_mm_unpackhi_epi8 (__m128i __A, __m128i __B) 987{ 988 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); 989} 990 991static __inline __m128i 992_mm_unpackhi_epi16 (__m128i __A, __m128i __B) 993{ 994 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); 995} 996 997static __inline __m128i 998_mm_unpackhi_epi32 (__m128i __A, __m128i __B) 999{ 1000 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); 1001} 1002 1003static __inline __m128i 1004_mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1005{ 1006 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); 1007} 1008 1009static __inline __m128i 1010_mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1011{ 1012 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); 1013} 1014 1015static __inline __m128i 1016_mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1017{ 1018 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); 1019} 1020 1021static __inline __m128i 1022_mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1023{ 1024 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); 1025} 1026 1027static __inline __m128i 1028_mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1029{ 1030 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); 1031} 1032 1033static __inline __m128i 1034_mm_add_epi8 (__m128i __A, __m128i __B) 1035{ 1036 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); 1037} 1038 1039static __inline __m128i 1040_mm_add_epi16 (__m128i __A, __m128i __B) 1041{ 1042 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); 1043} 1044 1045static __inline __m128i 1046_mm_add_epi32 (__m128i __A, __m128i __B) 1047{ 1048 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); 1049} 1050 1051static __inline __m128i 1052_mm_add_epi64 (__m128i __A, __m128i __B) 1053{ 1054 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); 1055} 1056 1057static __inline __m128i 1058_mm_adds_epi8 (__m128i __A, __m128i __B) 1059{ 1060 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); 1061} 1062 1063static __inline __m128i 1064_mm_adds_epi16 (__m128i __A, __m128i __B) 1065{ 1066 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); 1067} 1068 1069static __inline __m128i 1070_mm_adds_epu8 (__m128i __A, __m128i __B) 1071{ 1072 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); 1073} 1074 1075static __inline __m128i 1076_mm_adds_epu16 (__m128i __A, __m128i __B) 1077{ 1078 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); 1079} 1080 1081static __inline __m128i 1082_mm_sub_epi8 (__m128i __A, __m128i __B) 1083{ 1084 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); 1085} 1086 1087static __inline __m128i 1088_mm_sub_epi16 (__m128i __A, __m128i __B) 1089{ 1090 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); 1091} 1092 1093static __inline __m128i 1094_mm_sub_epi32 (__m128i __A, __m128i __B) 1095{ 1096 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); 1097} 1098 1099static __inline __m128i 1100_mm_sub_epi64 (__m128i __A, __m128i __B) 1101{ 1102 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); 1103} 1104 1105static __inline __m128i 1106_mm_subs_epi8 (__m128i __A, __m128i __B) 1107{ 1108 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); 1109} 1110 1111static __inline __m128i 1112_mm_subs_epi16 (__m128i __A, __m128i __B) 1113{ 1114 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); 1115} 1116 1117static __inline __m128i 1118_mm_subs_epu8 (__m128i __A, __m128i __B) 1119{ 1120 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); 1121} 1122 1123static __inline __m128i 1124_mm_subs_epu16 (__m128i __A, __m128i __B) 1125{ 1126 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); 1127} 1128 1129static __inline __m128i 1130_mm_madd_epi16 (__m128i __A, __m128i __B) 1131{ 1132 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); 1133} 1134 1135static __inline __m128i 1136_mm_mulhi_epi16 (__m128i __A, __m128i __B) 1137{ 1138 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); 1139} 1140 1141static __inline __m128i 1142_mm_mullo_epi16 (__m128i __A, __m128i __B) 1143{ 1144 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); 1145} 1146 1147static __inline __m64 1148_mm_mul_su32 (__m64 __A, __m64 __B) 1149{ 1150 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); 1151} 1152 1153static __inline __m128i 1154_mm_mul_epu32 (__m128i __A, __m128i __B) 1155{ 1156 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); 1157} 1158 1159static __inline __m128i 1160_mm_sll_epi16 (__m128i __A, __m128i __B) 1161{ 1162 return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B); 1163} 1164 1165static __inline __m128i 1166_mm_sll_epi32 (__m128i __A, __m128i __B) 1167{ 1168 return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B); 1169} 1170 1171static __inline __m128i 1172_mm_sll_epi64 (__m128i __A, __m128i __B) 1173{ 1174 return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B); 1175} 1176 1177static __inline __m128i 1178_mm_sra_epi16 (__m128i __A, __m128i __B) 1179{ 1180 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B); 1181} 1182 1183static __inline __m128i 1184_mm_sra_epi32 (__m128i __A, __m128i __B) 1185{ 1186 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B); 1187} 1188 1189static __inline __m128i 1190_mm_srl_epi16 (__m128i __A, __m128i __B) 1191{ 1192 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B); 1193} 1194 1195static __inline __m128i 1196_mm_srl_epi32 (__m128i __A, __m128i __B) 1197{ 1198 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B); 1199} 1200 1201static __inline __m128i 1202_mm_srl_epi64 (__m128i __A, __m128i __B) 1203{ 1204 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); 1205} 1206 1207static __inline __m128i 1208_mm_slli_epi16 (__m128i __A, int __B) 1209{ 1210 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); 1211} 1212 1213static __inline __m128i 1214_mm_slli_epi32 (__m128i __A, int __B) 1215{ 1216 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); 1217} 1218 1219static __inline __m128i 1220_mm_slli_epi64 (__m128i __A, int __B) 1221{ 1222 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); 1223} 1224 1225static __inline __m128i 1226_mm_srai_epi16 (__m128i __A, int __B) 1227{ 1228 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); 1229} 1230 1231static __inline __m128i 1232_mm_srai_epi32 (__m128i __A, int __B) 1233{ 1234 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); 1235} 1236 1237#if 0 1238static __m128i __attribute__((__always_inline__)) 1239_mm_srli_si128 (__m128i __A, const int __B) 1240{ 1241 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B)) 1242} 1243 1244static __m128i __attribute__((__always_inline__)) 1245_mm_srli_si128 (__m128i __A, const int __B) 1246{ 1247 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B)) 1248} 1249#endif 1250#define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B)) 1251#define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B)) 1252 1253static __inline __m128i 1254_mm_srli_epi16 (__m128i __A, int __B) 1255{ 1256 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); 1257} 1258 1259static __inline __m128i 1260_mm_srli_epi32 (__m128i __A, int __B) 1261{ 1262 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); 1263} 1264 1265static __inline __m128i 1266_mm_srli_epi64 (__m128i __A, int __B) 1267{ 1268 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); 1269} 1270 1271static __inline __m128i 1272_mm_and_si128 (__m128i __A, __m128i __B) 1273{ 1274 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); 1275} 1276 1277static __inline __m128i 1278_mm_andnot_si128 (__m128i __A, __m128i __B) 1279{ 1280 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); 1281} 1282 1283static __inline __m128i 1284_mm_or_si128 (__m128i __A, __m128i __B) 1285{ 1286 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); 1287} 1288 1289static __inline __m128i 1290_mm_xor_si128 (__m128i __A, __m128i __B) 1291{ 1292 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); 1293} 1294 1295static __inline __m128i 1296_mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1297{ 1298 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); 1299} 1300 1301static __inline __m128i 1302_mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1303{ 1304 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); 1305} 1306 1307static __inline __m128i 1308_mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1309{ 1310 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); 1311} 1312 1313static __inline __m128i 1314_mm_cmplt_epi8 (__m128i __A, __m128i __B) 1315{ 1316 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); 1317} 1318 1319static __inline __m128i 1320_mm_cmplt_epi16 (__m128i __A, __m128i __B) 1321{ 1322 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); 1323} 1324 1325static __inline __m128i 1326_mm_cmplt_epi32 (__m128i __A, __m128i __B) 1327{ 1328 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); 1329} 1330 1331static __inline __m128i 1332_mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1333{ 1334 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); 1335} 1336 1337static __inline __m128i 1338_mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1339{ 1340 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); 1341} 1342 1343static __inline __m128i 1344_mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1345{ 1346 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); 1347} 1348 1349#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B) 1350 1351#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C)) 1352 1353static __inline __m128i 1354_mm_max_epi16 (__m128i __A, __m128i __B) 1355{ 1356 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); 1357} 1358 1359static __inline __m128i 1360_mm_max_epu8 (__m128i __A, __m128i __B) 1361{ 1362 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); 1363} 1364 1365static __inline __m128i 1366_mm_min_epi16 (__m128i __A, __m128i __B) 1367{ 1368 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); 1369} 1370 1371static __inline __m128i 1372_mm_min_epu8 (__m128i __A, __m128i __B) 1373{ 1374 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); 1375} 1376 1377static __inline int 1378_mm_movemask_epi8 (__m128i __A) 1379{ 1380 return __builtin_ia32_pmovmskb128 ((__v16qi)__A); 1381} 1382 1383static __inline __m128i 1384_mm_mulhi_epu16 (__m128i __A, __m128i __B) 1385{ 1386 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); 1387} 1388 1389#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B)) 1390#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B)) 1391#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) 1392 1393static __inline void 1394_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 1395{ 1396 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); 1397} 1398 1399static __inline __m128i 1400_mm_avg_epu8 (__m128i __A, __m128i __B) 1401{ 1402 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); 1403} 1404 1405static __inline __m128i 1406_mm_avg_epu16 (__m128i __A, __m128i __B) 1407{ 1408 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); 1409} 1410 1411static __inline __m128i 1412_mm_sad_epu8 (__m128i __A, __m128i __B) 1413{ 1414 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); 1415} 1416 1417static __inline void 1418_mm_stream_si32 (int *__A, int __B) 1419{ 1420 __builtin_ia32_movnti (__A, __B); 1421} 1422 1423static __inline void 1424_mm_stream_si128 (__m128i *__A, __m128i __B) 1425{ 1426 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); 1427} 1428 1429static __inline void 1430_mm_stream_pd (double *__A, __m128d __B) 1431{ 1432 __builtin_ia32_movntpd (__A, (__v2df)__B); 1433} 1434 1435static __inline __m128i 1436_mm_movpi64_epi64 (__m64 __A) 1437{ 1438 return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A); 1439} 1440 1441static __inline void 1442_mm_clflush (void const *__A) 1443{ 1444 return __builtin_ia32_clflush (__A); 1445} 1446 1447static __inline void 1448_mm_lfence (void) 1449{ 1450 __builtin_ia32_lfence (); 1451} 1452 1453static __inline void 1454_mm_mfence (void) 1455{ 1456 __builtin_ia32_mfence (); 1457} 1458 1459static __inline __m128i 1460_mm_cvtsi32_si128 (int __A) 1461{ 1462 return (__m128i) __builtin_ia32_loadd (&__A); 1463} 1464 1465#ifdef __x86_64__ 1466static __inline __m128i 1467_mm_cvtsi64x_si128 (long long __A) 1468{ 1469 return (__m128i) __builtin_ia32_movq2dq (__A); 1470} 1471#endif 1472 1473static __inline int 1474_mm_cvtsi128_si32 (__m128i __A) 1475{ 1476 int __tmp; 1477 __builtin_ia32_stored (&__tmp, (__v4si)__A); 1478 return __tmp; 1479} 1480 1481#ifdef __x86_64__ 1482static __inline long long 1483_mm_cvtsi128_si64x (__m128i __A) 1484{ 1485 return __builtin_ia32_movdq2q ((__v2di)__A); 1486} 1487#endif 1488 1489#endif /* __SSE2__ */ 1490 1491#endif /* _EMMINTRIN_H_INCLUDED */ 1492