1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __SMMINTRIN_H 11#define __SMMINTRIN_H 12 13#if !defined(__i386__) && !defined(__x86_64__) 14#error "This header is only meant to be used on x86 and x64 architecture" 15#endif 16 17#include <tmmintrin.h> 18 19/* Define the default attributes for the functions in this file. */ 20#define __DEFAULT_FN_ATTRS \ 21 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \ 22 __min_vector_width__(128))) 23 24/* SSE4 Rounding macros. */ 25#define _MM_FROUND_TO_NEAREST_INT 0x00 26#define _MM_FROUND_TO_NEG_INF 0x01 27#define _MM_FROUND_TO_POS_INF 0x02 28#define _MM_FROUND_TO_ZERO 0x03 29#define _MM_FROUND_CUR_DIRECTION 0x04 30 31#define _MM_FROUND_RAISE_EXC 0x00 32#define _MM_FROUND_NO_EXC 0x08 33 34#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 35#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 36#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 37#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 38#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 39#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 40 41/// Rounds up each element of the 128-bit vector of [4 x float] to an 42/// integer and returns the rounded values in a 128-bit vector of 43/// [4 x float]. 44/// 45/// \headerfile <x86intrin.h> 46/// 47/// \code 48/// __m128 _mm_ceil_ps(__m128 X); 49/// \endcode 50/// 51/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 52/// 53/// \param X 54/// A 128-bit vector of [4 x float] values to be rounded up. 55/// \returns A 128-bit vector of [4 x float] containing the rounded values. 56#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 57 58/// Rounds up each element of the 128-bit vector of [2 x double] to an 59/// integer and returns the rounded values in a 128-bit vector of 60/// [2 x double]. 61/// 62/// \headerfile <x86intrin.h> 63/// 64/// \code 65/// __m128d _mm_ceil_pd(__m128d X); 66/// \endcode 67/// 68/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 69/// 70/// \param X 71/// A 128-bit vector of [2 x double] values to be rounded up. 72/// \returns A 128-bit vector of [2 x double] containing the rounded values. 73#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 74 75/// Copies three upper elements of the first 128-bit vector operand to 76/// the corresponding three upper elements of the 128-bit result vector of 77/// [4 x float]. Rounds up the lowest element of the second 128-bit vector 78/// operand to an integer and copies it to the lowest element of the 128-bit 79/// result vector of [4 x float]. 80/// 81/// \headerfile <x86intrin.h> 82/// 83/// \code 84/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 85/// \endcode 86/// 87/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 88/// 89/// \param X 90/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 91/// copied to the corresponding bits of the result. 92/// \param Y 93/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 94/// rounded up to the nearest integer and copied to the corresponding bits 95/// of the result. 96/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 97/// values. 98#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 99 100/// Copies the upper element of the first 128-bit vector operand to the 101/// corresponding upper element of the 128-bit result vector of [2 x double]. 102/// Rounds up the lower element of the second 128-bit vector operand to an 103/// integer and copies it to the lower element of the 128-bit result vector 104/// of [2 x double]. 105/// 106/// \headerfile <x86intrin.h> 107/// 108/// \code 109/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 110/// \endcode 111/// 112/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 113/// 114/// \param X 115/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 116/// copied to the corresponding bits of the result. 117/// \param Y 118/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 119/// rounded up to the nearest integer and copied to the corresponding bits 120/// of the result. 121/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 122/// values. 123#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 124 125/// Rounds down each element of the 128-bit vector of [4 x float] to an 126/// an integer and returns the rounded values in a 128-bit vector of 127/// [4 x float]. 128/// 129/// \headerfile <x86intrin.h> 130/// 131/// \code 132/// __m128 _mm_floor_ps(__m128 X); 133/// \endcode 134/// 135/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 136/// 137/// \param X 138/// A 128-bit vector of [4 x float] values to be rounded down. 139/// \returns A 128-bit vector of [4 x float] containing the rounded values. 140#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 141 142/// Rounds down each element of the 128-bit vector of [2 x double] to an 143/// integer and returns the rounded values in a 128-bit vector of 144/// [2 x double]. 145/// 146/// \headerfile <x86intrin.h> 147/// 148/// \code 149/// __m128d _mm_floor_pd(__m128d X); 150/// \endcode 151/// 152/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 153/// 154/// \param X 155/// A 128-bit vector of [2 x double]. 156/// \returns A 128-bit vector of [2 x double] containing the rounded values. 157#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 158 159/// Copies three upper elements of the first 128-bit vector operand to 160/// the corresponding three upper elements of the 128-bit result vector of 161/// [4 x float]. Rounds down the lowest element of the second 128-bit vector 162/// operand to an integer and copies it to the lowest element of the 128-bit 163/// result vector of [4 x float]. 164/// 165/// \headerfile <x86intrin.h> 166/// 167/// \code 168/// __m128 _mm_floor_ss(__m128 X, __m128 Y); 169/// \endcode 170/// 171/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 172/// 173/// \param X 174/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 175/// copied to the corresponding bits of the result. 176/// \param Y 177/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 178/// rounded down to the nearest integer and copied to the corresponding bits 179/// of the result. 180/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 181/// values. 182#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 183 184/// Copies the upper element of the first 128-bit vector operand to the 185/// corresponding upper element of the 128-bit result vector of [2 x double]. 186/// Rounds down the lower element of the second 128-bit vector operand to an 187/// integer and copies it to the lower element of the 128-bit result vector 188/// of [2 x double]. 189/// 190/// \headerfile <x86intrin.h> 191/// 192/// \code 193/// __m128d _mm_floor_sd(__m128d X, __m128d Y); 194/// \endcode 195/// 196/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 197/// 198/// \param X 199/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 200/// copied to the corresponding bits of the result. 201/// \param Y 202/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 203/// rounded down to the nearest integer and copied to the corresponding bits 204/// of the result. 205/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 206/// values. 207#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 208 209/// Rounds each element of the 128-bit vector of [4 x float] to an 210/// integer value according to the rounding control specified by the second 211/// argument and returns the rounded values in a 128-bit vector of 212/// [4 x float]. 213/// 214/// \headerfile <x86intrin.h> 215/// 216/// \code 217/// __m128 _mm_round_ps(__m128 X, const int M); 218/// \endcode 219/// 220/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 221/// 222/// \param X 223/// A 128-bit vector of [4 x float]. 224/// \param M 225/// An integer value that specifies the rounding operation. \n 226/// Bits [7:4] are reserved. \n 227/// Bit [3] is a precision exception value: \n 228/// 0: A normal PE exception is used \n 229/// 1: The PE field is not updated \n 230/// Bit [2] is the rounding control source: \n 231/// 0: Use bits [1:0] of \a M \n 232/// 1: Use the current MXCSR setting \n 233/// Bits [1:0] contain the rounding control definition: \n 234/// 00: Nearest \n 235/// 01: Downward (toward negative infinity) \n 236/// 10: Upward (toward positive infinity) \n 237/// 11: Truncated 238/// \returns A 128-bit vector of [4 x float] containing the rounded values. 239#define _mm_round_ps(X, M) \ 240 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) 241 242/// Copies three upper elements of the first 128-bit vector operand to 243/// the corresponding three upper elements of the 128-bit result vector of 244/// [4 x float]. Rounds the lowest element of the second 128-bit vector 245/// operand to an integer value according to the rounding control specified 246/// by the third argument and copies it to the lowest element of the 128-bit 247/// result vector of [4 x float]. 248/// 249/// \headerfile <x86intrin.h> 250/// 251/// \code 252/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 253/// \endcode 254/// 255/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 256/// 257/// \param X 258/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 259/// copied to the corresponding bits of the result. 260/// \param Y 261/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 262/// rounded to the nearest integer using the specified rounding control and 263/// copied to the corresponding bits of the result. 264/// \param M 265/// An integer value that specifies the rounding operation. \n 266/// Bits [7:4] are reserved. \n 267/// Bit [3] is a precision exception value: \n 268/// 0: A normal PE exception is used \n 269/// 1: The PE field is not updated \n 270/// Bit [2] is the rounding control source: \n 271/// 0: Use bits [1:0] of \a M \n 272/// 1: Use the current MXCSR setting \n 273/// Bits [1:0] contain the rounding control definition: \n 274/// 00: Nearest \n 275/// 01: Downward (toward negative infinity) \n 276/// 10: Upward (toward positive infinity) \n 277/// 11: Truncated 278/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 279/// values. 280#define _mm_round_ss(X, Y, M) \ 281 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ 282 (M))) 283 284/// Rounds each element of the 128-bit vector of [2 x double] to an 285/// integer value according to the rounding control specified by the second 286/// argument and returns the rounded values in a 128-bit vector of 287/// [2 x double]. 288/// 289/// \headerfile <x86intrin.h> 290/// 291/// \code 292/// __m128d _mm_round_pd(__m128d X, const int M); 293/// \endcode 294/// 295/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 296/// 297/// \param X 298/// A 128-bit vector of [2 x double]. 299/// \param M 300/// An integer value that specifies the rounding operation. \n 301/// Bits [7:4] are reserved. \n 302/// Bit [3] is a precision exception value: \n 303/// 0: A normal PE exception is used \n 304/// 1: The PE field is not updated \n 305/// Bit [2] is the rounding control source: \n 306/// 0: Use bits [1:0] of \a M \n 307/// 1: Use the current MXCSR setting \n 308/// Bits [1:0] contain the rounding control definition: \n 309/// 00: Nearest \n 310/// 01: Downward (toward negative infinity) \n 311/// 10: Upward (toward positive infinity) \n 312/// 11: Truncated 313/// \returns A 128-bit vector of [2 x double] containing the rounded values. 314#define _mm_round_pd(X, M) \ 315 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) 316 317/// Copies the upper element of the first 128-bit vector operand to the 318/// corresponding upper element of the 128-bit result vector of [2 x double]. 319/// Rounds the lower element of the second 128-bit vector operand to an 320/// integer value according to the rounding control specified by the third 321/// argument and copies it to the lower element of the 128-bit result vector 322/// of [2 x double]. 323/// 324/// \headerfile <x86intrin.h> 325/// 326/// \code 327/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 328/// \endcode 329/// 330/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 331/// 332/// \param X 333/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 334/// copied to the corresponding bits of the result. 335/// \param Y 336/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 337/// rounded to the nearest integer using the specified rounding control and 338/// copied to the corresponding bits of the result. 339/// \param M 340/// An integer value that specifies the rounding operation. \n 341/// Bits [7:4] are reserved. \n 342/// Bit [3] is a precision exception value: \n 343/// 0: A normal PE exception is used \n 344/// 1: The PE field is not updated \n 345/// Bit [2] is the rounding control source: \n 346/// 0: Use bits [1:0] of \a M \n 347/// 1: Use the current MXCSR setting \n 348/// Bits [1:0] contain the rounding control definition: \n 349/// 00: Nearest \n 350/// 01: Downward (toward negative infinity) \n 351/// 10: Upward (toward positive infinity) \n 352/// 11: Truncated 353/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 354/// values. 355#define _mm_round_sd(X, Y, M) \ 356 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 357 (M))) 358 359/* SSE4 Packed Blending Intrinsics. */ 360/// Returns a 128-bit vector of [2 x double] where the values are 361/// selected from either the first or second operand as specified by the 362/// third operand, the control mask. 363/// 364/// \headerfile <x86intrin.h> 365/// 366/// \code 367/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 368/// \endcode 369/// 370/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 371/// 372/// \param V1 373/// A 128-bit vector of [2 x double]. 374/// \param V2 375/// A 128-bit vector of [2 x double]. 376/// \param M 377/// An immediate integer operand, with mask bits [1:0] specifying how the 378/// values are to be copied. The position of the mask bit corresponds to the 379/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 380/// element in operand \a V1 is copied to the same position in the result. 381/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 382/// is copied to the same position in the result. 383/// \returns A 128-bit vector of [2 x double] containing the copied values. 384#define _mm_blend_pd(V1, V2, M) \ 385 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \ 386 (__v2df)(__m128d)(V2), (int)(M))) 387 388/// Returns a 128-bit vector of [4 x float] where the values are selected 389/// from either the first or second operand as specified by the third 390/// operand, the control mask. 391/// 392/// \headerfile <x86intrin.h> 393/// 394/// \code 395/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 396/// \endcode 397/// 398/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 399/// 400/// \param V1 401/// A 128-bit vector of [4 x float]. 402/// \param V2 403/// A 128-bit vector of [4 x float]. 404/// \param M 405/// An immediate integer operand, with mask bits [3:0] specifying how the 406/// values are to be copied. The position of the mask bit corresponds to the 407/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 408/// element in operand \a V1 is copied to the same position in the result. 409/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 410/// is copied to the same position in the result. 411/// \returns A 128-bit vector of [4 x float] containing the copied values. 412#define _mm_blend_ps(V1, V2, M) \ 413 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 414 (int)(M))) 415 416/// Returns a 128-bit vector of [2 x double] where the values are 417/// selected from either the first or second operand as specified by the 418/// third operand, the control mask. 419/// 420/// \headerfile <x86intrin.h> 421/// 422/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 423/// 424/// \param __V1 425/// A 128-bit vector of [2 x double]. 426/// \param __V2 427/// A 128-bit vector of [2 x double]. 428/// \param __M 429/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 430/// values are to be copied. The position of the mask bit corresponds to the 431/// most significant bit of a copied value. When a mask bit is 0, the 432/// corresponding 64-bit element in operand \a __V1 is copied to the same 433/// position in the result. When a mask bit is 1, the corresponding 64-bit 434/// element in operand \a __V2 is copied to the same position in the result. 435/// \returns A 128-bit vector of [2 x double] containing the copied values. 436static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, 437 __m128d __V2, 438 __m128d __M) { 439 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2, 440 (__v2df)__M); 441} 442 443/// Returns a 128-bit vector of [4 x float] where the values are 444/// selected from either the first or second operand as specified by the 445/// third operand, the control mask. 446/// 447/// \headerfile <x86intrin.h> 448/// 449/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 450/// 451/// \param __V1 452/// A 128-bit vector of [4 x float]. 453/// \param __V2 454/// A 128-bit vector of [4 x float]. 455/// \param __M 456/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 457/// how the values are to be copied. The position of the mask bit corresponds 458/// to the most significant bit of a copied value. When a mask bit is 0, the 459/// corresponding 32-bit element in operand \a __V1 is copied to the same 460/// position in the result. When a mask bit is 1, the corresponding 32-bit 461/// element in operand \a __V2 is copied to the same position in the result. 462/// \returns A 128-bit vector of [4 x float] containing the copied values. 463static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, 464 __m128 __V2, 465 __m128 __M) { 466 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2, 467 (__v4sf)__M); 468} 469 470/// Returns a 128-bit vector of [16 x i8] where the values are selected 471/// from either of the first or second operand as specified by the third 472/// operand, the control mask. 473/// 474/// \headerfile <x86intrin.h> 475/// 476/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 477/// 478/// \param __V1 479/// A 128-bit vector of [16 x i8]. 480/// \param __V2 481/// A 128-bit vector of [16 x i8]. 482/// \param __M 483/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 484/// how the values are to be copied. The position of the mask bit corresponds 485/// to the most significant bit of a copied value. When a mask bit is 0, the 486/// corresponding 8-bit element in operand \a __V1 is copied to the same 487/// position in the result. When a mask bit is 1, the corresponding 8-bit 488/// element in operand \a __V2 is copied to the same position in the result. 489/// \returns A 128-bit vector of [16 x i8] containing the copied values. 490static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, 491 __m128i __V2, 492 __m128i __M) { 493 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2, 494 (__v16qi)__M); 495} 496 497/// Returns a 128-bit vector of [8 x i16] where the values are selected 498/// from either of the first or second operand as specified by the third 499/// operand, the control mask. 500/// 501/// \headerfile <x86intrin.h> 502/// 503/// \code 504/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 505/// \endcode 506/// 507/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 508/// 509/// \param V1 510/// A 128-bit vector of [8 x i16]. 511/// \param V2 512/// A 128-bit vector of [8 x i16]. 513/// \param M 514/// An immediate integer operand, with mask bits [7:0] specifying how the 515/// values are to be copied. The position of the mask bit corresponds to the 516/// index of a copied value. When a mask bit is 0, the corresponding 16-bit 517/// element in operand \a V1 is copied to the same position in the result. 518/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 519/// is copied to the same position in the result. 520/// \returns A 128-bit vector of [8 x i16] containing the copied values. 521#define _mm_blend_epi16(V1, V2, M) \ 522 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \ 523 (__v8hi)(__m128i)(V2), (int)(M))) 524 525/* SSE4 Dword Multiply Instructions. */ 526/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 527/// and returns the lower 32 bits of the each product in a 128-bit vector of 528/// [4 x i32]. 529/// 530/// \headerfile <x86intrin.h> 531/// 532/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 533/// 534/// \param __V1 535/// A 128-bit integer vector. 536/// \param __V2 537/// A 128-bit integer vector. 538/// \returns A 128-bit integer vector containing the products of both operands. 539static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, 540 __m128i __V2) { 541 return (__m128i)((__v4su)__V1 * (__v4su)__V2); 542} 543 544/// Multiplies corresponding even-indexed elements of two 128-bit 545/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 546/// containing the products. 547/// 548/// \headerfile <x86intrin.h> 549/// 550/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 551/// 552/// \param __V1 553/// A 128-bit vector of [4 x i32]. 554/// \param __V2 555/// A 128-bit vector of [4 x i32]. 556/// \returns A 128-bit vector of [2 x i64] containing the products of both 557/// operands. 558static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, 559 __m128i __V2) { 560 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); 561} 562 563/* SSE4 Floating Point Dot Product Instructions. */ 564/// Computes the dot product of the two 128-bit vectors of [4 x float] 565/// and returns it in the elements of the 128-bit result vector of 566/// [4 x float]. 567/// 568/// The immediate integer operand controls which input elements 569/// will contribute to the dot product, and where the final results are 570/// returned. 571/// 572/// \headerfile <x86intrin.h> 573/// 574/// \code 575/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 576/// \endcode 577/// 578/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 579/// 580/// \param X 581/// A 128-bit vector of [4 x float]. 582/// \param Y 583/// A 128-bit vector of [4 x float]. 584/// \param M 585/// An immediate integer operand. Mask bits [7:4] determine which elements 586/// of the input vectors are used, with bit [4] corresponding to the lowest 587/// element and bit [7] corresponding to the highest element of each [4 x 588/// float] vector. If a bit is set, the corresponding elements from the two 589/// input vectors are used as an input for dot product; otherwise that input 590/// is treated as zero. Bits [3:0] determine which elements of the result 591/// will receive a copy of the final dot product, with bit [0] corresponding 592/// to the lowest element and bit [3] corresponding to the highest element of 593/// each [4 x float] subvector. If a bit is set, the dot product is returned 594/// in the corresponding element; otherwise that element is set to zero. 595/// \returns A 128-bit vector of [4 x float] containing the dot product. 596#define _mm_dp_ps(X, Y, M) \ 597 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M))) 598 599/// Computes the dot product of the two 128-bit vectors of [2 x double] 600/// and returns it in the elements of the 128-bit result vector of 601/// [2 x double]. 602/// 603/// The immediate integer operand controls which input 604/// elements will contribute to the dot product, and where the final results 605/// are returned. 606/// 607/// \headerfile <x86intrin.h> 608/// 609/// \code 610/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 611/// \endcode 612/// 613/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 614/// 615/// \param X 616/// A 128-bit vector of [2 x double]. 617/// \param Y 618/// A 128-bit vector of [2 x double]. 619/// \param M 620/// An immediate integer operand. Mask bits [5:4] determine which elements 621/// of the input vectors are used, with bit [4] corresponding to the lowest 622/// element and bit [5] corresponding to the highest element of each of [2 x 623/// double] vector. If a bit is set, the corresponding elements from the two 624/// input vectors are used as an input for dot product; otherwise that input 625/// is treated as zero. Bits [1:0] determine which elements of the result 626/// will receive a copy of the final dot product, with bit [0] corresponding 627/// to the lowest element and bit [1] corresponding to the highest element of 628/// each [2 x double] vector. If a bit is set, the dot product is returned in 629/// the corresponding element; otherwise that element is set to zero. 630#define _mm_dp_pd(X, Y, M) \ 631 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 632 (M))) 633 634/* SSE4 Streaming Load Hint Instruction. */ 635/// Loads integer values from a 128-bit aligned memory location to a 636/// 128-bit integer vector. 637/// 638/// \headerfile <x86intrin.h> 639/// 640/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 641/// 642/// \param __V 643/// A pointer to a 128-bit aligned memory location that contains the integer 644/// values. 645/// \returns A 128-bit integer vector containing the data stored at the 646/// specified memory location. 647static __inline__ __m128i __DEFAULT_FN_ATTRS 648_mm_stream_load_si128(__m128i const *__V) { 649 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V); 650} 651 652/* SSE4 Packed Integer Min/Max Instructions. */ 653/// Compares the corresponding elements of two 128-bit vectors of 654/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 655/// of the two values. 656/// 657/// \headerfile <x86intrin.h> 658/// 659/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 660/// 661/// \param __V1 662/// A 128-bit vector of [16 x i8]. 663/// \param __V2 664/// A 128-bit vector of [16 x i8] 665/// \returns A 128-bit vector of [16 x i8] containing the lesser values. 666static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, 667 __m128i __V2) { 668 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2); 669} 670 671/// Compares the corresponding elements of two 128-bit vectors of 672/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 673/// greater value of the two. 674/// 675/// \headerfile <x86intrin.h> 676/// 677/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 678/// 679/// \param __V1 680/// A 128-bit vector of [16 x i8]. 681/// \param __V2 682/// A 128-bit vector of [16 x i8]. 683/// \returns A 128-bit vector of [16 x i8] containing the greater values. 684static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, 685 __m128i __V2) { 686 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2); 687} 688 689/// Compares the corresponding elements of two 128-bit vectors of 690/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 691/// value of the two. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 696/// 697/// \param __V1 698/// A 128-bit vector of [8 x u16]. 699/// \param __V2 700/// A 128-bit vector of [8 x u16]. 701/// \returns A 128-bit vector of [8 x u16] containing the lesser values. 702static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, 703 __m128i __V2) { 704 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2); 705} 706 707/// Compares the corresponding elements of two 128-bit vectors of 708/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 709/// greater value of the two. 710/// 711/// \headerfile <x86intrin.h> 712/// 713/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 714/// 715/// \param __V1 716/// A 128-bit vector of [8 x u16]. 717/// \param __V2 718/// A 128-bit vector of [8 x u16]. 719/// \returns A 128-bit vector of [8 x u16] containing the greater values. 720static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, 721 __m128i __V2) { 722 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2); 723} 724 725/// Compares the corresponding elements of two 128-bit vectors of 726/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 727/// value of the two. 728/// 729/// \headerfile <x86intrin.h> 730/// 731/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 732/// 733/// \param __V1 734/// A 128-bit vector of [4 x i32]. 735/// \param __V2 736/// A 128-bit vector of [4 x i32]. 737/// \returns A 128-bit vector of [4 x i32] containing the lesser values. 738static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, 739 __m128i __V2) { 740 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2); 741} 742 743/// Compares the corresponding elements of two 128-bit vectors of 744/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 745/// greater value of the two. 746/// 747/// \headerfile <x86intrin.h> 748/// 749/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 750/// 751/// \param __V1 752/// A 128-bit vector of [4 x i32]. 753/// \param __V2 754/// A 128-bit vector of [4 x i32]. 755/// \returns A 128-bit vector of [4 x i32] containing the greater values. 756static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, 757 __m128i __V2) { 758 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2); 759} 760 761/// Compares the corresponding elements of two 128-bit vectors of 762/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 763/// value of the two. 764/// 765/// \headerfile <x86intrin.h> 766/// 767/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 768/// 769/// \param __V1 770/// A 128-bit vector of [4 x u32]. 771/// \param __V2 772/// A 128-bit vector of [4 x u32]. 773/// \returns A 128-bit vector of [4 x u32] containing the lesser values. 774static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, 775 __m128i __V2) { 776 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2); 777} 778 779/// Compares the corresponding elements of two 128-bit vectors of 780/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 781/// greater value of the two. 782/// 783/// \headerfile <x86intrin.h> 784/// 785/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 786/// 787/// \param __V1 788/// A 128-bit vector of [4 x u32]. 789/// \param __V2 790/// A 128-bit vector of [4 x u32]. 791/// \returns A 128-bit vector of [4 x u32] containing the greater values. 792static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, 793 __m128i __V2) { 794 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2); 795} 796 797/* SSE4 Insertion and Extraction from XMM Register Instructions. */ 798/// Takes the first argument \a X and inserts an element from the second 799/// argument \a Y as selected by the third argument \a N. That result then 800/// has elements zeroed out also as selected by the third argument \a N. The 801/// resulting 128-bit vector of [4 x float] is then returned. 802/// 803/// \headerfile <x86intrin.h> 804/// 805/// \code 806/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 807/// \endcode 808/// 809/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 810/// 811/// \param X 812/// A 128-bit vector source operand of [4 x float]. With the exception of 813/// those bits in the result copied from parameter \a Y and zeroed by bits 814/// [3:0] of \a N, all bits from this parameter are copied to the result. 815/// \param Y 816/// A 128-bit vector source operand of [4 x float]. One single-precision 817/// floating-point element from this source, as determined by the immediate 818/// parameter, is copied to the result. 819/// \param N 820/// Specifies which bits from operand \a Y will be copied, which bits in the 821/// result they will be copied to, and which bits in the result will be 822/// cleared. The following assignments are made: \n 823/// Bits [7:6] specify the bits to copy from operand \a Y: \n 824/// 00: Selects bits [31:0] from operand \a Y. \n 825/// 01: Selects bits [63:32] from operand \a Y. \n 826/// 10: Selects bits [95:64] from operand \a Y. \n 827/// 11: Selects bits [127:96] from operand \a Y. \n 828/// Bits [5:4] specify the bits in the result to which the selected bits 829/// from operand \a Y are copied: \n 830/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 831/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 832/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 833/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 834/// Bits[3:0]: If any of these bits are set, the corresponding result 835/// element is cleared. 836/// \returns A 128-bit vector of [4 x float] containing the copied 837/// single-precision floating point elements from the operands. 838#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 839 840/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 841/// returns it, using the immediate value parameter \a N as a selector. 842/// 843/// \headerfile <x86intrin.h> 844/// 845/// \code 846/// int _mm_extract_ps(__m128 X, const int N); 847/// \endcode 848/// 849/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 850/// instruction. 851/// 852/// \param X 853/// A 128-bit vector of [4 x float]. 854/// \param N 855/// An immediate value. Bits [1:0] determines which bits from the argument 856/// \a X are extracted and returned: \n 857/// 00: Bits [31:0] of parameter \a X are returned. \n 858/// 01: Bits [63:32] of parameter \a X are returned. \n 859/// 10: Bits [95:64] of parameter \a X are returned. \n 860/// 11: Bits [127:96] of parameter \a X are returned. 861/// \returns A 32-bit integer containing the extracted 32 bits of float data. 862#define _mm_extract_ps(X, N) \ 863 __builtin_bit_cast( \ 864 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) 865 866/* Miscellaneous insert and extract macros. */ 867/* Extract a single-precision float from X at index N into D. */ 868#define _MM_EXTRACT_FLOAT(D, X, N) \ 869 do { \ 870 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 871 } while (0) 872 873/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 874 an index suitable for _mm_insert_ps. */ 875#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 876 877/* Extract a float from X at index N into the first index of the return. */ 878#define _MM_PICK_OUT_PS(X, N) \ 879 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 880 881/* Insert int into packed integer array at index. */ 882/// Constructs a 128-bit vector of [16 x i8] by first making a copy of 883/// the 128-bit integer vector parameter, and then inserting the lower 8 bits 884/// of an integer parameter \a I into an offset specified by the immediate 885/// value parameter \a N. 886/// 887/// \headerfile <x86intrin.h> 888/// 889/// \code 890/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 891/// \endcode 892/// 893/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 894/// 895/// \param X 896/// A 128-bit integer vector of [16 x i8]. This vector is copied to the 897/// result and then one of the sixteen elements in the result vector is 898/// replaced by the lower 8 bits of \a I. 899/// \param I 900/// An integer. The lower 8 bits of this operand are written to the result 901/// beginning at the offset specified by \a N. 902/// \param N 903/// An immediate value. Bits [3:0] specify the bit offset in the result at 904/// which the lower 8 bits of \a I are written. \n 905/// 0000: Bits [7:0] of the result are used for insertion. \n 906/// 0001: Bits [15:8] of the result are used for insertion. \n 907/// 0010: Bits [23:16] of the result are used for insertion. \n 908/// 0011: Bits [31:24] of the result are used for insertion. \n 909/// 0100: Bits [39:32] of the result are used for insertion. \n 910/// 0101: Bits [47:40] of the result are used for insertion. \n 911/// 0110: Bits [55:48] of the result are used for insertion. \n 912/// 0111: Bits [63:56] of the result are used for insertion. \n 913/// 1000: Bits [71:64] of the result are used for insertion. \n 914/// 1001: Bits [79:72] of the result are used for insertion. \n 915/// 1010: Bits [87:80] of the result are used for insertion. \n 916/// 1011: Bits [95:88] of the result are used for insertion. \n 917/// 1100: Bits [103:96] of the result are used for insertion. \n 918/// 1101: Bits [111:104] of the result are used for insertion. \n 919/// 1110: Bits [119:112] of the result are used for insertion. \n 920/// 1111: Bits [127:120] of the result are used for insertion. 921/// \returns A 128-bit integer vector containing the constructed values. 922#define _mm_insert_epi8(X, I, N) \ 923 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \ 924 (int)(N))) 925 926/// Constructs a 128-bit vector of [4 x i32] by first making a copy of 927/// the 128-bit integer vector parameter, and then inserting the 32-bit 928/// integer parameter \a I at the offset specified by the immediate value 929/// parameter \a N. 930/// 931/// \headerfile <x86intrin.h> 932/// 933/// \code 934/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 935/// \endcode 936/// 937/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 938/// 939/// \param X 940/// A 128-bit integer vector of [4 x i32]. This vector is copied to the 941/// result and then one of the four elements in the result vector is 942/// replaced by \a I. 943/// \param I 944/// A 32-bit integer that is written to the result beginning at the offset 945/// specified by \a N. 946/// \param N 947/// An immediate value. Bits [1:0] specify the bit offset in the result at 948/// which the integer \a I is written. \n 949/// 00: Bits [31:0] of the result are used for insertion. \n 950/// 01: Bits [63:32] of the result are used for insertion. \n 951/// 10: Bits [95:64] of the result are used for insertion. \n 952/// 11: Bits [127:96] of the result are used for insertion. 953/// \returns A 128-bit integer vector containing the constructed values. 954#define _mm_insert_epi32(X, I, N) \ 955 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \ 956 (int)(N))) 957 958#ifdef __x86_64__ 959/// Constructs a 128-bit vector of [2 x i64] by first making a copy of 960/// the 128-bit integer vector parameter, and then inserting the 64-bit 961/// integer parameter \a I, using the immediate value parameter \a N as an 962/// insertion location selector. 963/// 964/// \headerfile <x86intrin.h> 965/// 966/// \code 967/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 968/// \endcode 969/// 970/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 971/// 972/// \param X 973/// A 128-bit integer vector of [2 x i64]. This vector is copied to the 974/// result and then one of the two elements in the result vector is replaced 975/// by \a I. 976/// \param I 977/// A 64-bit integer that is written to the result beginning at the offset 978/// specified by \a N. 979/// \param N 980/// An immediate value. Bit [0] specifies the bit offset in the result at 981/// which the integer \a I is written. \n 982/// 0: Bits [63:0] of the result are used for insertion. \n 983/// 1: Bits [127:64] of the result are used for insertion. \n 984/// \returns A 128-bit integer vector containing the constructed values. 985#define _mm_insert_epi64(X, I, N) \ 986 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \ 987 (int)(N))) 988#endif /* __x86_64__ */ 989 990/* Extract int from packed integer array at index. This returns the element 991 * as a zero extended value, so it is unsigned. 992 */ 993/// Extracts an 8-bit element from the 128-bit integer vector of 994/// [16 x i8], using the immediate value parameter \a N as a selector. 995/// 996/// \headerfile <x86intrin.h> 997/// 998/// \code 999/// int _mm_extract_epi8(__m128i X, const int N); 1000/// \endcode 1001/// 1002/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1003/// 1004/// \param X 1005/// A 128-bit integer vector. 1006/// \param N 1007/// An immediate value. Bits [3:0] specify which 8-bit vector element from 1008/// the argument \a X to extract and copy to the result. \n 1009/// 0000: Bits [7:0] of parameter \a X are extracted. \n 1010/// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1011/// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1012/// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1013/// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1014/// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1015/// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1016/// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1017/// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1018/// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1019/// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1020/// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1021/// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1022/// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1023/// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1024/// 1111: Bits [127:120] of the parameter \a X are extracted. 1025/// \returns An unsigned integer, whose lower 8 bits are selected from the 1026/// 128-bit integer vector parameter and the remaining bits are assigned 1027/// zeros. 1028#define _mm_extract_epi8(X, N) \ 1029 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1030 (int)(N))) 1031 1032/// Extracts a 32-bit element from the 128-bit integer vector of 1033/// [4 x i32], using the immediate value parameter \a N as a selector. 1034/// 1035/// \headerfile <x86intrin.h> 1036/// 1037/// \code 1038/// int _mm_extract_epi32(__m128i X, const int N); 1039/// \endcode 1040/// 1041/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1042/// 1043/// \param X 1044/// A 128-bit integer vector. 1045/// \param N 1046/// An immediate value. Bits [1:0] specify which 32-bit vector element from 1047/// the argument \a X to extract and copy to the result. \n 1048/// 00: Bits [31:0] of the parameter \a X are extracted. \n 1049/// 01: Bits [63:32] of the parameter \a X are extracted. \n 1050/// 10: Bits [95:64] of the parameter \a X are extracted. \n 1051/// 11: Bits [127:96] of the parameter \a X are exracted. 1052/// \returns An integer, whose lower 32 bits are selected from the 128-bit 1053/// integer vector parameter and the remaining bits are assigned zeros. 1054#define _mm_extract_epi32(X, N) \ 1055 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) 1056 1057/// Extracts a 64-bit element from the 128-bit integer vector of 1058/// [2 x i64], using the immediate value parameter \a N as a selector. 1059/// 1060/// \headerfile <x86intrin.h> 1061/// 1062/// \code 1063/// long long _mm_extract_epi64(__m128i X, const int N); 1064/// \endcode 1065/// 1066/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction 1067/// in 64-bit mode. 1068/// 1069/// \param X 1070/// A 128-bit integer vector. 1071/// \param N 1072/// An immediate value. Bit [0] specifies which 64-bit vector element from 1073/// the argument \a X to return. \n 1074/// 0: Bits [63:0] are returned. \n 1075/// 1: Bits [127:64] are returned. \n 1076/// \returns A 64-bit integer. 1077#define _mm_extract_epi64(X, N) \ 1078 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) 1079 1080/* SSE4 128-bit Packed Integer Comparisons. */ 1081/// Tests whether the specified bits in a 128-bit integer vector are all 1082/// zeros. 1083/// 1084/// \headerfile <x86intrin.h> 1085/// 1086/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1087/// 1088/// \param __M 1089/// A 128-bit integer vector containing the bits to be tested. 1090/// \param __V 1091/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1092/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1093static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, 1094 __m128i __V) { 1095 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1096} 1097 1098/// Tests whether the specified bits in a 128-bit integer vector are all 1099/// ones. 1100/// 1101/// \headerfile <x86intrin.h> 1102/// 1103/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1104/// 1105/// \param __M 1106/// A 128-bit integer vector containing the bits to be tested. 1107/// \param __V 1108/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1109/// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1110static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, 1111 __m128i __V) { 1112 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1113} 1114 1115/// Tests whether the specified bits in a 128-bit integer vector are 1116/// neither all zeros nor all ones. 1117/// 1118/// \headerfile <x86intrin.h> 1119/// 1120/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1121/// 1122/// \param __M 1123/// A 128-bit integer vector containing the bits to be tested. 1124/// \param __V 1125/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1126/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1127/// FALSE otherwise. 1128static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, 1129 __m128i __V) { 1130 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1131} 1132 1133/// Tests whether the specified bits in a 128-bit integer vector are all 1134/// ones. 1135/// 1136/// \headerfile <x86intrin.h> 1137/// 1138/// \code 1139/// int _mm_test_all_ones(__m128i V); 1140/// \endcode 1141/// 1142/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1143/// 1144/// \param V 1145/// A 128-bit integer vector containing the bits to be tested. 1146/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1147/// otherwise. 1148#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1)) 1149 1150/// Tests whether the specified bits in a 128-bit integer vector are 1151/// neither all zeros nor all ones. 1152/// 1153/// \headerfile <x86intrin.h> 1154/// 1155/// \code 1156/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1157/// \endcode 1158/// 1159/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1160/// 1161/// \param M 1162/// A 128-bit integer vector containing the bits to be tested. 1163/// \param V 1164/// A 128-bit integer vector selecting which bits to test in operand \a M. 1165/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1166/// FALSE otherwise. 1167#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1168 1169/// Tests whether the specified bits in a 128-bit integer vector are all 1170/// zeros. 1171/// 1172/// \headerfile <x86intrin.h> 1173/// 1174/// \code 1175/// int _mm_test_all_zeros(__m128i M, __m128i V); 1176/// \endcode 1177/// 1178/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1179/// 1180/// \param M 1181/// A 128-bit integer vector containing the bits to be tested. 1182/// \param V 1183/// A 128-bit integer vector selecting which bits to test in operand \a M. 1184/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1185#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 1186 1187/* SSE4 64-bit Packed Integer Comparisons. */ 1188/// Compares each of the corresponding 64-bit values of the 128-bit 1189/// integer vectors for equality. 1190/// 1191/// \headerfile <x86intrin.h> 1192/// 1193/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1194/// 1195/// \param __V1 1196/// A 128-bit integer vector. 1197/// \param __V2 1198/// A 128-bit integer vector. 1199/// \returns A 128-bit integer vector containing the comparison results. 1200static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, 1201 __m128i __V2) { 1202 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1203} 1204 1205/* SSE4 Packed Integer Sign-Extension. */ 1206/// Sign-extends each of the lower eight 8-bit integer elements of a 1207/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1208/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1209/// are unused. 1210/// 1211/// \headerfile <x86intrin.h> 1212/// 1213/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1214/// 1215/// \param __V 1216/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1217/// sign-extended to 16-bit values. 1218/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1219static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { 1220 /* This function always performs a signed extension, but __v16qi is a char 1221 which may be signed or unsigned, so use __v16qs. */ 1222 return (__m128i) __builtin_convertvector( 1223 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 1224 7), 1225 __v8hi); 1226} 1227 1228/// Sign-extends each of the lower four 8-bit integer elements of a 1229/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1230/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1231/// vector are unused. 1232/// 1233/// \headerfile <x86intrin.h> 1234/// 1235/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1236/// 1237/// \param __V 1238/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1239/// sign-extended to 32-bit values. 1240/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1241static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { 1242 /* This function always performs a signed extension, but __v16qi is a char 1243 which may be signed or unsigned, so use __v16qs. */ 1244 return (__m128i) __builtin_convertvector( 1245 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1246} 1247 1248/// Sign-extends each of the lower two 8-bit integer elements of a 1249/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1250/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1251/// vector are unused. 1252/// 1253/// \headerfile <x86intrin.h> 1254/// 1255/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1256/// 1257/// \param __V 1258/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1259/// sign-extended to 64-bit values. 1260/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1261static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { 1262 /* This function always performs a signed extension, but __v16qi is a char 1263 which may be signed or unsigned, so use __v16qs. */ 1264 return (__m128i) __builtin_convertvector( 1265 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1266} 1267 1268/// Sign-extends each of the lower four 16-bit integer elements of a 1269/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1270/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1271/// vector are unused. 1272/// 1273/// \headerfile <x86intrin.h> 1274/// 1275/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1276/// 1277/// \param __V 1278/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1279/// sign-extended to 32-bit values. 1280/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1281static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { 1282 return (__m128i) __builtin_convertvector( 1283 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1284} 1285 1286/// Sign-extends each of the lower two 16-bit integer elements of a 1287/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1288/// a 128-bit vector of [2 x i64]. The upper six elements of the input 1289/// vector are unused. 1290/// 1291/// \headerfile <x86intrin.h> 1292/// 1293/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1294/// 1295/// \param __V 1296/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1297/// sign-extended to 64-bit values. 1298/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1299static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { 1300 return (__m128i) __builtin_convertvector( 1301 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1302} 1303 1304/// Sign-extends each of the lower two 32-bit integer elements of a 1305/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1306/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1307/// are unused. 1308/// 1309/// \headerfile <x86intrin.h> 1310/// 1311/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1312/// 1313/// \param __V 1314/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1315/// sign-extended to 64-bit values. 1316/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1317static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { 1318 return (__m128i) __builtin_convertvector( 1319 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1320} 1321 1322/* SSE4 Packed Integer Zero-Extension. */ 1323/// Zero-extends each of the lower eight 8-bit integer elements of a 1324/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1325/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1326/// are unused. 1327/// 1328/// \headerfile <x86intrin.h> 1329/// 1330/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1331/// 1332/// \param __V 1333/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1334/// zero-extended to 16-bit values. 1335/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1336static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { 1337 return (__m128i) __builtin_convertvector( 1338 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 1339 7), 1340 __v8hi); 1341} 1342 1343/// Zero-extends each of the lower four 8-bit integer elements of a 1344/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1345/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1346/// vector are unused. 1347/// 1348/// \headerfile <x86intrin.h> 1349/// 1350/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1351/// 1352/// \param __V 1353/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1354/// zero-extended to 32-bit values. 1355/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1356static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { 1357 return (__m128i) __builtin_convertvector( 1358 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1359} 1360 1361/// Zero-extends each of the lower two 8-bit integer elements of a 1362/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1363/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1364/// vector are unused. 1365/// 1366/// \headerfile <x86intrin.h> 1367/// 1368/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1369/// 1370/// \param __V 1371/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1372/// zero-extended to 64-bit values. 1373/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1374static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { 1375 return (__m128i) __builtin_convertvector( 1376 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1377} 1378 1379/// Zero-extends each of the lower four 16-bit integer elements of a 1380/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1381/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1382/// vector are unused. 1383/// 1384/// \headerfile <x86intrin.h> 1385/// 1386/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1387/// 1388/// \param __V 1389/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1390/// zero-extended to 32-bit values. 1391/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1392static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { 1393 return (__m128i) __builtin_convertvector( 1394 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1395} 1396 1397/// Zero-extends each of the lower two 16-bit integer elements of a 1398/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1399/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1400/// are unused. 1401/// 1402/// \headerfile <x86intrin.h> 1403/// 1404/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1405/// 1406/// \param __V 1407/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1408/// zero-extended to 64-bit values. 1409/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1410static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { 1411 return (__m128i) __builtin_convertvector( 1412 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1413} 1414 1415/// Zero-extends each of the lower two 32-bit integer elements of a 1416/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1417/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1418/// are unused. 1419/// 1420/// \headerfile <x86intrin.h> 1421/// 1422/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1423/// 1424/// \param __V 1425/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1426/// zero-extended to 64-bit values. 1427/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1428static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { 1429 return (__m128i) __builtin_convertvector( 1430 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1431} 1432 1433/* SSE4 Pack with Unsigned Saturation. */ 1434/// Converts 32-bit signed integers from both 128-bit integer vector 1435/// operands into 16-bit unsigned integers, and returns the packed result. 1436/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1437/// 0x0000 are saturated to 0x0000. 1438/// 1439/// \headerfile <x86intrin.h> 1440/// 1441/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1442/// 1443/// \param __V1 1444/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1445/// signed integer and is converted to a 16-bit unsigned integer with 1446/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1447/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1448/// are written to the lower 64 bits of the result. 1449/// \param __V2 1450/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1451/// signed integer and is converted to a 16-bit unsigned integer with 1452/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1453/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1454/// are written to the higher 64 bits of the result. 1455/// \returns A 128-bit vector of [8 x i16] containing the converted values. 1456static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, 1457 __m128i __V2) { 1458 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1459} 1460 1461/* SSE4 Multiple Packed Sums of Absolute Difference. */ 1462/// Subtracts 8-bit unsigned integer values and computes the absolute 1463/// values of the differences to the corresponding bits in the destination. 1464/// Then sums of the absolute differences are returned according to the bit 1465/// fields in the immediate operand. 1466/// 1467/// \headerfile <x86intrin.h> 1468/// 1469/// \code 1470/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1471/// \endcode 1472/// 1473/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1474/// 1475/// \param X 1476/// A 128-bit vector of [16 x i8]. 1477/// \param Y 1478/// A 128-bit vector of [16 x i8]. 1479/// \param M 1480/// An 8-bit immediate operand specifying how the absolute differences are to 1481/// be calculated, according to the following algorithm: 1482/// \code 1483/// // M2 represents bit 2 of the immediate operand 1484/// // M10 represents bits [1:0] of the immediate operand 1485/// i = M2 * 4; 1486/// j = M10 * 4; 1487/// for (k = 0; k < 8; k = k + 1) { 1488/// d0 = abs(X[i + k + 0] - Y[j + 0]); 1489/// d1 = abs(X[i + k + 1] - Y[j + 1]); 1490/// d2 = abs(X[i + k + 2] - Y[j + 2]); 1491/// d3 = abs(X[i + k + 3] - Y[j + 3]); 1492/// r[k] = d0 + d1 + d2 + d3; 1493/// } 1494/// \endcode 1495/// \returns A 128-bit integer vector containing the sums of the sets of 1496/// absolute differences between both operands. 1497#define _mm_mpsadbw_epu8(X, Y, M) \ 1498 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1499 (__v16qi)(__m128i)(Y), (M))) 1500 1501/// Finds the minimum unsigned 16-bit element in the input 128-bit 1502/// vector of [8 x u16] and returns it and along with its index. 1503/// 1504/// \headerfile <x86intrin.h> 1505/// 1506/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1507/// instruction. 1508/// 1509/// \param __V 1510/// A 128-bit vector of [8 x u16]. 1511/// \returns A 128-bit value where bits [15:0] contain the minimum value found 1512/// in parameter \a __V, bits [18:16] contain the index of the minimum value 1513/// and the remaining bits are set to 0. 1514static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { 1515 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V); 1516} 1517 1518/* Handle the sse4.2 definitions here. */ 1519 1520/* These definitions are normally in nmmintrin.h, but gcc puts them in here 1521 so we'll do the same. */ 1522 1523#undef __DEFAULT_FN_ATTRS 1524#define __DEFAULT_FN_ATTRS \ 1525 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1526 1527/* These specify the type of data that we're comparing. */ 1528#define _SIDD_UBYTE_OPS 0x00 1529#define _SIDD_UWORD_OPS 0x01 1530#define _SIDD_SBYTE_OPS 0x02 1531#define _SIDD_SWORD_OPS 0x03 1532 1533/* These specify the type of comparison operation. */ 1534#define _SIDD_CMP_EQUAL_ANY 0x00 1535#define _SIDD_CMP_RANGES 0x04 1536#define _SIDD_CMP_EQUAL_EACH 0x08 1537#define _SIDD_CMP_EQUAL_ORDERED 0x0c 1538 1539/* These macros specify the polarity of the operation. */ 1540#define _SIDD_POSITIVE_POLARITY 0x00 1541#define _SIDD_NEGATIVE_POLARITY 0x10 1542#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1543#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1544 1545/* These macros are used in _mm_cmpXstri() to specify the return. */ 1546#define _SIDD_LEAST_SIGNIFICANT 0x00 1547#define _SIDD_MOST_SIGNIFICANT 0x40 1548 1549/* These macros are used in _mm_cmpXstri() to specify the return. */ 1550#define _SIDD_BIT_MASK 0x00 1551#define _SIDD_UNIT_MASK 0x40 1552 1553/* SSE4.2 Packed Comparison Intrinsics. */ 1554/// Uses the immediate operand \a M to perform a comparison of string 1555/// data with implicitly defined lengths that is contained in source operands 1556/// \a A and \a B. Returns a 128-bit integer vector representing the result 1557/// mask of the comparison. 1558/// 1559/// \headerfile <x86intrin.h> 1560/// 1561/// \code 1562/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1563/// \endcode 1564/// 1565/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1566/// instruction. 1567/// 1568/// \param A 1569/// A 128-bit integer vector containing one of the source operands to be 1570/// compared. 1571/// \param B 1572/// A 128-bit integer vector containing one of the source operands to be 1573/// compared. 1574/// \param M 1575/// An 8-bit immediate operand specifying whether the characters are bytes or 1576/// words, the type of comparison to perform, and the format of the return 1577/// value. \n 1578/// Bits [1:0]: Determine source data format. \n 1579/// 00: 16 unsigned bytes \n 1580/// 01: 8 unsigned words \n 1581/// 10: 16 signed bytes \n 1582/// 11: 8 signed words \n 1583/// Bits [3:2]: Determine comparison type and aggregation method. \n 1584/// 00: Subset: Each character in \a B is compared for equality with all 1585/// the characters in \a A. \n 1586/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1587/// basis is greater than or equal for even-indexed elements in \a A, 1588/// and less than or equal for odd-indexed elements in \a A. \n 1589/// 10: Match: Compare each pair of corresponding characters in \a A and 1590/// \a B for equality. \n 1591/// 11: Substring: Search \a B for substring matches of \a A. \n 1592/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1593/// mask of the comparison results. \n 1594/// 00: No effect. \n 1595/// 01: Negate the bit mask. \n 1596/// 10: No effect. \n 1597/// 11: Negate the bit mask only for bits with an index less than or equal 1598/// to the size of \a A or \a B. \n 1599/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1600/// bytes. \n 1601/// 0: The result is zero-extended to 16 bytes. \n 1602/// 1: The result is expanded to 16 bytes (this expansion is performed by 1603/// repeating each bit 8 or 16 times). 1604/// \returns Returns a 128-bit integer vector representing the result mask of 1605/// the comparison. 1606#define _mm_cmpistrm(A, B, M) \ 1607 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1608 (__v16qi)(__m128i)(B), (int)(M))) 1609 1610/// Uses the immediate operand \a M to perform a comparison of string 1611/// data with implicitly defined lengths that is contained in source operands 1612/// \a A and \a B. Returns an integer representing the result index of the 1613/// comparison. 1614/// 1615/// \headerfile <x86intrin.h> 1616/// 1617/// \code 1618/// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1619/// \endcode 1620/// 1621/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1622/// instruction. 1623/// 1624/// \param A 1625/// A 128-bit integer vector containing one of the source operands to be 1626/// compared. 1627/// \param B 1628/// A 128-bit integer vector containing one of the source operands to be 1629/// compared. 1630/// \param M 1631/// An 8-bit immediate operand specifying whether the characters are bytes or 1632/// words, the type of comparison to perform, and the format of the return 1633/// value. \n 1634/// Bits [1:0]: Determine source data format. \n 1635/// 00: 16 unsigned bytes \n 1636/// 01: 8 unsigned words \n 1637/// 10: 16 signed bytes \n 1638/// 11: 8 signed words \n 1639/// Bits [3:2]: Determine comparison type and aggregation method. \n 1640/// 00: Subset: Each character in \a B is compared for equality with all 1641/// the characters in \a A. \n 1642/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1643/// basis is greater than or equal for even-indexed elements in \a A, 1644/// and less than or equal for odd-indexed elements in \a A. \n 1645/// 10: Match: Compare each pair of corresponding characters in \a A and 1646/// \a B for equality. \n 1647/// 11: Substring: Search B for substring matches of \a A. \n 1648/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1649/// mask of the comparison results. \n 1650/// 00: No effect. \n 1651/// 01: Negate the bit mask. \n 1652/// 10: No effect. \n 1653/// 11: Negate the bit mask only for bits with an index less than or equal 1654/// to the size of \a A or \a B. \n 1655/// Bit [6]: Determines whether the index of the lowest set bit or the 1656/// highest set bit is returned. \n 1657/// 0: The index of the least significant set bit. \n 1658/// 1: The index of the most significant set bit. \n 1659/// \returns Returns an integer representing the result index of the comparison. 1660#define _mm_cmpistri(A, B, M) \ 1661 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1662 (__v16qi)(__m128i)(B), (int)(M))) 1663 1664/// Uses the immediate operand \a M to perform a comparison of string 1665/// data with explicitly defined lengths that is contained in source operands 1666/// \a A and \a B. Returns a 128-bit integer vector representing the result 1667/// mask of the comparison. 1668/// 1669/// \headerfile <x86intrin.h> 1670/// 1671/// \code 1672/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1673/// \endcode 1674/// 1675/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1676/// instruction. 1677/// 1678/// \param A 1679/// A 128-bit integer vector containing one of the source operands to be 1680/// compared. 1681/// \param LA 1682/// An integer that specifies the length of the string in \a A. 1683/// \param B 1684/// A 128-bit integer vector containing one of the source operands to be 1685/// compared. 1686/// \param LB 1687/// An integer that specifies the length of the string in \a B. 1688/// \param M 1689/// An 8-bit immediate operand specifying whether the characters are bytes or 1690/// words, the type of comparison to perform, and the format of the return 1691/// value. \n 1692/// Bits [1:0]: Determine source data format. \n 1693/// 00: 16 unsigned bytes \n 1694/// 01: 8 unsigned words \n 1695/// 10: 16 signed bytes \n 1696/// 11: 8 signed words \n 1697/// Bits [3:2]: Determine comparison type and aggregation method. \n 1698/// 00: Subset: Each character in \a B is compared for equality with all 1699/// the characters in \a A. \n 1700/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1701/// basis is greater than or equal for even-indexed elements in \a A, 1702/// and less than or equal for odd-indexed elements in \a A. \n 1703/// 10: Match: Compare each pair of corresponding characters in \a A and 1704/// \a B for equality. \n 1705/// 11: Substring: Search \a B for substring matches of \a A. \n 1706/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1707/// mask of the comparison results. \n 1708/// 00: No effect. \n 1709/// 01: Negate the bit mask. \n 1710/// 10: No effect. \n 1711/// 11: Negate the bit mask only for bits with an index less than or equal 1712/// to the size of \a A or \a B. \n 1713/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1714/// bytes. \n 1715/// 0: The result is zero-extended to 16 bytes. \n 1716/// 1: The result is expanded to 16 bytes (this expansion is performed by 1717/// repeating each bit 8 or 16 times). \n 1718/// \returns Returns a 128-bit integer vector representing the result mask of 1719/// the comparison. 1720#define _mm_cmpestrm(A, LA, B, LB, M) \ 1721 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1722 (__v16qi)(__m128i)(B), (int)(LB), \ 1723 (int)(M))) 1724 1725/// Uses the immediate operand \a M to perform a comparison of string 1726/// data with explicitly defined lengths that is contained in source operands 1727/// \a A and \a B. Returns an integer representing the result index of the 1728/// comparison. 1729/// 1730/// \headerfile <x86intrin.h> 1731/// 1732/// \code 1733/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1734/// \endcode 1735/// 1736/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1737/// instruction. 1738/// 1739/// \param A 1740/// A 128-bit integer vector containing one of the source operands to be 1741/// compared. 1742/// \param LA 1743/// An integer that specifies the length of the string in \a A. 1744/// \param B 1745/// A 128-bit integer vector containing one of the source operands to be 1746/// compared. 1747/// \param LB 1748/// An integer that specifies the length of the string in \a B. 1749/// \param M 1750/// An 8-bit immediate operand specifying whether the characters are bytes or 1751/// words, the type of comparison to perform, and the format of the return 1752/// value. \n 1753/// Bits [1:0]: Determine source data format. \n 1754/// 00: 16 unsigned bytes \n 1755/// 01: 8 unsigned words \n 1756/// 10: 16 signed bytes \n 1757/// 11: 8 signed words \n 1758/// Bits [3:2]: Determine comparison type and aggregation method. \n 1759/// 00: Subset: Each character in \a B is compared for equality with all 1760/// the characters in \a A. \n 1761/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1762/// basis is greater than or equal for even-indexed elements in \a A, 1763/// and less than or equal for odd-indexed elements in \a A. \n 1764/// 10: Match: Compare each pair of corresponding characters in \a A and 1765/// \a B for equality. \n 1766/// 11: Substring: Search B for substring matches of \a A. \n 1767/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1768/// mask of the comparison results. \n 1769/// 00: No effect. \n 1770/// 01: Negate the bit mask. \n 1771/// 10: No effect. \n 1772/// 11: Negate the bit mask only for bits with an index less than or equal 1773/// to the size of \a A or \a B. \n 1774/// Bit [6]: Determines whether the index of the lowest set bit or the 1775/// highest set bit is returned. \n 1776/// 0: The index of the least significant set bit. \n 1777/// 1: The index of the most significant set bit. \n 1778/// \returns Returns an integer representing the result index of the comparison. 1779#define _mm_cmpestri(A, LA, B, LB, M) \ 1780 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1781 (__v16qi)(__m128i)(B), (int)(LB), \ 1782 (int)(M))) 1783 1784/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1785/// Uses the immediate operand \a M to perform a comparison of string 1786/// data with implicitly defined lengths that is contained in source operands 1787/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1788/// string in \a B is the maximum, otherwise, returns 0. 1789/// 1790/// \headerfile <x86intrin.h> 1791/// 1792/// \code 1793/// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1794/// \endcode 1795/// 1796/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1797/// instruction. 1798/// 1799/// \param A 1800/// A 128-bit integer vector containing one of the source operands to be 1801/// compared. 1802/// \param B 1803/// A 128-bit integer vector containing one of the source operands to be 1804/// compared. 1805/// \param M 1806/// An 8-bit immediate operand specifying whether the characters are bytes or 1807/// words and the type of comparison to perform. \n 1808/// Bits [1:0]: Determine source data format. \n 1809/// 00: 16 unsigned bytes \n 1810/// 01: 8 unsigned words \n 1811/// 10: 16 signed bytes \n 1812/// 11: 8 signed words \n 1813/// Bits [3:2]: Determine comparison type and aggregation method. \n 1814/// 00: Subset: Each character in \a B is compared for equality with all 1815/// the characters in \a A. \n 1816/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1817/// basis is greater than or equal for even-indexed elements in \a A, 1818/// and less than or equal for odd-indexed elements in \a A. \n 1819/// 10: Match: Compare each pair of corresponding characters in \a A and 1820/// \a B for equality. \n 1821/// 11: Substring: Search \a B for substring matches of \a A. \n 1822/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1823/// mask of the comparison results. \n 1824/// 00: No effect. \n 1825/// 01: Negate the bit mask. \n 1826/// 10: No effect. \n 1827/// 11: Negate the bit mask only for bits with an index less than or equal 1828/// to the size of \a A or \a B. \n 1829/// \returns Returns 1 if the bit mask is zero and the length of the string in 1830/// \a B is the maximum; otherwise, returns 0. 1831#define _mm_cmpistra(A, B, M) \ 1832 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1833 (__v16qi)(__m128i)(B), (int)(M))) 1834 1835/// Uses the immediate operand \a M to perform a comparison of string 1836/// data with implicitly defined lengths that is contained in source operands 1837/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1838/// 0. 1839/// 1840/// \headerfile <x86intrin.h> 1841/// 1842/// \code 1843/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1844/// \endcode 1845/// 1846/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1847/// instruction. 1848/// 1849/// \param A 1850/// A 128-bit integer vector containing one of the source operands to be 1851/// compared. 1852/// \param B 1853/// A 128-bit integer vector containing one of the source operands to be 1854/// compared. 1855/// \param M 1856/// An 8-bit immediate operand specifying whether the characters are bytes or 1857/// words and the type of comparison to perform. \n 1858/// Bits [1:0]: Determine source data format. \n 1859/// 00: 16 unsigned bytes \n 1860/// 01: 8 unsigned words \n 1861/// 10: 16 signed bytes \n 1862/// 11: 8 signed words \n 1863/// Bits [3:2]: Determine comparison type and aggregation method. \n 1864/// 00: Subset: Each character in \a B is compared for equality with all 1865/// the characters in \a A. \n 1866/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1867/// basis is greater than or equal for even-indexed elements in \a A, 1868/// and less than or equal for odd-indexed elements in \a A. \n 1869/// 10: Match: Compare each pair of corresponding characters in \a A and 1870/// \a B for equality. \n 1871/// 11: Substring: Search B for substring matches of \a A. \n 1872/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1873/// mask of the comparison results. \n 1874/// 00: No effect. \n 1875/// 01: Negate the bit mask. \n 1876/// 10: No effect. \n 1877/// 11: Negate the bit mask only for bits with an index less than or equal 1878/// to the size of \a A or \a B. 1879/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1880#define _mm_cmpistrc(A, B, M) \ 1881 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1882 (__v16qi)(__m128i)(B), (int)(M))) 1883 1884/// Uses the immediate operand \a M to perform a comparison of string 1885/// data with implicitly defined lengths that is contained in source operands 1886/// \a A and \a B. Returns bit 0 of the resulting bit mask. 1887/// 1888/// \headerfile <x86intrin.h> 1889/// 1890/// \code 1891/// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1892/// \endcode 1893/// 1894/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1895/// instruction. 1896/// 1897/// \param A 1898/// A 128-bit integer vector containing one of the source operands to be 1899/// compared. 1900/// \param B 1901/// A 128-bit integer vector containing one of the source operands to be 1902/// compared. 1903/// \param M 1904/// An 8-bit immediate operand specifying whether the characters are bytes or 1905/// words and the type of comparison to perform. \n 1906/// Bits [1:0]: Determine source data format. \n 1907/// 00: 16 unsigned bytes \n 1908/// 01: 8 unsigned words \n 1909/// 10: 16 signed bytes \n 1910/// 11: 8 signed words \n 1911/// Bits [3:2]: Determine comparison type and aggregation method. \n 1912/// 00: Subset: Each character in \a B is compared for equality with all 1913/// the characters in \a A. \n 1914/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1915/// basis is greater than or equal for even-indexed elements in \a A, 1916/// and less than or equal for odd-indexed elements in \a A. \n 1917/// 10: Match: Compare each pair of corresponding characters in \a A and 1918/// \a B for equality. \n 1919/// 11: Substring: Search B for substring matches of \a A. \n 1920/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1921/// mask of the comparison results. \n 1922/// 00: No effect. \n 1923/// 01: Negate the bit mask. \n 1924/// 10: No effect. \n 1925/// 11: Negate the bit mask only for bits with an index less than or equal 1926/// to the size of \a A or \a B. \n 1927/// \returns Returns bit 0 of the resulting bit mask. 1928#define _mm_cmpistro(A, B, M) \ 1929 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1930 (__v16qi)(__m128i)(B), (int)(M))) 1931 1932/// Uses the immediate operand \a M to perform a comparison of string 1933/// data with implicitly defined lengths that is contained in source operands 1934/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1935/// the maximum, otherwise, returns 0. 1936/// 1937/// \headerfile <x86intrin.h> 1938/// 1939/// \code 1940/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1941/// \endcode 1942/// 1943/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1944/// instruction. 1945/// 1946/// \param A 1947/// A 128-bit integer vector containing one of the source operands to be 1948/// compared. 1949/// \param B 1950/// A 128-bit integer vector containing one of the source operands to be 1951/// compared. 1952/// \param M 1953/// An 8-bit immediate operand specifying whether the characters are bytes or 1954/// words and the type of comparison to perform. \n 1955/// Bits [1:0]: Determine source data format. \n 1956/// 00: 16 unsigned bytes \n 1957/// 01: 8 unsigned words \n 1958/// 10: 16 signed bytes \n 1959/// 11: 8 signed words \n 1960/// Bits [3:2]: Determine comparison type and aggregation method. \n 1961/// 00: Subset: Each character in \a B is compared for equality with all 1962/// the characters in \a A. \n 1963/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1964/// basis is greater than or equal for even-indexed elements in \a A, 1965/// and less than or equal for odd-indexed elements in \a A. \n 1966/// 10: Match: Compare each pair of corresponding characters in \a A and 1967/// \a B for equality. \n 1968/// 11: Substring: Search \a B for substring matches of \a A. \n 1969/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1970/// mask of the comparison results. \n 1971/// 00: No effect. \n 1972/// 01: Negate the bit mask. \n 1973/// 10: No effect. \n 1974/// 11: Negate the bit mask only for bits with an index less than or equal 1975/// to the size of \a A or \a B. \n 1976/// \returns Returns 1 if the length of the string in \a A is less than the 1977/// maximum, otherwise, returns 0. 1978#define _mm_cmpistrs(A, B, M) \ 1979 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 1980 (__v16qi)(__m128i)(B), (int)(M))) 1981 1982/// Uses the immediate operand \a M to perform a comparison of string 1983/// data with implicitly defined lengths that is contained in source operands 1984/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 1985/// the maximum, otherwise, returns 0. 1986/// 1987/// \headerfile <x86intrin.h> 1988/// 1989/// \code 1990/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 1991/// \endcode 1992/// 1993/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1994/// instruction. 1995/// 1996/// \param A 1997/// A 128-bit integer vector containing one of the source operands to be 1998/// compared. 1999/// \param B 2000/// A 128-bit integer vector containing one of the source operands to be 2001/// compared. 2002/// \param M 2003/// An 8-bit immediate operand specifying whether the characters are bytes or 2004/// words and the type of comparison to perform. \n 2005/// Bits [1:0]: Determine source data format. \n 2006/// 00: 16 unsigned bytes \n 2007/// 01: 8 unsigned words \n 2008/// 10: 16 signed bytes \n 2009/// 11: 8 signed words \n 2010/// Bits [3:2]: Determine comparison type and aggregation method. \n 2011/// 00: Subset: Each character in \a B is compared for equality with all 2012/// the characters in \a A. \n 2013/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2014/// basis is greater than or equal for even-indexed elements in \a A, 2015/// and less than or equal for odd-indexed elements in \a A. \n 2016/// 10: Match: Compare each pair of corresponding characters in \a A and 2017/// \a B for equality. \n 2018/// 11: Substring: Search \a B for substring matches of \a A. \n 2019/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2020/// mask of the comparison results. \n 2021/// 00: No effect. \n 2022/// 01: Negate the bit mask. \n 2023/// 10: No effect. \n 2024/// 11: Negate the bit mask only for bits with an index less than or equal 2025/// to the size of \a A or \a B. 2026/// \returns Returns 1 if the length of the string in \a B is less than the 2027/// maximum, otherwise, returns 0. 2028#define _mm_cmpistrz(A, B, M) \ 2029 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2030 (__v16qi)(__m128i)(B), (int)(M))) 2031 2032/// Uses the immediate operand \a M to perform a comparison of string 2033/// data with explicitly defined lengths that is contained in source operands 2034/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2035/// string in \a B is the maximum, otherwise, returns 0. 2036/// 2037/// \headerfile <x86intrin.h> 2038/// 2039/// \code 2040/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2041/// \endcode 2042/// 2043/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2044/// instruction. 2045/// 2046/// \param A 2047/// A 128-bit integer vector containing one of the source operands to be 2048/// compared. 2049/// \param LA 2050/// An integer that specifies the length of the string in \a A. 2051/// \param B 2052/// A 128-bit integer vector containing one of the source operands to be 2053/// compared. 2054/// \param LB 2055/// An integer that specifies the length of the string in \a B. 2056/// \param M 2057/// An 8-bit immediate operand specifying whether the characters are bytes or 2058/// words and the type of comparison to perform. \n 2059/// Bits [1:0]: Determine source data format. \n 2060/// 00: 16 unsigned bytes \n 2061/// 01: 8 unsigned words \n 2062/// 10: 16 signed bytes \n 2063/// 11: 8 signed words \n 2064/// Bits [3:2]: Determine comparison type and aggregation method. \n 2065/// 00: Subset: Each character in \a B is compared for equality with all 2066/// the characters in \a A. \n 2067/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2068/// basis is greater than or equal for even-indexed elements in \a A, 2069/// and less than or equal for odd-indexed elements in \a A. \n 2070/// 10: Match: Compare each pair of corresponding characters in \a A and 2071/// \a B for equality. \n 2072/// 11: Substring: Search \a B for substring matches of \a A. \n 2073/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2074/// mask of the comparison results. \n 2075/// 00: No effect. \n 2076/// 01: Negate the bit mask. \n 2077/// 10: No effect. \n 2078/// 11: Negate the bit mask only for bits with an index less than or equal 2079/// to the size of \a A or \a B. 2080/// \returns Returns 1 if the bit mask is zero and the length of the string in 2081/// \a B is the maximum, otherwise, returns 0. 2082#define _mm_cmpestra(A, LA, B, LB, M) \ 2083 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2084 (__v16qi)(__m128i)(B), (int)(LB), \ 2085 (int)(M))) 2086 2087/// Uses the immediate operand \a M to perform a comparison of string 2088/// data with explicitly defined lengths that is contained in source operands 2089/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2090/// returns 0. 2091/// 2092/// \headerfile <x86intrin.h> 2093/// 2094/// \code 2095/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2096/// \endcode 2097/// 2098/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2099/// instruction. 2100/// 2101/// \param A 2102/// A 128-bit integer vector containing one of the source operands to be 2103/// compared. 2104/// \param LA 2105/// An integer that specifies the length of the string in \a A. 2106/// \param B 2107/// A 128-bit integer vector containing one of the source operands to be 2108/// compared. 2109/// \param LB 2110/// An integer that specifies the length of the string in \a B. 2111/// \param M 2112/// An 8-bit immediate operand specifying whether the characters are bytes or 2113/// words and the type of comparison to perform. \n 2114/// Bits [1:0]: Determine source data format. \n 2115/// 00: 16 unsigned bytes \n 2116/// 01: 8 unsigned words \n 2117/// 10: 16 signed bytes \n 2118/// 11: 8 signed words \n 2119/// Bits [3:2]: Determine comparison type and aggregation method. \n 2120/// 00: Subset: Each character in \a B is compared for equality with all 2121/// the characters in \a A. \n 2122/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2123/// basis is greater than or equal for even-indexed elements in \a A, 2124/// and less than or equal for odd-indexed elements in \a A. \n 2125/// 10: Match: Compare each pair of corresponding characters in \a A and 2126/// \a B for equality. \n 2127/// 11: Substring: Search \a B for substring matches of \a A. \n 2128/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2129/// mask of the comparison results. \n 2130/// 00: No effect. \n 2131/// 01: Negate the bit mask. \n 2132/// 10: No effect. \n 2133/// 11: Negate the bit mask only for bits with an index less than or equal 2134/// to the size of \a A or \a B. \n 2135/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2136#define _mm_cmpestrc(A, LA, B, LB, M) \ 2137 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2138 (__v16qi)(__m128i)(B), (int)(LB), \ 2139 (int)(M))) 2140 2141/// Uses the immediate operand \a M to perform a comparison of string 2142/// data with explicitly defined lengths that is contained in source operands 2143/// \a A and \a B. Returns bit 0 of the resulting bit mask. 2144/// 2145/// \headerfile <x86intrin.h> 2146/// 2147/// \code 2148/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2149/// \endcode 2150/// 2151/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2152/// instruction. 2153/// 2154/// \param A 2155/// A 128-bit integer vector containing one of the source operands to be 2156/// compared. 2157/// \param LA 2158/// An integer that specifies the length of the string in \a A. 2159/// \param B 2160/// A 128-bit integer vector containing one of the source operands to be 2161/// compared. 2162/// \param LB 2163/// An integer that specifies the length of the string in \a B. 2164/// \param M 2165/// An 8-bit immediate operand specifying whether the characters are bytes or 2166/// words and the type of comparison to perform. \n 2167/// Bits [1:0]: Determine source data format. \n 2168/// 00: 16 unsigned bytes \n 2169/// 01: 8 unsigned words \n 2170/// 10: 16 signed bytes \n 2171/// 11: 8 signed words \n 2172/// Bits [3:2]: Determine comparison type and aggregation method. \n 2173/// 00: Subset: Each character in \a B is compared for equality with all 2174/// the characters in \a A. \n 2175/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2176/// basis is greater than or equal for even-indexed elements in \a A, 2177/// and less than or equal for odd-indexed elements in \a A. \n 2178/// 10: Match: Compare each pair of corresponding characters in \a A and 2179/// \a B for equality. \n 2180/// 11: Substring: Search \a B for substring matches of \a A. \n 2181/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2182/// mask of the comparison results. \n 2183/// 00: No effect. \n 2184/// 01: Negate the bit mask. \n 2185/// 10: No effect. \n 2186/// 11: Negate the bit mask only for bits with an index less than or equal 2187/// to the size of \a A or \a B. 2188/// \returns Returns bit 0 of the resulting bit mask. 2189#define _mm_cmpestro(A, LA, B, LB, M) \ 2190 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2191 (__v16qi)(__m128i)(B), (int)(LB), \ 2192 (int)(M))) 2193 2194/// Uses the immediate operand \a M to perform a comparison of string 2195/// data with explicitly defined lengths that is contained in source operands 2196/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2197/// the maximum, otherwise, returns 0. 2198/// 2199/// \headerfile <x86intrin.h> 2200/// 2201/// \code 2202/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2203/// \endcode 2204/// 2205/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2206/// instruction. 2207/// 2208/// \param A 2209/// A 128-bit integer vector containing one of the source operands to be 2210/// compared. 2211/// \param LA 2212/// An integer that specifies the length of the string in \a A. 2213/// \param B 2214/// A 128-bit integer vector containing one of the source operands to be 2215/// compared. 2216/// \param LB 2217/// An integer that specifies the length of the string in \a B. 2218/// \param M 2219/// An 8-bit immediate operand specifying whether the characters are bytes or 2220/// words and the type of comparison to perform. \n 2221/// Bits [1:0]: Determine source data format. \n 2222/// 00: 16 unsigned bytes \n 2223/// 01: 8 unsigned words \n 2224/// 10: 16 signed bytes \n 2225/// 11: 8 signed words \n 2226/// Bits [3:2]: Determine comparison type and aggregation method. \n 2227/// 00: Subset: Each character in \a B is compared for equality with all 2228/// the characters in \a A. \n 2229/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2230/// basis is greater than or equal for even-indexed elements in \a A, 2231/// and less than or equal for odd-indexed elements in \a A. \n 2232/// 10: Match: Compare each pair of corresponding characters in \a A and 2233/// \a B for equality. \n 2234/// 11: Substring: Search \a B for substring matches of \a A. \n 2235/// Bits [5:4]: Determine whether to perform a one's complement in the bit 2236/// mask of the comparison results. \n 2237/// 00: No effect. \n 2238/// 01: Negate the bit mask. \n 2239/// 10: No effect. \n 2240/// 11: Negate the bit mask only for bits with an index less than or equal 2241/// to the size of \a A or \a B. \n 2242/// \returns Returns 1 if the length of the string in \a A is less than the 2243/// maximum, otherwise, returns 0. 2244#define _mm_cmpestrs(A, LA, B, LB, M) \ 2245 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2246 (__v16qi)(__m128i)(B), (int)(LB), \ 2247 (int)(M))) 2248 2249/// Uses the immediate operand \a M to perform a comparison of string 2250/// data with explicitly defined lengths that is contained in source operands 2251/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2252/// the maximum, otherwise, returns 0. 2253/// 2254/// \headerfile <x86intrin.h> 2255/// 2256/// \code 2257/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2258/// \endcode 2259/// 2260/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2261/// 2262/// \param A 2263/// A 128-bit integer vector containing one of the source operands to be 2264/// compared. 2265/// \param LA 2266/// An integer that specifies the length of the string in \a A. 2267/// \param B 2268/// A 128-bit integer vector containing one of the source operands to be 2269/// compared. 2270/// \param LB 2271/// An integer that specifies the length of the string in \a B. 2272/// \param M 2273/// An 8-bit immediate operand specifying whether the characters are bytes or 2274/// words and the type of comparison to perform. \n 2275/// Bits [1:0]: Determine source data format. \n 2276/// 00: 16 unsigned bytes \n 2277/// 01: 8 unsigned words \n 2278/// 10: 16 signed bytes \n 2279/// 11: 8 signed words \n 2280/// Bits [3:2]: Determine comparison type and aggregation method. \n 2281/// 00: Subset: Each character in \a B is compared for equality with all 2282/// the characters in \a A. \n 2283/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2284/// basis is greater than or equal for even-indexed elements in \a A, 2285/// and less than or equal for odd-indexed elements in \a A. \n 2286/// 10: Match: Compare each pair of corresponding characters in \a A and 2287/// \a B for equality. \n 2288/// 11: Substring: Search \a B for substring matches of \a A. \n 2289/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2290/// mask of the comparison results. \n 2291/// 00: No effect. \n 2292/// 01: Negate the bit mask. \n 2293/// 10: No effect. \n 2294/// 11: Negate the bit mask only for bits with an index less than or equal 2295/// to the size of \a A or \a B. 2296/// \returns Returns 1 if the length of the string in \a B is less than the 2297/// maximum, otherwise, returns 0. 2298#define _mm_cmpestrz(A, LA, B, LB, M) \ 2299 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2300 (__v16qi)(__m128i)(B), (int)(LB), \ 2301 (int)(M))) 2302 2303/* SSE4.2 Compare Packed Data -- Greater Than. */ 2304/// Compares each of the corresponding 64-bit values of the 128-bit 2305/// integer vectors to determine if the values in the first operand are 2306/// greater than those in the second operand. 2307/// 2308/// \headerfile <x86intrin.h> 2309/// 2310/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2311/// 2312/// \param __V1 2313/// A 128-bit integer vector. 2314/// \param __V2 2315/// A 128-bit integer vector. 2316/// \returns A 128-bit integer vector containing the comparison results. 2317static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, 2318 __m128i __V2) { 2319 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2320} 2321 2322#undef __DEFAULT_FN_ATTRS 2323 2324#include <popcntintrin.h> 2325 2326#include <crc32intrin.h> 2327 2328#endif /* __SMMINTRIN_H */ 2329