1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __SMMINTRIN_H 11#define __SMMINTRIN_H 12 13#include <tmmintrin.h> 14 15/* Define the default attributes for the functions in this file. */ 16#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) 17 18/* SSE4 Rounding macros. */ 19#define _MM_FROUND_TO_NEAREST_INT 0x00 20#define _MM_FROUND_TO_NEG_INF 0x01 21#define _MM_FROUND_TO_POS_INF 0x02 22#define _MM_FROUND_TO_ZERO 0x03 23#define _MM_FROUND_CUR_DIRECTION 0x04 24 25#define _MM_FROUND_RAISE_EXC 0x00 26#define _MM_FROUND_NO_EXC 0x08 27 28#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 29#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 30#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 31#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 32#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 33#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 34 35/// Rounds up each element of the 128-bit vector of [4 x float] to an 36/// integer and returns the rounded values in a 128-bit vector of 37/// [4 x float]. 38/// 39/// \headerfile <x86intrin.h> 40/// 41/// \code 42/// __m128 _mm_ceil_ps(__m128 X); 43/// \endcode 44/// 45/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 46/// 47/// \param X 48/// A 128-bit vector of [4 x float] values to be rounded up. 49/// \returns A 128-bit vector of [4 x float] containing the rounded values. 50#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 51 52/// Rounds up each element of the 128-bit vector of [2 x double] to an 53/// integer and returns the rounded values in a 128-bit vector of 54/// [2 x double]. 55/// 56/// \headerfile <x86intrin.h> 57/// 58/// \code 59/// __m128d _mm_ceil_pd(__m128d X); 60/// \endcode 61/// 62/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 63/// 64/// \param X 65/// A 128-bit vector of [2 x double] values to be rounded up. 66/// \returns A 128-bit vector of [2 x double] containing the rounded values. 67#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 68 69/// Copies three upper elements of the first 128-bit vector operand to 70/// the corresponding three upper elements of the 128-bit result vector of 71/// [4 x float]. Rounds up the lowest element of the second 128-bit vector 72/// operand to an integer and copies it to the lowest element of the 128-bit 73/// result vector of [4 x float]. 74/// 75/// \headerfile <x86intrin.h> 76/// 77/// \code 78/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 79/// \endcode 80/// 81/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 82/// 83/// \param X 84/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 85/// copied to the corresponding bits of the result. 86/// \param Y 87/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 88/// rounded up to the nearest integer and copied to the corresponding bits 89/// of the result. 90/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 91/// values. 92#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 93 94/// Copies the upper element of the first 128-bit vector operand to the 95/// corresponding upper element of the 128-bit result vector of [2 x double]. 96/// Rounds up the lower element of the second 128-bit vector operand to an 97/// integer and copies it to the lower element of the 128-bit result vector 98/// of [2 x double]. 99/// 100/// \headerfile <x86intrin.h> 101/// 102/// \code 103/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 104/// \endcode 105/// 106/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 107/// 108/// \param X 109/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 110/// copied to the corresponding bits of the result. 111/// \param Y 112/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 113/// rounded up to the nearest integer and copied to the corresponding bits 114/// of the result. 115/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 116/// values. 117#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 118 119/// Rounds down each element of the 128-bit vector of [4 x float] to an 120/// an integer and returns the rounded values in a 128-bit vector of 121/// [4 x float]. 122/// 123/// \headerfile <x86intrin.h> 124/// 125/// \code 126/// __m128 _mm_floor_ps(__m128 X); 127/// \endcode 128/// 129/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 130/// 131/// \param X 132/// A 128-bit vector of [4 x float] values to be rounded down. 133/// \returns A 128-bit vector of [4 x float] containing the rounded values. 134#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 135 136/// Rounds down each element of the 128-bit vector of [2 x double] to an 137/// integer and returns the rounded values in a 128-bit vector of 138/// [2 x double]. 139/// 140/// \headerfile <x86intrin.h> 141/// 142/// \code 143/// __m128d _mm_floor_pd(__m128d X); 144/// \endcode 145/// 146/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 147/// 148/// \param X 149/// A 128-bit vector of [2 x double]. 150/// \returns A 128-bit vector of [2 x double] containing the rounded values. 151#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 152 153/// Copies three upper elements of the first 128-bit vector operand to 154/// the corresponding three upper elements of the 128-bit result vector of 155/// [4 x float]. Rounds down the lowest element of the second 128-bit vector 156/// operand to an integer and copies it to the lowest element of the 128-bit 157/// result vector of [4 x float]. 158/// 159/// \headerfile <x86intrin.h> 160/// 161/// \code 162/// __m128 _mm_floor_ss(__m128 X, __m128 Y); 163/// \endcode 164/// 165/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 166/// 167/// \param X 168/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 169/// copied to the corresponding bits of the result. 170/// \param Y 171/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 172/// rounded down to the nearest integer and copied to the corresponding bits 173/// of the result. 174/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 175/// values. 176#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 177 178/// Copies the upper element of the first 128-bit vector operand to the 179/// corresponding upper element of the 128-bit result vector of [2 x double]. 180/// Rounds down the lower element of the second 128-bit vector operand to an 181/// integer and copies it to the lower element of the 128-bit result vector 182/// of [2 x double]. 183/// 184/// \headerfile <x86intrin.h> 185/// 186/// \code 187/// __m128d _mm_floor_sd(__m128d X, __m128d Y); 188/// \endcode 189/// 190/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 191/// 192/// \param X 193/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 194/// copied to the corresponding bits of the result. 195/// \param Y 196/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 197/// rounded down to the nearest integer and copied to the corresponding bits 198/// of the result. 199/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 200/// values. 201#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 202 203/// Rounds each element of the 128-bit vector of [4 x float] to an 204/// integer value according to the rounding control specified by the second 205/// argument and returns the rounded values in a 128-bit vector of 206/// [4 x float]. 207/// 208/// \headerfile <x86intrin.h> 209/// 210/// \code 211/// __m128 _mm_round_ps(__m128 X, const int M); 212/// \endcode 213/// 214/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 215/// 216/// \param X 217/// A 128-bit vector of [4 x float]. 218/// \param M 219/// An integer value that specifies the rounding operation. \n 220/// Bits [7:4] are reserved. \n 221/// Bit [3] is a precision exception value: \n 222/// 0: A normal PE exception is used \n 223/// 1: The PE field is not updated \n 224/// Bit [2] is the rounding control source: \n 225/// 0: Use bits [1:0] of \a M \n 226/// 1: Use the current MXCSR setting \n 227/// Bits [1:0] contain the rounding control definition: \n 228/// 00: Nearest \n 229/// 01: Downward (toward negative infinity) \n 230/// 10: Upward (toward positive infinity) \n 231/// 11: Truncated 232/// \returns A 128-bit vector of [4 x float] containing the rounded values. 233#define _mm_round_ps(X, M) \ 234 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) 235 236/// Copies three upper elements of the first 128-bit vector operand to 237/// the corresponding three upper elements of the 128-bit result vector of 238/// [4 x float]. Rounds the lowest element of the second 128-bit vector 239/// operand to an integer value according to the rounding control specified 240/// by the third argument and copies it to the lowest element of the 128-bit 241/// result vector of [4 x float]. 242/// 243/// \headerfile <x86intrin.h> 244/// 245/// \code 246/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 247/// \endcode 248/// 249/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 250/// 251/// \param X 252/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 253/// copied to the corresponding bits of the result. 254/// \param Y 255/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 256/// rounded to the nearest integer using the specified rounding control and 257/// copied to the corresponding bits of the result. 258/// \param M 259/// An integer value that specifies the rounding operation. \n 260/// Bits [7:4] are reserved. \n 261/// Bit [3] is a precision exception value: \n 262/// 0: A normal PE exception is used \n 263/// 1: The PE field is not updated \n 264/// Bit [2] is the rounding control source: \n 265/// 0: Use bits [1:0] of \a M \n 266/// 1: Use the current MXCSR setting \n 267/// Bits [1:0] contain the rounding control definition: \n 268/// 00: Nearest \n 269/// 01: Downward (toward negative infinity) \n 270/// 10: Upward (toward positive infinity) \n 271/// 11: Truncated 272/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 273/// values. 274#define _mm_round_ss(X, Y, M) \ 275 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 276 (__v4sf)(__m128)(Y), (M)) 277 278/// Rounds each element of the 128-bit vector of [2 x double] to an 279/// integer value according to the rounding control specified by the second 280/// argument and returns the rounded values in a 128-bit vector of 281/// [2 x double]. 282/// 283/// \headerfile <x86intrin.h> 284/// 285/// \code 286/// __m128d _mm_round_pd(__m128d X, const int M); 287/// \endcode 288/// 289/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 290/// 291/// \param X 292/// A 128-bit vector of [2 x double]. 293/// \param M 294/// An integer value that specifies the rounding operation. \n 295/// Bits [7:4] are reserved. \n 296/// Bit [3] is a precision exception value: \n 297/// 0: A normal PE exception is used \n 298/// 1: The PE field is not updated \n 299/// Bit [2] is the rounding control source: \n 300/// 0: Use bits [1:0] of \a M \n 301/// 1: Use the current MXCSR setting \n 302/// Bits [1:0] contain the rounding control definition: \n 303/// 00: Nearest \n 304/// 01: Downward (toward negative infinity) \n 305/// 10: Upward (toward positive infinity) \n 306/// 11: Truncated 307/// \returns A 128-bit vector of [2 x double] containing the rounded values. 308#define _mm_round_pd(X, M) \ 309 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) 310 311/// Copies the upper element of the first 128-bit vector operand to the 312/// corresponding upper element of the 128-bit result vector of [2 x double]. 313/// Rounds the lower element of the second 128-bit vector operand to an 314/// integer value according to the rounding control specified by the third 315/// argument and copies it to the lower element of the 128-bit result vector 316/// of [2 x double]. 317/// 318/// \headerfile <x86intrin.h> 319/// 320/// \code 321/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 322/// \endcode 323/// 324/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 325/// 326/// \param X 327/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 328/// copied to the corresponding bits of the result. 329/// \param Y 330/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 331/// rounded to the nearest integer using the specified rounding control and 332/// copied to the corresponding bits of the result. 333/// \param M 334/// An integer value that specifies the rounding operation. \n 335/// Bits [7:4] are reserved. \n 336/// Bit [3] is a precision exception value: \n 337/// 0: A normal PE exception is used \n 338/// 1: The PE field is not updated \n 339/// Bit [2] is the rounding control source: \n 340/// 0: Use bits [1:0] of \a M \n 341/// 1: Use the current MXCSR setting \n 342/// Bits [1:0] contain the rounding control definition: \n 343/// 00: Nearest \n 344/// 01: Downward (toward negative infinity) \n 345/// 10: Upward (toward positive infinity) \n 346/// 11: Truncated 347/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 348/// values. 349#define _mm_round_sd(X, Y, M) \ 350 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 351 (__v2df)(__m128d)(Y), (M)) 352 353/* SSE4 Packed Blending Intrinsics. */ 354/// Returns a 128-bit vector of [2 x double] where the values are 355/// selected from either the first or second operand as specified by the 356/// third operand, the control mask. 357/// 358/// \headerfile <x86intrin.h> 359/// 360/// \code 361/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 362/// \endcode 363/// 364/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 365/// 366/// \param V1 367/// A 128-bit vector of [2 x double]. 368/// \param V2 369/// A 128-bit vector of [2 x double]. 370/// \param M 371/// An immediate integer operand, with mask bits [1:0] specifying how the 372/// values are to be copied. The position of the mask bit corresponds to the 373/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 374/// element in operand \a V1 is copied to the same position in the result. 375/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 376/// is copied to the same position in the result. 377/// \returns A 128-bit vector of [2 x double] containing the copied values. 378#define _mm_blend_pd(V1, V2, M) \ 379 (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ 380 (__v2df)(__m128d)(V2), (int)(M)) 381 382/// Returns a 128-bit vector of [4 x float] where the values are selected 383/// from either the first or second operand as specified by the third 384/// operand, the control mask. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// \code 389/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 390/// \endcode 391/// 392/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 393/// 394/// \param V1 395/// A 128-bit vector of [4 x float]. 396/// \param V2 397/// A 128-bit vector of [4 x float]. 398/// \param M 399/// An immediate integer operand, with mask bits [3:0] specifying how the 400/// values are to be copied. The position of the mask bit corresponds to the 401/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 402/// element in operand \a V1 is copied to the same position in the result. 403/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 404/// is copied to the same position in the result. 405/// \returns A 128-bit vector of [4 x float] containing the copied values. 406#define _mm_blend_ps(V1, V2, M) \ 407 (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ 408 (__v4sf)(__m128)(V2), (int)(M)) 409 410/// Returns a 128-bit vector of [2 x double] where the values are 411/// selected from either the first or second operand as specified by the 412/// third operand, the control mask. 413/// 414/// \headerfile <x86intrin.h> 415/// 416/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 417/// 418/// \param __V1 419/// A 128-bit vector of [2 x double]. 420/// \param __V2 421/// A 128-bit vector of [2 x double]. 422/// \param __M 423/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 424/// values are to be copied. The position of the mask bit corresponds to the 425/// most significant bit of a copied value. When a mask bit is 0, the 426/// corresponding 64-bit element in operand \a __V1 is copied to the same 427/// position in the result. When a mask bit is 1, the corresponding 64-bit 428/// element in operand \a __V2 is copied to the same position in the result. 429/// \returns A 128-bit vector of [2 x double] containing the copied values. 430static __inline__ __m128d __DEFAULT_FN_ATTRS 431_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 432{ 433 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 434 (__v2df)__M); 435} 436 437/// Returns a 128-bit vector of [4 x float] where the values are 438/// selected from either the first or second operand as specified by the 439/// third operand, the control mask. 440/// 441/// \headerfile <x86intrin.h> 442/// 443/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 444/// 445/// \param __V1 446/// A 128-bit vector of [4 x float]. 447/// \param __V2 448/// A 128-bit vector of [4 x float]. 449/// \param __M 450/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 451/// how the values are to be copied. The position of the mask bit corresponds 452/// to the most significant bit of a copied value. When a mask bit is 0, the 453/// corresponding 32-bit element in operand \a __V1 is copied to the same 454/// position in the result. When a mask bit is 1, the corresponding 32-bit 455/// element in operand \a __V2 is copied to the same position in the result. 456/// \returns A 128-bit vector of [4 x float] containing the copied values. 457static __inline__ __m128 __DEFAULT_FN_ATTRS 458_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 459{ 460 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 461 (__v4sf)__M); 462} 463 464/// Returns a 128-bit vector of [16 x i8] where the values are selected 465/// from either of the first or second operand as specified by the third 466/// operand, the control mask. 467/// 468/// \headerfile <x86intrin.h> 469/// 470/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 471/// 472/// \param __V1 473/// A 128-bit vector of [16 x i8]. 474/// \param __V2 475/// A 128-bit vector of [16 x i8]. 476/// \param __M 477/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 478/// how the values are to be copied. The position of the mask bit corresponds 479/// to the most significant bit of a copied value. When a mask bit is 0, the 480/// corresponding 8-bit element in operand \a __V1 is copied to the same 481/// position in the result. When a mask bit is 1, the corresponding 8-bit 482/// element in operand \a __V2 is copied to the same position in the result. 483/// \returns A 128-bit vector of [16 x i8] containing the copied values. 484static __inline__ __m128i __DEFAULT_FN_ATTRS 485_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 486{ 487 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 488 (__v16qi)__M); 489} 490 491/// Returns a 128-bit vector of [8 x i16] where the values are selected 492/// from either of the first or second operand as specified by the third 493/// operand, the control mask. 494/// 495/// \headerfile <x86intrin.h> 496/// 497/// \code 498/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 499/// \endcode 500/// 501/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 502/// 503/// \param V1 504/// A 128-bit vector of [8 x i16]. 505/// \param V2 506/// A 128-bit vector of [8 x i16]. 507/// \param M 508/// An immediate integer operand, with mask bits [7:0] specifying how the 509/// values are to be copied. The position of the mask bit corresponds to the 510/// index of a copied value. When a mask bit is 0, the corresponding 16-bit 511/// element in operand \a V1 is copied to the same position in the result. 512/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 513/// is copied to the same position in the result. 514/// \returns A 128-bit vector of [8 x i16] containing the copied values. 515#define _mm_blend_epi16(V1, V2, M) \ 516 (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ 517 (__v8hi)(__m128i)(V2), (int)(M)) 518 519/* SSE4 Dword Multiply Instructions. */ 520/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 521/// and returns the lower 32 bits of the each product in a 128-bit vector of 522/// [4 x i32]. 523/// 524/// \headerfile <x86intrin.h> 525/// 526/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 527/// 528/// \param __V1 529/// A 128-bit integer vector. 530/// \param __V2 531/// A 128-bit integer vector. 532/// \returns A 128-bit integer vector containing the products of both operands. 533static __inline__ __m128i __DEFAULT_FN_ATTRS 534_mm_mullo_epi32 (__m128i __V1, __m128i __V2) 535{ 536 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 537} 538 539/// Multiplies corresponding even-indexed elements of two 128-bit 540/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 541/// containing the products. 542/// 543/// \headerfile <x86intrin.h> 544/// 545/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 546/// 547/// \param __V1 548/// A 128-bit vector of [4 x i32]. 549/// \param __V2 550/// A 128-bit vector of [4 x i32]. 551/// \returns A 128-bit vector of [2 x i64] containing the products of both 552/// operands. 553static __inline__ __m128i __DEFAULT_FN_ATTRS 554_mm_mul_epi32 (__m128i __V1, __m128i __V2) 555{ 556 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 557} 558 559/* SSE4 Floating Point Dot Product Instructions. */ 560/// Computes the dot product of the two 128-bit vectors of [4 x float] 561/// and returns it in the elements of the 128-bit result vector of 562/// [4 x float]. 563/// 564/// The immediate integer operand controls which input elements 565/// will contribute to the dot product, and where the final results are 566/// returned. 567/// 568/// \headerfile <x86intrin.h> 569/// 570/// \code 571/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 572/// \endcode 573/// 574/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 575/// 576/// \param X 577/// A 128-bit vector of [4 x float]. 578/// \param Y 579/// A 128-bit vector of [4 x float]. 580/// \param M 581/// An immediate integer operand. Mask bits [7:4] determine which elements 582/// of the input vectors are used, with bit [4] corresponding to the lowest 583/// element and bit [7] corresponding to the highest element of each [4 x 584/// float] vector. If a bit is set, the corresponding elements from the two 585/// input vectors are used as an input for dot product; otherwise that input 586/// is treated as zero. Bits [3:0] determine which elements of the result 587/// will receive a copy of the final dot product, with bit [0] corresponding 588/// to the lowest element and bit [3] corresponding to the highest element of 589/// each [4 x float] subvector. If a bit is set, the dot product is returned 590/// in the corresponding element; otherwise that element is set to zero. 591/// \returns A 128-bit vector of [4 x float] containing the dot product. 592#define _mm_dp_ps(X, Y, M) \ 593 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 594 (__v4sf)(__m128)(Y), (M)) 595 596/// Computes the dot product of the two 128-bit vectors of [2 x double] 597/// and returns it in the elements of the 128-bit result vector of 598/// [2 x double]. 599/// 600/// The immediate integer operand controls which input 601/// elements will contribute to the dot product, and where the final results 602/// are returned. 603/// 604/// \headerfile <x86intrin.h> 605/// 606/// \code 607/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 608/// \endcode 609/// 610/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 611/// 612/// \param X 613/// A 128-bit vector of [2 x double]. 614/// \param Y 615/// A 128-bit vector of [2 x double]. 616/// \param M 617/// An immediate integer operand. Mask bits [5:4] determine which elements 618/// of the input vectors are used, with bit [4] corresponding to the lowest 619/// element and bit [5] corresponding to the highest element of each of [2 x 620/// double] vector. If a bit is set, the corresponding elements from the two 621/// input vectors are used as an input for dot product; otherwise that input 622/// is treated as zero. Bits [1:0] determine which elements of the result 623/// will receive a copy of the final dot product, with bit [0] corresponding 624/// to the lowest element and bit [1] corresponding to the highest element of 625/// each [2 x double] vector. If a bit is set, the dot product is returned in 626/// the corresponding element; otherwise that element is set to zero. 627#define _mm_dp_pd(X, Y, M) \ 628 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 629 (__v2df)(__m128d)(Y), (M)) 630 631/* SSE4 Streaming Load Hint Instruction. */ 632/// Loads integer values from a 128-bit aligned memory location to a 633/// 128-bit integer vector. 634/// 635/// \headerfile <x86intrin.h> 636/// 637/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 638/// 639/// \param __V 640/// A pointer to a 128-bit aligned memory location that contains the integer 641/// values. 642/// \returns A 128-bit integer vector containing the data stored at the 643/// specified memory location. 644static __inline__ __m128i __DEFAULT_FN_ATTRS 645_mm_stream_load_si128 (__m128i const *__V) 646{ 647 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); 648} 649 650/* SSE4 Packed Integer Min/Max Instructions. */ 651/// Compares the corresponding elements of two 128-bit vectors of 652/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 653/// of the two values. 654/// 655/// \headerfile <x86intrin.h> 656/// 657/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 658/// 659/// \param __V1 660/// A 128-bit vector of [16 x i8]. 661/// \param __V2 662/// A 128-bit vector of [16 x i8] 663/// \returns A 128-bit vector of [16 x i8] containing the lesser values. 664static __inline__ __m128i __DEFAULT_FN_ATTRS 665_mm_min_epi8 (__m128i __V1, __m128i __V2) 666{ 667 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 668} 669 670/// Compares the corresponding elements of two 128-bit vectors of 671/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 672/// greater value of the two. 673/// 674/// \headerfile <x86intrin.h> 675/// 676/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 677/// 678/// \param __V1 679/// A 128-bit vector of [16 x i8]. 680/// \param __V2 681/// A 128-bit vector of [16 x i8]. 682/// \returns A 128-bit vector of [16 x i8] containing the greater values. 683static __inline__ __m128i __DEFAULT_FN_ATTRS 684_mm_max_epi8 (__m128i __V1, __m128i __V2) 685{ 686 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 687} 688 689/// Compares the corresponding elements of two 128-bit vectors of 690/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 691/// value of the two. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 696/// 697/// \param __V1 698/// A 128-bit vector of [8 x u16]. 699/// \param __V2 700/// A 128-bit vector of [8 x u16]. 701/// \returns A 128-bit vector of [8 x u16] containing the lesser values. 702static __inline__ __m128i __DEFAULT_FN_ATTRS 703_mm_min_epu16 (__m128i __V1, __m128i __V2) 704{ 705 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 706} 707 708/// Compares the corresponding elements of two 128-bit vectors of 709/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 710/// greater value of the two. 711/// 712/// \headerfile <x86intrin.h> 713/// 714/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 715/// 716/// \param __V1 717/// A 128-bit vector of [8 x u16]. 718/// \param __V2 719/// A 128-bit vector of [8 x u16]. 720/// \returns A 128-bit vector of [8 x u16] containing the greater values. 721static __inline__ __m128i __DEFAULT_FN_ATTRS 722_mm_max_epu16 (__m128i __V1, __m128i __V2) 723{ 724 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 725} 726 727/// Compares the corresponding elements of two 128-bit vectors of 728/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 729/// value of the two. 730/// 731/// \headerfile <x86intrin.h> 732/// 733/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 734/// 735/// \param __V1 736/// A 128-bit vector of [4 x i32]. 737/// \param __V2 738/// A 128-bit vector of [4 x i32]. 739/// \returns A 128-bit vector of [4 x i32] containing the lesser values. 740static __inline__ __m128i __DEFAULT_FN_ATTRS 741_mm_min_epi32 (__m128i __V1, __m128i __V2) 742{ 743 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 744} 745 746/// Compares the corresponding elements of two 128-bit vectors of 747/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 748/// greater value of the two. 749/// 750/// \headerfile <x86intrin.h> 751/// 752/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 753/// 754/// \param __V1 755/// A 128-bit vector of [4 x i32]. 756/// \param __V2 757/// A 128-bit vector of [4 x i32]. 758/// \returns A 128-bit vector of [4 x i32] containing the greater values. 759static __inline__ __m128i __DEFAULT_FN_ATTRS 760_mm_max_epi32 (__m128i __V1, __m128i __V2) 761{ 762 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 763} 764 765/// Compares the corresponding elements of two 128-bit vectors of 766/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 767/// value of the two. 768/// 769/// \headerfile <x86intrin.h> 770/// 771/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 772/// 773/// \param __V1 774/// A 128-bit vector of [4 x u32]. 775/// \param __V2 776/// A 128-bit vector of [4 x u32]. 777/// \returns A 128-bit vector of [4 x u32] containing the lesser values. 778static __inline__ __m128i __DEFAULT_FN_ATTRS 779_mm_min_epu32 (__m128i __V1, __m128i __V2) 780{ 781 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 782} 783 784/// Compares the corresponding elements of two 128-bit vectors of 785/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 786/// greater value of the two. 787/// 788/// \headerfile <x86intrin.h> 789/// 790/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 791/// 792/// \param __V1 793/// A 128-bit vector of [4 x u32]. 794/// \param __V2 795/// A 128-bit vector of [4 x u32]. 796/// \returns A 128-bit vector of [4 x u32] containing the greater values. 797static __inline__ __m128i __DEFAULT_FN_ATTRS 798_mm_max_epu32 (__m128i __V1, __m128i __V2) 799{ 800 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 801} 802 803/* SSE4 Insertion and Extraction from XMM Register Instructions. */ 804/// Takes the first argument \a X and inserts an element from the second 805/// argument \a Y as selected by the third argument \a N. That result then 806/// has elements zeroed out also as selected by the third argument \a N. The 807/// resulting 128-bit vector of [4 x float] is then returned. 808/// 809/// \headerfile <x86intrin.h> 810/// 811/// \code 812/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 813/// \endcode 814/// 815/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 816/// 817/// \param X 818/// A 128-bit vector source operand of [4 x float]. With the exception of 819/// those bits in the result copied from parameter \a Y and zeroed by bits 820/// [3:0] of \a N, all bits from this parameter are copied to the result. 821/// \param Y 822/// A 128-bit vector source operand of [4 x float]. One single-precision 823/// floating-point element from this source, as determined by the immediate 824/// parameter, is copied to the result. 825/// \param N 826/// Specifies which bits from operand \a Y will be copied, which bits in the 827/// result they will be be copied to, and which bits in the result will be 828/// cleared. The following assignments are made: \n 829/// Bits [7:6] specify the bits to copy from operand \a Y: \n 830/// 00: Selects bits [31:0] from operand \a Y. \n 831/// 01: Selects bits [63:32] from operand \a Y. \n 832/// 10: Selects bits [95:64] from operand \a Y. \n 833/// 11: Selects bits [127:96] from operand \a Y. \n 834/// Bits [5:4] specify the bits in the result to which the selected bits 835/// from operand \a Y are copied: \n 836/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 837/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 838/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 839/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 840/// Bits[3:0]: If any of these bits are set, the corresponding result 841/// element is cleared. 842/// \returns A 128-bit vector of [4 x float] containing the copied 843/// single-precision floating point elements from the operands. 844#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 845 846/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 847/// returns it, using the immediate value parameter \a N as a selector. 848/// 849/// \headerfile <x86intrin.h> 850/// 851/// \code 852/// int _mm_extract_ps(__m128 X, const int N); 853/// \endcode 854/// 855/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 856/// instruction. 857/// 858/// \param X 859/// A 128-bit vector of [4 x float]. 860/// \param N 861/// An immediate value. Bits [1:0] determines which bits from the argument 862/// \a X are extracted and returned: \n 863/// 00: Bits [31:0] of parameter \a X are returned. \n 864/// 01: Bits [63:32] of parameter \a X are returned. \n 865/// 10: Bits [95:64] of parameter \a X are returned. \n 866/// 11: Bits [127:96] of parameter \a X are returned. 867/// \returns A 32-bit integer containing the extracted 32 bits of float data. 868#define _mm_extract_ps(X, N) (__extension__ \ 869 ({ union { int __i; float __f; } __t; \ 870 __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 871 __t.__i;})) 872 873/* Miscellaneous insert and extract macros. */ 874/* Extract a single-precision float from X at index N into D. */ 875#define _MM_EXTRACT_FLOAT(D, X, N) \ 876 { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } 877 878/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 879 an index suitable for _mm_insert_ps. */ 880#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 881 882/* Extract a float from X at index N into the first index of the return. */ 883#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 884 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 885 886/* Insert int into packed integer array at index. */ 887/// Constructs a 128-bit vector of [16 x i8] by first making a copy of 888/// the 128-bit integer vector parameter, and then inserting the lower 8 bits 889/// of an integer parameter \a I into an offset specified by the immediate 890/// value parameter \a N. 891/// 892/// \headerfile <x86intrin.h> 893/// 894/// \code 895/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 896/// \endcode 897/// 898/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 899/// 900/// \param X 901/// A 128-bit integer vector of [16 x i8]. This vector is copied to the 902/// result and then one of the sixteen elements in the result vector is 903/// replaced by the lower 8 bits of \a I. 904/// \param I 905/// An integer. The lower 8 bits of this operand are written to the result 906/// beginning at the offset specified by \a N. 907/// \param N 908/// An immediate value. Bits [3:0] specify the bit offset in the result at 909/// which the lower 8 bits of \a I are written. \n 910/// 0000: Bits [7:0] of the result are used for insertion. \n 911/// 0001: Bits [15:8] of the result are used for insertion. \n 912/// 0010: Bits [23:16] of the result are used for insertion. \n 913/// 0011: Bits [31:24] of the result are used for insertion. \n 914/// 0100: Bits [39:32] of the result are used for insertion. \n 915/// 0101: Bits [47:40] of the result are used for insertion. \n 916/// 0110: Bits [55:48] of the result are used for insertion. \n 917/// 0111: Bits [63:56] of the result are used for insertion. \n 918/// 1000: Bits [71:64] of the result are used for insertion. \n 919/// 1001: Bits [79:72] of the result are used for insertion. \n 920/// 1010: Bits [87:80] of the result are used for insertion. \n 921/// 1011: Bits [95:88] of the result are used for insertion. \n 922/// 1100: Bits [103:96] of the result are used for insertion. \n 923/// 1101: Bits [111:104] of the result are used for insertion. \n 924/// 1110: Bits [119:112] of the result are used for insertion. \n 925/// 1111: Bits [127:120] of the result are used for insertion. 926/// \returns A 128-bit integer vector containing the constructed values. 927#define _mm_insert_epi8(X, I, N) \ 928 (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ 929 (int)(I), (int)(N)) 930 931/// Constructs a 128-bit vector of [4 x i32] by first making a copy of 932/// the 128-bit integer vector parameter, and then inserting the 32-bit 933/// integer parameter \a I at the offset specified by the immediate value 934/// parameter \a N. 935/// 936/// \headerfile <x86intrin.h> 937/// 938/// \code 939/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 940/// \endcode 941/// 942/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 943/// 944/// \param X 945/// A 128-bit integer vector of [4 x i32]. This vector is copied to the 946/// result and then one of the four elements in the result vector is 947/// replaced by \a I. 948/// \param I 949/// A 32-bit integer that is written to the result beginning at the offset 950/// specified by \a N. 951/// \param N 952/// An immediate value. Bits [1:0] specify the bit offset in the result at 953/// which the integer \a I is written. \n 954/// 00: Bits [31:0] of the result are used for insertion. \n 955/// 01: Bits [63:32] of the result are used for insertion. \n 956/// 10: Bits [95:64] of the result are used for insertion. \n 957/// 11: Bits [127:96] of the result are used for insertion. 958/// \returns A 128-bit integer vector containing the constructed values. 959#define _mm_insert_epi32(X, I, N) \ 960 (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ 961 (int)(I), (int)(N)) 962 963#ifdef __x86_64__ 964/// Constructs a 128-bit vector of [2 x i64] by first making a copy of 965/// the 128-bit integer vector parameter, and then inserting the 64-bit 966/// integer parameter \a I, using the immediate value parameter \a N as an 967/// insertion location selector. 968/// 969/// \headerfile <x86intrin.h> 970/// 971/// \code 972/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 973/// \endcode 974/// 975/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 976/// 977/// \param X 978/// A 128-bit integer vector of [2 x i64]. This vector is copied to the 979/// result and then one of the two elements in the result vector is replaced 980/// by \a I. 981/// \param I 982/// A 64-bit integer that is written to the result beginning at the offset 983/// specified by \a N. 984/// \param N 985/// An immediate value. Bit [0] specifies the bit offset in the result at 986/// which the integer \a I is written. \n 987/// 0: Bits [63:0] of the result are used for insertion. \n 988/// 1: Bits [127:64] of the result are used for insertion. \n 989/// \returns A 128-bit integer vector containing the constructed values. 990#define _mm_insert_epi64(X, I, N) \ 991 (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ 992 (long long)(I), (int)(N)) 993#endif /* __x86_64__ */ 994 995/* Extract int from packed integer array at index. This returns the element 996 * as a zero extended value, so it is unsigned. 997 */ 998/// Extracts an 8-bit element from the 128-bit integer vector of 999/// [16 x i8], using the immediate value parameter \a N as a selector. 1000/// 1001/// \headerfile <x86intrin.h> 1002/// 1003/// \code 1004/// int _mm_extract_epi8(__m128i X, const int N); 1005/// \endcode 1006/// 1007/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1008/// 1009/// \param X 1010/// A 128-bit integer vector. 1011/// \param N 1012/// An immediate value. Bits [3:0] specify which 8-bit vector element from 1013/// the argument \a X to extract and copy to the result. \n 1014/// 0000: Bits [7:0] of parameter \a X are extracted. \n 1015/// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1016/// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1017/// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1018/// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1019/// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1020/// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1021/// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1022/// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1023/// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1024/// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1025/// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1026/// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1027/// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1028/// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1029/// 1111: Bits [127:120] of the parameter \a X are extracted. 1030/// \returns An unsigned integer, whose lower 8 bits are selected from the 1031/// 128-bit integer vector parameter and the remaining bits are assigned 1032/// zeros. 1033#define _mm_extract_epi8(X, N) \ 1034 (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1035 (int)(N)) 1036 1037/// Extracts a 32-bit element from the 128-bit integer vector of 1038/// [4 x i32], using the immediate value parameter \a N as a selector. 1039/// 1040/// \headerfile <x86intrin.h> 1041/// 1042/// \code 1043/// int _mm_extract_epi32(__m128i X, const int N); 1044/// \endcode 1045/// 1046/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1047/// 1048/// \param X 1049/// A 128-bit integer vector. 1050/// \param N 1051/// An immediate value. Bits [1:0] specify which 32-bit vector element from 1052/// the argument \a X to extract and copy to the result. \n 1053/// 00: Bits [31:0] of the parameter \a X are extracted. \n 1054/// 01: Bits [63:32] of the parameter \a X are extracted. \n 1055/// 10: Bits [95:64] of the parameter \a X are extracted. \n 1056/// 11: Bits [127:96] of the parameter \a X are exracted. 1057/// \returns An integer, whose lower 32 bits are selected from the 128-bit 1058/// integer vector parameter and the remaining bits are assigned zeros. 1059#define _mm_extract_epi32(X, N) \ 1060 (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) 1061 1062#ifdef __x86_64__ 1063/// Extracts a 64-bit element from the 128-bit integer vector of 1064/// [2 x i64], using the immediate value parameter \a N as a selector. 1065/// 1066/// \headerfile <x86intrin.h> 1067/// 1068/// \code 1069/// long long _mm_extract_epi64(__m128i X, const int N); 1070/// \endcode 1071/// 1072/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1073/// 1074/// \param X 1075/// A 128-bit integer vector. 1076/// \param N 1077/// An immediate value. Bit [0] specifies which 64-bit vector element from 1078/// the argument \a X to return. \n 1079/// 0: Bits [63:0] are returned. \n 1080/// 1: Bits [127:64] are returned. \n 1081/// \returns A 64-bit integer. 1082#define _mm_extract_epi64(X, N) \ 1083 (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) 1084#endif /* __x86_64 */ 1085 1086/* SSE4 128-bit Packed Integer Comparisons. */ 1087/// Tests whether the specified bits in a 128-bit integer vector are all 1088/// zeros. 1089/// 1090/// \headerfile <x86intrin.h> 1091/// 1092/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1093/// 1094/// \param __M 1095/// A 128-bit integer vector containing the bits to be tested. 1096/// \param __V 1097/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1098/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1099static __inline__ int __DEFAULT_FN_ATTRS 1100_mm_testz_si128(__m128i __M, __m128i __V) 1101{ 1102 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1103} 1104 1105/// Tests whether the specified bits in a 128-bit integer vector are all 1106/// ones. 1107/// 1108/// \headerfile <x86intrin.h> 1109/// 1110/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1111/// 1112/// \param __M 1113/// A 128-bit integer vector containing the bits to be tested. 1114/// \param __V 1115/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1116/// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1117static __inline__ int __DEFAULT_FN_ATTRS 1118_mm_testc_si128(__m128i __M, __m128i __V) 1119{ 1120 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1121} 1122 1123/// Tests whether the specified bits in a 128-bit integer vector are 1124/// neither all zeros nor all ones. 1125/// 1126/// \headerfile <x86intrin.h> 1127/// 1128/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1129/// 1130/// \param __M 1131/// A 128-bit integer vector containing the bits to be tested. 1132/// \param __V 1133/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1134/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1135/// FALSE otherwise. 1136static __inline__ int __DEFAULT_FN_ATTRS 1137_mm_testnzc_si128(__m128i __M, __m128i __V) 1138{ 1139 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1140} 1141 1142/// Tests whether the specified bits in a 128-bit integer vector are all 1143/// ones. 1144/// 1145/// \headerfile <x86intrin.h> 1146/// 1147/// \code 1148/// int _mm_test_all_ones(__m128i V); 1149/// \endcode 1150/// 1151/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1152/// 1153/// \param V 1154/// A 128-bit integer vector containing the bits to be tested. 1155/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1156/// otherwise. 1157#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1158 1159/// Tests whether the specified bits in a 128-bit integer vector are 1160/// neither all zeros nor all ones. 1161/// 1162/// \headerfile <x86intrin.h> 1163/// 1164/// \code 1165/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1166/// \endcode 1167/// 1168/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1169/// 1170/// \param M 1171/// A 128-bit integer vector containing the bits to be tested. 1172/// \param V 1173/// A 128-bit integer vector selecting which bits to test in operand \a M. 1174/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1175/// FALSE otherwise. 1176#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1177 1178/// Tests whether the specified bits in a 128-bit integer vector are all 1179/// zeros. 1180/// 1181/// \headerfile <x86intrin.h> 1182/// 1183/// \code 1184/// int _mm_test_all_zeros(__m128i M, __m128i V); 1185/// \endcode 1186/// 1187/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1188/// 1189/// \param M 1190/// A 128-bit integer vector containing the bits to be tested. 1191/// \param V 1192/// A 128-bit integer vector selecting which bits to test in operand \a M. 1193/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1194#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1195 1196/* SSE4 64-bit Packed Integer Comparisons. */ 1197/// Compares each of the corresponding 64-bit values of the 128-bit 1198/// integer vectors for equality. 1199/// 1200/// \headerfile <x86intrin.h> 1201/// 1202/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1203/// 1204/// \param __V1 1205/// A 128-bit integer vector. 1206/// \param __V2 1207/// A 128-bit integer vector. 1208/// \returns A 128-bit integer vector containing the comparison results. 1209static __inline__ __m128i __DEFAULT_FN_ATTRS 1210_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1211{ 1212 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1213} 1214 1215/* SSE4 Packed Integer Sign-Extension. */ 1216/// Sign-extends each of the lower eight 8-bit integer elements of a 1217/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1218/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1219/// are unused. 1220/// 1221/// \headerfile <x86intrin.h> 1222/// 1223/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1224/// 1225/// \param __V 1226/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1227/// extended to 16-bit values. 1228/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1229static __inline__ __m128i __DEFAULT_FN_ATTRS 1230_mm_cvtepi8_epi16(__m128i __V) 1231{ 1232 /* This function always performs a signed extension, but __v16qi is a char 1233 which may be signed or unsigned, so use __v16qs. */ 1234 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1235} 1236 1237/// Sign-extends each of the lower four 8-bit integer elements of a 1238/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1239/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1240/// vector are unused. 1241/// 1242/// \headerfile <x86intrin.h> 1243/// 1244/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1245/// 1246/// \param __V 1247/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1248/// sign-extended to 32-bit values. 1249/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1250static __inline__ __m128i __DEFAULT_FN_ATTRS 1251_mm_cvtepi8_epi32(__m128i __V) 1252{ 1253 /* This function always performs a signed extension, but __v16qi is a char 1254 which may be signed or unsigned, so use __v16qs. */ 1255 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1256} 1257 1258/// Sign-extends each of the lower two 8-bit integer elements of a 1259/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1260/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1261/// vector are unused. 1262/// 1263/// \headerfile <x86intrin.h> 1264/// 1265/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1266/// 1267/// \param __V 1268/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1269/// sign-extended to 64-bit values. 1270/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1271static __inline__ __m128i __DEFAULT_FN_ATTRS 1272_mm_cvtepi8_epi64(__m128i __V) 1273{ 1274 /* This function always performs a signed extension, but __v16qi is a char 1275 which may be signed or unsigned, so use __v16qs. */ 1276 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1277} 1278 1279/// Sign-extends each of the lower four 16-bit integer elements of a 1280/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1281/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1282/// vector are unused. 1283/// 1284/// \headerfile <x86intrin.h> 1285/// 1286/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1287/// 1288/// \param __V 1289/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1290/// sign-extended to 32-bit values. 1291/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1292static __inline__ __m128i __DEFAULT_FN_ATTRS 1293_mm_cvtepi16_epi32(__m128i __V) 1294{ 1295 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1296} 1297 1298/// Sign-extends each of the lower two 16-bit integer elements of a 1299/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1300/// a 128-bit vector of [2 x i64]. The upper six elements of the input 1301/// vector are unused. 1302/// 1303/// \headerfile <x86intrin.h> 1304/// 1305/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1306/// 1307/// \param __V 1308/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1309/// sign-extended to 64-bit values. 1310/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1311static __inline__ __m128i __DEFAULT_FN_ATTRS 1312_mm_cvtepi16_epi64(__m128i __V) 1313{ 1314 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1315} 1316 1317/// Sign-extends each of the lower two 32-bit integer elements of a 1318/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1319/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1320/// are unused. 1321/// 1322/// \headerfile <x86intrin.h> 1323/// 1324/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1325/// 1326/// \param __V 1327/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1328/// sign-extended to 64-bit values. 1329/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1330static __inline__ __m128i __DEFAULT_FN_ATTRS 1331_mm_cvtepi32_epi64(__m128i __V) 1332{ 1333 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1334} 1335 1336/* SSE4 Packed Integer Zero-Extension. */ 1337/// Zero-extends each of the lower eight 8-bit integer elements of a 1338/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1339/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1340/// are unused. 1341/// 1342/// \headerfile <x86intrin.h> 1343/// 1344/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1345/// 1346/// \param __V 1347/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1348/// zero-extended to 16-bit values. 1349/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1350static __inline__ __m128i __DEFAULT_FN_ATTRS 1351_mm_cvtepu8_epi16(__m128i __V) 1352{ 1353 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1354} 1355 1356/// Zero-extends each of the lower four 8-bit integer elements of a 1357/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1358/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1359/// vector are unused. 1360/// 1361/// \headerfile <x86intrin.h> 1362/// 1363/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1364/// 1365/// \param __V 1366/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1367/// zero-extended to 32-bit values. 1368/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1369static __inline__ __m128i __DEFAULT_FN_ATTRS 1370_mm_cvtepu8_epi32(__m128i __V) 1371{ 1372 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1373} 1374 1375/// Zero-extends each of the lower two 8-bit integer elements of a 1376/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1377/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1378/// vector are unused. 1379/// 1380/// \headerfile <x86intrin.h> 1381/// 1382/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1383/// 1384/// \param __V 1385/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1386/// zero-extended to 64-bit values. 1387/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1388static __inline__ __m128i __DEFAULT_FN_ATTRS 1389_mm_cvtepu8_epi64(__m128i __V) 1390{ 1391 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1392} 1393 1394/// Zero-extends each of the lower four 16-bit integer elements of a 1395/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1396/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1397/// vector are unused. 1398/// 1399/// \headerfile <x86intrin.h> 1400/// 1401/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1402/// 1403/// \param __V 1404/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1405/// zero-extended to 32-bit values. 1406/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1407static __inline__ __m128i __DEFAULT_FN_ATTRS 1408_mm_cvtepu16_epi32(__m128i __V) 1409{ 1410 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1411} 1412 1413/// Zero-extends each of the lower two 16-bit integer elements of a 1414/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1415/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1416/// are unused. 1417/// 1418/// \headerfile <x86intrin.h> 1419/// 1420/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1421/// 1422/// \param __V 1423/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1424/// zero-extended to 64-bit values. 1425/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1426static __inline__ __m128i __DEFAULT_FN_ATTRS 1427_mm_cvtepu16_epi64(__m128i __V) 1428{ 1429 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1430} 1431 1432/// Zero-extends each of the lower two 32-bit integer elements of a 1433/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1434/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1435/// are unused. 1436/// 1437/// \headerfile <x86intrin.h> 1438/// 1439/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1440/// 1441/// \param __V 1442/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1443/// zero-extended to 64-bit values. 1444/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1445static __inline__ __m128i __DEFAULT_FN_ATTRS 1446_mm_cvtepu32_epi64(__m128i __V) 1447{ 1448 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1449} 1450 1451/* SSE4 Pack with Unsigned Saturation. */ 1452/// Converts 32-bit signed integers from both 128-bit integer vector 1453/// operands into 16-bit unsigned integers, and returns the packed result. 1454/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1455/// 0x0000 are saturated to 0x0000. 1456/// 1457/// \headerfile <x86intrin.h> 1458/// 1459/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1460/// 1461/// \param __V1 1462/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1463/// signed integer and is converted to a 16-bit unsigned integer with 1464/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1465/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1466/// are written to the lower 64 bits of the result. 1467/// \param __V2 1468/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1469/// signed integer and is converted to a 16-bit unsigned integer with 1470/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1471/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1472/// are written to the higher 64 bits of the result. 1473/// \returns A 128-bit vector of [8 x i16] containing the converted values. 1474static __inline__ __m128i __DEFAULT_FN_ATTRS 1475_mm_packus_epi32(__m128i __V1, __m128i __V2) 1476{ 1477 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1478} 1479 1480/* SSE4 Multiple Packed Sums of Absolute Difference. */ 1481/// Subtracts 8-bit unsigned integer values and computes the absolute 1482/// values of the differences to the corresponding bits in the destination. 1483/// Then sums of the absolute differences are returned according to the bit 1484/// fields in the immediate operand. 1485/// 1486/// \headerfile <x86intrin.h> 1487/// 1488/// \code 1489/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1490/// \endcode 1491/// 1492/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1493/// 1494/// \param X 1495/// A 128-bit vector of [16 x i8]. 1496/// \param Y 1497/// A 128-bit vector of [16 x i8]. 1498/// \param M 1499/// An 8-bit immediate operand specifying how the absolute differences are to 1500/// be calculated, according to the following algorithm: 1501/// \code 1502/// // M2 represents bit 2 of the immediate operand 1503/// // M10 represents bits [1:0] of the immediate operand 1504/// i = M2 * 4; 1505/// j = M10 * 4; 1506/// for (k = 0; k < 8; k = k + 1) { 1507/// d0 = abs(X[i + k + 0] - Y[j + 0]); 1508/// d1 = abs(X[i + k + 1] - Y[j + 1]); 1509/// d2 = abs(X[i + k + 2] - Y[j + 2]); 1510/// d3 = abs(X[i + k + 3] - Y[j + 3]); 1511/// r[k] = d0 + d1 + d2 + d3; 1512/// } 1513/// \endcode 1514/// \returns A 128-bit integer vector containing the sums of the sets of 1515/// absolute differences between both operands. 1516#define _mm_mpsadbw_epu8(X, Y, M) \ 1517 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1518 (__v16qi)(__m128i)(Y), (M)) 1519 1520/// Finds the minimum unsigned 16-bit element in the input 128-bit 1521/// vector of [8 x u16] and returns it and along with its index. 1522/// 1523/// \headerfile <x86intrin.h> 1524/// 1525/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1526/// instruction. 1527/// 1528/// \param __V 1529/// A 128-bit vector of [8 x u16]. 1530/// \returns A 128-bit value where bits [15:0] contain the minimum value found 1531/// in parameter \a __V, bits [18:16] contain the index of the minimum value 1532/// and the remaining bits are set to 0. 1533static __inline__ __m128i __DEFAULT_FN_ATTRS 1534_mm_minpos_epu16(__m128i __V) 1535{ 1536 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1537} 1538 1539/* Handle the sse4.2 definitions here. */ 1540 1541/* These definitions are normally in nmmintrin.h, but gcc puts them in here 1542 so we'll do the same. */ 1543 1544#undef __DEFAULT_FN_ATTRS 1545#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1546 1547/* These specify the type of data that we're comparing. */ 1548#define _SIDD_UBYTE_OPS 0x00 1549#define _SIDD_UWORD_OPS 0x01 1550#define _SIDD_SBYTE_OPS 0x02 1551#define _SIDD_SWORD_OPS 0x03 1552 1553/* These specify the type of comparison operation. */ 1554#define _SIDD_CMP_EQUAL_ANY 0x00 1555#define _SIDD_CMP_RANGES 0x04 1556#define _SIDD_CMP_EQUAL_EACH 0x08 1557#define _SIDD_CMP_EQUAL_ORDERED 0x0c 1558 1559/* These macros specify the polarity of the operation. */ 1560#define _SIDD_POSITIVE_POLARITY 0x00 1561#define _SIDD_NEGATIVE_POLARITY 0x10 1562#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1563#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1564 1565/* These macros are used in _mm_cmpXstri() to specify the return. */ 1566#define _SIDD_LEAST_SIGNIFICANT 0x00 1567#define _SIDD_MOST_SIGNIFICANT 0x40 1568 1569/* These macros are used in _mm_cmpXstri() to specify the return. */ 1570#define _SIDD_BIT_MASK 0x00 1571#define _SIDD_UNIT_MASK 0x40 1572 1573/* SSE4.2 Packed Comparison Intrinsics. */ 1574/// Uses the immediate operand \a M to perform a comparison of string 1575/// data with implicitly defined lengths that is contained in source operands 1576/// \a A and \a B. Returns a 128-bit integer vector representing the result 1577/// mask of the comparison. 1578/// 1579/// \headerfile <x86intrin.h> 1580/// 1581/// \code 1582/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1583/// \endcode 1584/// 1585/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1586/// instruction. 1587/// 1588/// \param A 1589/// A 128-bit integer vector containing one of the source operands to be 1590/// compared. 1591/// \param B 1592/// A 128-bit integer vector containing one of the source operands to be 1593/// compared. 1594/// \param M 1595/// An 8-bit immediate operand specifying whether the characters are bytes or 1596/// words, the type of comparison to perform, and the format of the return 1597/// value. \n 1598/// Bits [1:0]: Determine source data format. \n 1599/// 00: 16 unsigned bytes \n 1600/// 01: 8 unsigned words \n 1601/// 10: 16 signed bytes \n 1602/// 11: 8 signed words \n 1603/// Bits [3:2]: Determine comparison type and aggregation method. \n 1604/// 00: Subset: Each character in \a B is compared for equality with all 1605/// the characters in \a A. \n 1606/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1607/// basis is greater than or equal for even-indexed elements in \a A, 1608/// and less than or equal for odd-indexed elements in \a A. \n 1609/// 10: Match: Compare each pair of corresponding characters in \a A and 1610/// \a B for equality. \n 1611/// 11: Substring: Search \a B for substring matches of \a A. \n 1612/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1613/// mask of the comparison results. \n 1614/// 00: No effect. \n 1615/// 01: Negate the bit mask. \n 1616/// 10: No effect. \n 1617/// 11: Negate the bit mask only for bits with an index less than or equal 1618/// to the size of \a A or \a B. \n 1619/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1620/// bytes. \n 1621/// 0: The result is zero-extended to 16 bytes. \n 1622/// 1: The result is expanded to 16 bytes (this expansion is performed by 1623/// repeating each bit 8 or 16 times). 1624/// \returns Returns a 128-bit integer vector representing the result mask of 1625/// the comparison. 1626#define _mm_cmpistrm(A, B, M) \ 1627 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1628 (__v16qi)(__m128i)(B), (int)(M)) 1629 1630/// Uses the immediate operand \a M to perform a comparison of string 1631/// data with implicitly defined lengths that is contained in source operands 1632/// \a A and \a B. Returns an integer representing the result index of the 1633/// comparison. 1634/// 1635/// \headerfile <x86intrin.h> 1636/// 1637/// \code 1638/// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1639/// \endcode 1640/// 1641/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1642/// instruction. 1643/// 1644/// \param A 1645/// A 128-bit integer vector containing one of the source operands to be 1646/// compared. 1647/// \param B 1648/// A 128-bit integer vector containing one of the source operands to be 1649/// compared. 1650/// \param M 1651/// An 8-bit immediate operand specifying whether the characters are bytes or 1652/// words, the type of comparison to perform, and the format of the return 1653/// value. \n 1654/// Bits [1:0]: Determine source data format. \n 1655/// 00: 16 unsigned bytes \n 1656/// 01: 8 unsigned words \n 1657/// 10: 16 signed bytes \n 1658/// 11: 8 signed words \n 1659/// Bits [3:2]: Determine comparison type and aggregation method. \n 1660/// 00: Subset: Each character in \a B is compared for equality with all 1661/// the characters in \a A. \n 1662/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1663/// basis is greater than or equal for even-indexed elements in \a A, 1664/// and less than or equal for odd-indexed elements in \a A. \n 1665/// 10: Match: Compare each pair of corresponding characters in \a A and 1666/// \a B for equality. \n 1667/// 11: Substring: Search B for substring matches of \a A. \n 1668/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1669/// mask of the comparison results. \n 1670/// 00: No effect. \n 1671/// 01: Negate the bit mask. \n 1672/// 10: No effect. \n 1673/// 11: Negate the bit mask only for bits with an index less than or equal 1674/// to the size of \a A or \a B. \n 1675/// Bit [6]: Determines whether the index of the lowest set bit or the 1676/// highest set bit is returned. \n 1677/// 0: The index of the least significant set bit. \n 1678/// 1: The index of the most significant set bit. \n 1679/// \returns Returns an integer representing the result index of the comparison. 1680#define _mm_cmpistri(A, B, M) \ 1681 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1682 (__v16qi)(__m128i)(B), (int)(M)) 1683 1684/// Uses the immediate operand \a M to perform a comparison of string 1685/// data with explicitly defined lengths that is contained in source operands 1686/// \a A and \a B. Returns a 128-bit integer vector representing the result 1687/// mask of the comparison. 1688/// 1689/// \headerfile <x86intrin.h> 1690/// 1691/// \code 1692/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1693/// \endcode 1694/// 1695/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1696/// instruction. 1697/// 1698/// \param A 1699/// A 128-bit integer vector containing one of the source operands to be 1700/// compared. 1701/// \param LA 1702/// An integer that specifies the length of the string in \a A. 1703/// \param B 1704/// A 128-bit integer vector containing one of the source operands to be 1705/// compared. 1706/// \param LB 1707/// An integer that specifies the length of the string in \a B. 1708/// \param M 1709/// An 8-bit immediate operand specifying whether the characters are bytes or 1710/// words, the type of comparison to perform, and the format of the return 1711/// value. \n 1712/// Bits [1:0]: Determine source data format. \n 1713/// 00: 16 unsigned bytes \n 1714/// 01: 8 unsigned words \n 1715/// 10: 16 signed bytes \n 1716/// 11: 8 signed words \n 1717/// Bits [3:2]: Determine comparison type and aggregation method. \n 1718/// 00: Subset: Each character in \a B is compared for equality with all 1719/// the characters in \a A. \n 1720/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1721/// basis is greater than or equal for even-indexed elements in \a A, 1722/// and less than or equal for odd-indexed elements in \a A. \n 1723/// 10: Match: Compare each pair of corresponding characters in \a A and 1724/// \a B for equality. \n 1725/// 11: Substring: Search \a B for substring matches of \a A. \n 1726/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1727/// mask of the comparison results. \n 1728/// 00: No effect. \n 1729/// 01: Negate the bit mask. \n 1730/// 10: No effect. \n 1731/// 11: Negate the bit mask only for bits with an index less than or equal 1732/// to the size of \a A or \a B. \n 1733/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1734/// bytes. \n 1735/// 0: The result is zero-extended to 16 bytes. \n 1736/// 1: The result is expanded to 16 bytes (this expansion is performed by 1737/// repeating each bit 8 or 16 times). \n 1738/// \returns Returns a 128-bit integer vector representing the result mask of 1739/// the comparison. 1740#define _mm_cmpestrm(A, LA, B, LB, M) \ 1741 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1742 (__v16qi)(__m128i)(B), (int)(LB), \ 1743 (int)(M)) 1744 1745/// Uses the immediate operand \a M to perform a comparison of string 1746/// data with explicitly defined lengths that is contained in source operands 1747/// \a A and \a B. Returns an integer representing the result index of the 1748/// comparison. 1749/// 1750/// \headerfile <x86intrin.h> 1751/// 1752/// \code 1753/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1754/// \endcode 1755/// 1756/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1757/// instruction. 1758/// 1759/// \param A 1760/// A 128-bit integer vector containing one of the source operands to be 1761/// compared. 1762/// \param LA 1763/// An integer that specifies the length of the string in \a A. 1764/// \param B 1765/// A 128-bit integer vector containing one of the source operands to be 1766/// compared. 1767/// \param LB 1768/// An integer that specifies the length of the string in \a B. 1769/// \param M 1770/// An 8-bit immediate operand specifying whether the characters are bytes or 1771/// words, the type of comparison to perform, and the format of the return 1772/// value. \n 1773/// Bits [1:0]: Determine source data format. \n 1774/// 00: 16 unsigned bytes \n 1775/// 01: 8 unsigned words \n 1776/// 10: 16 signed bytes \n 1777/// 11: 8 signed words \n 1778/// Bits [3:2]: Determine comparison type and aggregation method. \n 1779/// 00: Subset: Each character in \a B is compared for equality with all 1780/// the characters in \a A. \n 1781/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1782/// basis is greater than or equal for even-indexed elements in \a A, 1783/// and less than or equal for odd-indexed elements in \a A. \n 1784/// 10: Match: Compare each pair of corresponding characters in \a A and 1785/// \a B for equality. \n 1786/// 11: Substring: Search B for substring matches of \a A. \n 1787/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1788/// mask of the comparison results. \n 1789/// 00: No effect. \n 1790/// 01: Negate the bit mask. \n 1791/// 10: No effect. \n 1792/// 11: Negate the bit mask only for bits with an index less than or equal 1793/// to the size of \a A or \a B. \n 1794/// Bit [6]: Determines whether the index of the lowest set bit or the 1795/// highest set bit is returned. \n 1796/// 0: The index of the least significant set bit. \n 1797/// 1: The index of the most significant set bit. \n 1798/// \returns Returns an integer representing the result index of the comparison. 1799#define _mm_cmpestri(A, LA, B, LB, M) \ 1800 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1801 (__v16qi)(__m128i)(B), (int)(LB), \ 1802 (int)(M)) 1803 1804/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1805/// Uses the immediate operand \a M to perform a comparison of string 1806/// data with implicitly defined lengths that is contained in source operands 1807/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1808/// string in \a B is the maximum, otherwise, returns 0. 1809/// 1810/// \headerfile <x86intrin.h> 1811/// 1812/// \code 1813/// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1814/// \endcode 1815/// 1816/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1817/// instruction. 1818/// 1819/// \param A 1820/// A 128-bit integer vector containing one of the source operands to be 1821/// compared. 1822/// \param B 1823/// A 128-bit integer vector containing one of the source operands to be 1824/// compared. 1825/// \param M 1826/// An 8-bit immediate operand specifying whether the characters are bytes or 1827/// words and the type of comparison to perform. \n 1828/// Bits [1:0]: Determine source data format. \n 1829/// 00: 16 unsigned bytes \n 1830/// 01: 8 unsigned words \n 1831/// 10: 16 signed bytes \n 1832/// 11: 8 signed words \n 1833/// Bits [3:2]: Determine comparison type and aggregation method. \n 1834/// 00: Subset: Each character in \a B is compared for equality with all 1835/// the characters in \a A. \n 1836/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1837/// basis is greater than or equal for even-indexed elements in \a A, 1838/// and less than or equal for odd-indexed elements in \a A. \n 1839/// 10: Match: Compare each pair of corresponding characters in \a A and 1840/// \a B for equality. \n 1841/// 11: Substring: Search \a B for substring matches of \a A. \n 1842/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1843/// mask of the comparison results. \n 1844/// 00: No effect. \n 1845/// 01: Negate the bit mask. \n 1846/// 10: No effect. \n 1847/// 11: Negate the bit mask only for bits with an index less than or equal 1848/// to the size of \a A or \a B. \n 1849/// \returns Returns 1 if the bit mask is zero and the length of the string in 1850/// \a B is the maximum; otherwise, returns 0. 1851#define _mm_cmpistra(A, B, M) \ 1852 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1853 (__v16qi)(__m128i)(B), (int)(M)) 1854 1855/// Uses the immediate operand \a M to perform a comparison of string 1856/// data with implicitly defined lengths that is contained in source operands 1857/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1858/// 0. 1859/// 1860/// \headerfile <x86intrin.h> 1861/// 1862/// \code 1863/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1864/// \endcode 1865/// 1866/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1867/// instruction. 1868/// 1869/// \param A 1870/// A 128-bit integer vector containing one of the source operands to be 1871/// compared. 1872/// \param B 1873/// A 128-bit integer vector containing one of the source operands to be 1874/// compared. 1875/// \param M 1876/// An 8-bit immediate operand specifying whether the characters are bytes or 1877/// words and the type of comparison to perform. \n 1878/// Bits [1:0]: Determine source data format. \n 1879/// 00: 16 unsigned bytes \n 1880/// 01: 8 unsigned words \n 1881/// 10: 16 signed bytes \n 1882/// 11: 8 signed words \n 1883/// Bits [3:2]: Determine comparison type and aggregation method. \n 1884/// 00: Subset: Each character in \a B is compared for equality with all 1885/// the characters in \a A. \n 1886/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1887/// basis is greater than or equal for even-indexed elements in \a A, 1888/// and less than or equal for odd-indexed elements in \a A. \n 1889/// 10: Match: Compare each pair of corresponding characters in \a A and 1890/// \a B for equality. \n 1891/// 11: Substring: Search B for substring matches of \a A. \n 1892/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1893/// mask of the comparison results. \n 1894/// 00: No effect. \n 1895/// 01: Negate the bit mask. \n 1896/// 10: No effect. \n 1897/// 11: Negate the bit mask only for bits with an index less than or equal 1898/// to the size of \a A or \a B. 1899/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1900#define _mm_cmpistrc(A, B, M) \ 1901 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1902 (__v16qi)(__m128i)(B), (int)(M)) 1903 1904/// Uses the immediate operand \a M to perform a comparison of string 1905/// data with implicitly defined lengths that is contained in source operands 1906/// \a A and \a B. Returns bit 0 of the resulting bit mask. 1907/// 1908/// \headerfile <x86intrin.h> 1909/// 1910/// \code 1911/// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1912/// \endcode 1913/// 1914/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1915/// instruction. 1916/// 1917/// \param A 1918/// A 128-bit integer vector containing one of the source operands to be 1919/// compared. 1920/// \param B 1921/// A 128-bit integer vector containing one of the source operands to be 1922/// compared. 1923/// \param M 1924/// An 8-bit immediate operand specifying whether the characters are bytes or 1925/// words and the type of comparison to perform. \n 1926/// Bits [1:0]: Determine source data format. \n 1927/// 00: 16 unsigned bytes \n 1928/// 01: 8 unsigned words \n 1929/// 10: 16 signed bytes \n 1930/// 11: 8 signed words \n 1931/// Bits [3:2]: Determine comparison type and aggregation method. \n 1932/// 00: Subset: Each character in \a B is compared for equality with all 1933/// the characters in \a A. \n 1934/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1935/// basis is greater than or equal for even-indexed elements in \a A, 1936/// and less than or equal for odd-indexed elements in \a A. \n 1937/// 10: Match: Compare each pair of corresponding characters in \a A and 1938/// \a B for equality. \n 1939/// 11: Substring: Search B for substring matches of \a A. \n 1940/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1941/// mask of the comparison results. \n 1942/// 00: No effect. \n 1943/// 01: Negate the bit mask. \n 1944/// 10: No effect. \n 1945/// 11: Negate the bit mask only for bits with an index less than or equal 1946/// to the size of \a A or \a B. \n 1947/// \returns Returns bit 0 of the resulting bit mask. 1948#define _mm_cmpistro(A, B, M) \ 1949 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1950 (__v16qi)(__m128i)(B), (int)(M)) 1951 1952/// Uses the immediate operand \a M to perform a comparison of string 1953/// data with implicitly defined lengths that is contained in source operands 1954/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1955/// the maximum, otherwise, returns 0. 1956/// 1957/// \headerfile <x86intrin.h> 1958/// 1959/// \code 1960/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1961/// \endcode 1962/// 1963/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1964/// instruction. 1965/// 1966/// \param A 1967/// A 128-bit integer vector containing one of the source operands to be 1968/// compared. 1969/// \param B 1970/// A 128-bit integer vector containing one of the source operands to be 1971/// compared. 1972/// \param M 1973/// An 8-bit immediate operand specifying whether the characters are bytes or 1974/// words and the type of comparison to perform. \n 1975/// Bits [1:0]: Determine source data format. \n 1976/// 00: 16 unsigned bytes \n 1977/// 01: 8 unsigned words \n 1978/// 10: 16 signed bytes \n 1979/// 11: 8 signed words \n 1980/// Bits [3:2]: Determine comparison type and aggregation method. \n 1981/// 00: Subset: Each character in \a B is compared for equality with all 1982/// the characters in \a A. \n 1983/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1984/// basis is greater than or equal for even-indexed elements in \a A, 1985/// and less than or equal for odd-indexed elements in \a A. \n 1986/// 10: Match: Compare each pair of corresponding characters in \a A and 1987/// \a B for equality. \n 1988/// 11: Substring: Search \a B for substring matches of \a A. \n 1989/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1990/// mask of the comparison results. \n 1991/// 00: No effect. \n 1992/// 01: Negate the bit mask. \n 1993/// 10: No effect. \n 1994/// 11: Negate the bit mask only for bits with an index less than or equal 1995/// to the size of \a A or \a B. \n 1996/// \returns Returns 1 if the length of the string in \a A is less than the 1997/// maximum, otherwise, returns 0. 1998#define _mm_cmpistrs(A, B, M) \ 1999 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2000 (__v16qi)(__m128i)(B), (int)(M)) 2001 2002/// Uses the immediate operand \a M to perform a comparison of string 2003/// data with implicitly defined lengths that is contained in source operands 2004/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2005/// the maximum, otherwise, returns 0. 2006/// 2007/// \headerfile <x86intrin.h> 2008/// 2009/// \code 2010/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2011/// \endcode 2012/// 2013/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 2014/// instruction. 2015/// 2016/// \param A 2017/// A 128-bit integer vector containing one of the source operands to be 2018/// compared. 2019/// \param B 2020/// A 128-bit integer vector containing one of the source operands to be 2021/// compared. 2022/// \param M 2023/// An 8-bit immediate operand specifying whether the characters are bytes or 2024/// words and the type of comparison to perform. \n 2025/// Bits [1:0]: Determine source data format. \n 2026/// 00: 16 unsigned bytes \n 2027/// 01: 8 unsigned words \n 2028/// 10: 16 signed bytes \n 2029/// 11: 8 signed words \n 2030/// Bits [3:2]: Determine comparison type and aggregation method. \n 2031/// 00: Subset: Each character in \a B is compared for equality with all 2032/// the characters in \a A. \n 2033/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2034/// basis is greater than or equal for even-indexed elements in \a A, 2035/// and less than or equal for odd-indexed elements in \a A. \n 2036/// 10: Match: Compare each pair of corresponding characters in \a A and 2037/// \a B for equality. \n 2038/// 11: Substring: Search \a B for substring matches of \a A. \n 2039/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2040/// mask of the comparison results. \n 2041/// 00: No effect. \n 2042/// 01: Negate the bit mask. \n 2043/// 10: No effect. \n 2044/// 11: Negate the bit mask only for bits with an index less than or equal 2045/// to the size of \a A or \a B. 2046/// \returns Returns 1 if the length of the string in \a B is less than the 2047/// maximum, otherwise, returns 0. 2048#define _mm_cmpistrz(A, B, M) \ 2049 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2050 (__v16qi)(__m128i)(B), (int)(M)) 2051 2052/// Uses the immediate operand \a M to perform a comparison of string 2053/// data with explicitly defined lengths that is contained in source operands 2054/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2055/// string in \a B is the maximum, otherwise, returns 0. 2056/// 2057/// \headerfile <x86intrin.h> 2058/// 2059/// \code 2060/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2061/// \endcode 2062/// 2063/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2064/// instruction. 2065/// 2066/// \param A 2067/// A 128-bit integer vector containing one of the source operands to be 2068/// compared. 2069/// \param LA 2070/// An integer that specifies the length of the string in \a A. 2071/// \param B 2072/// A 128-bit integer vector containing one of the source operands to be 2073/// compared. 2074/// \param LB 2075/// An integer that specifies the length of the string in \a B. 2076/// \param M 2077/// An 8-bit immediate operand specifying whether the characters are bytes or 2078/// words and the type of comparison to perform. \n 2079/// Bits [1:0]: Determine source data format. \n 2080/// 00: 16 unsigned bytes \n 2081/// 01: 8 unsigned words \n 2082/// 10: 16 signed bytes \n 2083/// 11: 8 signed words \n 2084/// Bits [3:2]: Determine comparison type and aggregation method. \n 2085/// 00: Subset: Each character in \a B is compared for equality with all 2086/// the characters in \a A. \n 2087/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2088/// basis is greater than or equal for even-indexed elements in \a A, 2089/// and less than or equal for odd-indexed elements in \a A. \n 2090/// 10: Match: Compare each pair of corresponding characters in \a A and 2091/// \a B for equality. \n 2092/// 11: Substring: Search \a B for substring matches of \a A. \n 2093/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2094/// mask of the comparison results. \n 2095/// 00: No effect. \n 2096/// 01: Negate the bit mask. \n 2097/// 10: No effect. \n 2098/// 11: Negate the bit mask only for bits with an index less than or equal 2099/// to the size of \a A or \a B. 2100/// \returns Returns 1 if the bit mask is zero and the length of the string in 2101/// \a B is the maximum, otherwise, returns 0. 2102#define _mm_cmpestra(A, LA, B, LB, M) \ 2103 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2104 (__v16qi)(__m128i)(B), (int)(LB), \ 2105 (int)(M)) 2106 2107/// Uses the immediate operand \a M to perform a comparison of string 2108/// data with explicitly defined lengths that is contained in source operands 2109/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2110/// returns 0. 2111/// 2112/// \headerfile <x86intrin.h> 2113/// 2114/// \code 2115/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2116/// \endcode 2117/// 2118/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2119/// instruction. 2120/// 2121/// \param A 2122/// A 128-bit integer vector containing one of the source operands to be 2123/// compared. 2124/// \param LA 2125/// An integer that specifies the length of the string in \a A. 2126/// \param B 2127/// A 128-bit integer vector containing one of the source operands to be 2128/// compared. 2129/// \param LB 2130/// An integer that specifies the length of the string in \a B. 2131/// \param M 2132/// An 8-bit immediate operand specifying whether the characters are bytes or 2133/// words and the type of comparison to perform. \n 2134/// Bits [1:0]: Determine source data format. \n 2135/// 00: 16 unsigned bytes \n 2136/// 01: 8 unsigned words \n 2137/// 10: 16 signed bytes \n 2138/// 11: 8 signed words \n 2139/// Bits [3:2]: Determine comparison type and aggregation method. \n 2140/// 00: Subset: Each character in \a B is compared for equality with all 2141/// the characters in \a A. \n 2142/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2143/// basis is greater than or equal for even-indexed elements in \a A, 2144/// and less than or equal for odd-indexed elements in \a A. \n 2145/// 10: Match: Compare each pair of corresponding characters in \a A and 2146/// \a B for equality. \n 2147/// 11: Substring: Search \a B for substring matches of \a A. \n 2148/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2149/// mask of the comparison results. \n 2150/// 00: No effect. \n 2151/// 01: Negate the bit mask. \n 2152/// 10: No effect. \n 2153/// 11: Negate the bit mask only for bits with an index less than or equal 2154/// to the size of \a A or \a B. \n 2155/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2156#define _mm_cmpestrc(A, LA, B, LB, M) \ 2157 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2158 (__v16qi)(__m128i)(B), (int)(LB), \ 2159 (int)(M)) 2160 2161/// Uses the immediate operand \a M to perform a comparison of string 2162/// data with explicitly defined lengths that is contained in source operands 2163/// \a A and \a B. Returns bit 0 of the resulting bit mask. 2164/// 2165/// \headerfile <x86intrin.h> 2166/// 2167/// \code 2168/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2169/// \endcode 2170/// 2171/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2172/// instruction. 2173/// 2174/// \param A 2175/// A 128-bit integer vector containing one of the source operands to be 2176/// compared. 2177/// \param LA 2178/// An integer that specifies the length of the string in \a A. 2179/// \param B 2180/// A 128-bit integer vector containing one of the source operands to be 2181/// compared. 2182/// \param LB 2183/// An integer that specifies the length of the string in \a B. 2184/// \param M 2185/// An 8-bit immediate operand specifying whether the characters are bytes or 2186/// words and the type of comparison to perform. \n 2187/// Bits [1:0]: Determine source data format. \n 2188/// 00: 16 unsigned bytes \n 2189/// 01: 8 unsigned words \n 2190/// 10: 16 signed bytes \n 2191/// 11: 8 signed words \n 2192/// Bits [3:2]: Determine comparison type and aggregation method. \n 2193/// 00: Subset: Each character in \a B is compared for equality with all 2194/// the characters in \a A. \n 2195/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2196/// basis is greater than or equal for even-indexed elements in \a A, 2197/// and less than or equal for odd-indexed elements in \a A. \n 2198/// 10: Match: Compare each pair of corresponding characters in \a A and 2199/// \a B for equality. \n 2200/// 11: Substring: Search \a B for substring matches of \a A. \n 2201/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2202/// mask of the comparison results. \n 2203/// 00: No effect. \n 2204/// 01: Negate the bit mask. \n 2205/// 10: No effect. \n 2206/// 11: Negate the bit mask only for bits with an index less than or equal 2207/// to the size of \a A or \a B. 2208/// \returns Returns bit 0 of the resulting bit mask. 2209#define _mm_cmpestro(A, LA, B, LB, M) \ 2210 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2211 (__v16qi)(__m128i)(B), (int)(LB), \ 2212 (int)(M)) 2213 2214/// Uses the immediate operand \a M to perform a comparison of string 2215/// data with explicitly defined lengths that is contained in source operands 2216/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2217/// the maximum, otherwise, returns 0. 2218/// 2219/// \headerfile <x86intrin.h> 2220/// 2221/// \code 2222/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2223/// \endcode 2224/// 2225/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2226/// instruction. 2227/// 2228/// \param A 2229/// A 128-bit integer vector containing one of the source operands to be 2230/// compared. 2231/// \param LA 2232/// An integer that specifies the length of the string in \a A. 2233/// \param B 2234/// A 128-bit integer vector containing one of the source operands to be 2235/// compared. 2236/// \param LB 2237/// An integer that specifies the length of the string in \a B. 2238/// \param M 2239/// An 8-bit immediate operand specifying whether the characters are bytes or 2240/// words and the type of comparison to perform. \n 2241/// Bits [1:0]: Determine source data format. \n 2242/// 00: 16 unsigned bytes \n 2243/// 01: 8 unsigned words \n 2244/// 10: 16 signed bytes \n 2245/// 11: 8 signed words \n 2246/// Bits [3:2]: Determine comparison type and aggregation method. \n 2247/// 00: Subset: Each character in \a B is compared for equality with all 2248/// the characters in \a A. \n 2249/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2250/// basis is greater than or equal for even-indexed elements in \a A, 2251/// and less than or equal for odd-indexed elements in \a A. \n 2252/// 10: Match: Compare each pair of corresponding characters in \a A and 2253/// \a B for equality. \n 2254/// 11: Substring: Search \a B for substring matches of \a A. \n 2255/// Bits [5:4]: Determine whether to perform a one's complement in the bit 2256/// mask of the comparison results. \n 2257/// 00: No effect. \n 2258/// 01: Negate the bit mask. \n 2259/// 10: No effect. \n 2260/// 11: Negate the bit mask only for bits with an index less than or equal 2261/// to the size of \a A or \a B. \n 2262/// \returns Returns 1 if the length of the string in \a A is less than the 2263/// maximum, otherwise, returns 0. 2264#define _mm_cmpestrs(A, LA, B, LB, M) \ 2265 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2266 (__v16qi)(__m128i)(B), (int)(LB), \ 2267 (int)(M)) 2268 2269/// Uses the immediate operand \a M to perform a comparison of string 2270/// data with explicitly defined lengths that is contained in source operands 2271/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2272/// the maximum, otherwise, returns 0. 2273/// 2274/// \headerfile <x86intrin.h> 2275/// 2276/// \code 2277/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2278/// \endcode 2279/// 2280/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2281/// 2282/// \param A 2283/// A 128-bit integer vector containing one of the source operands to be 2284/// compared. 2285/// \param LA 2286/// An integer that specifies the length of the string in \a A. 2287/// \param B 2288/// A 128-bit integer vector containing one of the source operands to be 2289/// compared. 2290/// \param LB 2291/// An integer that specifies the length of the string in \a B. 2292/// \param M 2293/// An 8-bit immediate operand specifying whether the characters are bytes or 2294/// words and the type of comparison to perform. \n 2295/// Bits [1:0]: Determine source data format. \n 2296/// 00: 16 unsigned bytes \n 2297/// 01: 8 unsigned words \n 2298/// 10: 16 signed bytes \n 2299/// 11: 8 signed words \n 2300/// Bits [3:2]: Determine comparison type and aggregation method. \n 2301/// 00: Subset: Each character in \a B is compared for equality with all 2302/// the characters in \a A. \n 2303/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2304/// basis is greater than or equal for even-indexed elements in \a A, 2305/// and less than or equal for odd-indexed elements in \a A. \n 2306/// 10: Match: Compare each pair of corresponding characters in \a A and 2307/// \a B for equality. \n 2308/// 11: Substring: Search \a B for substring matches of \a A. \n 2309/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2310/// mask of the comparison results. \n 2311/// 00: No effect. \n 2312/// 01: Negate the bit mask. \n 2313/// 10: No effect. \n 2314/// 11: Negate the bit mask only for bits with an index less than or equal 2315/// to the size of \a A or \a B. 2316/// \returns Returns 1 if the length of the string in \a B is less than the 2317/// maximum, otherwise, returns 0. 2318#define _mm_cmpestrz(A, LA, B, LB, M) \ 2319 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2320 (__v16qi)(__m128i)(B), (int)(LB), \ 2321 (int)(M)) 2322 2323/* SSE4.2 Compare Packed Data -- Greater Than. */ 2324/// Compares each of the corresponding 64-bit values of the 128-bit 2325/// integer vectors to determine if the values in the first operand are 2326/// greater than those in the second operand. 2327/// 2328/// \headerfile <x86intrin.h> 2329/// 2330/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2331/// 2332/// \param __V1 2333/// A 128-bit integer vector. 2334/// \param __V2 2335/// A 128-bit integer vector. 2336/// \returns A 128-bit integer vector containing the comparison results. 2337static __inline__ __m128i __DEFAULT_FN_ATTRS 2338_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2339{ 2340 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2341} 2342 2343/* SSE4.2 Accumulate CRC32. */ 2344/// Adds the unsigned integer operand to the CRC-32C checksum of the 2345/// unsigned char operand. 2346/// 2347/// \headerfile <x86intrin.h> 2348/// 2349/// This intrinsic corresponds to the <c> CRC32B </c> instruction. 2350/// 2351/// \param __C 2352/// An unsigned integer operand to add to the CRC-32C checksum of operand 2353/// \a __D. 2354/// \param __D 2355/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. 2356/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2357/// operand \a __D. 2358static __inline__ unsigned int __DEFAULT_FN_ATTRS 2359_mm_crc32_u8(unsigned int __C, unsigned char __D) 2360{ 2361 return __builtin_ia32_crc32qi(__C, __D); 2362} 2363 2364/// Adds the unsigned integer operand to the CRC-32C checksum of the 2365/// unsigned short operand. 2366/// 2367/// \headerfile <x86intrin.h> 2368/// 2369/// This intrinsic corresponds to the <c> CRC32W </c> instruction. 2370/// 2371/// \param __C 2372/// An unsigned integer operand to add to the CRC-32C checksum of operand 2373/// \a __D. 2374/// \param __D 2375/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. 2376/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2377/// operand \a __D. 2378static __inline__ unsigned int __DEFAULT_FN_ATTRS 2379_mm_crc32_u16(unsigned int __C, unsigned short __D) 2380{ 2381 return __builtin_ia32_crc32hi(__C, __D); 2382} 2383 2384/// Adds the first unsigned integer operand to the CRC-32C checksum of 2385/// the second unsigned integer operand. 2386/// 2387/// \headerfile <x86intrin.h> 2388/// 2389/// This intrinsic corresponds to the <c> CRC32L </c> instruction. 2390/// 2391/// \param __C 2392/// An unsigned integer operand to add to the CRC-32C checksum of operand 2393/// \a __D. 2394/// \param __D 2395/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. 2396/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2397/// operand \a __D. 2398static __inline__ unsigned int __DEFAULT_FN_ATTRS 2399_mm_crc32_u32(unsigned int __C, unsigned int __D) 2400{ 2401 return __builtin_ia32_crc32si(__C, __D); 2402} 2403 2404#ifdef __x86_64__ 2405/// Adds the unsigned integer operand to the CRC-32C checksum of the 2406/// unsigned 64-bit integer operand. 2407/// 2408/// \headerfile <x86intrin.h> 2409/// 2410/// This intrinsic corresponds to the <c> CRC32Q </c> instruction. 2411/// 2412/// \param __C 2413/// An unsigned integer operand to add to the CRC-32C checksum of operand 2414/// \a __D. 2415/// \param __D 2416/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. 2417/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2418/// operand \a __D. 2419static __inline__ unsigned long long __DEFAULT_FN_ATTRS 2420_mm_crc32_u64(unsigned long long __C, unsigned long long __D) 2421{ 2422 return __builtin_ia32_crc32di(__C, __D); 2423} 2424#endif /* __x86_64__ */ 2425 2426#undef __DEFAULT_FN_ATTRS 2427 2428#include <popcntintrin.h> 2429 2430#endif /* __SMMINTRIN_H */ 2431