1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error \ 12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead." 13#endif // __IMMINTRIN_H 14 15#ifndef __AVXVNNIINT16INTRIN_H 16#define __AVXVNNIINT16INTRIN_H 17 18/* Define the default attributes for the functions in this file. */ 19#define __DEFAULT_FN_ATTRS128 \ 20 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ 21 __min_vector_width__(128))) 22#define __DEFAULT_FN_ATTRS256 \ 23 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ 24 __min_vector_width__(256))) 25 26/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 27/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 28/// signed 16-bit results. Sum these 2 results with the corresponding 29/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 30/// 31/// \headerfile <immintrin.h> 32/// 33/// \code 34/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B) 35/// \endcode 36/// 37/// This intrinsic corresponds to the \c VPDPWSUD instruction. 38/// 39/// \param __W 40/// A 128-bit vector of [4 x int]. 41/// \param __A 42/// A 128-bit vector of [8 x short]. 43/// \param __B 44/// A 128-bit vector of [8 x unsigned short]. 45/// \returns 46/// A 128-bit vector of [4 x int]. 47/// 48/// \code{.operation} 49/// FOR j := 0 to 3 50/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 51/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 52/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 53/// ENDFOR 54/// dst[MAX:128] := 0 55/// \endcode 56static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, 57 __m128i __A, 58 __m128i __B) { 59 return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, 60 (__v4si)__B); 61} 62 63/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 64/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 65/// signed 16-bit results. Sum these 2 results with the corresponding 66/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 67/// 68/// \headerfile <immintrin.h> 69/// 70/// \code 71/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) 72/// \endcode 73/// 74/// This intrinsic corresponds to the \c VPDPWSUD instruction. 75/// 76/// \param __W 77/// A 256-bit vector of [8 x int]. 78/// \param __A 79/// A 256-bit vector of [16 x short]. 80/// \param __B 81/// A 256-bit vector of [16 x unsigned short]. 82/// \returns 83/// A 256-bit vector of [8 x int]. 84/// 85/// \code{.operation} 86/// FOR j := 0 to 7 87/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 88/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 89/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 90/// ENDFOR 91/// dst[MAX:256] := 0 92/// \endcode 93static __inline__ __m256i __DEFAULT_FN_ATTRS256 94_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { 95 return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, 96 (__v8si)__B); 97} 98 99/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 100/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 101/// signed 16-bit results. Sum these 2 results with the corresponding 102/// 32-bit integer in \a __W with signed saturation, and store the packed 103/// 32-bit results in \a dst. 104/// 105/// \headerfile <immintrin.h> 106/// 107/// \code 108/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 109/// \endcode 110/// 111/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 112/// 113/// \param __W 114/// A 128-bit vector of [4 x int]. 115/// \param __A 116/// A 128-bit vector of [8 x short]. 117/// \param __B 118/// A 128-bit vector of [8 x unsigned short]. 119/// \returns 120/// A 128-bit vector of [4 x int]. 121/// 122/// \code{.operation} 123/// FOR j := 0 to 3 124/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 125/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 126/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 127/// ENDFOR 128/// dst[MAX:128] := 0 129/// \endcode 130static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, 131 __m128i __A, 132 __m128i __B) { 133 return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, 134 (__v4si)__B); 135} 136 137/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 138/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 139/// signed 16-bit results. Sum these 2 results with the corresponding 140/// 32-bit integer in \a __W with signed saturation, and store the packed 141/// 32-bit results in \a dst. 142/// 143/// \headerfile <immintrin.h> 144/// 145/// \code 146/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 147/// \endcode 148/// 149/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 150/// 151/// \param __W 152/// A 256-bit vector of [8 x int]. 153/// \param __A 154/// A 256-bit vector of [16 x short]. 155/// \param __B 156/// A 256-bit vector of [16 x unsigned short]. 157/// \returns 158/// A 256-bit vector of [8 x int]. 159/// 160/// \code{.operation} 161/// FOR j := 0 to 7 162/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 163/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 164/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 165/// ENDFOR 166/// dst[MAX:256] := 0 167/// \endcode 168static __inline__ __m256i __DEFAULT_FN_ATTRS256 169_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 170 return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, 171 (__v8si)__B); 172} 173 174/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 175/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 176/// signed 16-bit results. Sum these 2 results with the corresponding 177/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 178/// 179/// \headerfile <immintrin.h> 180/// 181/// \code 182/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B) 183/// \endcode 184/// 185/// This intrinsic corresponds to the \c VPDPWUSD instruction. 186/// 187/// \param __W 188/// A 128-bit vector of [4 x int]. 189/// \param __A 190/// A 128-bit vector of [8 x unsigned short]. 191/// \param __B 192/// A 128-bit vector of [8 x short]. 193/// \returns 194/// A 128-bit vector of [4 x int]. 195/// 196/// \code{.operation} 197/// FOR j := 0 to 3 198/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 199/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 200/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 201/// ENDFOR 202/// dst[MAX:128] := 0 203/// \endcode 204static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, 205 __m128i __A, 206 __m128i __B) { 207 return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, 208 (__v4si)__B); 209} 210 211/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 212/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 213/// signed 16-bit results. Sum these 2 results with the corresponding 214/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 215/// 216/// \headerfile <immintrin.h> 217/// 218/// \code 219/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) 220/// \endcode 221/// 222/// This intrinsic corresponds to the \c VPDPWUSD instruction. 223/// 224/// \param __W 225/// A 256-bit vector of [8 x int]. 226/// \param __A 227/// A 256-bit vector of [16 x unsigned short]. 228/// \param __B 229/// A 256-bit vector of [16 x short]. 230/// \returns 231/// A 256-bit vector of [8 x int]. 232/// 233/// \code{.operation} 234/// FOR j := 0 to 7 235/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 236/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 237/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 238/// ENDFOR 239/// dst[MAX:256] := 0 240/// \endcode 241static __inline__ __m256i __DEFAULT_FN_ATTRS256 242_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { 243 return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, 244 (__v8si)__B); 245} 246 247/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 248/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 249/// signed 16-bit results. Sum these 2 results with the corresponding 250/// 32-bit integer in \a __W with signed saturation, and store the packed 251/// 32-bit results in \a dst. 252/// 253/// \headerfile <immintrin.h> 254/// 255/// \code 256/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) 257/// \endcode 258/// 259/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 260/// 261/// \param __W 262/// A 128-bit vector of [4 x int]. 263/// \param __A 264/// A 128-bit vector of [8 x unsigned short]. 265/// \param __B 266/// A 128-bit vector of [8 x short]. 267/// \returns 268/// A 128-bit vector of [4 x int]. 269/// 270/// \code{.operation} 271/// FOR j := 0 to 3 272/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 273/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 274/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 275/// ENDFOR 276/// dst[MAX:128] := 0 277/// \endcode 278static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, 279 __m128i __A, 280 __m128i __B) { 281 return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, 282 (__v4si)__B); 283} 284 285/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 286/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 287/// signed 16-bit results. Sum these 2 results with the corresponding 288/// 32-bit integer in \a __W with signed saturation, and store the packed 289/// 32-bit results in \a dst. 290/// 291/// \headerfile <immintrin.h> 292/// 293/// \code 294/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 295/// \endcode 296/// 297/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 298/// 299/// \param __W 300/// A 256-bit vector of [8 x int]. 301/// \param __A 302/// A 256-bit vector of [16 x unsigned short]. 303/// \param __B 304/// A 256-bit vector of [16 x short]. 305/// \returns 306/// A 256-bit vector of [8 x int]. 307/// 308/// \code{.operation} 309/// FOR j := 0 to 7 310/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 311/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 312/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 313/// ENDFOR 314/// dst[MAX:256] := 0 315/// \endcode 316static __inline__ __m256i __DEFAULT_FN_ATTRS256 317_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { 318 return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, 319 (__v8si)__B); 320} 321 322/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 323/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 324/// signed 16-bit results. Sum these 2 results with the corresponding 325/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 326/// 327/// \headerfile <immintrin.h> 328/// 329/// \code 330/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B) 331/// \endcode 332/// 333/// This intrinsic corresponds to the \c VPDPWUUD instruction. 334/// 335/// \param __W 336/// A 128-bit vector of [4 x unsigned int]. 337/// \param __A 338/// A 128-bit vector of [8 x unsigned short]. 339/// \param __B 340/// A 128-bit vector of [8 x unsigned short]. 341/// \returns 342/// A 128-bit vector of [4 x unsigned int]. 343/// 344/// \code{.operation} 345/// FOR j := 0 to 3 346/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 347/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 348/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 349/// ENDFOR 350/// dst[MAX:128] := 0 351/// \endcode 352static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, 353 __m128i __A, 354 __m128i __B) { 355 return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, 356 (__v4si)__B); 357} 358 359/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 360/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 361/// signed 16-bit results. Sum these 2 results with the corresponding 362/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 363/// 364/// \headerfile <immintrin.h> 365/// 366/// \code 367/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) 368/// \endcode 369/// 370/// This intrinsic corresponds to the \c VPDPWUUD instruction. 371/// 372/// \param __W 373/// A 256-bit vector of [8 x unsigned int]. 374/// \param __A 375/// A 256-bit vector of [16 x unsigned short]. 376/// \param __B 377/// A 256-bit vector of [16 x unsigned short]. 378/// \returns 379/// A 256-bit vector of [8 x unsigned int]. 380/// 381/// \code{.operation} 382/// FOR j := 0 to 7 383/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 384/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 385/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 386/// ENDFOR 387/// dst[MAX:256] := 0 388/// \endcode 389static __inline__ __m256i __DEFAULT_FN_ATTRS256 390_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { 391 return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, 392 (__v8si)__B); 393} 394 395/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 396/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 397/// signed 16-bit results. Sum these 2 results with the corresponding 398/// 32-bit integer in \a __W with signed saturation, and store the packed 399/// 32-bit results in \a dst. 400/// 401/// \headerfile <immintrin.h> 402/// 403/// \code 404/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 405/// \endcode 406/// 407/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 408/// 409/// \param __W 410/// A 128-bit vector of [4 x unsigned int]. 411/// \param __A 412/// A 128-bit vector of [8 x unsigned short]. 413/// \param __B 414/// A 128-bit vector of [8 x unsigned short]. 415/// \returns 416/// A 128-bit vector of [4 x unsigned int]. 417/// 418/// \code{.operation} 419/// FOR j := 0 to 3 420/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 421/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 422/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 423/// ENDFOR 424/// dst[MAX:128] := 0 425/// \endcode 426static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, 427 __m128i __A, 428 __m128i __B) { 429 return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, 430 (__v4si)__B); 431} 432 433/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 434/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 435/// signed 16-bit results. Sum these 2 results with the corresponding 436/// 32-bit integer in \a __W with signed saturation, and store the packed 437/// 32-bit results in \a dst. 438/// 439/// \headerfile <immintrin.h> 440/// 441/// \code 442/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) 443/// \endcode 444/// 445/// This intrinsic corresponds to the \c VPDPWSUDS instruction. 446/// 447/// \param __W 448/// A 256-bit vector of [8 x unsigned int]. 449/// \param __A 450/// A 256-bit vector of [16 x unsigned short]. 451/// \param __B 452/// A 256-bit vector of [16 x unsigned short]. 453/// \returns 454/// A 256-bit vector of [8 x unsigned int]. 455/// 456/// \code{.operation} 457/// FOR j := 0 to 7 458/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 459/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 460/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 461/// ENDFOR 462/// dst[MAX:256] := 0 463/// \endcode 464static __inline__ __m256i __DEFAULT_FN_ATTRS256 465_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 466 return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, 467 (__v8si)__B); 468} 469 470#undef __DEFAULT_FN_ATTRS128 471#undef __DEFAULT_FN_ATTRS256 472 473#endif // __AVXVNNIINT16INTRIN_H 474