1/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== 2 * 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 * 8 *===-----------------------------------------------------------------------=== 9 */ 10#ifndef __IMMINTRIN_H 11#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." 12#endif 13 14#ifndef __AVX512VLVNNIINTRIN_H 15#define __AVX512VLVNNIINTRIN_H 16 17/* Define the default attributes for the functions in this file. */ 18#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) 19#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) 20 21/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 22/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 23/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 24/// in \a S, and store the packed 32-bit results in DST. 25/// 26/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 27/// 28/// \code{.operation} 29/// FOR j := 0 to 7 30/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 31/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 32/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 33/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 34/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 35/// ENDFOR 36/// DST[MAX:256] := 0 37/// \endcode 38#define _mm256_dpbusd_epi32(S, A, B) \ 39 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 40 41/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 42/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 43/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 44/// in \a S using signed saturation, and store the packed 32-bit results in DST. 45/// 46/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 47/// 48/// \code{.operation} 49/// FOR j := 0 to 7 50/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 51/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 52/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 53/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 54/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 55/// ENDFOR 56/// DST[MAX:256] := 0 57/// \endcode 58#define _mm256_dpbusds_epi32(S, A, B) \ 59 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 60 61/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 62/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 63/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 64/// and store the packed 32-bit results in DST. 65/// 66/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 67/// 68/// \code{.operation} 69/// FOR j := 0 to 7 70/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 71/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 72/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 73/// ENDFOR 74/// DST[MAX:256] := 0 75/// \endcode 76#define _mm256_dpwssd_epi32(S, A, B) \ 77 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 78 79/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 80/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 81/// results. Sum these 2 results with the corresponding 32-bit integer in \a S 82/// using signed saturation, and store the packed 32-bit results in DST. 83/// 84/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 85/// 86/// \code{.operation} 87/// FOR j := 0 to 7 88/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 89/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 90/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 91/// ENDFOR 92/// DST[MAX:256] := 0 93/// \endcode 94#define _mm256_dpwssds_epi32(S, A, B) \ 95 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 96 97/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 98/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 99/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 100/// in \a S, and store the packed 32-bit results in DST. 101/// 102/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 103/// 104/// \code{.operation} 105/// FOR j := 0 to 3 106/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 107/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 108/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 109/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 110/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 111/// ENDFOR 112/// DST[MAX:128] := 0 113/// \endcode 114#define _mm_dpbusd_epi32(S, A, B) \ 115 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 116 117/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 118/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 119/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 120/// in \a S using signed saturation, and store the packed 32-bit results in DST. 121/// 122/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 123/// 124/// \code{.operation} 125/// FOR j := 0 to 3 126/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 127/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 128/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 129/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 130/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 131/// ENDFOR 132/// DST[MAX:128] := 0 133/// \endcode 134#define _mm_dpbusds_epi32(S, A, B) \ 135 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 136 137/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 138/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 139/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 140/// and store the packed 32-bit results in DST. 141/// 142/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 143/// 144/// \code{.operation} 145/// FOR j := 0 to 3 146/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 147/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 148/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 149/// ENDFOR 150/// DST[MAX:128] := 0 151/// \endcode 152#define _mm_dpwssd_epi32(S, A, B) \ 153 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 154 155/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 156/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 157/// results. Sum these 2 results with the corresponding 32-bit integer in \a S 158/// using signed saturation, and store the packed 32-bit results in DST. 159/// 160/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 161/// 162/// \code{.operation} 163/// FOR j := 0 to 3 164/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 165/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 166/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 167/// ENDFOR 168/// DST[MAX:128] := 0 169/// \endcode 170#define _mm_dpwssds_epi32(S, A, B) \ 171 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 172 173static __inline__ __m256i __DEFAULT_FN_ATTRS256 174_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 175{ 176 return (__m256i)__builtin_ia32_selectd_256(__U, 177 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 178 (__v8si)__S); 179} 180 181static __inline__ __m256i __DEFAULT_FN_ATTRS256 182_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 183{ 184 return (__m256i)__builtin_ia32_selectd_256(__U, 185 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 186 (__v8si)_mm256_setzero_si256()); 187} 188 189static __inline__ __m256i __DEFAULT_FN_ATTRS256 190_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 191{ 192 return (__m256i)__builtin_ia32_selectd_256(__U, 193 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 194 (__v8si)__S); 195} 196 197static __inline__ __m256i __DEFAULT_FN_ATTRS256 198_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 199{ 200 return (__m256i)__builtin_ia32_selectd_256(__U, 201 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 202 (__v8si)_mm256_setzero_si256()); 203} 204 205static __inline__ __m256i __DEFAULT_FN_ATTRS256 206_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 207{ 208 return (__m256i)__builtin_ia32_selectd_256(__U, 209 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 210 (__v8si)__S); 211} 212 213static __inline__ __m256i __DEFAULT_FN_ATTRS256 214_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 215{ 216 return (__m256i)__builtin_ia32_selectd_256(__U, 217 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 218 (__v8si)_mm256_setzero_si256()); 219} 220 221static __inline__ __m256i __DEFAULT_FN_ATTRS256 222_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 223{ 224 return (__m256i)__builtin_ia32_selectd_256(__U, 225 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 226 (__v8si)__S); 227} 228 229static __inline__ __m256i __DEFAULT_FN_ATTRS256 230_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 231{ 232 return (__m256i)__builtin_ia32_selectd_256(__U, 233 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 234 (__v8si)_mm256_setzero_si256()); 235} 236 237static __inline__ __m128i __DEFAULT_FN_ATTRS128 238_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 239{ 240 return (__m128i)__builtin_ia32_selectd_128(__U, 241 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 242 (__v4si)__S); 243} 244 245static __inline__ __m128i __DEFAULT_FN_ATTRS128 246_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 247{ 248 return (__m128i)__builtin_ia32_selectd_128(__U, 249 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 250 (__v4si)_mm_setzero_si128()); 251} 252 253static __inline__ __m128i __DEFAULT_FN_ATTRS128 254_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 255{ 256 return (__m128i)__builtin_ia32_selectd_128(__U, 257 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 258 (__v4si)__S); 259} 260 261static __inline__ __m128i __DEFAULT_FN_ATTRS128 262_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 263{ 264 return (__m128i)__builtin_ia32_selectd_128(__U, 265 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 266 (__v4si)_mm_setzero_si128()); 267} 268 269static __inline__ __m128i __DEFAULT_FN_ATTRS128 270_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 271{ 272 return (__m128i)__builtin_ia32_selectd_128(__U, 273 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 274 (__v4si)__S); 275} 276 277static __inline__ __m128i __DEFAULT_FN_ATTRS128 278_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 279{ 280 return (__m128i)__builtin_ia32_selectd_128(__U, 281 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 282 (__v4si)_mm_setzero_si128()); 283} 284 285static __inline__ __m128i __DEFAULT_FN_ATTRS128 286_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 287{ 288 return (__m128i)__builtin_ia32_selectd_128(__U, 289 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 290 (__v4si)__S); 291} 292 293static __inline__ __m128i __DEFAULT_FN_ATTRS128 294_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 295{ 296 return (__m128i)__builtin_ia32_selectd_128(__U, 297 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 298 (__v4si)_mm_setzero_si128()); 299} 300 301#undef __DEFAULT_FN_ATTRS128 302#undef __DEFAULT_FN_ATTRS256 303 304#endif 305