1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error                                                                         \
12    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13#endif // __IMMINTRIN_H
14
15#ifndef __AVXVNNIINT16INTRIN_H
16#define __AVXVNNIINT16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS128                                                  \
20  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
21                 __min_vector_width__(128)))
22#define __DEFAULT_FN_ATTRS256                                                  \
23  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
24                 __min_vector_width__(256)))
25
26/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28///    signed 16-bit results. Sum these 2 results with the corresponding
29///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30///
31/// \headerfile <immintrin.h>
32///
33/// \code
34/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35/// \endcode
36///
37/// This intrinsic corresponds to the \c VPDPWSUD instruction.
38///
39/// \param __W
40///    A 128-bit vector of [4 x int].
41/// \param __A
42///    A 128-bit vector of [8 x short].
43/// \param __B
44///    A 128-bit vector of [8 x unsigned short].
45/// \returns
46///    A 128-bit vector of [4 x int].
47///
48/// \code{.operation}
49/// FOR j := 0 to 3
50/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53/// ENDFOR
54/// dst[MAX:128] := 0
55/// \endcode
56static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57                                                                 __m128i __A,
58                                                                 __m128i __B) {
59  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60                                             (__v4si)__B);
61}
62
63/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65///    signed 16-bit results. Sum these 2 results with the corresponding
66///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67///
68/// \headerfile <immintrin.h>
69///
70/// \code
71/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72/// \endcode
73///
74/// This intrinsic corresponds to the \c VPDPWSUD instruction.
75///
76/// \param __W
77///    A 256-bit vector of [8 x int].
78/// \param __A
79///    A 256-bit vector of [16 x short].
80/// \param __B
81///    A 256-bit vector of [16 x unsigned short].
82/// \returns
83///    A 256-bit vector of [8 x int].
84///
85/// \code{.operation}
86/// FOR j := 0 to 7
87/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90/// ENDFOR
91/// dst[MAX:256] := 0
92/// \endcode
93static __inline__ __m256i __DEFAULT_FN_ATTRS256
94_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96                                             (__v8si)__B);
97}
98
99/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101///    signed 16-bit results. Sum these 2 results with the corresponding
102///    32-bit integer in \a __W with signed saturation, and store the packed
103///    32-bit results in \a dst.
104///
105/// \headerfile <immintrin.h>
106///
107/// \code
108/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109/// \endcode
110///
111/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112///
113/// \param __W
114///    A 128-bit vector of [4 x int].
115/// \param __A
116///    A 128-bit vector of [8 x short].
117/// \param __B
118///    A 128-bit vector of [8 x unsigned short].
119/// \returns
120///    A 128-bit vector of [4 x int].
121///
122/// \code{.operation}
123/// FOR j := 0 to 3
124/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127/// ENDFOR
128/// dst[MAX:128] := 0
129/// \endcode
130static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131                                                                  __m128i __A,
132                                                                  __m128i __B) {
133  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134                                              (__v4si)__B);
135}
136
137/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139///    signed 16-bit results. Sum these 2 results with the corresponding
140///    32-bit integer in \a __W with signed saturation, and store the packed
141///    32-bit results in \a dst.
142///
143/// \headerfile <immintrin.h>
144///
145/// \code
146/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147/// \endcode
148///
149/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150///
151/// \param __W
152///    A 256-bit vector of [8 x int].
153/// \param __A
154///    A 256-bit vector of [16 x short].
155/// \param __B
156///    A 256-bit vector of [16 x unsigned short].
157/// \returns
158///    A 256-bit vector of [8 x int].
159///
160/// \code{.operation}
161/// FOR j := 0 to 7
162/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165/// ENDFOR
166/// dst[MAX:256] := 0
167/// \endcode
168static __inline__ __m256i __DEFAULT_FN_ATTRS256
169_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171                                              (__v8si)__B);
172}
173
174/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176///    signed 16-bit results. Sum these 2 results with the corresponding
177///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178///
179/// \headerfile <immintrin.h>
180///
181/// \code
182/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183/// \endcode
184///
185/// This intrinsic corresponds to the \c VPDPWUSD instruction.
186///
187/// \param __W
188///    A 128-bit vector of [4 x int].
189/// \param __A
190///    A 128-bit vector of [8 x unsigned short].
191/// \param __B
192///    A 128-bit vector of [8 x short].
193/// \returns
194///    A 128-bit vector of [4 x int].
195///
196/// \code{.operation}
197/// FOR j := 0 to 3
198/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201/// ENDFOR
202/// dst[MAX:128] := 0
203/// \endcode
204static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205                                                                 __m128i __A,
206                                                                 __m128i __B) {
207  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208                                             (__v4si)__B);
209}
210
211/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213///    signed 16-bit results. Sum these 2 results with the corresponding
214///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215///
216/// \headerfile <immintrin.h>
217///
218/// \code
219/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220/// \endcode
221///
222/// This intrinsic corresponds to the \c VPDPWUSD instruction.
223///
224/// \param __W
225///    A 256-bit vector of [8 x int].
226/// \param __A
227///    A 256-bit vector of [16 x unsigned short].
228/// \param __B
229///    A 256-bit vector of [16 x short].
230/// \returns
231///    A 256-bit vector of [8 x int].
232///
233/// \code{.operation}
234/// FOR j := 0 to 7
235/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238/// ENDFOR
239/// dst[MAX:256] := 0
240/// \endcode
241static __inline__ __m256i __DEFAULT_FN_ATTRS256
242_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244                                             (__v8si)__B);
245}
246
247/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249///    signed 16-bit results. Sum these 2 results with the corresponding
250///    32-bit integer in \a __W with signed saturation, and store the packed
251///    32-bit results in \a dst.
252///
253/// \headerfile <immintrin.h>
254///
255/// \code
256/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257/// \endcode
258///
259/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260///
261/// \param __W
262///    A 128-bit vector of [4 x int].
263/// \param __A
264///    A 128-bit vector of [8 x unsigned short].
265/// \param __B
266///    A 128-bit vector of [8 x short].
267/// \returns
268///    A 128-bit vector of [4 x int].
269///
270/// \code{.operation}
271/// FOR j := 0 to 3
272/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275/// ENDFOR
276/// dst[MAX:128] := 0
277/// \endcode
278static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279                                                                  __m128i __A,
280                                                                  __m128i __B) {
281  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282                                              (__v4si)__B);
283}
284
285/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287///    signed 16-bit results. Sum these 2 results with the corresponding
288///    32-bit integer in \a __W with signed saturation, and store the packed
289///    32-bit results in \a dst.
290///
291/// \headerfile <immintrin.h>
292///
293/// \code
294/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295/// \endcode
296///
297/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298///
299/// \param __W
300///    A 256-bit vector of [8 x int].
301/// \param __A
302///    A 256-bit vector of [16 x unsigned short].
303/// \param __B
304///    A 256-bit vector of [16 x short].
305/// \returns
306///    A 256-bit vector of [8 x int].
307///
308/// \code{.operation}
309/// FOR j := 0 to 7
310/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313/// ENDFOR
314/// dst[MAX:256] := 0
315/// \endcode
316static __inline__ __m256i __DEFAULT_FN_ATTRS256
317_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319                                              (__v8si)__B);
320}
321
322/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324///    signed 16-bit results. Sum these 2 results with the corresponding
325///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326///
327/// \headerfile <immintrin.h>
328///
329/// \code
330/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331/// \endcode
332///
333/// This intrinsic corresponds to the \c VPDPWUUD instruction.
334///
335/// \param __W
336///    A 128-bit vector of [4 x unsigned int].
337/// \param __A
338///    A 128-bit vector of [8 x unsigned short].
339/// \param __B
340///    A 128-bit vector of [8 x unsigned short].
341/// \returns
342///    A 128-bit vector of [4 x unsigned int].
343///
344/// \code{.operation}
345/// FOR j := 0 to 3
346/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349/// ENDFOR
350/// dst[MAX:128] := 0
351/// \endcode
352static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353                                                                 __m128i __A,
354                                                                 __m128i __B) {
355  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356                                             (__v4si)__B);
357}
358
359/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361///    signed 16-bit results. Sum these 2 results with the corresponding
362///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363///
364/// \headerfile <immintrin.h>
365///
366/// \code
367/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368/// \endcode
369///
370/// This intrinsic corresponds to the \c VPDPWUUD instruction.
371///
372/// \param __W
373///    A 256-bit vector of [8 x unsigned int].
374/// \param __A
375///    A 256-bit vector of [16 x unsigned short].
376/// \param __B
377///    A 256-bit vector of [16 x unsigned short].
378/// \returns
379///    A 256-bit vector of [8 x unsigned int].
380///
381/// \code{.operation}
382/// FOR j := 0 to 7
383/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386/// ENDFOR
387/// dst[MAX:256] := 0
388/// \endcode
389static __inline__ __m256i __DEFAULT_FN_ATTRS256
390_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392                                             (__v8si)__B);
393}
394
395/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397///    signed 16-bit results. Sum these 2 results with the corresponding
398///    32-bit integer in \a __W with signed saturation, and store the packed
399///    32-bit results in \a dst.
400///
401/// \headerfile <immintrin.h>
402///
403/// \code
404/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405/// \endcode
406///
407/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408///
409/// \param __W
410///    A 128-bit vector of [4 x unsigned int].
411/// \param __A
412///    A 128-bit vector of [8 x unsigned short].
413/// \param __B
414///    A 128-bit vector of [8 x unsigned short].
415/// \returns
416///    A 128-bit vector of [4 x unsigned int].
417///
418/// \code{.operation}
419/// FOR j := 0 to 3
420/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423/// ENDFOR
424/// dst[MAX:128] := 0
425/// \endcode
426static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427                                                                  __m128i __A,
428                                                                  __m128i __B) {
429  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430                                              (__v4si)__B);
431}
432
433/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435///    signed 16-bit results. Sum these 2 results with the corresponding
436///    32-bit integer in \a __W with signed saturation, and store the packed
437///    32-bit results in \a dst.
438///
439/// \headerfile <immintrin.h>
440///
441/// \code
442/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443/// \endcode
444///
445/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446///
447/// \param __W
448///    A 256-bit vector of [8 x unsigned int].
449/// \param __A
450///    A 256-bit vector of [16 x unsigned short].
451/// \param __B
452///    A 256-bit vector of [16 x unsigned short].
453/// \returns
454///    A 256-bit vector of [8 x unsigned int].
455///
456/// \code{.operation}
457/// FOR j := 0 to 7
458/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461/// ENDFOR
462/// dst[MAX:256] := 0
463/// \endcode
464static __inline__ __m256i __DEFAULT_FN_ATTRS256
465_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467                                              (__v8si)__B);
468}
469
470#undef __DEFAULT_FN_ATTRS128
471#undef __DEFAULT_FN_ATTRS256
472
473#endif // __AVXVNNIINT16INTRIN_H
474