1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error                                                                         \
11    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXVNNIINT8INTRIN_H
15#define __AVXVNNIINT8INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256                                                  \
19  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
20                 __min_vector_width__(256)))
21#define __DEFAULT_FN_ATTRS128                                                  \
22  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
23                 __min_vector_width__(128)))
24
25/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27///    signed 16-bit results. Sum these 4 results with the corresponding
28///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29///
30/// \headerfile <x86intrin.h>
31///
32/// \code
33/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34/// \endcode
35///
36/// This intrinsic corresponds to the \c VPDPBSSD instruction.
37///
38/// \param __A
39///    A 128-bit vector of [16 x char].
40/// \param __B
41///    A 128-bit vector of [16 x char].
42/// \returns
43///    A 128-bit vector of [4 x int].
44///
45/// \code{.operation}
46/// FOR j := 0 to 3
47/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52/// ENDFOR
53/// dst[MAX:128] := 0
54/// \endcode
55static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56                                                                 __m128i __A,
57                                                                 __m128i __B) {
58  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59                                             (__v4si)__B);
60}
61
62/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64///    signed 16-bit results. Sum these 4 results with the corresponding
65///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66///
67/// \headerfile <x86intrin.h>
68///
69/// \code
70/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71/// \endcode
72///
73/// This intrinsic corresponds to the \c VPDPBSSD instruction.
74///
75/// \param __A
76///    A 256-bit vector of [32 x char].
77/// \param __B
78///    A 256-bit vector of [32 x char].
79/// \returns
80///    A 256-bit vector of [8 x int].
81///
82/// \code{.operation}
83/// FOR j := 0 to 7
84/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89/// ENDFOR
90/// dst[MAX:256] := 0
91/// \endcode
92static __inline__ __m256i __DEFAULT_FN_ATTRS256
93_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95                                             (__v8si)__B);
96}
97
98/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100///    signed 16-bit results. Sum these 4 results with the corresponding
101///    32-bit integer in \a __W with signed saturation, and store the packed
102///    32-bit results in \a dst.
103///
104/// \headerfile <x86intrin.h>
105///
106/// \code
107/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108/// \endcode
109///
110/// This intrinsic corresponds to the \c VPDPBSSD instruction.
111///
112/// \param __A
113///    A 128-bit vector of [16 x char].
114/// \param __B
115///    A 128-bit vector of [16 x char].
116/// \returns
117///    A 128-bit vector of [4 x int].
118///
119/// \code{.operation}
120/// FOR j := 0 to 3
121/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126/// ENDFOR
127/// dst[MAX:128] := 0
128/// \endcode
129static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130                                                                  __m128i __A,
131                                                                  __m128i __B) {
132  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133                                              (__v4si)__B);
134}
135
136/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138///    signed 16-bit results. Sum these 4 results with the corresponding
139///    32-bit integer in \a __W with signed saturation, and store the packed
140///    32-bit results in \a dst.
141///
142/// \headerfile <x86intrin.h>
143///
144/// \code
145/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146/// \endcode
147///
148/// This intrinsic corresponds to the \c VPDPBSSD instruction.
149///
150/// \param __A
151///    A 256-bit vector of [32 x char].
152/// \param __B
153///    A 256-bit vector of [32 x char].
154/// \returns
155///    A 256-bit vector of [8 x int].
156///
157/// \code{.operation}
158/// FOR j := 0 to 7
159/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164/// ENDFOR
165/// dst[MAX:256] := 0
166/// \endcode
167static __inline__ __m256i __DEFAULT_FN_ATTRS256
168_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170                                              (__v8si)__B);
171}
172
173/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175///    signed 16-bit results. Sum these 4 results with the corresponding
176///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177///
178/// \headerfile <x86intrin.h>
179///
180/// \code
181/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182/// \endcode
183///
184/// This intrinsic corresponds to the \c VPDPBSSD instruction.
185///
186/// \param __A
187///    A 128-bit vector of [16 x char].
188/// \param __B
189///    A 128-bit vector of [16 x unsigned char].
190/// \returns
191///    A 128-bit vector of [4 x int].
192///
193/// \code{.operation}
194/// FOR j := 0 to 3
195/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200/// ENDFOR
201/// dst[MAX:128] := 0
202/// \endcode
203static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204                                                                 __m128i __A,
205                                                                 __m128i __B) {
206  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207                                             (__v4si)__B);
208}
209
210/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212///    signed 16-bit results. Sum these 4 results with the corresponding
213///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214///
215/// \headerfile <x86intrin.h>
216///
217/// \code
218/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219/// \endcode
220///
221/// This intrinsic corresponds to the \c VPDPBSSD instruction.
222///
223/// \param __A
224///    A 256-bit vector of [32 x char].
225/// \param __B
226///    A 256-bit vector of [32 x unsigned char].
227/// \returns
228///    A 256-bit vector of [8 x int].
229///
230/// \code{.operation}
231/// FOR j := 0 to 7
232/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237/// ENDFOR
238/// dst[MAX:256] := 0
239/// \endcode
240static __inline__ __m256i __DEFAULT_FN_ATTRS256
241_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243                                             (__v8si)__B);
244}
245
246/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248///    signed 16-bit results. Sum these 4 results with the corresponding
249///    32-bit integer in \a __W with signed saturation, and store the packed
250///    32-bit results in \a dst.
251///
252/// \headerfile <x86intrin.h>
253///
254/// \code
255/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256/// \endcode
257///
258/// This intrinsic corresponds to the \c VPDPBSSD instruction.
259///
260/// \param __A
261///    A 128-bit vector of [16 x char].
262/// \param __B
263///    A 128-bit vector of [16 x unsigned char].
264/// \returns
265///    A 128-bit vector of [4 x int].
266///
267/// \code{.operation}
268/// FOR j := 0 to 3
269/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274/// ENDFOR
275/// dst[MAX:128] := 0
276/// \endcode
277static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278                                                                  __m128i __A,
279                                                                  __m128i __B) {
280  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281                                              (__v4si)__B);
282}
283
284/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286///    signed 16-bit results. Sum these 4 results with the corresponding
287///    32-bit integer in \a __W with signed saturation, and store the packed
288///    32-bit results in \a dst.
289///
290/// \headerfile <x86intrin.h>
291///
292/// \code
293/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294/// \endcode
295///
296/// This intrinsic corresponds to the \c VPDPBSSD instruction.
297///
298/// \param __A
299///    A 256-bit vector of [32 x char].
300/// \param __B
301///    A 256-bit vector of [32 x unsigned char].
302/// \returns
303///    A 256-bit vector of [8 x int].
304///
305/// \code{.operation}
306/// FOR j := 0 to 7
307/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312/// ENDFOR
313/// dst[MAX:256] := 0
314/// \endcode
315static __inline__ __m256i __DEFAULT_FN_ATTRS256
316_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318                                              (__v8si)__B);
319}
320
321/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323///    signed 16-bit results. Sum these 4 results with the corresponding
324///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325///
326/// \headerfile <x86intrin.h>
327///
328/// \code
329/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330/// \endcode
331///
332/// This intrinsic corresponds to the \c VPDPBSSD instruction.
333///
334/// \param __A
335///    A 128-bit vector of [16 x unsigned char].
336/// \param __B
337///    A 128-bit vector of [16 x unsigned char].
338/// \returns
339///    A 128-bit vector of [4 x int].
340///
341/// \code{.operation}
342/// FOR j := 0 to 3
343/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348/// ENDFOR
349/// dst[MAX:128] := 0
350/// \endcode
351static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352                                                                 __m128i __A,
353                                                                 __m128i __B) {
354  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355                                             (__v4si)__B);
356}
357
358/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360///    signed 16-bit results. Sum these 4 results with the corresponding
361///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362///
363/// \headerfile <x86intrin.h>
364///
365/// \code
366/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367/// \endcode
368///
369/// This intrinsic corresponds to the \c VPDPBSSD instruction.
370///
371/// \param __A
372///    A 256-bit vector of [32 x unsigned char].
373/// \param __B
374///    A 256-bit vector of [32 x unsigned char].
375/// \returns
376///    A 256-bit vector of [8 x int].
377///
378/// \code{.operation}
379/// FOR j := 0 to 7
380/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385/// ENDFOR
386/// dst[MAX:256] := 0
387/// \endcode
388static __inline__ __m256i __DEFAULT_FN_ATTRS256
389_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391                                             (__v8si)__B);
392}
393
394/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396///    signed 16-bit results. Sum these 4 results with the corresponding
397///    32-bit integer in \a __W with signed saturation, and store the packed
398///    32-bit results in \a dst.
399///
400/// \headerfile <x86intrin.h>
401///
402/// \code
403/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404/// \endcode
405///
406/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407///
408/// \param __A
409///    A 128-bit vector of [16 x unsigned char].
410/// \param __B
411///    A 128-bit vector of [16 x unsigned char].
412/// \returns
413///    A 128-bit vector of [4 x int].
414///
415/// \code{.operation}
416/// FOR j := 0 to 3
417/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422/// ENDFOR
423/// dst[MAX:128] := 0
424/// \endcode
425static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426                                                                  __m128i __A,
427                                                                  __m128i __B) {
428  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429                                              (__v4si)__B);
430}
431
432/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434///    signed 16-bit results. Sum these 4 results with the corresponding
435///    32-bit integer in \a __W with signed saturation, and store the packed
436///    32-bit results in \a dst.
437///
438/// \headerfile <x86intrin.h>
439///
440/// \code
441/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442/// \endcode
443///
444/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445///
446/// \param __A
447///    A 256-bit vector of [32 x unsigned char].
448/// \param __B
449///    A 256-bit vector of [32 x unsigned char].
450/// \returns
451///    A 256-bit vector of [8 x int].
452///
453/// \code{.operation}
454/// FOR j := 0 to 7
455/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460/// ENDFOR
461/// dst[MAX:256] := 0
462/// \endcode
463static __inline__ __m256i __DEFAULT_FN_ATTRS256
464_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466                                              (__v8si)__B);
467}
468#undef __DEFAULT_FN_ATTRS128
469#undef __DEFAULT_FN_ATTRS256
470
471#endif // __AVXVNNIINT8INTRIN_H
472