1351280Sdim/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
2351280Sdim *
3351280Sdim * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4351280Sdim * See https://llvm.org/LICENSE.txt for license information.
5351280Sdim * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6351280Sdim *
7351280Sdim *===-----------------------------------------------------------------------===
8351280Sdim */
9351280Sdim#ifndef __IMMINTRIN_H
10351280Sdim#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
11351280Sdim#endif
12351280Sdim
13351280Sdim#ifndef __AVX512VLBF16INTRIN_H
14351280Sdim#define __AVX512VLBF16INTRIN_H
15351280Sdim
16351280Sdimtypedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
17351280Sdim
18351280Sdim#define __DEFAULT_FN_ATTRS128 \
19351280Sdim  __attribute__((__always_inline__, __nodebug__, \
20351280Sdim                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21351280Sdim#define __DEFAULT_FN_ATTRS256 \
22351280Sdim  __attribute__((__always_inline__, __nodebug__, \
23351280Sdim                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
24351280Sdim
25351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
26351280Sdim///
27351280Sdim/// \headerfile <x86intrin.h>
28351280Sdim///
29351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
30351280Sdim///
31351280Sdim/// \param __A
32351280Sdim///    A 128-bit vector of [4 x float].
33351280Sdim/// \param __B
34351280Sdim///    A 128-bit vector of [4 x float].
35351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
36351280Sdim///    conversion of __B, and higher 64 bits come from conversion of __A.
37351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
38351280Sdim_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
39351280Sdim  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
40351280Sdim                                                    (__v4sf) __B);
41351280Sdim}
42351280Sdim
43351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
44351280Sdim///
45351280Sdim/// \headerfile <x86intrin.h>
46351280Sdim///
47351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48351280Sdim///
49351280Sdim/// \param __A
50351280Sdim///    A 128-bit vector of [4 x float].
51351280Sdim/// \param __B
52351280Sdim///    A 128-bit vector of [4 x float].
53351280Sdim/// \param __W
54351280Sdim///    A 128-bit vector of [8 x bfloat].
55351280Sdim/// \param __U
56351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
57351280Sdim///    A 1 means conversion of __A or __B. A 0 means element from __W.
58351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
59351280Sdim///    conversion of __B, and higher 64 bits come from conversion of __A.
60351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
61351280Sdim_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
62351280Sdim  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
63351280Sdim                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
64351280Sdim                                             (__v8hi)__W);
65351280Sdim}
66351280Sdim
67351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
68351280Sdim///
69351280Sdim/// \headerfile <x86intrin.h>
70351280Sdim///
71351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
72351280Sdim///
73351280Sdim/// \param __A
74351280Sdim///    A 128-bit vector of [4 x float].
75351280Sdim/// \param __B
76351280Sdim///    A 128-bit vector of [4 x float].
77351280Sdim/// \param __U
78351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
79351280Sdim///    A 1 means conversion of __A or __B. A 0 means element is zero.
80351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
81351280Sdim///    conversion of __B, and higher 64 bits come from conversion of __A.
82351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
83351280Sdim_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
84351280Sdim  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
85351280Sdim                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
86351280Sdim                                             (__v8hi)_mm_setzero_si128());
87351280Sdim}
88351280Sdim
89351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
90351280Sdim///
91351280Sdim/// \headerfile <x86intrin.h>
92351280Sdim///
93351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
94351280Sdim///
95351280Sdim/// \param __A
96351280Sdim///    A 256-bit vector of [8 x float].
97351280Sdim/// \param __B
98351280Sdim///    A 256-bit vector of [8 x float].
99351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
100351280Sdim///    conversion of __B, and higher 128 bits come from conversion of __A.
101351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256
102351280Sdim_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
103351280Sdim  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
104351280Sdim                                                    (__v8sf) __B);
105351280Sdim}
106351280Sdim
107351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
108351280Sdim///
109351280Sdim/// \headerfile <x86intrin.h>
110351280Sdim///
111351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
112351280Sdim///
113351280Sdim/// \param __A
114351280Sdim///    A 256-bit vector of [8 x float].
115351280Sdim/// \param __B
116351280Sdim///    A 256-bit vector of [8 x float].
117351280Sdim/// \param __W
118351280Sdim///    A 256-bit vector of [16 x bfloat].
119351280Sdim/// \param __U
120351280Sdim///    A 16-bit mask value specifying what is chosen for each element.
121351280Sdim///    A 1 means conversion of __A or __B. A 0 means element from __W.
122351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
123351280Sdim///    conversion of __B, and higher 128 bits come from conversion of __A.
124351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256
125351280Sdim_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
126351280Sdim  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
127351280Sdim                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
128351280Sdim                                         (__v16hi)__W);
129351280Sdim}
130351280Sdim
131351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data.
132351280Sdim///
133351280Sdim/// \headerfile <x86intrin.h>
134351280Sdim///
135351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
136351280Sdim///
137351280Sdim/// \param __A
138351280Sdim///    A 256-bit vector of [8 x float].
139351280Sdim/// \param __B
140351280Sdim///    A 256-bit vector of [8 x float].
141351280Sdim/// \param __U
142351280Sdim///    A 16-bit mask value specifying what is chosen for each element.
143351280Sdim///    A 1 means conversion of __A or __B. A 0 means element is zero.
144351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
145351280Sdim///    conversion of __B, and higher 128 bits come from conversion of __A.
146351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256
147351280Sdim_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
148351280Sdim  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
149351280Sdim                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
150351280Sdim                                         (__v16hi)_mm256_setzero_si256());
151351280Sdim}
152351280Sdim
153351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
154351280Sdim///
155351280Sdim/// \headerfile <x86intrin.h>
156351280Sdim///
157351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
158351280Sdim///
159351280Sdim/// \param __A
160351280Sdim///    A 128-bit vector of [4 x float].
161351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
162351280Sdim///    conversion of __A, and higher 64 bits are 0.
163351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
164351280Sdim_mm_cvtneps_pbh(__m128 __A) {
165351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
166351280Sdim                                                  (__v8hi)_mm_undefined_si128(),
167351280Sdim                                                  (__mmask8)-1);
168351280Sdim}
169351280Sdim
170351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
171351280Sdim///
172351280Sdim/// \headerfile <x86intrin.h>
173351280Sdim///
174351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
175351280Sdim///
176351280Sdim/// \param __A
177351280Sdim///    A 128-bit vector of [4 x float].
178351280Sdim/// \param __W
179351280Sdim///    A 128-bit vector of [8 x bfloat].
180351280Sdim/// \param __U
181351280Sdim///    A 4-bit mask value specifying what is chosen for each element.
182351280Sdim///    A 1 means conversion of __A. A 0 means element from __W.
183351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
184351280Sdim///    conversion of __A, and higher 64 bits are 0.
185351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
186351280Sdim_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
187351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
188351280Sdim                                                        (__v8hi)__W,
189351280Sdim                                                        (__mmask8)__U);
190351280Sdim}
191351280Sdim
192351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
193351280Sdim///
194351280Sdim/// \headerfile <x86intrin.h>
195351280Sdim///
196351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
197351280Sdim///
198351280Sdim/// \param __A
199351280Sdim///    A 128-bit vector of [4 x float].
200351280Sdim/// \param __U
201351280Sdim///    A 4-bit mask value specifying what is chosen for each element.
202351280Sdim///    A 1 means conversion of __A. A 0 means element is zero.
203351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
204351280Sdim///    conversion of __A, and higher 64 bits are 0.
205351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128
206351280Sdim_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
207351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
208351280Sdim                                                    (__v8hi)_mm_setzero_si128(),
209351280Sdim                                                    (__mmask8)__U);
210351280Sdim}
211351280Sdim
212351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
213351280Sdim///
214351280Sdim/// \headerfile <x86intrin.h>
215351280Sdim///
216351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
217351280Sdim///
218351280Sdim/// \param __A
219351280Sdim///    A 256-bit vector of [8 x float].
220351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
221351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256
222351280Sdim_mm256_cvtneps_pbh(__m256 __A) {
223351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
224351280Sdim                                                  (__v8hi)_mm_undefined_si128(),
225351280Sdim                                                  (__mmask8)-1);
226351280Sdim}
227351280Sdim
228351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
229351280Sdim///
230351280Sdim/// \headerfile <x86intrin.h>
231351280Sdim///
232351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
233351280Sdim///
234351280Sdim/// \param __A
235351280Sdim///    A 256-bit vector of [8 x float].
236351280Sdim/// \param __W
237351280Sdim///    A 256-bit vector of [8 x bfloat].
238351280Sdim/// \param __U
239351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
240351280Sdim///    A 1 means conversion of __A. A 0 means element from __W.
241351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
242351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256
243351280Sdim_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
244351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
245351280Sdim                                                        (__v8hi)__W,
246351280Sdim                                                        (__mmask8)__U);
247351280Sdim}
248351280Sdim
249351280Sdim/// Convert Packed Single Data to Packed BF16 Data.
250351280Sdim///
251351280Sdim/// \headerfile <x86intrin.h>
252351280Sdim///
253351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
254351280Sdim///
255351280Sdim/// \param __A
256351280Sdim///    A 256-bit vector of [8 x float].
257351280Sdim/// \param __U
258351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
259351280Sdim///    A 1 means conversion of __A. A 0 means element is zero.
260351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
261351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256
262351280Sdim_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
263351280Sdim  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
264351280Sdim                                                    (__v8hi)_mm_setzero_si128(),
265351280Sdim                                                    (__mmask8)__U);
266351280Sdim}
267351280Sdim
268351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
269351280Sdim///
270351280Sdim/// \headerfile <x86intrin.h>
271351280Sdim///
272351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
273351280Sdim///
274351280Sdim/// \param __A
275351280Sdim///    A 128-bit vector of [8 x bfloat].
276351280Sdim/// \param __B
277351280Sdim///    A 128-bit vector of [8 x bfloat].
278351280Sdim/// \param __D
279351280Sdim///    A 128-bit vector of [4 x float].
280351280Sdim/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
281351280Sdim///  __A, __B and __D
282351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128
283351280Sdim_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
284351280Sdim  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
285351280Sdim                                             (__v4si)__A,
286351280Sdim                                             (__v4si)__B);
287351280Sdim}
288351280Sdim
289351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
290351280Sdim///
291351280Sdim/// \headerfile <x86intrin.h>
292351280Sdim///
293351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
294351280Sdim///
295351280Sdim/// \param __A
296351280Sdim///    A 128-bit vector of [8 x bfloat].
297351280Sdim/// \param __B
298351280Sdim///    A 128-bit vector of [8 x bfloat].
299351280Sdim/// \param __D
300351280Sdim///    A 128-bit vector of [4 x float].
301351280Sdim/// \param __U
302351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
303351280Sdim///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
304351280Sdim/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
305351280Sdim///  __A, __B and __D
306351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128
307351280Sdim_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
308351280Sdim  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
309351280Sdim                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
310351280Sdim                                           (__v4sf)__D);
311351280Sdim}
312351280Sdim
313351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
314351280Sdim///
315351280Sdim/// \headerfile <x86intrin.h>
316351280Sdim///
317351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
318351280Sdim///
319351280Sdim/// \param __A
320351280Sdim///    A 128-bit vector of [8 x bfloat].
321351280Sdim/// \param __B
322351280Sdim///    A 128-bit vector of [8 x bfloat].
323351280Sdim/// \param __D
324351280Sdim///    A 128-bit vector of [4 x float].
325351280Sdim/// \param __U
326351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
327351280Sdim///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
328351280Sdim/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
329351280Sdim///  __A, __B and __D
330351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128
331351280Sdim_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
332351280Sdim  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
333351280Sdim                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
334351280Sdim                                           (__v4sf)_mm_setzero_si128());
335351280Sdim}
336351280Sdim
337351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
338351280Sdim///
339351280Sdim/// \headerfile <x86intrin.h>
340351280Sdim///
341351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
342351280Sdim///
343351280Sdim/// \param __A
344351280Sdim///    A 256-bit vector of [16 x bfloat].
345351280Sdim/// \param __B
346351280Sdim///    A 256-bit vector of [16 x bfloat].
347351280Sdim/// \param __D
348351280Sdim///    A 256-bit vector of [8 x float].
349351280Sdim/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
350351280Sdim///  __A, __B and __D
351351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256
352351280Sdim_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
353351280Sdim  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
354351280Sdim                                             (__v8si)__A,
355351280Sdim                                             (__v8si)__B);
356351280Sdim}
357351280Sdim
358351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
359351280Sdim///
360351280Sdim/// \headerfile <x86intrin.h>
361351280Sdim///
362351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
363351280Sdim///
364351280Sdim/// \param __A
365351280Sdim///    A 256-bit vector of [16 x bfloat].
366351280Sdim/// \param __B
367351280Sdim///    A 256-bit vector of [16 x bfloat].
368351280Sdim/// \param __D
369351280Sdim///    A 256-bit vector of [8 x float].
370351280Sdim/// \param __U
371351280Sdim///    A 16-bit mask value specifying what is chosen for each element.
372351280Sdim///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
373351280Sdim/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
374351280Sdim///  __A, __B and __D
375351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256
376351280Sdim_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
377351280Sdim  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
378351280Sdim                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
379351280Sdim                                        (__v8sf)__D);
380351280Sdim}
381351280Sdim
382351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
383351280Sdim///
384351280Sdim/// \headerfile <x86intrin.h>
385351280Sdim///
386351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
387351280Sdim///
388351280Sdim/// \param __A
389351280Sdim///    A 256-bit vector of [16 x bfloat].
390351280Sdim/// \param __B
391351280Sdim///    A 256-bit vector of [16 x bfloat].
392351280Sdim/// \param __D
393351280Sdim///    A 256-bit vector of [8 x float].
394351280Sdim/// \param __U
395351280Sdim///    A 8-bit mask value specifying what is chosen for each element.
396351280Sdim///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
397351280Sdim/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
398351280Sdim///  __A, __B and __D
399351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256
400351280Sdim_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
401351280Sdim  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
402351280Sdim                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
403351280Sdim                                        (__v8sf)_mm256_setzero_si256());
404351280Sdim}
405351280Sdim
406351280Sdim/// Convert One Single float Data to One BF16 Data.
407351280Sdim///
408351280Sdim/// \headerfile <x86intrin.h>
409351280Sdim///
410351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
411351280Sdim///
412351280Sdim/// \param __A
413351280Sdim///    A float data.
414351280Sdim/// \returns A bf16 data whose sign field and exponent field keep unchanged,
415351280Sdim///    and fraction field is truncated to 7 bits.
416351280Sdimstatic __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
417351280Sdim  __v4sf __V = {__A, 0, 0, 0};
418351280Sdim  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
419351280Sdim      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
420351280Sdim  return __R[0];
421351280Sdim}
422351280Sdim
423351280Sdim/// Convert Packed BF16 Data to Packed float Data.
424351280Sdim///
425351280Sdim/// \headerfile <x86intrin.h>
426351280Sdim///
427351280Sdim/// \param __A
428351280Sdim///    A 128-bit vector of [8 x bfloat].
429351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A
430351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
431351280Sdim  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
432351280Sdim      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
433351280Sdim}
434351280Sdim
435351280Sdim/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
436351280Sdim///
437351280Sdim/// \headerfile <x86intrin.h>
438351280Sdim///
439351280Sdim/// \param __U
440351280Sdim///    A 8-bit mask. Elements are zeroed out when the corresponding mask
441351280Sdim///    bit is not set.
442351280Sdim/// \param __A
443351280Sdim///    A 128-bit vector of [8 x bfloat].
444351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A
445351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256
446351280Sdim_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
447351280Sdim  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
448351280Sdim      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
449351280Sdim}
450351280Sdim
451351280Sdim/// Convert Packed BF16 Data to Packed float Data using merging mask.
452351280Sdim///
453351280Sdim/// \headerfile <x86intrin.h>
454351280Sdim///
455351280Sdim/// \param __S
456351280Sdim///    A 256-bit vector of [8 x float]. Elements are copied from __S when
457351280Sdim///     the corresponding mask bit is not set.
458351280Sdim/// \param __U
459351280Sdim///    A 8-bit mask. Elements are zeroed out when the corresponding mask
460351280Sdim///    bit is not set.
461351280Sdim/// \param __A
462351280Sdim///    A 128-bit vector of [8 x bfloat].
463351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A
464351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256
465351280Sdim_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
466351280Sdim  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
467351280Sdim      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
468351280Sdim      16));
469351280Sdim}
470351280Sdim
471351280Sdim#undef __DEFAULT_FN_ATTRS128
472351280Sdim#undef __DEFAULT_FN_ATTRS256
473351280Sdim
474351280Sdim#endif
475