154359Sroberto/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
282498Sroberto *
354359Sroberto * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
454359Sroberto * See https://llvm.org/LICENSE.txt for license information.
554359Sroberto * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
654359Sroberto *
754359Sroberto *===-----------------------------------------------------------------------===
854359Sroberto */
954359Sroberto#ifndef __IMMINTRIN_H
1054359Sroberto#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
1154359Sroberto#endif
1254359Sroberto
1354359Sroberto#ifdef __SSE2__
1454359Sroberto
1554359Sroberto#ifndef __AVX512BF16INTRIN_H
1654359Sroberto#define __AVX512BF16INTRIN_H
1754359Sroberto
1854359Srobertotypedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19182007Srobertotypedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
2054359Srobertotypedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
2182498Sroberto
2282498Sroberto#define __DEFAULT_FN_ATTRS512 \
2354359Sroberto  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
2482498Sroberto                 __min_vector_width__(512)))
2554359Sroberto#define __DEFAULT_FN_ATTRS                                                     \
2682498Sroberto  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
2782498Sroberto
2854359Sroberto/// Convert One BF16 Data to One Single Float Data.
2982498Sroberto///
3082498Sroberto/// \headerfile <x86intrin.h>
3182498Sroberto///
3282498Sroberto/// This intrinsic does not correspond to a specific instruction.
3382498Sroberto///
3482498Sroberto/// \param __A
3582498Sroberto///    A bfloat data.
3682498Sroberto/// \returns A float data whose sign field and exponent field keep unchanged,
3782498Sroberto///    and fraction field is extended to 23 bits.
3882498Srobertostatic __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
3982498Sroberto  return __builtin_ia32_cvtsbf162ss_32(__A);
4082498Sroberto}
4182498Sroberto
4282498Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data.
4382498Sroberto///
4482498Sroberto/// \headerfile <x86intrin.h>
45182007Sroberto///
46182007Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
4754359Sroberto///
4882498Sroberto/// \param __A
4982498Sroberto///    A 512-bit vector of [16 x float].
5082498Sroberto/// \param __B
5154359Sroberto///    A 512-bit vector of [16 x float].
5282498Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
5382498Sroberto///    conversion of __B, and higher 256 bits come from conversion of __A.
5482498Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512
55182007Sroberto_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
56182007Sroberto  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
5782498Sroberto                                                    (__v16sf) __B);
58182007Sroberto}
5982498Sroberto
6082498Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data.
61182007Sroberto///
6282498Sroberto/// \headerfile <x86intrin.h>
6382498Sroberto///
6454359Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
6582498Sroberto///
6654359Sroberto/// \param __A
6754359Sroberto///    A 512-bit vector of [16 x float].
6854359Sroberto/// \param __B
6982498Sroberto///    A 512-bit vector of [16 x float].
7054359Sroberto/// \param __W
7182498Sroberto///    A 512-bit vector of [32 x bfloat].
7282498Sroberto/// \param __U
7354359Sroberto///    A 32-bit mask value specifying what is chosen for each element.
7454359Sroberto///    A 1 means conversion of __A or __B. A 0 means element from __W.
75182007Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
7682498Sroberto///    conversion of __B, and higher 256 bits come from conversion of __A.
7754359Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512
78182007Sroberto_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
79182007Sroberto  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
80182007Sroberto                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
8154359Sroberto                                        (__v32bf)__W);
8282498Sroberto}
8354359Sroberto
8454359Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data.
8554359Sroberto///
8654359Sroberto/// \headerfile <x86intrin.h>
8754359Sroberto///
8854359Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
8954359Sroberto///
9054359Sroberto/// \param __A
9154359Sroberto///    A 512-bit vector of [16 x float].
92285612Sdelphij/// \param __B
93285612Sdelphij///    A 512-bit vector of [16 x float].
94285612Sdelphij/// \param __U
95285612Sdelphij///    A 32-bit mask value specifying what is chosen for each element.
9654359Sroberto///    A 1 means conversion of __A or __B. A 0 means element is zero.
9754359Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
9854359Sroberto///    conversion of __B, and higher 256 bits come from conversion of __A.
9954359Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512
10054359Sroberto_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
10154359Sroberto  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
10254359Sroberto                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
10354359Sroberto                                        (__v32bf)_mm512_setzero_si512());
10482498Sroberto}
10582498Sroberto
10682498Sroberto/// Convert Packed Single Data to Packed BF16 Data.
10782498Sroberto///
10854359Sroberto/// \headerfile <x86intrin.h>
10954359Sroberto///
11054359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
11154359Sroberto///
11254359Sroberto/// \param __A
11354359Sroberto///    A 512-bit vector of [16 x float].
11454359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
11554359Srobertostatic __inline__ __m256bh __DEFAULT_FN_ATTRS512
11654359Sroberto_mm512_cvtneps_pbh(__m512 __A) {
11754359Sroberto  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
11854359Sroberto                                              (__v16bf)_mm256_undefined_si256(),
11954359Sroberto                                              (__mmask16)-1);
12054359Sroberto}
12154359Sroberto
12282498Sroberto/// Convert Packed Single Data to Packed BF16 Data.
12354359Sroberto///
12482498Sroberto/// \headerfile <x86intrin.h>
12554359Sroberto///
12654359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
12754359Sroberto///
128285612Sdelphij/// \param __A
129285612Sdelphij///    A 512-bit vector of [16 x float].
130285612Sdelphij/// \param __W
13154359Sroberto///    A 256-bit vector of [16 x bfloat].
13254359Sroberto/// \param __U
13354359Sroberto///    A 16-bit mask value specifying what is chosen for each element.
13454359Sroberto///    A 1 means conversion of __A. A 0 means element from __W.
13554359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
136285612Sdelphijstatic __inline__ __m256bh __DEFAULT_FN_ATTRS512
137285612Sdelphij_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
13854359Sroberto  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
13954359Sroberto                                                        (__v16bf)__W,
140285612Sdelphij                                                        (__mmask16)__U);
14154359Sroberto}
14254359Sroberto
14354359Sroberto/// Convert Packed Single Data to Packed BF16 Data.
144285612Sdelphij///
145285612Sdelphij/// \headerfile <x86intrin.h>
14654359Sroberto///
14754359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
14854359Sroberto///
149285612Sdelphij/// \param __A
15054359Sroberto///    A 512-bit vector of [16 x float].
15154359Sroberto/// \param __U
15254359Sroberto///    A 16-bit mask value specifying what is chosen for each element.
15354359Sroberto///    A 1 means conversion of __A. A 0 means element is zero.
15454359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
15554359Srobertostatic __inline__ __m256bh __DEFAULT_FN_ATTRS512
15654359Sroberto_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
15754359Sroberto  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
15854359Sroberto                                                (__v16bf)_mm256_setzero_si256(),
15954359Sroberto                                                (__mmask16)__U);
16054359Sroberto}
16154359Sroberto
16254359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
16354359Sroberto///
16454359Sroberto/// \headerfile <x86intrin.h>
16554359Sroberto///
16654359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
16754359Sroberto///
16854359Sroberto/// \param __A
16954359Sroberto///    A 512-bit vector of [32 x bfloat].
17054359Sroberto/// \param __B
17154359Sroberto///    A 512-bit vector of [32 x bfloat].
17254359Sroberto/// \param __D
17354359Sroberto///    A 512-bit vector of [16 x float].
174285612Sdelphij/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
175285612Sdelphij///  __A, __B and __D
176285612Sdelphijstatic __inline__ __m512 __DEFAULT_FN_ATTRS512
177285612Sdelphij_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
178285612Sdelphij  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
17954359Sroberto                                             (__v32bf) __A,
18054359Sroberto                                             (__v32bf) __B);
18154359Sroberto}
18254359Sroberto
18354359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
18454359Sroberto///
18554359Sroberto/// \headerfile <x86intrin.h>
18654359Sroberto///
18754359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
18854359Sroberto///
18954359Sroberto/// \param __A
19054359Sroberto///    A 512-bit vector of [32 x bfloat].
19154359Sroberto/// \param __B
19254359Sroberto///    A 512-bit vector of [32 x bfloat].
19354359Sroberto/// \param __D
194285612Sdelphij///    A 512-bit vector of [16 x float].
195285612Sdelphij/// \param __U
196285612Sdelphij///    A 16-bit mask value specifying what is chosen for each element.
197285612Sdelphij///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
198285612Sdelphij/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
199285612Sdelphij///  __A, __B and __D
200285612Sdelphijstatic __inline__ __m512 __DEFAULT_FN_ATTRS512
20182498Sroberto_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
20254359Sroberto  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
20354359Sroberto                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
20454359Sroberto                                       (__v16sf)__D);
20554359Sroberto}
206285612Sdelphij
20754359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
208285612Sdelphij///
20954359Sroberto/// \headerfile <x86intrin.h>
21054359Sroberto///
21154359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
21254359Sroberto///
21382498Sroberto/// \param __A
21454359Sroberto///    A 512-bit vector of [32 x bfloat].
21554359Sroberto/// \param __B
21654359Sroberto///    A 512-bit vector of [32 x bfloat].
21754359Sroberto/// \param __D
21854359Sroberto///    A 512-bit vector of [16 x float].
21954359Sroberto/// \param __U
22054359Sroberto///    A 16-bit mask value specifying what is chosen for each element.
22154359Sroberto///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
22254359Sroberto/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
22354359Sroberto///  __A, __B and __D
22454359Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512
22554359Sroberto_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
22654359Sroberto  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
22754359Sroberto                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
22854359Sroberto                                       (__v16sf)_mm512_setzero_si512());
22954359Sroberto}
23054359Sroberto
23154359Sroberto/// Convert Packed BF16 Data to Packed float Data.
23254359Sroberto///
23354359Sroberto/// \headerfile <x86intrin.h>
23454359Sroberto///
23582498Sroberto/// \param __A
23682498Sroberto///    A 256-bit vector of [16 x bfloat].
23754359Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A
238182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
23982498Sroberto  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
24082498Sroberto      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
241182007Sroberto}
24282498Sroberto
243182007Sroberto/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
244182007Sroberto///
245182007Sroberto/// \headerfile <x86intrin.h>
246182007Sroberto///
247182007Sroberto/// \param __U
24882498Sroberto///    A 16-bit mask. Elements are zeroed out when the corresponding mask
249182007Sroberto///    bit is not set.
250182007Sroberto/// \param __A
251182007Sroberto///    A 256-bit vector of [16 x bfloat].
252182007Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A
253182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512
254182007Sroberto_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
255182007Sroberto  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
256182007Sroberto      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
257182007Sroberto}
258182007Sroberto
259182007Sroberto/// Convert Packed BF16 Data to Packed float Data using merging mask.
260182007Sroberto///
261182007Sroberto/// \headerfile <x86intrin.h>
262182007Sroberto///
263182007Sroberto/// \param __S
264182007Sroberto///    A 512-bit vector of [16 x float]. Elements are copied from __S when
265182007Sroberto///     the corresponding mask bit is not set.
266182007Sroberto/// \param __U
267182007Sroberto///    A 16-bit mask.
268182007Sroberto/// \param __A
269182007Sroberto///    A 256-bit vector of [16 x bfloat].
270182007Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A
271182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512
272182007Sroberto_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
273182007Sroberto  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
274182007Sroberto      (__m512i)__S, (__mmask16)__U,
275182007Sroberto      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
276182007Sroberto}
277182007Sroberto
278182007Sroberto#undef __DEFAULT_FN_ATTRS
279182007Sroberto#undef __DEFAULT_FN_ATTRS512
280182007Sroberto
281182007Sroberto#endif
282182007Sroberto#endif
283182007Sroberto