154359Sroberto/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== 282498Sroberto * 354359Sroberto * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 454359Sroberto * See https://llvm.org/LICENSE.txt for license information. 554359Sroberto * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 654359Sroberto * 754359Sroberto *===-----------------------------------------------------------------------=== 854359Sroberto */ 954359Sroberto#ifndef __IMMINTRIN_H 1054359Sroberto#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead." 1154359Sroberto#endif 1254359Sroberto 1354359Sroberto#ifdef __SSE2__ 1454359Sroberto 1554359Sroberto#ifndef __AVX512BF16INTRIN_H 1654359Sroberto#define __AVX512BF16INTRIN_H 1754359Sroberto 1854359Srobertotypedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); 19182007Srobertotypedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); 2054359Srobertotypedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); 2182498Sroberto 2282498Sroberto#define __DEFAULT_FN_ATTRS512 \ 2354359Sroberto __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \ 2482498Sroberto __min_vector_width__(512))) 2554359Sroberto#define __DEFAULT_FN_ATTRS \ 2682498Sroberto __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"))) 2782498Sroberto 2854359Sroberto/// Convert One BF16 Data to One Single Float Data. 2982498Sroberto/// 3082498Sroberto/// \headerfile <x86intrin.h> 3182498Sroberto/// 3282498Sroberto/// This intrinsic does not correspond to a specific instruction. 3382498Sroberto/// 3482498Sroberto/// \param __A 3582498Sroberto/// A bfloat data. 3682498Sroberto/// \returns A float data whose sign field and exponent field keep unchanged, 3782498Sroberto/// and fraction field is extended to 23 bits. 3882498Srobertostatic __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { 3982498Sroberto return __builtin_ia32_cvtsbf162ss_32(__A); 4082498Sroberto} 4182498Sroberto 4282498Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data. 4382498Sroberto/// 4482498Sroberto/// \headerfile <x86intrin.h> 45182007Sroberto/// 46182007Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 4754359Sroberto/// 4882498Sroberto/// \param __A 4982498Sroberto/// A 512-bit vector of [16 x float]. 5082498Sroberto/// \param __B 5154359Sroberto/// A 512-bit vector of [16 x float]. 5282498Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 5382498Sroberto/// conversion of __B, and higher 256 bits come from conversion of __A. 5482498Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512 55182007Sroberto_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { 56182007Sroberto return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, 5782498Sroberto (__v16sf) __B); 58182007Sroberto} 5982498Sroberto 6082498Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data. 61182007Sroberto/// 6282498Sroberto/// \headerfile <x86intrin.h> 6382498Sroberto/// 6454359Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 6582498Sroberto/// 6654359Sroberto/// \param __A 6754359Sroberto/// A 512-bit vector of [16 x float]. 6854359Sroberto/// \param __B 6982498Sroberto/// A 512-bit vector of [16 x float]. 7054359Sroberto/// \param __W 7182498Sroberto/// A 512-bit vector of [32 x bfloat]. 7282498Sroberto/// \param __U 7354359Sroberto/// A 32-bit mask value specifying what is chosen for each element. 7454359Sroberto/// A 1 means conversion of __A or __B. A 0 means element from __W. 75182007Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 7682498Sroberto/// conversion of __B, and higher 256 bits come from conversion of __A. 7754359Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512 78182007Sroberto_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { 79182007Sroberto return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 80182007Sroberto (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 8154359Sroberto (__v32bf)__W); 8282498Sroberto} 8354359Sroberto 8454359Sroberto/// Convert Two Packed Single Data to One Packed BF16 Data. 8554359Sroberto/// 8654359Sroberto/// \headerfile <x86intrin.h> 8754359Sroberto/// 8854359Sroberto/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 8954359Sroberto/// 9054359Sroberto/// \param __A 9154359Sroberto/// A 512-bit vector of [16 x float]. 92285612Sdelphij/// \param __B 93285612Sdelphij/// A 512-bit vector of [16 x float]. 94285612Sdelphij/// \param __U 95285612Sdelphij/// A 32-bit mask value specifying what is chosen for each element. 9654359Sroberto/// A 1 means conversion of __A or __B. A 0 means element is zero. 9754359Sroberto/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 9854359Sroberto/// conversion of __B, and higher 256 bits come from conversion of __A. 9954359Srobertostatic __inline__ __m512bh __DEFAULT_FN_ATTRS512 10054359Sroberto_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { 10154359Sroberto return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 10254359Sroberto (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 10354359Sroberto (__v32bf)_mm512_setzero_si512()); 10482498Sroberto} 10582498Sroberto 10682498Sroberto/// Convert Packed Single Data to Packed BF16 Data. 10782498Sroberto/// 10854359Sroberto/// \headerfile <x86intrin.h> 10954359Sroberto/// 11054359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 11154359Sroberto/// 11254359Sroberto/// \param __A 11354359Sroberto/// A 512-bit vector of [16 x float]. 11454359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 11554359Srobertostatic __inline__ __m256bh __DEFAULT_FN_ATTRS512 11654359Sroberto_mm512_cvtneps_pbh(__m512 __A) { 11754359Sroberto return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 11854359Sroberto (__v16bf)_mm256_undefined_si256(), 11954359Sroberto (__mmask16)-1); 12054359Sroberto} 12154359Sroberto 12282498Sroberto/// Convert Packed Single Data to Packed BF16 Data. 12354359Sroberto/// 12482498Sroberto/// \headerfile <x86intrin.h> 12554359Sroberto/// 12654359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 12754359Sroberto/// 128285612Sdelphij/// \param __A 129285612Sdelphij/// A 512-bit vector of [16 x float]. 130285612Sdelphij/// \param __W 13154359Sroberto/// A 256-bit vector of [16 x bfloat]. 13254359Sroberto/// \param __U 13354359Sroberto/// A 16-bit mask value specifying what is chosen for each element. 13454359Sroberto/// A 1 means conversion of __A. A 0 means element from __W. 13554359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 136285612Sdelphijstatic __inline__ __m256bh __DEFAULT_FN_ATTRS512 137285612Sdelphij_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { 13854359Sroberto return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 13954359Sroberto (__v16bf)__W, 140285612Sdelphij (__mmask16)__U); 14154359Sroberto} 14254359Sroberto 14354359Sroberto/// Convert Packed Single Data to Packed BF16 Data. 144285612Sdelphij/// 145285612Sdelphij/// \headerfile <x86intrin.h> 14654359Sroberto/// 14754359Sroberto/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 14854359Sroberto/// 149285612Sdelphij/// \param __A 15054359Sroberto/// A 512-bit vector of [16 x float]. 15154359Sroberto/// \param __U 15254359Sroberto/// A 16-bit mask value specifying what is chosen for each element. 15354359Sroberto/// A 1 means conversion of __A. A 0 means element is zero. 15454359Sroberto/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 15554359Srobertostatic __inline__ __m256bh __DEFAULT_FN_ATTRS512 15654359Sroberto_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { 15754359Sroberto return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 15854359Sroberto (__v16bf)_mm256_setzero_si256(), 15954359Sroberto (__mmask16)__U); 16054359Sroberto} 16154359Sroberto 16254359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 16354359Sroberto/// 16454359Sroberto/// \headerfile <x86intrin.h> 16554359Sroberto/// 16654359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 16754359Sroberto/// 16854359Sroberto/// \param __A 16954359Sroberto/// A 512-bit vector of [32 x bfloat]. 17054359Sroberto/// \param __B 17154359Sroberto/// A 512-bit vector of [32 x bfloat]. 17254359Sroberto/// \param __D 17354359Sroberto/// A 512-bit vector of [16 x float]. 174285612Sdelphij/// \returns A 512-bit vector of [16 x float] comes from Dot Product of 175285612Sdelphij/// __A, __B and __D 176285612Sdelphijstatic __inline__ __m512 __DEFAULT_FN_ATTRS512 177285612Sdelphij_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { 178285612Sdelphij return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, 17954359Sroberto (__v32bf) __A, 18054359Sroberto (__v32bf) __B); 18154359Sroberto} 18254359Sroberto 18354359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 18454359Sroberto/// 18554359Sroberto/// \headerfile <x86intrin.h> 18654359Sroberto/// 18754359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 18854359Sroberto/// 18954359Sroberto/// \param __A 19054359Sroberto/// A 512-bit vector of [32 x bfloat]. 19154359Sroberto/// \param __B 19254359Sroberto/// A 512-bit vector of [32 x bfloat]. 19354359Sroberto/// \param __D 194285612Sdelphij/// A 512-bit vector of [16 x float]. 195285612Sdelphij/// \param __U 196285612Sdelphij/// A 16-bit mask value specifying what is chosen for each element. 197285612Sdelphij/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 198285612Sdelphij/// \returns A 512-bit vector of [16 x float] comes from Dot Product of 199285612Sdelphij/// __A, __B and __D 200285612Sdelphijstatic __inline__ __m512 __DEFAULT_FN_ATTRS512 20182498Sroberto_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { 20254359Sroberto return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 20354359Sroberto (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 20454359Sroberto (__v16sf)__D); 20554359Sroberto} 206285612Sdelphij 20754359Sroberto/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 208285612Sdelphij/// 20954359Sroberto/// \headerfile <x86intrin.h> 21054359Sroberto/// 21154359Sroberto/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 21254359Sroberto/// 21382498Sroberto/// \param __A 21454359Sroberto/// A 512-bit vector of [32 x bfloat]. 21554359Sroberto/// \param __B 21654359Sroberto/// A 512-bit vector of [32 x bfloat]. 21754359Sroberto/// \param __D 21854359Sroberto/// A 512-bit vector of [16 x float]. 21954359Sroberto/// \param __U 22054359Sroberto/// A 16-bit mask value specifying what is chosen for each element. 22154359Sroberto/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 22254359Sroberto/// \returns A 512-bit vector of [16 x float] comes from Dot Product of 22354359Sroberto/// __A, __B and __D 22454359Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512 22554359Sroberto_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { 22654359Sroberto return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 22754359Sroberto (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 22854359Sroberto (__v16sf)_mm512_setzero_si512()); 22954359Sroberto} 23054359Sroberto 23154359Sroberto/// Convert Packed BF16 Data to Packed float Data. 23254359Sroberto/// 23354359Sroberto/// \headerfile <x86intrin.h> 23454359Sroberto/// 23582498Sroberto/// \param __A 23682498Sroberto/// A 256-bit vector of [16 x bfloat]. 23754359Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A 238182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { 23982498Sroberto return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 24082498Sroberto (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 241182007Sroberto} 24282498Sroberto 243182007Sroberto/// Convert Packed BF16 Data to Packed float Data using zeroing mask. 244182007Sroberto/// 245182007Sroberto/// \headerfile <x86intrin.h> 246182007Sroberto/// 247182007Sroberto/// \param __U 24882498Sroberto/// A 16-bit mask. Elements are zeroed out when the corresponding mask 249182007Sroberto/// bit is not set. 250182007Sroberto/// \param __A 251182007Sroberto/// A 256-bit vector of [16 x bfloat]. 252182007Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A 253182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512 254182007Sroberto_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { 255182007Sroberto return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 256182007Sroberto (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); 257182007Sroberto} 258182007Sroberto 259182007Sroberto/// Convert Packed BF16 Data to Packed float Data using merging mask. 260182007Sroberto/// 261182007Sroberto/// \headerfile <x86intrin.h> 262182007Sroberto/// 263182007Sroberto/// \param __S 264182007Sroberto/// A 512-bit vector of [16 x float]. Elements are copied from __S when 265182007Sroberto/// the corresponding mask bit is not set. 266182007Sroberto/// \param __U 267182007Sroberto/// A 16-bit mask. 268182007Sroberto/// \param __A 269182007Sroberto/// A 256-bit vector of [16 x bfloat]. 270182007Sroberto/// \returns A 512-bit vector of [16 x float] come from conversion of __A 271182007Srobertostatic __inline__ __m512 __DEFAULT_FN_ATTRS512 272182007Sroberto_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { 273182007Sroberto return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( 274182007Sroberto (__m512i)__S, (__mmask16)__U, 275182007Sroberto (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 276182007Sroberto} 277182007Sroberto 278182007Sroberto#undef __DEFAULT_FN_ATTRS 279182007Sroberto#undef __DEFAULT_FN_ATTRS512 280182007Sroberto 281182007Sroberto#endif 282182007Sroberto#endif 283182007Sroberto