1351280Sdim/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------=== 2351280Sdim * 3351280Sdim * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4351280Sdim * See https://llvm.org/LICENSE.txt for license information. 5351280Sdim * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6351280Sdim * 7351280Sdim *===-----------------------------------------------------------------------=== 8351280Sdim */ 9351280Sdim#ifndef __IMMINTRIN_H 10351280Sdim#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead." 11351280Sdim#endif 12351280Sdim 13351280Sdim#ifndef __AVX512VLBF16INTRIN_H 14351280Sdim#define __AVX512VLBF16INTRIN_H 15351280Sdim 16351280Sdimtypedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16))); 17351280Sdim 18351280Sdim#define __DEFAULT_FN_ATTRS128 \ 19351280Sdim __attribute__((__always_inline__, __nodebug__, \ 20351280Sdim __target__("avx512vl, avx512bf16"), __min_vector_width__(128))) 21351280Sdim#define __DEFAULT_FN_ATTRS256 \ 22351280Sdim __attribute__((__always_inline__, __nodebug__, \ 23351280Sdim __target__("avx512vl, avx512bf16"), __min_vector_width__(256))) 24351280Sdim 25351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 26351280Sdim/// 27351280Sdim/// \headerfile <x86intrin.h> 28351280Sdim/// 29351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 30351280Sdim/// 31351280Sdim/// \param __A 32351280Sdim/// A 128-bit vector of [4 x float]. 33351280Sdim/// \param __B 34351280Sdim/// A 128-bit vector of [4 x float]. 35351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 36351280Sdim/// conversion of __B, and higher 64 bits come from conversion of __A. 37351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 38351280Sdim_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { 39351280Sdim return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A, 40351280Sdim (__v4sf) __B); 41351280Sdim} 42351280Sdim 43351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 44351280Sdim/// 45351280Sdim/// \headerfile <x86intrin.h> 46351280Sdim/// 47351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 48351280Sdim/// 49351280Sdim/// \param __A 50351280Sdim/// A 128-bit vector of [4 x float]. 51351280Sdim/// \param __B 52351280Sdim/// A 128-bit vector of [4 x float]. 53351280Sdim/// \param __W 54351280Sdim/// A 128-bit vector of [8 x bfloat]. 55351280Sdim/// \param __U 56351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 57351280Sdim/// A 1 means conversion of __A or __B. A 0 means element from __W. 58351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 59351280Sdim/// conversion of __B, and higher 64 bits come from conversion of __A. 60351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 61351280Sdim_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { 62351280Sdim return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, 63351280Sdim (__v8hi)_mm_cvtne2ps_pbh(__A, __B), 64351280Sdim (__v8hi)__W); 65351280Sdim} 66351280Sdim 67351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 68351280Sdim/// 69351280Sdim/// \headerfile <x86intrin.h> 70351280Sdim/// 71351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 72351280Sdim/// 73351280Sdim/// \param __A 74351280Sdim/// A 128-bit vector of [4 x float]. 75351280Sdim/// \param __B 76351280Sdim/// A 128-bit vector of [4 x float]. 77351280Sdim/// \param __U 78351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 79351280Sdim/// A 1 means conversion of __A or __B. A 0 means element is zero. 80351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 81351280Sdim/// conversion of __B, and higher 64 bits come from conversion of __A. 82351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 83351280Sdim_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { 84351280Sdim return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, 85351280Sdim (__v8hi)_mm_cvtne2ps_pbh(__A, __B), 86351280Sdim (__v8hi)_mm_setzero_si128()); 87351280Sdim} 88351280Sdim 89351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 90351280Sdim/// 91351280Sdim/// \headerfile <x86intrin.h> 92351280Sdim/// 93351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 94351280Sdim/// 95351280Sdim/// \param __A 96351280Sdim/// A 256-bit vector of [8 x float]. 97351280Sdim/// \param __B 98351280Sdim/// A 256-bit vector of [8 x float]. 99351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 100351280Sdim/// conversion of __B, and higher 128 bits come from conversion of __A. 101351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256 102351280Sdim_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { 103351280Sdim return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A, 104351280Sdim (__v8sf) __B); 105351280Sdim} 106351280Sdim 107351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 108351280Sdim/// 109351280Sdim/// \headerfile <x86intrin.h> 110351280Sdim/// 111351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 112351280Sdim/// 113351280Sdim/// \param __A 114351280Sdim/// A 256-bit vector of [8 x float]. 115351280Sdim/// \param __B 116351280Sdim/// A 256-bit vector of [8 x float]. 117351280Sdim/// \param __W 118351280Sdim/// A 256-bit vector of [16 x bfloat]. 119351280Sdim/// \param __U 120351280Sdim/// A 16-bit mask value specifying what is chosen for each element. 121351280Sdim/// A 1 means conversion of __A or __B. A 0 means element from __W. 122351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 123351280Sdim/// conversion of __B, and higher 128 bits come from conversion of __A. 124351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256 125351280Sdim_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { 126351280Sdim return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, 127351280Sdim (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), 128351280Sdim (__v16hi)__W); 129351280Sdim} 130351280Sdim 131351280Sdim/// Convert Two Packed Single Data to One Packed BF16 Data. 132351280Sdim/// 133351280Sdim/// \headerfile <x86intrin.h> 134351280Sdim/// 135351280Sdim/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 136351280Sdim/// 137351280Sdim/// \param __A 138351280Sdim/// A 256-bit vector of [8 x float]. 139351280Sdim/// \param __B 140351280Sdim/// A 256-bit vector of [8 x float]. 141351280Sdim/// \param __U 142351280Sdim/// A 16-bit mask value specifying what is chosen for each element. 143351280Sdim/// A 1 means conversion of __A or __B. A 0 means element is zero. 144351280Sdim/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 145351280Sdim/// conversion of __B, and higher 128 bits come from conversion of __A. 146351280Sdimstatic __inline__ __m256bh __DEFAULT_FN_ATTRS256 147351280Sdim_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { 148351280Sdim return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, 149351280Sdim (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), 150351280Sdim (__v16hi)_mm256_setzero_si256()); 151351280Sdim} 152351280Sdim 153351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 154351280Sdim/// 155351280Sdim/// \headerfile <x86intrin.h> 156351280Sdim/// 157351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 158351280Sdim/// 159351280Sdim/// \param __A 160351280Sdim/// A 128-bit vector of [4 x float]. 161351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 162351280Sdim/// conversion of __A, and higher 64 bits are 0. 163351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 164351280Sdim_mm_cvtneps_pbh(__m128 __A) { 165351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, 166351280Sdim (__v8hi)_mm_undefined_si128(), 167351280Sdim (__mmask8)-1); 168351280Sdim} 169351280Sdim 170351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 171351280Sdim/// 172351280Sdim/// \headerfile <x86intrin.h> 173351280Sdim/// 174351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 175351280Sdim/// 176351280Sdim/// \param __A 177351280Sdim/// A 128-bit vector of [4 x float]. 178351280Sdim/// \param __W 179351280Sdim/// A 128-bit vector of [8 x bfloat]. 180351280Sdim/// \param __U 181351280Sdim/// A 4-bit mask value specifying what is chosen for each element. 182351280Sdim/// A 1 means conversion of __A. A 0 means element from __W. 183351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 184351280Sdim/// conversion of __A, and higher 64 bits are 0. 185351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 186351280Sdim_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { 187351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, 188351280Sdim (__v8hi)__W, 189351280Sdim (__mmask8)__U); 190351280Sdim} 191351280Sdim 192351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 193351280Sdim/// 194351280Sdim/// \headerfile <x86intrin.h> 195351280Sdim/// 196351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 197351280Sdim/// 198351280Sdim/// \param __A 199351280Sdim/// A 128-bit vector of [4 x float]. 200351280Sdim/// \param __U 201351280Sdim/// A 4-bit mask value specifying what is chosen for each element. 202351280Sdim/// A 1 means conversion of __A. A 0 means element is zero. 203351280Sdim/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 204351280Sdim/// conversion of __A, and higher 64 bits are 0. 205351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS128 206351280Sdim_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { 207351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, 208351280Sdim (__v8hi)_mm_setzero_si128(), 209351280Sdim (__mmask8)__U); 210351280Sdim} 211351280Sdim 212351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 213351280Sdim/// 214351280Sdim/// \headerfile <x86intrin.h> 215351280Sdim/// 216351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 217351280Sdim/// 218351280Sdim/// \param __A 219351280Sdim/// A 256-bit vector of [8 x float]. 220351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 221351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256 222351280Sdim_mm256_cvtneps_pbh(__m256 __A) { 223351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, 224351280Sdim (__v8hi)_mm_undefined_si128(), 225351280Sdim (__mmask8)-1); 226351280Sdim} 227351280Sdim 228351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 229351280Sdim/// 230351280Sdim/// \headerfile <x86intrin.h> 231351280Sdim/// 232351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 233351280Sdim/// 234351280Sdim/// \param __A 235351280Sdim/// A 256-bit vector of [8 x float]. 236351280Sdim/// \param __W 237351280Sdim/// A 256-bit vector of [8 x bfloat]. 238351280Sdim/// \param __U 239351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 240351280Sdim/// A 1 means conversion of __A. A 0 means element from __W. 241351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 242351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256 243351280Sdim_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { 244351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, 245351280Sdim (__v8hi)__W, 246351280Sdim (__mmask8)__U); 247351280Sdim} 248351280Sdim 249351280Sdim/// Convert Packed Single Data to Packed BF16 Data. 250351280Sdim/// 251351280Sdim/// \headerfile <x86intrin.h> 252351280Sdim/// 253351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 254351280Sdim/// 255351280Sdim/// \param __A 256351280Sdim/// A 256-bit vector of [8 x float]. 257351280Sdim/// \param __U 258351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 259351280Sdim/// A 1 means conversion of __A. A 0 means element is zero. 260351280Sdim/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 261351280Sdimstatic __inline__ __m128bh __DEFAULT_FN_ATTRS256 262351280Sdim_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { 263351280Sdim return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, 264351280Sdim (__v8hi)_mm_setzero_si128(), 265351280Sdim (__mmask8)__U); 266351280Sdim} 267351280Sdim 268351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 269351280Sdim/// 270351280Sdim/// \headerfile <x86intrin.h> 271351280Sdim/// 272351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 273351280Sdim/// 274351280Sdim/// \param __A 275351280Sdim/// A 128-bit vector of [8 x bfloat]. 276351280Sdim/// \param __B 277351280Sdim/// A 128-bit vector of [8 x bfloat]. 278351280Sdim/// \param __D 279351280Sdim/// A 128-bit vector of [4 x float]. 280351280Sdim/// \returns A 128-bit vector of [4 x float] comes from Dot Product of 281351280Sdim/// __A, __B and __D 282351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128 283351280Sdim_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { 284351280Sdim return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D, 285351280Sdim (__v4si)__A, 286351280Sdim (__v4si)__B); 287351280Sdim} 288351280Sdim 289351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 290351280Sdim/// 291351280Sdim/// \headerfile <x86intrin.h> 292351280Sdim/// 293351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 294351280Sdim/// 295351280Sdim/// \param __A 296351280Sdim/// A 128-bit vector of [8 x bfloat]. 297351280Sdim/// \param __B 298351280Sdim/// A 128-bit vector of [8 x bfloat]. 299351280Sdim/// \param __D 300351280Sdim/// A 128-bit vector of [4 x float]. 301351280Sdim/// \param __U 302351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 303351280Sdim/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 304351280Sdim/// \returns A 128-bit vector of [4 x float] comes from Dot Product of 305351280Sdim/// __A, __B and __D 306351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128 307351280Sdim_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { 308351280Sdim return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 309351280Sdim (__v4sf)_mm_dpbf16_ps(__D, __A, __B), 310351280Sdim (__v4sf)__D); 311351280Sdim} 312351280Sdim 313351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 314351280Sdim/// 315351280Sdim/// \headerfile <x86intrin.h> 316351280Sdim/// 317351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 318351280Sdim/// 319351280Sdim/// \param __A 320351280Sdim/// A 128-bit vector of [8 x bfloat]. 321351280Sdim/// \param __B 322351280Sdim/// A 128-bit vector of [8 x bfloat]. 323351280Sdim/// \param __D 324351280Sdim/// A 128-bit vector of [4 x float]. 325351280Sdim/// \param __U 326351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 327351280Sdim/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 328351280Sdim/// \returns A 128-bit vector of [4 x float] comes from Dot Product of 329351280Sdim/// __A, __B and __D 330351280Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS128 331351280Sdim_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { 332351280Sdim return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 333351280Sdim (__v4sf)_mm_dpbf16_ps(__D, __A, __B), 334351280Sdim (__v4sf)_mm_setzero_si128()); 335351280Sdim} 336351280Sdim 337351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 338351280Sdim/// 339351280Sdim/// \headerfile <x86intrin.h> 340351280Sdim/// 341351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 342351280Sdim/// 343351280Sdim/// \param __A 344351280Sdim/// A 256-bit vector of [16 x bfloat]. 345351280Sdim/// \param __B 346351280Sdim/// A 256-bit vector of [16 x bfloat]. 347351280Sdim/// \param __D 348351280Sdim/// A 256-bit vector of [8 x float]. 349351280Sdim/// \returns A 256-bit vector of [8 x float] comes from Dot Product of 350351280Sdim/// __A, __B and __D 351351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 352351280Sdim_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { 353351280Sdim return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D, 354351280Sdim (__v8si)__A, 355351280Sdim (__v8si)__B); 356351280Sdim} 357351280Sdim 358351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 359351280Sdim/// 360351280Sdim/// \headerfile <x86intrin.h> 361351280Sdim/// 362351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 363351280Sdim/// 364351280Sdim/// \param __A 365351280Sdim/// A 256-bit vector of [16 x bfloat]. 366351280Sdim/// \param __B 367351280Sdim/// A 256-bit vector of [16 x bfloat]. 368351280Sdim/// \param __D 369351280Sdim/// A 256-bit vector of [8 x float]. 370351280Sdim/// \param __U 371351280Sdim/// A 16-bit mask value specifying what is chosen for each element. 372351280Sdim/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 373351280Sdim/// \returns A 256-bit vector of [8 x float] comes from Dot Product of 374351280Sdim/// __A, __B and __D 375351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 376351280Sdim_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { 377351280Sdim return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 378351280Sdim (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), 379351280Sdim (__v8sf)__D); 380351280Sdim} 381351280Sdim 382351280Sdim/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 383351280Sdim/// 384351280Sdim/// \headerfile <x86intrin.h> 385351280Sdim/// 386351280Sdim/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 387351280Sdim/// 388351280Sdim/// \param __A 389351280Sdim/// A 256-bit vector of [16 x bfloat]. 390351280Sdim/// \param __B 391351280Sdim/// A 256-bit vector of [16 x bfloat]. 392351280Sdim/// \param __D 393351280Sdim/// A 256-bit vector of [8 x float]. 394351280Sdim/// \param __U 395351280Sdim/// A 8-bit mask value specifying what is chosen for each element. 396351280Sdim/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 397351280Sdim/// \returns A 256-bit vector of [8 x float] comes from Dot Product of 398351280Sdim/// __A, __B and __D 399351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 400351280Sdim_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { 401351280Sdim return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 402351280Sdim (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), 403351280Sdim (__v8sf)_mm256_setzero_si256()); 404351280Sdim} 405351280Sdim 406351280Sdim/// Convert One Single float Data to One BF16 Data. 407351280Sdim/// 408351280Sdim/// \headerfile <x86intrin.h> 409351280Sdim/// 410351280Sdim/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 411351280Sdim/// 412351280Sdim/// \param __A 413351280Sdim/// A float data. 414351280Sdim/// \returns A bf16 data whose sign field and exponent field keep unchanged, 415351280Sdim/// and fraction field is truncated to 7 bits. 416351280Sdimstatic __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { 417351280Sdim __v4sf __V = {__A, 0, 0, 0}; 418351280Sdim __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask( 419351280Sdim (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 420351280Sdim return __R[0]; 421351280Sdim} 422351280Sdim 423351280Sdim/// Convert Packed BF16 Data to Packed float Data. 424351280Sdim/// 425351280Sdim/// \headerfile <x86intrin.h> 426351280Sdim/// 427351280Sdim/// \param __A 428351280Sdim/// A 128-bit vector of [8 x bfloat]. 429351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A 430351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { 431351280Sdim return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( 432351280Sdim (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); 433351280Sdim} 434351280Sdim 435351280Sdim/// Convert Packed BF16 Data to Packed float Data using zeroing mask. 436351280Sdim/// 437351280Sdim/// \headerfile <x86intrin.h> 438351280Sdim/// 439351280Sdim/// \param __U 440351280Sdim/// A 8-bit mask. Elements are zeroed out when the corresponding mask 441351280Sdim/// bit is not set. 442351280Sdim/// \param __A 443351280Sdim/// A 128-bit vector of [8 x bfloat]. 444351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A 445351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 446351280Sdim_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { 447351280Sdim return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( 448351280Sdim (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); 449351280Sdim} 450351280Sdim 451351280Sdim/// Convert Packed BF16 Data to Packed float Data using merging mask. 452351280Sdim/// 453351280Sdim/// \headerfile <x86intrin.h> 454351280Sdim/// 455351280Sdim/// \param __S 456351280Sdim/// A 256-bit vector of [8 x float]. Elements are copied from __S when 457351280Sdim/// the corresponding mask bit is not set. 458351280Sdim/// \param __U 459351280Sdim/// A 8-bit mask. Elements are zeroed out when the corresponding mask 460351280Sdim/// bit is not set. 461351280Sdim/// \param __A 462351280Sdim/// A 128-bit vector of [8 x bfloat]. 463351280Sdim/// \returns A 256-bit vector of [8 x float] come from convertion of __A 464351280Sdimstatic __inline__ __m256 __DEFAULT_FN_ATTRS256 465351280Sdim_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { 466351280Sdim return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( 467351280Sdim (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 468351280Sdim 16)); 469351280Sdim} 470351280Sdim 471351280Sdim#undef __DEFAULT_FN_ATTRS128 472351280Sdim#undef __DEFAULT_FN_ATTRS256 473351280Sdim 474351280Sdim#endif 475