1/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error \ 12 "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead." 13#endif // __IMMINTRIN_H 14 15#ifdef __SSE2__ 16 17#ifndef __AVXNECONVERTINTRIN_H 18#define __AVXNECONVERTINTRIN_H 19 20/* Define the default attributes for the functions in this file. */ 21#define __DEFAULT_FN_ATTRS128 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ 23 __min_vector_width__(128))) 24#define __DEFAULT_FN_ATTRS256 \ 25 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ 26 __min_vector_width__(256))) 27 28/// Convert scalar BF16 (16-bit) floating-point element 29/// stored at memory locations starting at location \a __A to a 30/// single-precision (32-bit) floating-point, broadcast it to packed 31/// single-precision (32-bit) floating-point elements, and store the results in 32/// \a dst. 33/// 34/// \headerfile <x86intrin.h> 35/// 36/// \code 37/// _mm_bcstnebf16_ps(const void *__A); 38/// \endcode 39/// 40/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. 41/// 42/// \param __A 43/// A pointer to a 16-bit memory location. The address of the memory 44/// location does not have to be aligned. 45/// \returns 46/// A 128-bit vector of [4 x float]. 47/// 48/// \code{.operation} 49/// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) 50/// FOR j := 0 to 3 51/// m := j*32 52/// dst[m+31:m] := b 53/// ENDFOR 54/// dst[MAX:128] := 0 55/// \endcode 56static __inline__ __m128 __DEFAULT_FN_ATTRS128 57_mm_bcstnebf16_ps(const void *__A) { 58 return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A); 59} 60 61/// Convert scalar BF16 (16-bit) floating-point element 62/// stored at memory locations starting at location \a __A to a 63/// single-precision (32-bit) floating-point, broadcast it to packed 64/// single-precision (32-bit) floating-point elements, and store the results in 65/// \a dst. 66/// 67/// \headerfile <x86intrin.h> 68/// 69/// \code 70/// _mm256_bcstnebf16_ps(const void *__A); 71/// \endcode 72/// 73/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. 74/// 75/// \param __A 76/// A pointer to a 16-bit memory location. The address of the memory 77/// location does not have to be aligned. 78/// \returns 79/// A 256-bit vector of [8 x float]. 80/// 81/// \code{.operation} 82/// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) 83/// FOR j := 0 to 7 84/// m := j*32 85/// dst[m+31:m] := b 86/// ENDFOR 87/// dst[MAX:256] := 0 88/// \endcode 89static __inline__ __m256 __DEFAULT_FN_ATTRS256 90_mm256_bcstnebf16_ps(const void *__A) { 91 return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A); 92} 93 94/// Convert scalar half-precision (16-bit) floating-point element 95/// stored at memory locations starting at location \a __A to a 96/// single-precision (32-bit) floating-point, broadcast it to packed 97/// single-precision (32-bit) floating-point elements, and store the results in 98/// \a dst. 99/// 100/// \headerfile <x86intrin.h> 101/// 102/// \code 103/// _mm_bcstnesh_ps(const void *__A); 104/// \endcode 105/// 106/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. 107/// 108/// \param __A 109/// A pointer to a 16-bit memory location. The address of the memory 110/// location does not have to be aligned. 111/// \returns 112/// A 128-bit vector of [4 x float]. 113/// 114/// \code{.operation} 115/// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) 116/// FOR j := 0 to 3 117/// m := j*32 118/// dst[m+31:m] := b 119/// ENDFOR 120/// dst[MAX:128] := 0 121/// \endcode 122static __inline__ __m128 __DEFAULT_FN_ATTRS128 123_mm_bcstnesh_ps(const void *__A) { 124 return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A); 125} 126 127/// Convert scalar half-precision (16-bit) floating-point element 128/// stored at memory locations starting at location \a __A to a 129/// single-precision (32-bit) floating-point, broadcast it to packed 130/// single-precision (32-bit) floating-point elements, and store the results in 131/// \a dst. 132/// 133/// \headerfile <x86intrin.h> 134/// 135/// \code 136/// _mm256_bcstnesh_ps(const void *__A); 137/// \endcode 138/// 139/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. 140/// 141/// \param __A 142/// A pointer to a 16-bit memory location. The address of the memory 143/// location does not have to be aligned. 144/// \returns 145/// A 256-bit vector of [8 x float]. 146/// 147/// \code{.operation} 148/// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) 149/// FOR j := 0 to 7 150/// m := j*32 151/// dst[m+31:m] := b 152/// ENDFOR 153/// dst[MAX:256] := 0 154/// \endcode 155static __inline__ __m256 __DEFAULT_FN_ATTRS256 156_mm256_bcstnesh_ps(const void *__A) { 157 return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A); 158} 159 160/// Convert packed BF16 (16-bit) floating-point even-indexed elements 161/// stored at memory locations starting at location \a __A to packed 162/// single-precision (32-bit) floating-point elements, and store the results in 163/// \a dst. 164/// 165/// \headerfile <x86intrin.h> 166/// 167/// \code 168/// _mm_cvtneebf16_ps(const __m128bh *__A); 169/// \endcode 170/// 171/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. 172/// 173/// \param __A 174/// A pointer to a 128-bit memory location containing 8 consecutive 175/// BF16 (16-bit) floating-point values. 176/// \returns 177/// A 128-bit vector of [4 x float]. 178/// 179/// \code{.operation} 180/// FOR j := 0 to 3 181/// k := j*2 182/// i := k*16 183/// m := j*32 184/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 185/// ENDFOR 186/// dst[MAX:128] := 0 187/// \endcode 188static __inline__ __m128 __DEFAULT_FN_ATTRS128 189_mm_cvtneebf16_ps(const __m128bh *__A) { 190 return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A); 191} 192 193/// Convert packed BF16 (16-bit) floating-point even-indexed elements 194/// stored at memory locations starting at location \a __A to packed 195/// single-precision (32-bit) floating-point elements, and store the results in 196/// \a dst. 197/// 198/// \headerfile <x86intrin.h> 199/// 200/// \code 201/// _mm256_cvtneebf16_ps(const __m256bh *__A); 202/// \endcode 203/// 204/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. 205/// 206/// \param __A 207/// A pointer to a 256-bit memory location containing 16 consecutive 208/// BF16 (16-bit) floating-point values. 209/// \returns 210/// A 256-bit vector of [8 x float]. 211/// 212/// \code{.operation} 213/// FOR j := 0 to 7 214/// k := j*2 215/// i := k*16 216/// m := j*32 217/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 218/// ENDFOR 219/// dst[MAX:256] := 0 220/// \endcode 221static __inline__ __m256 __DEFAULT_FN_ATTRS256 222_mm256_cvtneebf16_ps(const __m256bh *__A) { 223 return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A); 224} 225 226/// Convert packed half-precision (16-bit) floating-point even-indexed elements 227/// stored at memory locations starting at location \a __A to packed 228/// single-precision (32-bit) floating-point elements, and store the results in 229/// \a dst. 230/// 231/// \headerfile <x86intrin.h> 232/// 233/// \code 234/// _mm_cvtneeph_ps(const __m128h *__A); 235/// \endcode 236/// 237/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. 238/// 239/// \param __A 240/// A pointer to a 128-bit memory location containing 8 consecutive 241/// half-precision (16-bit) floating-point values. 242/// \returns 243/// A 128-bit vector of [4 x float]. 244/// 245/// \code{.operation} 246/// FOR j := 0 to 3 247/// k := j*2 248/// i := k*16 249/// m := j*32 250/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 251/// ENDFOR 252/// dst[MAX:128] := 0 253/// \endcode 254static __inline__ __m128 __DEFAULT_FN_ATTRS128 255_mm_cvtneeph_ps(const __m128h *__A) { 256 return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A); 257} 258 259/// Convert packed half-precision (16-bit) floating-point even-indexed elements 260/// stored at memory locations starting at location \a __A to packed 261/// single-precision (32-bit) floating-point elements, and store the results in 262/// \a dst. 263/// 264/// \headerfile <x86intrin.h> 265/// 266/// \code 267/// _mm256_cvtneeph_ps(const __m256h *__A); 268/// \endcode 269/// 270/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. 271/// 272/// \param __A 273/// A pointer to a 256-bit memory location containing 16 consecutive 274/// half-precision (16-bit) floating-point values. 275/// \returns 276/// A 256-bit vector of [8 x float]. 277/// 278/// \code{.operation} 279/// FOR j := 0 to 7 280/// k := j*2 281/// i := k*16 282/// m := j*32 283/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 284/// ENDFOR 285/// dst[MAX:256] := 0 286/// \endcode 287static __inline__ __m256 __DEFAULT_FN_ATTRS256 288_mm256_cvtneeph_ps(const __m256h *__A) { 289 return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A); 290} 291 292/// Convert packed BF16 (16-bit) floating-point odd-indexed elements 293/// stored at memory locations starting at location \a __A to packed 294/// single-precision (32-bit) floating-point elements, and store the results in 295/// \a dst. 296/// 297/// \headerfile <x86intrin.h> 298/// 299/// \code 300/// _mm_cvtneobf16_ps(const __m128bh *__A); 301/// \endcode 302/// 303/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. 304/// 305/// \param __A 306/// A pointer to a 128-bit memory location containing 8 consecutive 307/// BF16 (16-bit) floating-point values. 308/// \returns 309/// A 128-bit vector of [4 x float]. 310/// 311/// \code{.operation} 312/// FOR j := 0 to 3 313/// k := j*2+1 314/// i := k*16 315/// m := j*32 316/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 317/// ENDFOR 318/// dst[MAX:128] := 0 319/// \endcode 320static __inline__ __m128 __DEFAULT_FN_ATTRS128 321_mm_cvtneobf16_ps(const __m128bh *__A) { 322 return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A); 323} 324 325/// Convert packed BF16 (16-bit) floating-point odd-indexed elements 326/// stored at memory locations starting at location \a __A to packed 327/// single-precision (32-bit) floating-point elements, and store the results in 328/// \a dst. 329/// 330/// \headerfile <x86intrin.h> 331/// 332/// \code 333/// _mm256_cvtneobf16_ps(const __m256bh *__A); 334/// \endcode 335/// 336/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. 337/// 338/// \param __A 339/// A pointer to a 256-bit memory location containing 16 consecutive 340/// BF16 (16-bit) floating-point values. 341/// \returns 342/// A 256-bit vector of [8 x float]. 343/// 344/// \code{.operation} 345/// FOR j := 0 to 7 346/// k := j*2+1 347/// i := k*16 348/// m := j*32 349/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 350/// ENDFOR 351/// dst[MAX:256] := 0 352/// \endcode 353static __inline__ __m256 __DEFAULT_FN_ATTRS256 354_mm256_cvtneobf16_ps(const __m256bh *__A) { 355 return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A); 356} 357 358/// Convert packed half-precision (16-bit) floating-point odd-indexed elements 359/// stored at memory locations starting at location \a __A to packed 360/// single-precision (32-bit) floating-point elements, and store the results in 361/// \a dst. 362/// 363/// \headerfile <x86intrin.h> 364/// 365/// \code 366/// _mm_cvtneoph_ps(const __m128h *__A); 367/// \endcode 368/// 369/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. 370/// 371/// \param __A 372/// A pointer to a 128-bit memory location containing 8 consecutive 373/// half-precision (16-bit) floating-point values. 374/// \returns 375/// A 128-bit vector of [4 x float]. 376/// 377/// \code{.operation} 378/// FOR j := 0 to 3 379/// k := j*2+1 380/// i := k*16 381/// m := j*32 382/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 383/// ENDFOR 384/// dst[MAX:128] := 0 385/// \endcode 386static __inline__ __m128 __DEFAULT_FN_ATTRS128 387_mm_cvtneoph_ps(const __m128h *__A) { 388 return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A); 389} 390 391/// Convert packed half-precision (16-bit) floating-point odd-indexed elements 392/// stored at memory locations starting at location \a __A to packed 393/// single-precision (32-bit) floating-point elements, and store the results in 394/// \a dst. 395/// 396/// \headerfile <x86intrin.h> 397/// 398/// \code 399/// _mm256_cvtneoph_ps(const __m256h *__A); 400/// \endcode 401/// 402/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. 403/// 404/// \param __A 405/// A pointer to a 256-bit memory location containing 16 consecutive 406/// half-precision (16-bit) floating-point values. 407/// \returns 408/// A 256-bit vector of [8 x float]. 409/// 410/// \code{.operation} 411/// FOR j := 0 to 7 412/// k := j*2+1 413/// i := k*16 414/// m := j*32 415/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 416/// ENDFOR 417/// dst[MAX:256] := 0 418/// \endcode 419static __inline__ __m256 __DEFAULT_FN_ATTRS256 420_mm256_cvtneoph_ps(const __m256h *__A) { 421 return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A); 422} 423 424/// Convert packed single-precision (32-bit) floating-point elements in \a __A 425/// to packed BF16 (16-bit) floating-point elements, and store the results in \a 426/// dst. 427/// 428/// \headerfile <x86intrin.h> 429/// 430/// \code 431/// _mm_cvtneps_avx_pbh(__m128 __A); 432/// \endcode 433/// 434/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. 435/// 436/// \param __A 437/// A 128-bit vector of [4 x float]. 438/// \returns 439/// A 128-bit vector of [8 x bfloat]. 440/// 441/// \code{.operation} 442/// FOR j := 0 to 3 443/// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) 444/// ENDFOR 445/// dst[MAX:128] := 0 446/// \endcode 447static __inline__ __m128bh __DEFAULT_FN_ATTRS128 448_mm_cvtneps_avx_pbh(__m128 __A) { 449 return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A); 450} 451 452/// Convert packed single-precision (32-bit) floating-point elements in \a __A 453/// to packed BF16 (16-bit) floating-point elements, and store the results in \a 454/// dst. 455/// 456/// \headerfile <x86intrin.h> 457/// 458/// \code 459/// _mm256_cvtneps_avx_pbh(__m256 __A); 460/// \endcode 461/// 462/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. 463/// 464/// \param __A 465/// A 256-bit vector of [8 x float]. 466/// \returns 467/// A 128-bit vector of [8 x bfloat]. 468/// 469/// \code{.operation} 470/// FOR j := 0 to 7 471/// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) 472/// ENDFOR 473/// dst[MAX:128] := 0 474/// \endcode 475static __inline__ __m128bh __DEFAULT_FN_ATTRS256 476_mm256_cvtneps_avx_pbh(__m256 __A) { 477 return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A); 478} 479 480#undef __DEFAULT_FN_ATTRS128 481#undef __DEFAULT_FN_ATTRS256 482 483#endif // __AVXNECONVERTINTRIN_H 484#endif // __SSE2__ 485