1/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead." 12#endif 13 14#ifndef __AVXIFMAINTRIN_H 15#define __AVXIFMAINTRIN_H 16 17/* Define the default attributes for the functions in this file. */ 18#define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ 20 __min_vector_width__(128))) 21#define __DEFAULT_FN_ATTRS256 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ 23 __min_vector_width__(256))) 24 25// must vex-encoding 26 27/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 28/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit 29/// unsigned integer from the intermediate result with the corresponding 30/// unsigned 64-bit integer in \a __X, and store the results in \a dst. 31/// 32/// \headerfile <immintrin.h> 33/// 34/// \code 35/// __m128i 36/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) 37/// \endcode 38/// 39/// This intrinsic corresponds to the \c VPMADD52HUQ instruction. 40/// 41/// \return 42/// return __m128i dst. 43/// \param __X 44/// A 128-bit vector of [2 x i64] 45/// \param __Y 46/// A 128-bit vector of [2 x i64] 47/// \param __Z 48/// A 128-bit vector of [2 x i64] 49/// 50/// \code{.operation} 51/// FOR j := 0 to 1 52/// i := j*64 53/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 54/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) 55/// ENDFOR 56/// dst[MAX:128] := 0 57/// \endcode 58static __inline__ __m128i __DEFAULT_FN_ATTRS128 59_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { 60 return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, 61 (__v2di)__Z); 62} 63 64/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 65/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit 66/// unsigned integer from the intermediate result with the corresponding 67/// unsigned 64-bit integer in \a __X, and store the results in \a dst. 68/// 69/// \headerfile <immintrin.h> 70/// 71/// \code 72/// __m256i 73/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) 74/// \endcode 75/// 76/// This intrinsic corresponds to the \c VPMADD52HUQ instruction. 77/// 78/// \return 79/// return __m256i dst. 80/// \param __X 81/// A 256-bit vector of [4 x i64] 82/// \param __Y 83/// A 256-bit vector of [4 x i64] 84/// \param __Z 85/// A 256-bit vector of [4 x i64] 86/// 87/// \code{.operation} 88/// FOR j := 0 to 3 89/// i := j*64 90/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 91/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) 92/// ENDFOR 93/// dst[MAX:256] := 0 94/// \endcode 95static __inline__ __m256i __DEFAULT_FN_ATTRS256 96_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { 97 return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, 98 (__v4di)__Z); 99} 100 101/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 102/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit 103/// unsigned integer from the intermediate result with the corresponding 104/// unsigned 64-bit integer in \a __X, and store the results in \a dst. 105/// 106/// \headerfile <immintrin.h> 107/// 108/// \code 109/// __m128i 110/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) 111/// \endcode 112/// 113/// This intrinsic corresponds to the \c VPMADD52LUQ instruction. 114/// 115/// \return 116/// return __m128i dst. 117/// \param __X 118/// A 128-bit vector of [2 x i64] 119/// \param __Y 120/// A 128-bit vector of [2 x i64] 121/// \param __Z 122/// A 128-bit vector of [2 x i64] 123/// 124/// \code{.operation} 125/// FOR j := 0 to 1 126/// i := j*64 127/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 128/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) 129/// ENDFOR 130/// dst[MAX:128] := 0 131/// \endcode 132static __inline__ __m128i __DEFAULT_FN_ATTRS128 133_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { 134 return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, 135 (__v2di)__Z); 136} 137 138/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 139/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit 140/// unsigned integer from the intermediate result with the corresponding 141/// unsigned 64-bit integer in \a __X, and store the results in \a dst. 142/// 143/// \headerfile <immintrin.h> 144/// 145/// \code 146/// __m256i 147/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) 148/// \endcode 149/// 150/// This intrinsic corresponds to the \c VPMADD52LUQ instruction. 151/// 152/// \return 153/// return __m256i dst. 154/// \param __X 155/// A 256-bit vector of [4 x i64] 156/// \param __Y 157/// A 256-bit vector of [4 x i64] 158/// \param __Z 159/// A 256-bit vector of [4 x i64] 160/// 161/// \code{.operation} 162/// FOR j := 0 to 3 163/// i := j*64 164/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 165/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) 166/// ENDFOR 167/// dst[MAX:256] := 0 168/// \endcode 169static __inline__ __m256i __DEFAULT_FN_ATTRS256 170_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { 171 return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, 172 (__v4di)__Z); 173} 174#undef __DEFAULT_FN_ATTRS128 175#undef __DEFAULT_FN_ATTRS256 176 177#endif // __AVXIFMAINTRIN_H 178