1/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
2 *
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *
8 *===-----------------------------------------------------------------------===
9 */
10#ifndef __IMMINTRIN_H
11#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __GFNIINTRIN_H
15#define __GFNIINTRIN_H
16
17/* Default attributes for simple form (no masking). */
18#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
19
20/* Default attributes for YMM unmasked form. */
21#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
22
23/* Default attributes for ZMM unmasked forms. */
24#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
25/* Default attributes for ZMM masked forms. */
26#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
27
28/* Default attributes for VLX masked forms. */
29#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
30#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
31
32#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
33  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
34                                                   (__v16qi)(__m128i)(B), \
35                                                   (char)(I)))
36
37#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
38  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
39                                                   (__v16qi)(__m128i)(B), \
40                                                   (char)(I)))
41
42static __inline__ __m128i __DEFAULT_FN_ATTRS
43_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
44{
45  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
46              (__v16qi) __B);
47}
48
49#ifdef __AVXINTRIN_H
50#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
51  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
52                                                   (__v32qi)(__m256i)(B), \
53                                                   (char)(I)))
54
55#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
56  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
57                                                   (__v32qi)(__m256i)(B), \
58                                                   (char)(I)))
59
60static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
61_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
62{
63  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
64              (__v32qi) __B);
65}
66#endif /* __AVXINTRIN_H */
67
68#ifdef __AVX512BWINTRIN_H
69#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
70  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
71                                                   (__v64qi)(__m512i)(B), \
72                                                   (char)(I)))
73
74#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
75  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
76         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
77         (__v64qi)(__m512i)(S)))
78
79#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
80  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
81         U, A, B, I)
82
83#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
84  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
85                                                   (__v64qi)(__m512i)(B), \
86                                                   (char)(I)))
87
88#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
89  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
90         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
91         (__v64qi)(__m512i)(S)))
92
93#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
94  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
95         U, A, B, I)
96
97static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
98_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
99{
100  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
101              (__v64qi) __B);
102}
103
104static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
105_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
106{
107  return (__m512i) __builtin_ia32_selectb_512(__U,
108              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
109              (__v64qi) __S);
110}
111
112static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
113_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
114{
115  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
116              __U, __A, __B);
117}
118#endif /* __AVX512BWINTRIN_H */
119
120#ifdef __AVX512VLBWINTRIN_H
121#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
122  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
123         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
124         (__v16qi)(__m128i)(S)))
125
126#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
127  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
128         U, A, B, I)
129
130#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
131  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
132         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
133         (__v32qi)(__m256i)(S)))
134
135#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
136  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
137         U, A, B, I)
138
139#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
140  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
141         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
142         (__v16qi)(__m128i)(S)))
143
144#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
145  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
146
147#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
148  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
149         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
150         (__v32qi)(__m256i)(S)))
151
152#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
153  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
154         U, A, B, I)
155
156static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
157_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
158{
159  return (__m128i) __builtin_ia32_selectb_128(__U,
160              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
161              (__v16qi) __S);
162}
163
164static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
165_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
166{
167  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
168              __U, __A, __B);
169}
170
171static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
172_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
173{
174  return (__m256i) __builtin_ia32_selectb_256(__U,
175              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
176              (__v32qi) __S);
177}
178
179static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
180_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
181{
182  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
183              __U, __A, __B);
184}
185#endif /* __AVX512VLBWINTRIN_H */
186
187#undef __DEFAULT_FN_ATTRS
188#undef __DEFAULT_FN_ATTRS_Y
189#undef __DEFAULT_FN_ATTRS_Z
190#undef __DEFAULT_FN_ATTRS_VL128
191#undef __DEFAULT_FN_ATTRS_VL256
192
193#endif /* __GFNIINTRIN_H */
194
195