1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23/* Define the default attributes for the functions in this file. */
24#define __DEFAULT_FN_ATTRS512                                                  \
25  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
26                 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256                                                  \
28  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
29                 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128                                                  \
31  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
32                 __min_vector_width__(128)))
33
34static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
35  return __a[0];
36}
37
38static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
39  return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
40}
41
42static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
43  return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45}
46
47static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
48  return (__m256h)__builtin_ia32_undef256();
49}
50
51static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
52  return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
55}
56
57static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
58  return (__m128h)__builtin_ia32_undef128();
59}
60
61static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
62  return (__m512h)__builtin_ia32_undef512();
63}
64
65static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
66  return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
67                            __h, __h, __h, __h, __h, __h, __h, __h,
68                            __h, __h, __h, __h, __h, __h, __h, __h,
69                            __h, __h, __h, __h, __h, __h, __h, __h};
70}
71
72static __inline __m512h __DEFAULT_FN_ATTRS512
73_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
74              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
75              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
76              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
77              _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
78              _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
79              _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
80              _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
81  return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
82                            __h25, __h24, __h23, __h22, __h21, __h20, __h19,
83                            __h18, __h17, __h16, __h15, __h14, __h13, __h12,
84                            __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
85                            __h4,  __h3,  __h2,  __h1};
86}
87
88#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89                       h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
90                       h25, h26, h27, h28, h29, h30, h31, h32)                 \
91  _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92                (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93                (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
94                (h5), (h4), (h3), (h2), (h1))
95
96static __inline __m512h __DEFAULT_FN_ATTRS512
97_mm512_set1_pch(_Float16 _Complex h) {
98  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
99}
100
101static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
102  return (__m128)__a;
103}
104
105static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
106  return (__m256)__a;
107}
108
109static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
110  return (__m512)__a;
111}
112
113static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
114  return (__m128d)__a;
115}
116
117static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
118  return (__m256d)__a;
119}
120
121static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
122  return (__m512d)__a;
123}
124
125static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
126  return (__m128i)__a;
127}
128
129static __inline__ __m256i __DEFAULT_FN_ATTRS256
130_mm256_castph_si256(__m256h __a) {
131  return (__m256i)__a;
132}
133
134static __inline__ __m512i __DEFAULT_FN_ATTRS512
135_mm512_castph_si512(__m512h __a) {
136  return (__m512i)__a;
137}
138
139static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
140  return (__m128h)__a;
141}
142
143static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
144  return (__m256h)__a;
145}
146
147static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
148  return (__m512h)__a;
149}
150
151static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
152  return (__m128h)__a;
153}
154
155static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
156  return (__m256h)__a;
157}
158
159static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
160  return (__m512h)__a;
161}
162
163static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
164  return (__m128h)__a;
165}
166
167static __inline__ __m256h __DEFAULT_FN_ATTRS256
168_mm256_castsi256_ph(__m256i __a) {
169  return (__m256h)__a;
170}
171
172static __inline__ __m512h __DEFAULT_FN_ATTRS512
173_mm512_castsi512_ph(__m512i __a) {
174  return (__m512h)__a;
175}
176
177static __inline__ __m128h __DEFAULT_FN_ATTRS256
178_mm256_castph256_ph128(__m256h __a) {
179  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
180}
181
182static __inline__ __m128h __DEFAULT_FN_ATTRS512
183_mm512_castph512_ph128(__m512h __a) {
184  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
185}
186
187static __inline__ __m256h __DEFAULT_FN_ATTRS512
188_mm512_castph512_ph256(__m512h __a) {
189  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
190                                 12, 13, 14, 15);
191}
192
193static __inline__ __m256h __DEFAULT_FN_ATTRS256
194_mm256_castph128_ph256(__m128h __a) {
195  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
196                                 -1, -1, -1, -1, -1);
197}
198
199static __inline__ __m512h __DEFAULT_FN_ATTRS512
200_mm512_castph128_ph512(__m128h __a) {
201  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
202                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
203                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
204}
205
206static __inline__ __m512h __DEFAULT_FN_ATTRS512
207_mm512_castph256_ph512(__m256h __a) {
208  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
210                                 -1, -1, -1, -1, -1, -1, -1, -1);
211}
212
213/// Constructs a 256-bit floating-point vector of [16 x half] from a
214///    128-bit floating-point vector of [8 x half]. The lower 128 bits
215///    contain the value of the source vector. The upper 384 bits are set
216///    to zero.
217///
218/// \headerfile <x86intrin.h>
219///
220/// This intrinsic has no corresponding instruction.
221///
222/// \param __a
223///    A 128-bit vector of [8 x half].
224/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
225///    contain the value of the parameter. The upper 384 bits are set to zero.
226static __inline__ __m256h __DEFAULT_FN_ATTRS256
227_mm256_zextph128_ph256(__m128h __a) {
228  return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
229                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
230}
231
232/// Constructs a 512-bit floating-point vector of [32 x half] from a
233///    128-bit floating-point vector of [8 x half]. The lower 128 bits
234///    contain the value of the source vector. The upper 384 bits are set
235///    to zero.
236///
237/// \headerfile <x86intrin.h>
238///
239/// This intrinsic has no corresponding instruction.
240///
241/// \param __a
242///    A 128-bit vector of [8 x half].
243/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
244///    contain the value of the parameter. The upper 384 bits are set to zero.
245static __inline__ __m512h __DEFAULT_FN_ATTRS512
246_mm512_zextph128_ph512(__m128h __a) {
247  return __builtin_shufflevector(
248      __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
249      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
250}
251
252/// Constructs a 512-bit floating-point vector of [32 x half] from a
253///    256-bit floating-point vector of [16 x half]. The lower 256 bits
254///    contain the value of the source vector. The upper 256 bits are set
255///    to zero.
256///
257/// \headerfile <x86intrin.h>
258///
259/// This intrinsic has no corresponding instruction.
260///
261/// \param __a
262///    A 256-bit vector of [16 x half].
263/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
264///    contain the value of the parameter. The upper 256 bits are set to zero.
265static __inline__ __m512h __DEFAULT_FN_ATTRS512
266_mm512_zextph256_ph512(__m256h __a) {
267  return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
268                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
269                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
270                                 29, 30, 31);
271}
272
273#define _mm_comi_round_sh(A, B, P, R)                                          \
274  __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
275
276#define _mm_comi_sh(A, B, pred)                                                \
277  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
278
279static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
280                                                          __m128h B) {
281  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
282                                _MM_FROUND_CUR_DIRECTION);
283}
284
285static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
286                                                          __m128h B) {
287  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
288                                _MM_FROUND_CUR_DIRECTION);
289}
290
291static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
292                                                          __m128h B) {
293  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
294                                _MM_FROUND_CUR_DIRECTION);
295}
296
297static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
298                                                          __m128h B) {
299  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
300                                _MM_FROUND_CUR_DIRECTION);
301}
302
303static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
304                                                          __m128h B) {
305  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
306                                _MM_FROUND_CUR_DIRECTION);
307}
308
309static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
310                                                           __m128h B) {
311  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
312                                _MM_FROUND_CUR_DIRECTION);
313}
314
315static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
316                                                           __m128h B) {
317  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
318                                _MM_FROUND_CUR_DIRECTION);
319}
320
321static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
322                                                           __m128h B) {
323  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
324                                _MM_FROUND_CUR_DIRECTION);
325}
326
327static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
328                                                           __m128h B) {
329  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
330                                _MM_FROUND_CUR_DIRECTION);
331}
332
333static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
334                                                           __m128h B) {
335  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
336                                _MM_FROUND_CUR_DIRECTION);
337}
338
339static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
340                                                           __m128h B) {
341  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
342                                _MM_FROUND_CUR_DIRECTION);
343}
344
345static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
346                                                            __m128h B) {
347  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
348                                _MM_FROUND_CUR_DIRECTION);
349}
350
351static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
352                                                              __m512h __B) {
353  return (__m512h)((__v32hf)__A + (__v32hf)__B);
354}
355
356static __inline__ __m512h __DEFAULT_FN_ATTRS512
357_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
358  return (__m512h)__builtin_ia32_selectph_512(
359      (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
360}
361
362static __inline__ __m512h __DEFAULT_FN_ATTRS512
363_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
364  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
365                                              (__v32hf)_mm512_add_ph(__A, __B),
366                                              (__v32hf)_mm512_setzero_ph());
367}
368
369#define _mm512_add_round_ph(A, B, R)                                           \
370  ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
371                                    (__v32hf)(__m512h)(B), (int)(R)))
372
373#define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
374  ((__m512h)__builtin_ia32_selectph_512(                                       \
375      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
376      (__v32hf)(__m512h)(W)))
377
378#define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
379  ((__m512h)__builtin_ia32_selectph_512(                                       \
380      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
381      (__v32hf)_mm512_setzero_ph()))
382
383static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
384                                                              __m512h __B) {
385  return (__m512h)((__v32hf)__A - (__v32hf)__B);
386}
387
388static __inline__ __m512h __DEFAULT_FN_ATTRS512
389_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
390  return (__m512h)__builtin_ia32_selectph_512(
391      (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
392}
393
394static __inline__ __m512h __DEFAULT_FN_ATTRS512
395_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
396  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
397                                              (__v32hf)_mm512_sub_ph(__A, __B),
398                                              (__v32hf)_mm512_setzero_ph());
399}
400
401#define _mm512_sub_round_ph(A, B, R)                                           \
402  ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
403                                    (__v32hf)(__m512h)(B), (int)(R)))
404
405#define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
406  ((__m512h)__builtin_ia32_selectph_512(                                       \
407      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
408      (__v32hf)(__m512h)(W)))
409
410#define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
411  ((__m512h)__builtin_ia32_selectph_512(                                       \
412      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
413      (__v32hf)_mm512_setzero_ph()))
414
415static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
416                                                              __m512h __B) {
417  return (__m512h)((__v32hf)__A * (__v32hf)__B);
418}
419
420static __inline__ __m512h __DEFAULT_FN_ATTRS512
421_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
422  return (__m512h)__builtin_ia32_selectph_512(
423      (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
424}
425
426static __inline__ __m512h __DEFAULT_FN_ATTRS512
427_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
428  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
429                                              (__v32hf)_mm512_mul_ph(__A, __B),
430                                              (__v32hf)_mm512_setzero_ph());
431}
432
433#define _mm512_mul_round_ph(A, B, R)                                           \
434  ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
435                                    (__v32hf)(__m512h)(B), (int)(R)))
436
437#define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
438  ((__m512h)__builtin_ia32_selectph_512(                                       \
439      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
440      (__v32hf)(__m512h)(W)))
441
442#define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
443  ((__m512h)__builtin_ia32_selectph_512(                                       \
444      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
445      (__v32hf)_mm512_setzero_ph()))
446
447static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
448                                                              __m512h __B) {
449  return (__m512h)((__v32hf)__A / (__v32hf)__B);
450}
451
452static __inline__ __m512h __DEFAULT_FN_ATTRS512
453_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
454  return (__m512h)__builtin_ia32_selectph_512(
455      (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
456}
457
458static __inline__ __m512h __DEFAULT_FN_ATTRS512
459_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
460  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
461                                              (__v32hf)_mm512_div_ph(__A, __B),
462                                              (__v32hf)_mm512_setzero_ph());
463}
464
465#define _mm512_div_round_ph(A, B, R)                                           \
466  ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
467                                    (__v32hf)(__m512h)(B), (int)(R)))
468
469#define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
470  ((__m512h)__builtin_ia32_selectph_512(                                       \
471      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
472      (__v32hf)(__m512h)(W)))
473
474#define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
475  ((__m512h)__builtin_ia32_selectph_512(                                       \
476      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
477      (__v32hf)_mm512_setzero_ph()))
478
479static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
480                                                              __m512h __B) {
481  return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
482                                          _MM_FROUND_CUR_DIRECTION);
483}
484
485static __inline__ __m512h __DEFAULT_FN_ATTRS512
486_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
487  return (__m512h)__builtin_ia32_selectph_512(
488      (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
489}
490
491static __inline__ __m512h __DEFAULT_FN_ATTRS512
492_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
493  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
494                                              (__v32hf)_mm512_min_ph(__A, __B),
495                                              (__v32hf)_mm512_setzero_ph());
496}
497
498#define _mm512_min_round_ph(A, B, R)                                           \
499  ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
500                                    (__v32hf)(__m512h)(B), (int)(R)))
501
502#define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
503  ((__m512h)__builtin_ia32_selectph_512(                                       \
504      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
505      (__v32hf)(__m512h)(W)))
506
507#define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
508  ((__m512h)__builtin_ia32_selectph_512(                                       \
509      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
510      (__v32hf)_mm512_setzero_ph()))
511
512static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
513                                                              __m512h __B) {
514  return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
515                                          _MM_FROUND_CUR_DIRECTION);
516}
517
518static __inline__ __m512h __DEFAULT_FN_ATTRS512
519_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
520  return (__m512h)__builtin_ia32_selectph_512(
521      (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
522}
523
524static __inline__ __m512h __DEFAULT_FN_ATTRS512
525_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
526  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
527                                              (__v32hf)_mm512_max_ph(__A, __B),
528                                              (__v32hf)_mm512_setzero_ph());
529}
530
531#define _mm512_max_round_ph(A, B, R)                                           \
532  ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
533                                    (__v32hf)(__m512h)(B), (int)(R)))
534
535#define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
536  ((__m512h)__builtin_ia32_selectph_512(                                       \
537      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
538      (__v32hf)(__m512h)(W)))
539
540#define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
541  ((__m512h)__builtin_ia32_selectph_512(                                       \
542      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
543      (__v32hf)_mm512_setzero_ph()))
544
545static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
546  return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
547}
548
549static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
550  return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
551}
552
553static __inline__ __m512h __DEFAULT_FN_ATTRS512
554_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
555  return (__m512h)__builtin_ia32_selectps_512(
556      (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
557}
558
559static __inline__ __m512h __DEFAULT_FN_ATTRS512
560_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
561  return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
562                                              (__v16sf)_mm512_conj_pch(__A),
563                                              (__v16sf)_mm512_setzero_ps());
564}
565
566static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
567                                                           __m128h __B) {
568  __A[0] += __B[0];
569  return __A;
570}
571
572static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
573                                                                __mmask8 __U,
574                                                                __m128h __A,
575                                                                __m128h __B) {
576  __A = _mm_add_sh(__A, __B);
577  return __builtin_ia32_selectsh_128(__U, __A, __W);
578}
579
580static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
581                                                                 __m128h __A,
582                                                                 __m128h __B) {
583  __A = _mm_add_sh(__A, __B);
584  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
585}
586
587#define _mm_add_round_sh(A, B, R)                                              \
588  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
589      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
590      (__mmask8)-1, (int)(R)))
591
592#define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
593  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
594      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
595      (__mmask8)(U), (int)(R)))
596
597#define _mm_maskz_add_round_sh(U, A, B, R)                                     \
598  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
599      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
600      (__mmask8)(U), (int)(R)))
601
602static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
603                                                           __m128h __B) {
604  __A[0] -= __B[0];
605  return __A;
606}
607
608static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
609                                                                __mmask8 __U,
610                                                                __m128h __A,
611                                                                __m128h __B) {
612  __A = _mm_sub_sh(__A, __B);
613  return __builtin_ia32_selectsh_128(__U, __A, __W);
614}
615
616static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
617                                                                 __m128h __A,
618                                                                 __m128h __B) {
619  __A = _mm_sub_sh(__A, __B);
620  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
621}
622
623#define _mm_sub_round_sh(A, B, R)                                              \
624  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
625      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
626      (__mmask8)-1, (int)(R)))
627
628#define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
629  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
630      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
631      (__mmask8)(U), (int)(R)))
632
633#define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
634  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
635      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
636      (__mmask8)(U), (int)(R)))
637
638static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
639                                                           __m128h __B) {
640  __A[0] *= __B[0];
641  return __A;
642}
643
644static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
645                                                                __mmask8 __U,
646                                                                __m128h __A,
647                                                                __m128h __B) {
648  __A = _mm_mul_sh(__A, __B);
649  return __builtin_ia32_selectsh_128(__U, __A, __W);
650}
651
652static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
653                                                                 __m128h __A,
654                                                                 __m128h __B) {
655  __A = _mm_mul_sh(__A, __B);
656  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
657}
658
659#define _mm_mul_round_sh(A, B, R)                                              \
660  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
661      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
662      (__mmask8)-1, (int)(R)))
663
664#define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
665  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
666      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
667      (__mmask8)(U), (int)(R)))
668
669#define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
670  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
671      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
672      (__mmask8)(U), (int)(R)))
673
674static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
675                                                           __m128h __B) {
676  __A[0] /= __B[0];
677  return __A;
678}
679
680static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
681                                                                __mmask8 __U,
682                                                                __m128h __A,
683                                                                __m128h __B) {
684  __A = _mm_div_sh(__A, __B);
685  return __builtin_ia32_selectsh_128(__U, __A, __W);
686}
687
688static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
689                                                                 __m128h __A,
690                                                                 __m128h __B) {
691  __A = _mm_div_sh(__A, __B);
692  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
693}
694
695#define _mm_div_round_sh(A, B, R)                                              \
696  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
697      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
698      (__mmask8)-1, (int)(R)))
699
700#define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
701  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
702      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
703      (__mmask8)(U), (int)(R)))
704
705#define _mm_maskz_div_round_sh(U, A, B, R)                                     \
706  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
707      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
708      (__mmask8)(U), (int)(R)))
709
710static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
711                                                           __m128h __B) {
712  return (__m128h)__builtin_ia32_minsh_round_mask(
713      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
714      _MM_FROUND_CUR_DIRECTION);
715}
716
717static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
718                                                                __mmask8 __U,
719                                                                __m128h __A,
720                                                                __m128h __B) {
721  return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
722                                                  (__v8hf)__W, (__mmask8)__U,
723                                                  _MM_FROUND_CUR_DIRECTION);
724}
725
726static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
727                                                                 __m128h __A,
728                                                                 __m128h __B) {
729  return (__m128h)__builtin_ia32_minsh_round_mask(
730      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
731      _MM_FROUND_CUR_DIRECTION);
732}
733
734#define _mm_min_round_sh(A, B, R)                                              \
735  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
736      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
737      (__mmask8)-1, (int)(R)))
738
739#define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
740  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
741      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
742      (__mmask8)(U), (int)(R)))
743
744#define _mm_maskz_min_round_sh(U, A, B, R)                                     \
745  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
746      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
747      (__mmask8)(U), (int)(R)))
748
749static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
750                                                           __m128h __B) {
751  return (__m128h)__builtin_ia32_maxsh_round_mask(
752      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
753      _MM_FROUND_CUR_DIRECTION);
754}
755
756static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
757                                                                __mmask8 __U,
758                                                                __m128h __A,
759                                                                __m128h __B) {
760  return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
761                                                  (__v8hf)__W, (__mmask8)__U,
762                                                  _MM_FROUND_CUR_DIRECTION);
763}
764
765static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
766                                                                 __m128h __A,
767                                                                 __m128h __B) {
768  return (__m128h)__builtin_ia32_maxsh_round_mask(
769      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
770      _MM_FROUND_CUR_DIRECTION);
771}
772
773#define _mm_max_round_sh(A, B, R)                                              \
774  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
775      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
776      (__mmask8)-1, (int)(R)))
777
778#define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
779  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
780      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
781      (__mmask8)(U), (int)(R)))
782
783#define _mm_maskz_max_round_sh(U, A, B, R)                                     \
784  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
785      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
786      (__mmask8)(U), (int)(R)))
787
788#define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
789  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
790                                           (__v32hf)(__m512h)(B), (int)(P),    \
791                                           (__mmask32)-1, (int)(R)))
792
793#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
794  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
795                                           (__v32hf)(__m512h)(B), (int)(P),    \
796                                           (__mmask32)(U), (int)(R)))
797
798#define _mm512_cmp_ph_mask(A, B, P)                                            \
799  _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
800
801#define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
802  _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
803
804#define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
805  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
806                                       (__v8hf)(__m128h)(Y), (int)(P),         \
807                                       (__mmask8)-1, (int)(R)))
808
809#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
810  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
811                                       (__v8hf)(__m128h)(Y), (int)(P),         \
812                                       (__mmask8)(M), (int)(R)))
813
814#define _mm_cmp_sh_mask(X, Y, P)                                               \
815  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
816      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
817      _MM_FROUND_CUR_DIRECTION))
818
819#define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
820  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
821      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
822      _MM_FROUND_CUR_DIRECTION))
823// loads with vmovsh:
824static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
825  struct __mm_load_sh_struct {
826    _Float16 __u;
827  } __attribute__((__packed__, __may_alias__));
828  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
829  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
830}
831
832static __inline__ __m128h __DEFAULT_FN_ATTRS128
833_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
834  __m128h src = (__v8hf)__builtin_shufflevector(
835      (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
836
837  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
838}
839
840static __inline__ __m128h __DEFAULT_FN_ATTRS128
841_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
842  return (__m128h)__builtin_ia32_loadsh128_mask(
843      (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
844}
845
846static __inline__ __m512h __DEFAULT_FN_ATTRS512
847_mm512_load_ph(void const *__p) {
848  return *(const __m512h *)__p;
849}
850
851static __inline__ __m256h __DEFAULT_FN_ATTRS256
852_mm256_load_ph(void const *__p) {
853  return *(const __m256h *)__p;
854}
855
856static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
857  return *(const __m128h *)__p;
858}
859
860static __inline__ __m512h __DEFAULT_FN_ATTRS512
861_mm512_loadu_ph(void const *__p) {
862  struct __loadu_ph {
863    __m512h_u __v;
864  } __attribute__((__packed__, __may_alias__));
865  return ((const struct __loadu_ph *)__p)->__v;
866}
867
868static __inline__ __m256h __DEFAULT_FN_ATTRS256
869_mm256_loadu_ph(void const *__p) {
870  struct __loadu_ph {
871    __m256h_u __v;
872  } __attribute__((__packed__, __may_alias__));
873  return ((const struct __loadu_ph *)__p)->__v;
874}
875
876static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
877  struct __loadu_ph {
878    __m128h_u __v;
879  } __attribute__((__packed__, __may_alias__));
880  return ((const struct __loadu_ph *)__p)->__v;
881}
882
883// stores with vmovsh:
884static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
885                                                          __m128h __a) {
886  struct __mm_store_sh_struct {
887    _Float16 __u;
888  } __attribute__((__packed__, __may_alias__));
889  ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
890}
891
892static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
893                                                               __mmask8 __U,
894                                                               __m128h __A) {
895  __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
896}
897
898static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
899                                                             __m512h __A) {
900  *(__m512h *)__P = __A;
901}
902
903static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
904                                                             __m256h __A) {
905  *(__m256h *)__P = __A;
906}
907
908static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
909                                                          __m128h __A) {
910  *(__m128h *)__P = __A;
911}
912
913static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
914                                                              __m512h __A) {
915  struct __storeu_ph {
916    __m512h_u __v;
917  } __attribute__((__packed__, __may_alias__));
918  ((struct __storeu_ph *)__P)->__v = __A;
919}
920
921static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
922                                                              __m256h __A) {
923  struct __storeu_ph {
924    __m256h_u __v;
925  } __attribute__((__packed__, __may_alias__));
926  ((struct __storeu_ph *)__P)->__v = __A;
927}
928
929static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
930                                                           __m128h __A) {
931  struct __storeu_ph {
932    __m128h_u __v;
933  } __attribute__((__packed__, __may_alias__));
934  ((struct __storeu_ph *)__P)->__v = __A;
935}
936
937// moves with vmovsh:
938static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
939                                                            __m128h __b) {
940  __a[0] = __b[0];
941  return __a;
942}
943
944static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
945                                                                 __mmask8 __U,
946                                                                 __m128h __A,
947                                                                 __m128h __B) {
948  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
949}
950
951static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
952                                                                  __m128h __A,
953                                                                  __m128h __B) {
954  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
955                                     _mm_setzero_ph());
956}
957
958// vmovw:
959static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
960  return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
961}
962
963static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
964  __v8hi __b = (__v8hi)__a;
965  return __b[0];
966}
967
968static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
969  return (__m512h)__builtin_ia32_rcpph512_mask(
970      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
971}
972
973static __inline__ __m512h __DEFAULT_FN_ATTRS512
974_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
975  return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
976                                               (__mmask32)__U);
977}
978
979static __inline__ __m512h __DEFAULT_FN_ATTRS512
980_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
981  return (__m512h)__builtin_ia32_rcpph512_mask(
982      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
983}
984
985static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
986  return (__m512h)__builtin_ia32_rsqrtph512_mask(
987      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
988}
989
990static __inline__ __m512h __DEFAULT_FN_ATTRS512
991_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
992  return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
993                                                 (__mmask32)__U);
994}
995
996static __inline__ __m512h __DEFAULT_FN_ATTRS512
997_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
998  return (__m512h)__builtin_ia32_rsqrtph512_mask(
999      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1000}
1001
1002#define _mm512_getmant_ph(A, B, C)                                             \
1003  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1004      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1005      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1006      _MM_FROUND_CUR_DIRECTION))
1007
1008#define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1009  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1010      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1011      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1012
1013#define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1014  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1015      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1016      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017
1018#define _mm512_getmant_round_ph(A, B, C, R)                                    \
1019  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1020      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1021      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1022
1023#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1024  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1025      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1026      (__mmask32)(U), (int)(R)))
1027
1028#define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1029  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1030      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1031      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1032
1033static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1034  return (__m512h)__builtin_ia32_getexpph512_mask(
1035      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1036      _MM_FROUND_CUR_DIRECTION);
1037}
1038
1039static __inline__ __m512h __DEFAULT_FN_ATTRS512
1040_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1041  return (__m512h)__builtin_ia32_getexpph512_mask(
1042      (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1043}
1044
1045static __inline__ __m512h __DEFAULT_FN_ATTRS512
1046_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1047  return (__m512h)__builtin_ia32_getexpph512_mask(
1048      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1049      _MM_FROUND_CUR_DIRECTION);
1050}
1051
1052#define _mm512_getexp_round_ph(A, R)                                           \
1053  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1054                                            (__v32hf)_mm512_undefined_ph(),    \
1055                                            (__mmask32)-1, (int)(R)))
1056
1057#define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1058  ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1059      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1060
1061#define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1062  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1063                                            (__v32hf)_mm512_setzero_ph(),      \
1064                                            (__mmask32)(U), (int)(R)))
1065
1066static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1067                                                                 __m512h __B) {
1068  return (__m512h)__builtin_ia32_scalefph512_mask(
1069      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1070      _MM_FROUND_CUR_DIRECTION);
1071}
1072
1073static __inline__ __m512h __DEFAULT_FN_ATTRS512
1074_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1075  return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1076                                                  (__v32hf)__W, (__mmask32)__U,
1077                                                  _MM_FROUND_CUR_DIRECTION);
1078}
1079
1080static __inline__ __m512h __DEFAULT_FN_ATTRS512
1081_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1082  return (__m512h)__builtin_ia32_scalefph512_mask(
1083      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1084      _MM_FROUND_CUR_DIRECTION);
1085}
1086
1087#define _mm512_scalef_round_ph(A, B, R)                                        \
1088  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1089      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1090      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1091
1092#define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1093  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1094      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1095      (__mmask32)(U), (int)(R)))
1096
1097#define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1098  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1099      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1100      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1101
1102#define _mm512_roundscale_ph(A, B)                                             \
1103  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1104      (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1105      _MM_FROUND_CUR_DIRECTION))
1106
1107#define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1108  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1109      (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1110      (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1111
1112#define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1113  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1114      (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1115      (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1116
1117#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1118  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1119                                           (__v32hf)(__m512h)(A),              \
1120                                           (__mmask32)(B), (int)(R)))
1121
1122#define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1123  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1124                                           (__v32hf)_mm512_setzero_ph(),       \
1125                                           (__mmask32)(A), (int)(R)))
1126
1127#define _mm512_roundscale_round_ph(A, imm, R)                                  \
1128  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1129                                           (__v32hf)_mm512_undefined_ph(),     \
1130                                           (__mmask32)-1, (int)(R)))
1131
1132#define _mm512_reduce_ph(A, imm)                                               \
1133  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1134      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1135      (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1136
1137#define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1138  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1139      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1140      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1141
1142#define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1143  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1144      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1145      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146
1147#define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1148  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1149                                            (__v32hf)(__m512h)(W),             \
1150                                            (__mmask32)(U), (int)(R)))
1151
1152#define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1153  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154                                            (__v32hf)_mm512_setzero_ph(),      \
1155                                            (__mmask32)(U), (int)(R)))
1156
1157#define _mm512_reduce_round_ph(A, imm, R)                                      \
1158  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159                                            (__v32hf)_mm512_undefined_ph(),    \
1160                                            (__mmask32)-1, (int)(R)))
1161
1162static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1163                                                           __m128h __B) {
1164  return (__m128h)__builtin_ia32_rcpsh_mask(
1165      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1166}
1167
1168static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1169                                                                __mmask8 __U,
1170                                                                __m128h __A,
1171                                                                __m128h __B) {
1172  return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1173                                            (__v8hf)__W, (__mmask8)__U);
1174}
1175
1176static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1177                                                                 __m128h __A,
1178                                                                 __m128h __B) {
1179  return (__m128h)__builtin_ia32_rcpsh_mask(
1180      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1181}
1182
1183static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1184                                                             __m128h __B) {
1185  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1186      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1187}
1188
1189static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1190                                                                  __mmask8 __U,
1191                                                                  __m128h __A,
1192                                                                  __m128h __B) {
1193  return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1194                                              (__v8hf)__W, (__mmask8)__U);
1195}
1196
1197static __inline__ __m128h __DEFAULT_FN_ATTRS128
1198_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1199  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1200      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1201}
1202
1203#define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1204  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1205      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1206      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1207
1208#define _mm_getmant_sh(A, B, C, D)                                             \
1209  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1210      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1211      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1212
1213#define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1214  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1215      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1216      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1217
1218#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1219  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1220      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1221      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1222
1223#define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1224  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1225      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1226      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1227
1228#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1229  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1230      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1231      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1232
1233#define _mm_getexp_round_sh(A, B, R)                                           \
1234  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1235      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1236      (__mmask8)-1, (int)(R)))
1237
1238static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1239                                                              __m128h __B) {
1240  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1241      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1242      _MM_FROUND_CUR_DIRECTION);
1243}
1244
1245static __inline__ __m128h __DEFAULT_FN_ATTRS128
1246_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1247  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248      (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1249      _MM_FROUND_CUR_DIRECTION);
1250}
1251
1252#define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1253  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1254      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1255      (__mmask8)(U), (int)(R)))
1256
1257static __inline__ __m128h __DEFAULT_FN_ATTRS128
1258_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1259  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1260      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1261      _MM_FROUND_CUR_DIRECTION);
1262}
1263
1264#define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1265  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1266      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1267      (__mmask8)(U), (int)(R)))
1268
1269#define _mm_scalef_round_sh(A, B, R)                                           \
1270  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1271      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1272      (__mmask8)-1, (int)(R)))
1273
1274static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1275                                                              __m128h __B) {
1276  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1277      (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1278      _MM_FROUND_CUR_DIRECTION);
1279}
1280
1281static __inline__ __m128h __DEFAULT_FN_ATTRS128
1282_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1283  return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1284                                                     (__v8hf)__W, (__mmask8)__U,
1285                                                     _MM_FROUND_CUR_DIRECTION);
1286}
1287
1288#define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1289  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1290      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1291      (__mmask8)(U), (int)(R)))
1292
1293static __inline__ __m128h __DEFAULT_FN_ATTRS128
1294_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1295  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1296      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1297      _MM_FROUND_CUR_DIRECTION);
1298}
1299
1300#define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1301  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1302      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1303      (__mmask8)(U), (int)(R)))
1304
1305#define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1306  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1307      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1308      (__mmask8)-1, (int)(imm), (int)(R)))
1309
1310#define _mm_roundscale_sh(A, B, imm)                                           \
1311  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1312      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1313      (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1314
1315#define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1316  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1317      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1318      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1319
1320#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1321  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1322      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1323      (__mmask8)(U), (int)(I), (int)(R)))
1324
1325#define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1326  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1327      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1328      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1329
1330#define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1331  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1332      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1333      (__mmask8)(U), (int)(I), (int)(R)))
1334
1335#define _mm_reduce_sh(A, B, C)                                                 \
1336  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1337      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1338      (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1339
1340#define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1341  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1342      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1343      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1344
1345#define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1346  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1347      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1348      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349
1350#define _mm_reduce_round_sh(A, B, C, R)                                        \
1351  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1352      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1353      (__mmask8)-1, (int)(C), (int)(R)))
1354
1355#define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1356  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1357      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1358      (__mmask8)(U), (int)(C), (int)(R)))
1359
1360#define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1361  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1362      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1363      (__mmask8)(U), (int)(C), (int)(R)))
1364
1365#define _mm512_sqrt_round_ph(A, R)                                             \
1366  ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1367
1368#define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1369  ((__m512h)__builtin_ia32_selectph_512(                                       \
1370      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1371      (__v32hf)(__m512h)(W)))
1372
1373#define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1374  ((__m512h)__builtin_ia32_selectph_512(                                       \
1375      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1376      (__v32hf)_mm512_setzero_ph()))
1377
1378static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1379  return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1380                                           _MM_FROUND_CUR_DIRECTION);
1381}
1382
1383static __inline__ __m512h __DEFAULT_FN_ATTRS512
1384_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1385  return (__m512h)__builtin_ia32_selectph_512(
1386      (__mmask32)(__U),
1387      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1388      (__v32hf)(__m512h)(__W));
1389}
1390
1391static __inline__ __m512h __DEFAULT_FN_ATTRS512
1392_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1393  return (__m512h)__builtin_ia32_selectph_512(
1394      (__mmask32)(__U),
1395      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1396      (__v32hf)_mm512_setzero_ph());
1397}
1398
1399#define _mm_sqrt_round_sh(A, B, R)                                             \
1400  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1401      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1402      (__mmask8)-1, (int)(R)))
1403
1404#define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1405  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1406      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1407      (__mmask8)(U), (int)(R)))
1408
1409#define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1410  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1411      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1412      (__mmask8)(U), (int)(R)))
1413
1414static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1415                                                            __m128h __B) {
1416  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1418      (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1419}
1420
1421static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1422                                                                 __mmask32 __U,
1423                                                                 __m128h __A,
1424                                                                 __m128h __B) {
1425  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1427      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1428}
1429
1430static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1431                                                                  __m128h __A,
1432                                                                  __m128h __B) {
1433  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1435      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1436}
1437
1438#define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1439  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1440                                               (int)(imm), (__mmask32)(U)))
1441
1442#define _mm512_fpclass_ph_mask(A, imm)                                         \
1443  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1444                                               (int)(imm), (__mmask32)-1))
1445
1446#define _mm_fpclass_sh_mask(A, imm)                                            \
1447  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1448                                           (__mmask8)-1))
1449
1450#define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1451  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1452                                           (__mmask8)(U)))
1453
1454#define _mm512_cvt_roundpd_ph(A, R)                                            \
1455  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1456      (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1457
1458#define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1459  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1460                                             (__mmask8)(U), (int)(R)))
1461
1462#define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1463  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1464      (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1465
1466static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1467  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1469      _MM_FROUND_CUR_DIRECTION);
1470}
1471
1472static __inline__ __m128h __DEFAULT_FN_ATTRS512
1473_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1474  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475      (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1476}
1477
1478static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1480  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1482      _MM_FROUND_CUR_DIRECTION);
1483}
1484
1485#define _mm512_cvt_roundph_pd(A, R)                                            \
1486  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1487      (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1488
1489#define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1490  ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1491                                             (__mmask8)(U), (int)(R)))
1492
1493#define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1494  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1495      (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1496
1497static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1498  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1499      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1500      _MM_FROUND_CUR_DIRECTION);
1501}
1502
1503static __inline__ __m512d __DEFAULT_FN_ATTRS512
1504_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1505  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506      (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1507}
1508
1509static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1511  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1513      _MM_FROUND_CUR_DIRECTION);
1514}
1515
1516#define _mm_cvt_roundsh_ss(A, B, R)                                            \
1517  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1518                                               (__v4sf)_mm_undefined_ps(),     \
1519                                               (__mmask8)(-1), (int)(R)))
1520
1521#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1522  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1523      (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1524
1525#define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1526  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1527                                               (__v4sf)_mm_setzero_ps(),       \
1528                                               (__mmask8)(U), (int)(R)))
1529
1530static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1531                                                            __m128h __B) {
1532  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1533      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1534      _MM_FROUND_CUR_DIRECTION);
1535}
1536
1537static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1538                                                                 __mmask8 __U,
1539                                                                 __m128 __A,
1540                                                                 __m128h __B) {
1541  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1542                                                     (__v4sf)__W, (__mmask8)__U,
1543                                                     _MM_FROUND_CUR_DIRECTION);
1544}
1545
1546static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1547                                                                  __m128 __A,
1548                                                                  __m128h __B) {
1549  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1550      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1551      _MM_FROUND_CUR_DIRECTION);
1552}
1553
1554#define _mm_cvt_roundss_sh(A, B, R)                                            \
1555  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1556                                                (__v8hf)_mm_undefined_ph(),    \
1557                                                (__mmask8)(-1), (int)(R)))
1558
1559#define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1560  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1561      (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1562
1563#define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1564  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1565                                                (__v8hf)_mm_setzero_ph(),      \
1566                                                (__mmask8)(U), (int)(R)))
1567
1568static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1569                                                             __m128 __B) {
1570  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1572      _MM_FROUND_CUR_DIRECTION);
1573}
1574
1575static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1576                                                                  __mmask8 __U,
1577                                                                  __m128h __A,
1578                                                                  __m128 __B) {
1579  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580      (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1581      _MM_FROUND_CUR_DIRECTION);
1582}
1583
1584static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1585                                                                   __m128h __A,
1586                                                                   __m128 __B) {
1587  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1589      _MM_FROUND_CUR_DIRECTION);
1590}
1591
1592#define _mm_cvt_roundsd_sh(A, B, R)                                            \
1593  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1594                                                (__v8hf)_mm_undefined_ph(),    \
1595                                                (__mmask8)(-1), (int)(R)))
1596
1597#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1598  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1599      (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1600
1601#define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1602  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1603                                                (__v8hf)_mm_setzero_ph(),      \
1604                                                (__mmask8)(U), (int)(R)))
1605
1606static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1607                                                             __m128d __B) {
1608  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1610      _MM_FROUND_CUR_DIRECTION);
1611}
1612
1613static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1614                                                                  __mmask8 __U,
1615                                                                  __m128h __A,
1616                                                                  __m128d __B) {
1617  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618      (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1619      _MM_FROUND_CUR_DIRECTION);
1620}
1621
1622static __inline__ __m128h __DEFAULT_FN_ATTRS128
1623_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1624  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1626      _MM_FROUND_CUR_DIRECTION);
1627}
1628
1629#define _mm_cvt_roundsh_sd(A, B, R)                                            \
1630  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1631                                                (__v2df)_mm_undefined_pd(),    \
1632                                                (__mmask8)(-1), (int)(R)))
1633
1634#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1635  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1636      (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1637
1638#define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1639  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1640                                                (__v2df)_mm_setzero_pd(),      \
1641                                                (__mmask8)(U), (int)(R)))
1642
1643static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1644                                                             __m128h __B) {
1645  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1646      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1647      _MM_FROUND_CUR_DIRECTION);
1648}
1649
1650static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1651                                                                  __mmask8 __U,
1652                                                                  __m128d __A,
1653                                                                  __m128h __B) {
1654  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655      (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1656      _MM_FROUND_CUR_DIRECTION);
1657}
1658
1659static __inline__ __m128d __DEFAULT_FN_ATTRS128
1660_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1661  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1663      _MM_FROUND_CUR_DIRECTION);
1664}
1665
1666#define _mm512_cvt_roundph_epi16(A, R)                                         \
1667  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1668                                            (__v32hi)_mm512_undefined_epi32(), \
1669                                            (__mmask32)(-1), (int)(R)))
1670
1671#define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1672  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1673                                            (__mmask32)(U), (int)(R)))
1674
1675#define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1676  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1677                                            (__v32hi)_mm512_setzero_epi32(),   \
1678                                            (__mmask32)(U), (int)(R)))
1679
1680static __inline__ __m512i __DEFAULT_FN_ATTRS512
1681_mm512_cvtph_epi16(__m512h __A) {
1682  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1683      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1684      _MM_FROUND_CUR_DIRECTION);
1685}
1686
1687static __inline__ __m512i __DEFAULT_FN_ATTRS512
1688_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1689  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1691}
1692
1693static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1695  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1697      _MM_FROUND_CUR_DIRECTION);
1698}
1699
1700#define _mm512_cvtt_roundph_epi16(A, R)                                        \
1701  ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1702      (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1703      (int)(R)))
1704
1705#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1706  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1707                                             (__mmask32)(U), (int)(R)))
1708
1709#define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1710  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1711                                             (__v32hi)_mm512_setzero_epi32(),  \
1712                                             (__mmask32)(U), (int)(R)))
1713
1714static __inline__ __m512i __DEFAULT_FN_ATTRS512
1715_mm512_cvttph_epi16(__m512h __A) {
1716  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1717      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1718      _MM_FROUND_CUR_DIRECTION);
1719}
1720
1721static __inline__ __m512i __DEFAULT_FN_ATTRS512
1722_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1723  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1725}
1726
1727static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1729  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1731      _MM_FROUND_CUR_DIRECTION);
1732}
1733
1734#define _mm512_cvt_roundepi16_ph(A, R)                                         \
1735  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1736                                            (__v32hf)_mm512_undefined_ph(),    \
1737                                            (__mmask32)(-1), (int)(R)))
1738
1739#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1740  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1741                                            (__mmask32)(U), (int)(R)))
1742
1743#define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1744  ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1745      (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1746
1747static __inline__ __m512h __DEFAULT_FN_ATTRS512
1748_mm512_cvtepi16_ph(__m512i __A) {
1749  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1751      _MM_FROUND_CUR_DIRECTION);
1752}
1753
1754static __inline__ __m512h __DEFAULT_FN_ATTRS512
1755_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1756  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757      (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1758}
1759
1760static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1762  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1764      _MM_FROUND_CUR_DIRECTION);
1765}
1766
1767#define _mm512_cvt_roundph_epu16(A, R)                                         \
1768  ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1769      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1770      (int)(R)))
1771
1772#define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1773  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1774                                             (__mmask32)(U), (int)(R)))
1775
1776#define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1777  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1778                                             (__v32hu)_mm512_setzero_epi32(),  \
1779                                             (__mmask32)(U), (int)(R)))
1780
1781static __inline__ __m512i __DEFAULT_FN_ATTRS512
1782_mm512_cvtph_epu16(__m512h __A) {
1783  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1784      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1785      _MM_FROUND_CUR_DIRECTION);
1786}
1787
1788static __inline__ __m512i __DEFAULT_FN_ATTRS512
1789_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1790  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1792}
1793
1794static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1796  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1798      _MM_FROUND_CUR_DIRECTION);
1799}
1800
1801#define _mm512_cvtt_roundph_epu16(A, R)                                        \
1802  ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1803      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1804      (int)(R)))
1805
1806#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1807  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1808                                              (__mmask32)(U), (int)(R)))
1809
1810#define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1811  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1812                                              (__v32hu)_mm512_setzero_epi32(), \
1813                                              (__mmask32)(U), (int)(R)))
1814
1815static __inline__ __m512i __DEFAULT_FN_ATTRS512
1816_mm512_cvttph_epu16(__m512h __A) {
1817  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1818      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1819      _MM_FROUND_CUR_DIRECTION);
1820}
1821
1822static __inline__ __m512i __DEFAULT_FN_ATTRS512
1823_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1824  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1826}
1827
1828static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1830  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1832      _MM_FROUND_CUR_DIRECTION);
1833}
1834
1835#define _mm512_cvt_roundepu16_ph(A, R)                                         \
1836  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1837                                             (__v32hf)_mm512_undefined_ph(),   \
1838                                             (__mmask32)(-1), (int)(R)))
1839
1840#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1841  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1842                                             (__mmask32)(U), (int)(R)))
1843
1844#define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1845  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1846      (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1847
1848static __inline__ __m512h __DEFAULT_FN_ATTRS512
1849_mm512_cvtepu16_ph(__m512i __A) {
1850  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1852      _MM_FROUND_CUR_DIRECTION);
1853}
1854
1855static __inline__ __m512h __DEFAULT_FN_ATTRS512
1856_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1857  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858      (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1859}
1860
1861static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1863  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1865      _MM_FROUND_CUR_DIRECTION);
1866}
1867
1868#define _mm512_cvt_roundph_epi32(A, R)                                         \
1869  ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1870      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1871      (int)(R)))
1872
1873#define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1874  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1875                                             (__mmask16)(U), (int)(R)))
1876
1877#define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1878  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1879                                             (__v16si)_mm512_setzero_epi32(),  \
1880                                             (__mmask16)(U), (int)(R)))
1881
1882static __inline__ __m512i __DEFAULT_FN_ATTRS512
1883_mm512_cvtph_epi32(__m256h __A) {
1884  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1885      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1886      _MM_FROUND_CUR_DIRECTION);
1887}
1888
1889static __inline__ __m512i __DEFAULT_FN_ATTRS512
1890_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1891  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1893}
1894
1895static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1897  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1899      _MM_FROUND_CUR_DIRECTION);
1900}
1901
1902#define _mm512_cvt_roundph_epu32(A, R)                                         \
1903  ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1904      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1905      (int)(R)))
1906
1907#define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1908  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1909                                              (__mmask16)(U), (int)(R)))
1910
1911#define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1912  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1913                                              (__v16su)_mm512_setzero_epi32(), \
1914                                              (__mmask16)(U), (int)(R)))
1915
1916static __inline__ __m512i __DEFAULT_FN_ATTRS512
1917_mm512_cvtph_epu32(__m256h __A) {
1918  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1919      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1920      _MM_FROUND_CUR_DIRECTION);
1921}
1922
1923static __inline__ __m512i __DEFAULT_FN_ATTRS512
1924_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1925  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1927}
1928
1929static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1931  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1933      _MM_FROUND_CUR_DIRECTION);
1934}
1935
1936#define _mm512_cvt_roundepi32_ph(A, R)                                         \
1937  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1938                                             (__v16hf)_mm256_undefined_ph(),   \
1939                                             (__mmask16)(-1), (int)(R)))
1940
1941#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1942  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1943                                             (__mmask16)(U), (int)(R)))
1944
1945#define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1946  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1947      (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1948
1949static __inline__ __m256h __DEFAULT_FN_ATTRS512
1950_mm512_cvtepi32_ph(__m512i __A) {
1951  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1953      _MM_FROUND_CUR_DIRECTION);
1954}
1955
1956static __inline__ __m256h __DEFAULT_FN_ATTRS512
1957_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1958  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959      (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1960}
1961
1962static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1964  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1966      _MM_FROUND_CUR_DIRECTION);
1967}
1968
1969#define _mm512_cvt_roundepu32_ph(A, R)                                         \
1970  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1971                                              (__v16hf)_mm256_undefined_ph(),  \
1972                                              (__mmask16)(-1), (int)(R)))
1973
1974#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1975  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1976                                              (__mmask16)(U), (int)(R)))
1977
1978#define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1979  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1980      (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1981
1982static __inline__ __m256h __DEFAULT_FN_ATTRS512
1983_mm512_cvtepu32_ph(__m512i __A) {
1984  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1986      _MM_FROUND_CUR_DIRECTION);
1987}
1988
1989static __inline__ __m256h __DEFAULT_FN_ATTRS512
1990_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1991  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992      (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1993}
1994
1995static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1997  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1999      _MM_FROUND_CUR_DIRECTION);
2000}
2001
2002#define _mm512_cvtt_roundph_epi32(A, R)                                        \
2003  ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2004      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2005      (int)(R)))
2006
2007#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2008  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2009                                              (__mmask16)(U), (int)(R)))
2010
2011#define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2012  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2013                                              (__v16si)_mm512_setzero_epi32(), \
2014                                              (__mmask16)(U), (int)(R)))
2015
2016static __inline__ __m512i __DEFAULT_FN_ATTRS512
2017_mm512_cvttph_epi32(__m256h __A) {
2018  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2019      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2020      _MM_FROUND_CUR_DIRECTION);
2021}
2022
2023static __inline__ __m512i __DEFAULT_FN_ATTRS512
2024_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2025  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2027}
2028
2029static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2031  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2033      _MM_FROUND_CUR_DIRECTION);
2034}
2035
2036#define _mm512_cvtt_roundph_epu32(A, R)                                        \
2037  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2038      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2039      (int)(R)))
2040
2041#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2042  ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2043                                               (__mmask16)(U), (int)(R)))
2044
2045#define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2046  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2047      (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2048      (int)(R)))
2049
2050static __inline__ __m512i __DEFAULT_FN_ATTRS512
2051_mm512_cvttph_epu32(__m256h __A) {
2052  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2053      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2054      _MM_FROUND_CUR_DIRECTION);
2055}
2056
2057static __inline__ __m512i __DEFAULT_FN_ATTRS512
2058_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2059  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2061}
2062
2063static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2065  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2067      _MM_FROUND_CUR_DIRECTION);
2068}
2069
2070#define _mm512_cvt_roundepi64_ph(A, R)                                         \
2071  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2072      (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2073
2074#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2075  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2076                                             (__mmask8)(U), (int)(R)))
2077
2078#define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2079  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2080      (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2081
2082static __inline__ __m128h __DEFAULT_FN_ATTRS512
2083_mm512_cvtepi64_ph(__m512i __A) {
2084  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2086      _MM_FROUND_CUR_DIRECTION);
2087}
2088
2089static __inline__ __m128h __DEFAULT_FN_ATTRS512
2090_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2091  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092      (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2093}
2094
2095static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2097  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2099      _MM_FROUND_CUR_DIRECTION);
2100}
2101
2102#define _mm512_cvt_roundph_epi64(A, R)                                         \
2103  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2104                                             (__v8di)_mm512_undefined_epi32(), \
2105                                             (__mmask8)(-1), (int)(R)))
2106
2107#define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2108  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2109                                             (__mmask8)(U), (int)(R)))
2110
2111#define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2112  ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2113      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2114
2115static __inline__ __m512i __DEFAULT_FN_ATTRS512
2116_mm512_cvtph_epi64(__m128h __A) {
2117  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2118      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2119      _MM_FROUND_CUR_DIRECTION);
2120}
2121
2122static __inline__ __m512i __DEFAULT_FN_ATTRS512
2123_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2124  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2126}
2127
2128static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2130  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2132      _MM_FROUND_CUR_DIRECTION);
2133}
2134
2135#define _mm512_cvt_roundepu64_ph(A, R)                                         \
2136  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2137      (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2138
2139#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2140  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2141                                              (__mmask8)(U), (int)(R)))
2142
2143#define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2144  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2145      (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2146
2147static __inline__ __m128h __DEFAULT_FN_ATTRS512
2148_mm512_cvtepu64_ph(__m512i __A) {
2149  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2151      _MM_FROUND_CUR_DIRECTION);
2152}
2153
2154static __inline__ __m128h __DEFAULT_FN_ATTRS512
2155_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2156  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157      (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2158}
2159
2160static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2162  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2164      _MM_FROUND_CUR_DIRECTION);
2165}
2166
2167#define _mm512_cvt_roundph_epu64(A, R)                                         \
2168  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2169      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2170      (int)(R)))
2171
2172#define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2173  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2174                                              (__mmask8)(U), (int)(R)))
2175
2176#define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2177  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2178      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2179
2180static __inline__ __m512i __DEFAULT_FN_ATTRS512
2181_mm512_cvtph_epu64(__m128h __A) {
2182  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2183      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2184      _MM_FROUND_CUR_DIRECTION);
2185}
2186
2187static __inline__ __m512i __DEFAULT_FN_ATTRS512
2188_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2189  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2191}
2192
2193static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2195  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2197      _MM_FROUND_CUR_DIRECTION);
2198}
2199
2200#define _mm512_cvtt_roundph_epi64(A, R)                                        \
2201  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2202      (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2203      (int)(R)))
2204
2205#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2206  ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2207                                              (__mmask8)(U), (int)(R)))
2208
2209#define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2210  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2211      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2212
2213static __inline__ __m512i __DEFAULT_FN_ATTRS512
2214_mm512_cvttph_epi64(__m128h __A) {
2215  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2216      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2217      _MM_FROUND_CUR_DIRECTION);
2218}
2219
2220static __inline__ __m512i __DEFAULT_FN_ATTRS512
2221_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2222  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2224}
2225
2226static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2228  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2230      _MM_FROUND_CUR_DIRECTION);
2231}
2232
2233#define _mm512_cvtt_roundph_epu64(A, R)                                        \
2234  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2235      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2236      (int)(R)))
2237
2238#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2239  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2240                                               (__mmask8)(U), (int)(R)))
2241
2242#define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2243  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2244      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2245
2246static __inline__ __m512i __DEFAULT_FN_ATTRS512
2247_mm512_cvttph_epu64(__m128h __A) {
2248  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2249      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2250      _MM_FROUND_CUR_DIRECTION);
2251}
2252
2253static __inline__ __m512i __DEFAULT_FN_ATTRS512
2254_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2255  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2257}
2258
2259static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2261  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2263      _MM_FROUND_CUR_DIRECTION);
2264}
2265
2266#define _mm_cvt_roundsh_i32(A, R)                                              \
2267  ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2268
2269static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2270  return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2271}
2272
2273#define _mm_cvt_roundsh_u32(A, R)                                              \
2274  ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2275
2276static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2277_mm_cvtsh_u32(__m128h __A) {
2278  return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2279                                                   _MM_FROUND_CUR_DIRECTION);
2280}
2281
2282#ifdef __x86_64__
2283#define _mm_cvt_roundsh_i64(A, R)                                              \
2284  ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2285
2286static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2287  return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2288                                               _MM_FROUND_CUR_DIRECTION);
2289}
2290
2291#define _mm_cvt_roundsh_u64(A, R)                                              \
2292  ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2293
2294static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2295_mm_cvtsh_u64(__m128h __A) {
2296  return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2298}
2299#endif // __x86_64__
2300
2301#define _mm_cvt_roundu32_sh(A, B, R)                                           \
2302  ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2303
2304static __inline__ __m128h __DEFAULT_FN_ATTRS128
2305_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2306  __A[0] = __B;
2307  return __A;
2308}
2309
2310#ifdef __x86_64__
2311#define _mm_cvt_roundu64_sh(A, B, R)                                           \
2312  ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2313                                        (int)(R)))
2314
2315static __inline__ __m128h __DEFAULT_FN_ATTRS128
2316_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2317  __A[0] = __B;
2318  return __A;
2319}
2320#endif
2321
2322#define _mm_cvt_roundi32_sh(A, B, R)                                           \
2323  ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2324
2325static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2326                                                              int __B) {
2327  __A[0] = __B;
2328  return __A;
2329}
2330
2331#ifdef __x86_64__
2332#define _mm_cvt_roundi64_sh(A, B, R)                                           \
2333  ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2334
2335static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2336                                                              long long __B) {
2337  __A[0] = __B;
2338  return __A;
2339}
2340#endif
2341
2342#define _mm_cvtt_roundsh_i32(A, R)                                             \
2343  ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2344
2345static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2346  return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2347                                          _MM_FROUND_CUR_DIRECTION);
2348}
2349
2350#ifdef __x86_64__
2351#define _mm_cvtt_roundsh_i64(A, R)                                             \
2352  ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2353
2354static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2355  return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2356                                                _MM_FROUND_CUR_DIRECTION);
2357}
2358#endif
2359
2360#define _mm_cvtt_roundsh_u32(A, R)                                             \
2361  ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2362
2363static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2364_mm_cvttsh_u32(__m128h __A) {
2365  return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2366                                                    _MM_FROUND_CUR_DIRECTION);
2367}
2368
2369#ifdef __x86_64__
2370#define _mm_cvtt_roundsh_u64(A, R)                                             \
2371  ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2372
2373static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2374_mm_cvttsh_u64(__m128h __A) {
2375  return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2377}
2378#endif
2379
2380#define _mm512_cvtx_roundph_ps(A, R)                                           \
2381  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2382                                             (__v16sf)_mm512_undefined_ps(),   \
2383                                             (__mmask16)(-1), (int)(R)))
2384
2385#define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2386  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2387                                             (__mmask16)(U), (int)(R)))
2388
2389#define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2390  ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2391      (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2392
2393static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2394  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2395      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2396      _MM_FROUND_CUR_DIRECTION);
2397}
2398
2399static __inline__ __m512 __DEFAULT_FN_ATTRS512
2400_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2401  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402      (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2403}
2404
2405static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2407  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2409      _MM_FROUND_CUR_DIRECTION);
2410}
2411
2412#define _mm512_cvtx_roundps_ph(A, R)                                           \
2413  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2414                                              (__v16hf)_mm256_undefined_ph(),  \
2415                                              (__mmask16)(-1), (int)(R)))
2416
2417#define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2418  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2419                                              (__mmask16)(U), (int)(R)))
2420
2421#define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2422  ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2423      (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2424
2425static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2426  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2428      _MM_FROUND_CUR_DIRECTION);
2429}
2430
2431static __inline__ __m256h __DEFAULT_FN_ATTRS512
2432_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2433  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434      (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2435}
2436
2437static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2439  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2441      _MM_FROUND_CUR_DIRECTION);
2442}
2443
2444#define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2445  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2446      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2447      (__mmask32)-1, (int)(R)))
2448
2449#define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2450  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2451      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2452      (__mmask32)(U), (int)(R)))
2453
2454#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2455  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2456      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2457      (__mmask32)(U), (int)(R)))
2458
2459#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2460  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2461      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2462      (__mmask32)(U), (int)(R)))
2463
2464#define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2465  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2466      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2467      (__mmask32)-1, (int)(R)))
2468
2469#define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2470  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2471      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2472      (__mmask32)(U), (int)(R)))
2473
2474#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2475  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2476      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2477      (__mmask32)(U), (int)(R)))
2478
2479#define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2480  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2481      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2482      (__mmask32)-1, (int)(R)))
2483
2484#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2485  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2486      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2487      (__mmask32)(U), (int)(R)))
2488
2489#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2490  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2491      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2492      (__mmask32)(U), (int)(R)))
2493
2494#define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2495  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2496      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2497      (__mmask32)-1, (int)(R)))
2498
2499#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2500  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2501      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2502      (__mmask32)(U), (int)(R)))
2503
2504static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2505                                                                __m512h __B,
2506                                                                __m512h __C) {
2507  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2508                                                  (__v32hf)__C, (__mmask32)-1,
2509                                                  _MM_FROUND_CUR_DIRECTION);
2510}
2511
2512static __inline__ __m512h __DEFAULT_FN_ATTRS512
2513_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2514  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515                                                  (__v32hf)__C, (__mmask32)__U,
2516                                                  _MM_FROUND_CUR_DIRECTION);
2517}
2518
2519static __inline__ __m512h __DEFAULT_FN_ATTRS512
2520_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2521  return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2522                                                   (__v32hf)__C, (__mmask32)__U,
2523                                                   _MM_FROUND_CUR_DIRECTION);
2524}
2525
2526static __inline__ __m512h __DEFAULT_FN_ATTRS512
2527_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528  return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2529                                                   (__v32hf)__C, (__mmask32)__U,
2530                                                   _MM_FROUND_CUR_DIRECTION);
2531}
2532
2533static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2534                                                                __m512h __B,
2535                                                                __m512h __C) {
2536  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537                                                  -(__v32hf)__C, (__mmask32)-1,
2538                                                  _MM_FROUND_CUR_DIRECTION);
2539}
2540
2541static __inline__ __m512h __DEFAULT_FN_ATTRS512
2542_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2543  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544                                                  -(__v32hf)__C, (__mmask32)__U,
2545                                                  _MM_FROUND_CUR_DIRECTION);
2546}
2547
2548static __inline__ __m512h __DEFAULT_FN_ATTRS512
2549_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2552      _MM_FROUND_CUR_DIRECTION);
2553}
2554
2555static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2556                                                                 __m512h __B,
2557                                                                 __m512h __C) {
2558  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2559                                                  (__v32hf)__C, (__mmask32)-1,
2560                                                  _MM_FROUND_CUR_DIRECTION);
2561}
2562
2563static __inline__ __m512h __DEFAULT_FN_ATTRS512
2564_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2565  return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2566                                                   (__v32hf)__C, (__mmask32)__U,
2567                                                   _MM_FROUND_CUR_DIRECTION);
2568}
2569
2570static __inline__ __m512h __DEFAULT_FN_ATTRS512
2571_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572  return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2573                                                   (__v32hf)__C, (__mmask32)__U,
2574                                                   _MM_FROUND_CUR_DIRECTION);
2575}
2576
2577static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2578                                                                 __m512h __B,
2579                                                                 __m512h __C) {
2580  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2581                                                  -(__v32hf)__C, (__mmask32)-1,
2582                                                  _MM_FROUND_CUR_DIRECTION);
2583}
2584
2585static __inline__ __m512h __DEFAULT_FN_ATTRS512
2586_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588      -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2589      _MM_FROUND_CUR_DIRECTION);
2590}
2591
2592#define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2593  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2594      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2595      (__mmask32)-1, (int)(R)))
2596
2597#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2598  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2599      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2600      (__mmask32)(U), (int)(R)))
2601
2602#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2603  ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2604      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2605      (__mmask32)(U), (int)(R)))
2606
2607#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2608  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2609      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2610      (__mmask32)(U), (int)(R)))
2611
2612#define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2613  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2614      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2615      (__mmask32)-1, (int)(R)))
2616
2617#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2618  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2619      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2620      (__mmask32)(U), (int)(R)))
2621
2622#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2623  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2624      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2625      (__mmask32)(U), (int)(R)))
2626
2627static __inline__ __m512h __DEFAULT_FN_ATTRS512
2628_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2631      _MM_FROUND_CUR_DIRECTION);
2632}
2633
2634static __inline__ __m512h __DEFAULT_FN_ATTRS512
2635_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2636  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2638      _MM_FROUND_CUR_DIRECTION);
2639}
2640
2641static __inline__ __m512h __DEFAULT_FN_ATTRS512
2642_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2643  return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645      _MM_FROUND_CUR_DIRECTION);
2646}
2647
2648static __inline__ __m512h __DEFAULT_FN_ATTRS512
2649_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652      _MM_FROUND_CUR_DIRECTION);
2653}
2654
2655static __inline__ __m512h __DEFAULT_FN_ATTRS512
2656_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2659      _MM_FROUND_CUR_DIRECTION);
2660}
2661
2662static __inline__ __m512h __DEFAULT_FN_ATTRS512
2663_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2664  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2666      _MM_FROUND_CUR_DIRECTION);
2667}
2668
2669static __inline__ __m512h __DEFAULT_FN_ATTRS512
2670_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673      _MM_FROUND_CUR_DIRECTION);
2674}
2675
2676#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2677  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2678      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2679      (__mmask32)(U), (int)(R)))
2680
2681static __inline__ __m512h __DEFAULT_FN_ATTRS512
2682_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2683  return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2684                                                   (__v32hf)__C, (__mmask32)__U,
2685                                                   _MM_FROUND_CUR_DIRECTION);
2686}
2687
2688#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2689  ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2690      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2691      (__mmask32)(U), (int)(R)))
2692
2693static __inline__ __m512h __DEFAULT_FN_ATTRS512
2694_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2695  return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2697      _MM_FROUND_CUR_DIRECTION);
2698}
2699
2700#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2701  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2702      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2703      (__mmask32)(U), (int)(R)))
2704
2705static __inline__ __m512h __DEFAULT_FN_ATTRS512
2706_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2707  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2708                                                  (__v32hf)__C, (__mmask32)__U,
2709                                                  _MM_FROUND_CUR_DIRECTION);
2710}
2711
2712#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2713  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2714      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2715      (__mmask32)(U), (int)(R)))
2716
2717#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2718  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2719      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2720      (__mmask32)(U), (int)(R)))
2721
2722static __inline__ __m512h __DEFAULT_FN_ATTRS512
2723_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2724  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2725                                                  -(__v32hf)__C, (__mmask32)__U,
2726                                                  _MM_FROUND_CUR_DIRECTION);
2727}
2728
2729static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2731  return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2732                                                   (__v32hf)__C, (__mmask32)__U,
2733                                                   _MM_FROUND_CUR_DIRECTION);
2734}
2735
2736static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2737                                                             __m128h __A,
2738                                                             __m128h __B) {
2739  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2740                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2741}
2742
2743static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2744                                                                  __mmask8 __U,
2745                                                                  __m128h __A,
2746                                                                  __m128h __B) {
2747  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2748                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2749}
2750
2751#define _mm_fmadd_round_sh(A, B, C, R)                                         \
2752  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2753      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2754      (__mmask8)-1, (int)(R)))
2755
2756#define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2757  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2758      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2759      (__mmask8)(U), (int)(R)))
2760
2761static __inline__ __m128h __DEFAULT_FN_ATTRS128
2762_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2764                                        (__mmask8)__U,
2765                                        _MM_FROUND_CUR_DIRECTION);
2766}
2767
2768#define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2769  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2770      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2771      (__mmask8)(U), (int)(R)))
2772
2773static __inline__ __m128h __DEFAULT_FN_ATTRS128
2774_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2775  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2776                                        (__mmask8)__U,
2777                                        _MM_FROUND_CUR_DIRECTION);
2778}
2779
2780#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2781  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2782      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2783      (__mmask8)(U), (int)(R)))
2784
2785static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2786                                                             __m128h __A,
2787                                                             __m128h __B) {
2788  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2789                                                -(__v8hf)__B, (__mmask8)-1,
2790                                                _MM_FROUND_CUR_DIRECTION);
2791}
2792
2793static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2794                                                                  __mmask8 __U,
2795                                                                  __m128h __A,
2796                                                                  __m128h __B) {
2797  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2798                                                -(__v8hf)__B, (__mmask8)__U,
2799                                                _MM_FROUND_CUR_DIRECTION);
2800}
2801
2802#define _mm_fmsub_round_sh(A, B, C, R)                                         \
2803  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2804      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2805      (__mmask8)-1, (int)(R)))
2806
2807#define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2808  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2809      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2810      (__mmask8)(U), (int)(R)))
2811
2812static __inline__ __m128h __DEFAULT_FN_ATTRS128
2813_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814  return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2815                                                 -(__v8hf)__C, (__mmask8)__U,
2816                                                 _MM_FROUND_CUR_DIRECTION);
2817}
2818
2819#define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2820  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2821      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2822      (__mmask8)(U), (int)R))
2823
2824static __inline__ __m128h __DEFAULT_FN_ATTRS128
2825_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2826  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2827                                        (__mmask8)__U,
2828                                        _MM_FROUND_CUR_DIRECTION);
2829}
2830
2831#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2832  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2833      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2834      (__mmask8)(U), (int)(R)))
2835
2836static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2837                                                              __m128h __A,
2838                                                              __m128h __B) {
2839  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2840                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2841}
2842
2843static __inline__ __m128h __DEFAULT_FN_ATTRS128
2844_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2845  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2846                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2847}
2848
2849#define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2850  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2851      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2852      (__mmask8)-1, (int)(R)))
2853
2854#define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2855  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2856      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2857      (__mmask8)(U), (int)(R)))
2858
2859static __inline__ __m128h __DEFAULT_FN_ATTRS128
2860_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2862                                        (__mmask8)__U,
2863                                        _MM_FROUND_CUR_DIRECTION);
2864}
2865
2866#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2867  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2868      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2869      (__mmask8)(U), (int)(R)))
2870
2871static __inline__ __m128h __DEFAULT_FN_ATTRS128
2872_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2873  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2874                                        (__mmask8)__U,
2875                                        _MM_FROUND_CUR_DIRECTION);
2876}
2877
2878#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2879  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2880      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2881      (__mmask8)(U), (int)(R)))
2882
2883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2884                                                              __m128h __A,
2885                                                              __m128h __B) {
2886  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2887                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2888}
2889
2890static __inline__ __m128h __DEFAULT_FN_ATTRS128
2891_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2892  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2893                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2894}
2895
2896#define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2897  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2898      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2899      (__mmask8)-1, (int)(R)))
2900
2901#define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2902  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2903      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2904      (__mmask8)(U), (int)(R)))
2905
2906static __inline__ __m128h __DEFAULT_FN_ATTRS128
2907_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2909                                        (__mmask8)__U,
2910                                        _MM_FROUND_CUR_DIRECTION);
2911}
2912
2913#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2914  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2915      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2916      (__mmask8)(U), (int)(R)))
2917
2918static __inline__ __m128h __DEFAULT_FN_ATTRS128
2919_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2920  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2921                                        (__mmask8)__U,
2922                                        _MM_FROUND_CUR_DIRECTION);
2923}
2924
2925#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2926  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2927      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2928      (__mmask8)(U), (int)(R)))
2929
2930static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2931                                                               __m128h __B,
2932                                                               __m128h __C) {
2933  return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2934                                                 (__v4sf)__C, (__mmask8)-1,
2935                                                 _MM_FROUND_CUR_DIRECTION);
2936}
2937
2938static __inline__ __m128h __DEFAULT_FN_ATTRS128
2939_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2940  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2941      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2942}
2943
2944static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946  return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2947                                                  (__v4sf)__C, (__mmask8)__U,
2948                                                  _MM_FROUND_CUR_DIRECTION);
2949}
2950
2951static __inline__ __m128h __DEFAULT_FN_ATTRS128
2952_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2953  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2954      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2955}
2956
2957#define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2958  ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2959      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2960      (__mmask8)-1, (int)(R)))
2961
2962#define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2963  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2964      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2965      (__mmask8)(U), (int)(R)))
2966
2967#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2968  ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2969      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2970      (__mmask8)(U), (int)(R)))
2971
2972#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2973  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2974      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2975      (__mmask8)(U), (int)(R)))
2976
2977static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2978                                                              __m128h __B,
2979                                                              __m128h __C) {
2980  return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2981                                                (__v4sf)__C, (__mmask8)-1,
2982                                                _MM_FROUND_CUR_DIRECTION);
2983}
2984
2985static __inline__ __m128h __DEFAULT_FN_ATTRS128
2986_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2987  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2988      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2989}
2990
2991static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993  return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2994                                                 (__v4sf)__C, (__mmask8)__U,
2995                                                 _MM_FROUND_CUR_DIRECTION);
2996}
2997
2998static __inline__ __m128h __DEFAULT_FN_ATTRS128
2999_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3000  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3001      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3002}
3003
3004#define _mm_fmadd_round_sch(A, B, C, R)                                        \
3005  ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3006      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3007      (__mmask8)-1, (int)(R)))
3008
3009#define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3010  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3011      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3012      (__mmask8)(U), (int)(R)))
3013
3014#define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3015  ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3016      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3017      (__mmask8)(U), (int)(R)))
3018
3019#define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3020  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3021      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3022      (__mmask8)(U), (int)(R)))
3023
3024static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3025                                                              __m128h __B) {
3026  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3028      _MM_FROUND_CUR_DIRECTION);
3029}
3030
3031static __inline__ __m128h __DEFAULT_FN_ATTRS128
3032_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3033  return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3034                                                (__v4sf)__W, (__mmask8)__U,
3035                                                _MM_FROUND_CUR_DIRECTION);
3036}
3037
3038static __inline__ __m128h __DEFAULT_FN_ATTRS128
3039_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3040  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3042      _MM_FROUND_CUR_DIRECTION);
3043}
3044
3045#define _mm_fcmul_round_sch(A, B, R)                                           \
3046  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3047      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3048      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3049
3050#define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3051  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3052      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3053      (__mmask8)(U), (int)(R)))
3054
3055#define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3056  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3057      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3058      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3059
3060static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3061                                                             __m128h __B) {
3062  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3064      _MM_FROUND_CUR_DIRECTION);
3065}
3066
3067static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3068                                                                  __mmask8 __U,
3069                                                                  __m128h __A,
3070                                                                  __m128h __B) {
3071  return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3072                                               (__v4sf)__W, (__mmask8)__U,
3073                                               _MM_FROUND_CUR_DIRECTION);
3074}
3075
3076static __inline__ __m128h __DEFAULT_FN_ATTRS128
3077_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3078  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3080      _MM_FROUND_CUR_DIRECTION);
3081}
3082
3083#define _mm_fmul_round_sch(A, B, R)                                            \
3084  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3085      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3086      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3087
3088#define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3089  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3090      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3091      (__mmask8)(U), (int)(R)))
3092
3093#define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3094  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3095      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3096      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3097
3098static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3099                                                                 __m512h __B) {
3100  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3102      _MM_FROUND_CUR_DIRECTION);
3103}
3104
3105static __inline__ __m512h __DEFAULT_FN_ATTRS512
3106_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3107  return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3108                                                   (__v16sf)__W, (__mmask16)__U,
3109                                                   _MM_FROUND_CUR_DIRECTION);
3110}
3111
3112static __inline__ __m512h __DEFAULT_FN_ATTRS512
3113_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3114  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3116      _MM_FROUND_CUR_DIRECTION);
3117}
3118
3119#define _mm512_fcmul_round_pch(A, B, R)                                        \
3120  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3121      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3122      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3123
3124#define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3125  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3126      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3127      (__mmask16)(U), (int)(R)))
3128
3129#define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3130  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3131      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3132      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3133
3134static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3135                                                                __m512h __B) {
3136  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3138      _MM_FROUND_CUR_DIRECTION);
3139}
3140
3141static __inline__ __m512h __DEFAULT_FN_ATTRS512
3142_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3143  return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3144                                                  (__v16sf)__W, (__mmask16)__U,
3145                                                  _MM_FROUND_CUR_DIRECTION);
3146}
3147
3148static __inline__ __m512h __DEFAULT_FN_ATTRS512
3149_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3150  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3152      _MM_FROUND_CUR_DIRECTION);
3153}
3154
3155#define _mm512_fmul_round_pch(A, B, R)                                         \
3156  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3157      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3158      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3159
3160#define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3161  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3162      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3163      (__mmask16)(U), (int)(R)))
3164
3165#define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3166  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3167      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3168      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3169
3170static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3171                                                                  __m512h __B,
3172                                                                  __m512h __C) {
3173  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3175      _MM_FROUND_CUR_DIRECTION);
3176}
3177
3178static __inline__ __m512h __DEFAULT_FN_ATTRS512
3179_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3180  return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3182      _MM_FROUND_CUR_DIRECTION);
3183}
3184
3185static __inline__ __m512h __DEFAULT_FN_ATTRS512
3186_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3187  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189      _MM_FROUND_CUR_DIRECTION);
3190}
3191
3192static __inline__ __m512h __DEFAULT_FN_ATTRS512
3193_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194  return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196      _MM_FROUND_CUR_DIRECTION);
3197}
3198
3199#define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3200  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3201      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3202      (__mmask16)-1, (int)(R)))
3203
3204#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3205  ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3206      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3207      (__mmask16)(U), (int)(R)))
3208
3209#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3210  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3211      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3212      (__mmask16)(U), (int)(R)))
3213
3214#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3215  ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3216      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3217      (__mmask16)(U), (int)(R)))
3218
3219static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3220                                                                 __m512h __B,
3221                                                                 __m512h __C) {
3222  return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3223                                                    (__v16sf)__C, (__mmask16)-1,
3224                                                    _MM_FROUND_CUR_DIRECTION);
3225}
3226
3227static __inline__ __m512h __DEFAULT_FN_ATTRS512
3228_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3229  return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3230                                                   (__v16sf)__C, (__mmask16)__U,
3231                                                   _MM_FROUND_CUR_DIRECTION);
3232}
3233
3234static __inline__ __m512h __DEFAULT_FN_ATTRS512
3235_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3236  return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3238      _MM_FROUND_CUR_DIRECTION);
3239}
3240
3241static __inline__ __m512h __DEFAULT_FN_ATTRS512
3242_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243  return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245      _MM_FROUND_CUR_DIRECTION);
3246}
3247
3248#define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3249  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3250      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3251      (__mmask16)-1, (int)(R)))
3252
3253#define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3254  ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3255      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3256      (__mmask16)(U), (int)(R)))
3257
3258#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3259  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3260      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3261      (__mmask16)(U), (int)(R)))
3262
3263#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3264  ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3265      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3266      (__mmask16)(U), (int)(R)))
3267
3268static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3269_mm512_reduce_add_ph(__m512h __W) {
3270  return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3271}
3272
3273static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3274_mm512_reduce_mul_ph(__m512h __W) {
3275  return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3276}
3277
3278static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3279_mm512_reduce_max_ph(__m512h __V) {
3280  return __builtin_ia32_reduce_fmax_ph512(__V);
3281}
3282
3283static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3284_mm512_reduce_min_ph(__m512h __V) {
3285  return __builtin_ia32_reduce_fmin_ph512(__V);
3286}
3287
3288static __inline__ __m512h __DEFAULT_FN_ATTRS512
3289_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3290  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3291                                              (__v32hf)__A);
3292}
3293
3294static __inline__ __m512h __DEFAULT_FN_ATTRS512
3295_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296  return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3297                                                 (__v32hi)__B);
3298}
3299
3300static __inline__ __m512h __DEFAULT_FN_ATTRS512
3301_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302  return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3303}
3304
3305// intrinsics below are alias for f*mul_*ch
3306#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310#define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3311  _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312#define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3313  _mm512_maskz_fmul_round_pch(U, A, B, R)
3314
3315#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319#define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3320  _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321#define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3322  _mm512_maskz_fcmul_round_pch(U, A, B, R)
3323
3324#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328#define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3329  _mm_mask_fmul_round_sch(W, U, A, B, R)
3330#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3331
3332#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336#define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3337  _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338#define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3339  _mm_maskz_fcmul_round_sch(U, A, B, R)
3340
3341#undef __DEFAULT_FN_ATTRS128
3342#undef __DEFAULT_FN_ATTRS256
3343#undef __DEFAULT_FN_ATTRS512
3344
3345#endif
3346#endif
3347