1/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9#ifndef __IMMINTRIN_H 10#error \ 11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead." 12#endif 13 14#ifdef __SSE2__ 15 16#ifndef __AVX512VLFP16INTRIN_H 17#define __AVX512VLFP16INTRIN_H 18 19/* Define the default attributes for the functions in this file. */ 20#define __DEFAULT_FN_ATTRS256 \ 21 __attribute__((__always_inline__, __nodebug__, \ 22 __target__("avx512fp16, avx512vl"), \ 23 __min_vector_width__(256))) 24#define __DEFAULT_FN_ATTRS128 \ 25 __attribute__((__always_inline__, __nodebug__, \ 26 __target__("avx512fp16, avx512vl"), \ 27 __min_vector_width__(128))) 28 29static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { 30 return __a[0]; 31} 32 33static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { 34 return __a[0]; 35} 36 37static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { 38 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; 39} 40 41static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { 42 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; 43} 44 45static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { 46 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, 47 __h, __h, __h, __h, __h, __h, __h, __h}; 48} 49 50static __inline __m128h __DEFAULT_FN_ATTRS128 51_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 52 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { 53 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; 54} 55 56static __inline __m256h __DEFAULT_FN_ATTRS256 57_mm256_set1_pch(_Float16 _Complex h) { 58 return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); 59} 60 61static __inline __m128h __DEFAULT_FN_ATTRS128 62_mm_set1_pch(_Float16 _Complex h) { 63 return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); 64} 65 66static __inline __m256h __DEFAULT_FN_ATTRS256 67_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 68 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, 69 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, 70 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { 71 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, 72 __h10, __h9, __h8, __h7, __h6, __h5, 73 __h4, __h3, __h2, __h1}; 74} 75 76#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ 77 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 78 79#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ 80 h14, h15, h16) \ 81 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ 82 (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 83 84static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, 85 __m256h __B) { 86 return (__m256h)((__v16hf)__A + (__v16hf)__B); 87} 88 89static __inline__ __m256h __DEFAULT_FN_ATTRS256 90_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 91 return (__m256h)__builtin_ia32_selectph_256( 92 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); 93} 94 95static __inline__ __m256h __DEFAULT_FN_ATTRS256 96_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { 97 return (__m256h)__builtin_ia32_selectph_256( 98 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 99} 100 101static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, 102 __m128h __B) { 103 return (__m128h)((__v8hf)__A + (__v8hf)__B); 104} 105 106static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, 107 __mmask8 __U, 108 __m128h __A, 109 __m128h __B) { 110 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 111 (__v8hf)__W); 112} 113 114static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, 115 __m128h __A, 116 __m128h __B) { 117 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 118 (__v8hf)_mm_setzero_ph()); 119} 120 121static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, 122 __m256h __B) { 123 return (__m256h)((__v16hf)__A - (__v16hf)__B); 124} 125 126static __inline__ __m256h __DEFAULT_FN_ATTRS256 127_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 128 return (__m256h)__builtin_ia32_selectph_256( 129 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); 130} 131 132static __inline__ __m256h __DEFAULT_FN_ATTRS256 133_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { 134 return (__m256h)__builtin_ia32_selectph_256( 135 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 136} 137 138static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, 139 __m128h __B) { 140 return (__m128h)((__v8hf)__A - (__v8hf)__B); 141} 142 143static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, 144 __mmask8 __U, 145 __m128h __A, 146 __m128h __B) { 147 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 148 (__v8hf)__W); 149} 150 151static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, 152 __m128h __A, 153 __m128h __B) { 154 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 155 (__v8hf)_mm_setzero_ph()); 156} 157 158static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, 159 __m256h __B) { 160 return (__m256h)((__v16hf)__A * (__v16hf)__B); 161} 162 163static __inline__ __m256h __DEFAULT_FN_ATTRS256 164_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 165 return (__m256h)__builtin_ia32_selectph_256( 166 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); 167} 168 169static __inline__ __m256h __DEFAULT_FN_ATTRS256 170_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { 171 return (__m256h)__builtin_ia32_selectph_256( 172 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 173} 174 175static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, 176 __m128h __B) { 177 return (__m128h)((__v8hf)__A * (__v8hf)__B); 178} 179 180static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, 181 __mmask8 __U, 182 __m128h __A, 183 __m128h __B) { 184 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 185 (__v8hf)__W); 186} 187 188static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, 189 __m128h __A, 190 __m128h __B) { 191 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 192 (__v8hf)_mm_setzero_ph()); 193} 194 195static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, 196 __m256h __B) { 197 return (__m256h)((__v16hf)__A / (__v16hf)__B); 198} 199 200static __inline__ __m256h __DEFAULT_FN_ATTRS256 201_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 202 return (__m256h)__builtin_ia32_selectph_256( 203 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); 204} 205 206static __inline__ __m256h __DEFAULT_FN_ATTRS256 207_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { 208 return (__m256h)__builtin_ia32_selectph_256( 209 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 210} 211 212static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, 213 __m128h __B) { 214 return (__m128h)((__v8hf)__A / (__v8hf)__B); 215} 216 217static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, 218 __mmask8 __U, 219 __m128h __A, 220 __m128h __B) { 221 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 222 (__v8hf)__W); 223} 224 225static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, 226 __m128h __A, 227 __m128h __B) { 228 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 229 (__v8hf)_mm_setzero_ph()); 230} 231 232static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, 233 __m256h __B) { 234 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); 235} 236 237static __inline__ __m256h __DEFAULT_FN_ATTRS256 238_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 239 return (__m256h)__builtin_ia32_selectph_256( 240 (__mmask16)__U, 241 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 242 (__v16hf)__W); 243} 244 245static __inline__ __m256h __DEFAULT_FN_ATTRS256 246_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { 247 return (__m256h)__builtin_ia32_selectph_256( 248 (__mmask16)__U, 249 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 250 (__v16hf)_mm256_setzero_ph()); 251} 252 253static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, 254 __m128h __B) { 255 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); 256} 257 258static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, 259 __mmask8 __U, 260 __m128h __A, 261 __m128h __B) { 262 return (__m128h)__builtin_ia32_selectph_128( 263 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 264 (__v8hf)__W); 265} 266 267static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, 268 __m128h __A, 269 __m128h __B) { 270 return (__m128h)__builtin_ia32_selectph_128( 271 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 272 (__v8hf)_mm_setzero_ph()); 273} 274 275static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, 276 __m256h __B) { 277 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); 278} 279 280static __inline__ __m256h __DEFAULT_FN_ATTRS256 281_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 282 return (__m256h)__builtin_ia32_selectph_256( 283 (__mmask16)__U, 284 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 285 (__v16hf)__W); 286} 287 288static __inline__ __m256h __DEFAULT_FN_ATTRS256 289_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { 290 return (__m256h)__builtin_ia32_selectph_256( 291 (__mmask16)__U, 292 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 293 (__v16hf)_mm256_setzero_ph()); 294} 295 296static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, 297 __m128h __B) { 298 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); 299} 300 301static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, 302 __mmask8 __U, 303 __m128h __A, 304 __m128h __B) { 305 return (__m128h)__builtin_ia32_selectph_128( 306 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 307 (__v8hf)__W); 308} 309 310static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, 311 __m128h __A, 312 __m128h __B) { 313 return (__m128h)__builtin_ia32_selectph_128( 314 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 315 (__v8hf)_mm_setzero_ph()); 316} 317 318static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { 319 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); 320} 321 322static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { 323 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); 324} 325 326static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { 327 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); 328} 329 330static __inline__ __m256h __DEFAULT_FN_ATTRS256 331_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { 332 return (__m256h)__builtin_ia32_selectps_256( 333 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); 334} 335 336static __inline__ __m256h __DEFAULT_FN_ATTRS256 337_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { 338 return (__m256h)__builtin_ia32_selectps_256( 339 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); 340} 341 342static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { 343 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); 344} 345 346static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, 347 __mmask8 __U, 348 __m128h __A) { 349 return (__m128h)__builtin_ia32_selectps_128( 350 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); 351} 352 353static __inline__ __m128h __DEFAULT_FN_ATTRS128 354_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { 355 return (__m128h)__builtin_ia32_selectps_128( 356 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); 357} 358 359#define _mm256_cmp_ph_mask(a, b, p) \ 360 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 361 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) 362 363#define _mm256_mask_cmp_ph_mask(m, a, b, p) \ 364 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 365 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) 366 367#define _mm_cmp_ph_mask(a, b, p) \ 368 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 369 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) 370 371#define _mm_mask_cmp_ph_mask(m, a, b, p) \ 372 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 373 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) 374 375static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { 376 return (__m256h)__builtin_ia32_rcpph256_mask( 377 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 378} 379 380static __inline__ __m256h __DEFAULT_FN_ATTRS256 381_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 382 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, 383 (__mmask16)__U); 384} 385 386static __inline__ __m256h __DEFAULT_FN_ATTRS256 387_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { 388 return (__m256h)__builtin_ia32_rcpph256_mask( 389 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 390} 391 392static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { 393 return (__m128h)__builtin_ia32_rcpph128_mask( 394 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 395} 396 397static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, 398 __mmask8 __U, 399 __m128h __A) { 400 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, 401 (__mmask8)__U); 402} 403 404static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, 405 __m128h __A) { 406 return (__m128h)__builtin_ia32_rcpph128_mask( 407 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 408} 409 410static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { 411 return (__m256h)__builtin_ia32_rsqrtph256_mask( 412 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 413} 414 415static __inline__ __m256h __DEFAULT_FN_ATTRS256 416_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 417 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, 418 (__mmask16)__U); 419} 420 421static __inline__ __m256h __DEFAULT_FN_ATTRS256 422_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { 423 return (__m256h)__builtin_ia32_rsqrtph256_mask( 424 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 425} 426 427static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { 428 return (__m128h)__builtin_ia32_rsqrtph128_mask( 429 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 430} 431 432static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, 433 __mmask8 __U, 434 __m128h __A) { 435 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, 436 (__mmask8)__U); 437} 438 439static __inline__ __m128h __DEFAULT_FN_ATTRS128 440_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { 441 return (__m128h)__builtin_ia32_rsqrtph128_mask( 442 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 443} 444 445static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { 446 return (__m128h)__builtin_ia32_getexpph128_mask( 447 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 448} 449 450static __inline__ __m128h __DEFAULT_FN_ATTRS128 451_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { 452 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, 453 (__mmask8)__U); 454} 455 456static __inline__ __m128h __DEFAULT_FN_ATTRS128 457_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { 458 return (__m128h)__builtin_ia32_getexpph128_mask( 459 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 460} 461 462static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { 463 return (__m256h)__builtin_ia32_getexpph256_mask( 464 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 465} 466 467static __inline__ __m256h __DEFAULT_FN_ATTRS256 468_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 469 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, 470 (__mmask16)__U); 471} 472 473static __inline__ __m256h __DEFAULT_FN_ATTRS256 474_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { 475 return (__m256h)__builtin_ia32_getexpph256_mask( 476 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 477} 478 479#define _mm_getmant_ph(A, B, C) \ 480 ((__m128h)__builtin_ia32_getmantph128_mask( \ 481 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 482 (__mmask8)-1)) 483 484#define _mm_mask_getmant_ph(W, U, A, B, C) \ 485 ((__m128h)__builtin_ia32_getmantph128_mask( \ 486 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ 487 (__mmask8)(U))) 488 489#define _mm_maskz_getmant_ph(U, A, B, C) \ 490 ((__m128h)__builtin_ia32_getmantph128_mask( \ 491 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 492 (__mmask8)(U))) 493 494#define _mm256_getmant_ph(A, B, C) \ 495 ((__m256h)__builtin_ia32_getmantph256_mask( \ 496 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 497 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) 498 499#define _mm256_mask_getmant_ph(W, U, A, B, C) \ 500 ((__m256h)__builtin_ia32_getmantph256_mask( \ 501 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ 502 (__mmask16)(U))) 503 504#define _mm256_maskz_getmant_ph(U, A, B, C) \ 505 ((__m256h)__builtin_ia32_getmantph256_mask( \ 506 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 507 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) 508 509static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, 510 __m128h __B) { 511 return (__m128h)__builtin_ia32_scalefph128_mask( 512 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 513} 514 515static __inline__ __m128h __DEFAULT_FN_ATTRS128 516_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 517 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, 518 (__v8hf)__W, (__mmask8)__U); 519} 520 521static __inline__ __m128h __DEFAULT_FN_ATTRS128 522_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { 523 return (__m128h)__builtin_ia32_scalefph128_mask( 524 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 525} 526 527static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, 528 __m256h __B) { 529 return (__m256h)__builtin_ia32_scalefph256_mask( 530 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 531} 532 533static __inline__ __m256h __DEFAULT_FN_ATTRS256 534_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 535 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, 536 (__v16hf)__W, (__mmask16)__U); 537} 538 539static __inline__ __m256h __DEFAULT_FN_ATTRS256 540_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { 541 return (__m256h)__builtin_ia32_scalefph256_mask( 542 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 543} 544 545#define _mm_roundscale_ph(A, imm) \ 546 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 547 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 548 (__mmask8)-1)) 549 550#define _mm_mask_roundscale_ph(W, U, A, imm) \ 551 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 552 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 553 554#define _mm_maskz_roundscale_ph(U, A, imm) \ 555 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 556 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 557 (__mmask8)(U))) 558 559#define _mm256_roundscale_ph(A, imm) \ 560 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 561 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 562 (__mmask16)-1)) 563 564#define _mm256_mask_roundscale_ph(W, U, A, imm) \ 565 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 566 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ 567 (__mmask16)(U))) 568 569#define _mm256_maskz_roundscale_ph(U, A, imm) \ 570 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 571 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 572 (__mmask16)(U))) 573 574#define _mm_reduce_ph(A, imm) \ 575 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 576 (__v8hf)_mm_setzero_ph(), \ 577 (__mmask8)-1)) 578 579#define _mm_mask_reduce_ph(W, U, A, imm) \ 580 ((__m128h)__builtin_ia32_reduceph128_mask( \ 581 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 582 583#define _mm_maskz_reduce_ph(U, A, imm) \ 584 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 585 (__v8hf)_mm_setzero_ph(), \ 586 (__mmask8)(U))) 587 588#define _mm256_reduce_ph(A, imm) \ 589 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 590 (__v16hf)_mm256_setzero_ph(), \ 591 (__mmask16)-1)) 592 593#define _mm256_mask_reduce_ph(W, U, A, imm) \ 594 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 595 (__v16hf)(__m256h)(W), \ 596 (__mmask16)(U))) 597 598#define _mm256_maskz_reduce_ph(U, A, imm) \ 599 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 600 (__v16hf)_mm256_setzero_ph(), \ 601 (__mmask16)(U))) 602 603static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { 604 return __builtin_ia32_sqrtph((__v8hf)__a); 605} 606 607static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, 608 __mmask8 __U, 609 __m128h __A) { 610 return (__m128h)__builtin_ia32_selectph_128( 611 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); 612} 613 614static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, 615 __m128h __A) { 616 return (__m128h)__builtin_ia32_selectph_128( 617 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); 618} 619 620static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { 621 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); 622} 623 624static __inline__ __m256h __DEFAULT_FN_ATTRS256 625_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 626 return (__m256h)__builtin_ia32_selectph_256( 627 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); 628} 629 630static __inline__ __m256h __DEFAULT_FN_ATTRS256 631_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { 632 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 633 (__v16hf)_mm256_sqrt_ph(__A), 634 (__v16hf)_mm256_setzero_ph()); 635} 636 637#define _mm_mask_fpclass_ph_mask(U, A, imm) \ 638 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 639 (int)(imm), (__mmask8)(U))) 640 641#define _mm_fpclass_ph_mask(A, imm) \ 642 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 643 (int)(imm), (__mmask8)-1)) 644 645#define _mm256_mask_fpclass_ph_mask(U, A, imm) \ 646 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 647 (int)(imm), (__mmask16)(U))) 648 649#define _mm256_fpclass_ph_mask(A, imm) \ 650 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 651 (int)(imm), (__mmask16)-1)) 652 653static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { 654 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 655 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 656} 657 658static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, 659 __mmask8 __U, 660 __m128d __A) { 661 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, 662 (__mmask8)__U); 663} 664 665static __inline__ __m128h __DEFAULT_FN_ATTRS128 666_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { 667 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 668 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 669} 670 671static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { 672 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 673 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 674} 675 676static __inline__ __m128h __DEFAULT_FN_ATTRS256 677_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { 678 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, 679 (__mmask8)__U); 680} 681 682static __inline__ __m128h __DEFAULT_FN_ATTRS256 683_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { 684 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 685 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 686} 687 688static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { 689 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 690 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); 691} 692 693static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, 694 __mmask8 __U, 695 __m128h __A) { 696 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, 697 (__mmask8)__U); 698} 699 700static __inline__ __m128d __DEFAULT_FN_ATTRS128 701_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 702 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 703 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); 704} 705 706static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { 707 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 708 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); 709} 710 711static __inline__ __m256d __DEFAULT_FN_ATTRS256 712_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { 713 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, 714 (__mmask8)__U); 715} 716 717static __inline__ __m256d __DEFAULT_FN_ATTRS256 718_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 719 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 720 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); 721} 722 723static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { 724 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 725 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 726} 727 728static __inline__ __m128i __DEFAULT_FN_ATTRS128 729_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 730 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, 731 (__mmask8)__U); 732} 733 734static __inline__ __m128i __DEFAULT_FN_ATTRS128 735_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { 736 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 737 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 738} 739 740static __inline__ __m256i __DEFAULT_FN_ATTRS256 741_mm256_cvtph_epi16(__m256h __A) { 742 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 743 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 744} 745 746static __inline__ __m256i __DEFAULT_FN_ATTRS256 747_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 748 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, 749 (__mmask16)__U); 750} 751 752static __inline__ __m256i __DEFAULT_FN_ATTRS256 753_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { 754 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 755 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 756} 757 758static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { 759 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 760 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 761} 762 763static __inline__ __m128i __DEFAULT_FN_ATTRS128 764_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 765 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, 766 (__mmask8)__U); 767} 768 769static __inline__ __m128i __DEFAULT_FN_ATTRS128 770_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { 771 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 772 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 773} 774 775static __inline__ __m256i __DEFAULT_FN_ATTRS256 776_mm256_cvttph_epi16(__m256h __A) { 777 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 778 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 779} 780 781static __inline__ __m256i __DEFAULT_FN_ATTRS256 782_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 783 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, 784 (__mmask16)__U); 785} 786 787static __inline__ __m256i __DEFAULT_FN_ATTRS256 788_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { 789 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 790 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 791} 792 793static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { 794 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); 795} 796 797static __inline__ __m128h __DEFAULT_FN_ATTRS128 798_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 799 return (__m128h)__builtin_ia32_selectph_128( 800 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); 801} 802 803static __inline__ __m128h __DEFAULT_FN_ATTRS128 804_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { 805 return (__m128h)__builtin_ia32_selectph_128( 806 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); 807} 808 809static __inline__ __m256h __DEFAULT_FN_ATTRS256 810_mm256_cvtepi16_ph(__m256i __A) { 811 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); 812} 813 814static __inline__ __m256h __DEFAULT_FN_ATTRS256 815_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 816 return (__m256h)__builtin_ia32_selectph_256( 817 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); 818} 819 820static __inline__ __m256h __DEFAULT_FN_ATTRS256 821_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { 822 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 823 (__v16hf)_mm256_cvtepi16_ph(__A), 824 (__v16hf)_mm256_setzero_ph()); 825} 826 827static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { 828 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 829 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 830} 831 832static __inline__ __m128i __DEFAULT_FN_ATTRS128 833_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 834 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, 835 (__mmask8)__U); 836} 837 838static __inline__ __m128i __DEFAULT_FN_ATTRS128 839_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { 840 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 841 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 842} 843 844static __inline__ __m256i __DEFAULT_FN_ATTRS256 845_mm256_cvtph_epu16(__m256h __A) { 846 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 847 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 848} 849 850static __inline__ __m256i __DEFAULT_FN_ATTRS256 851_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 852 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, 853 (__mmask16)__U); 854} 855 856static __inline__ __m256i __DEFAULT_FN_ATTRS256 857_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { 858 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 859 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 860} 861 862static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { 863 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 864 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 865} 866 867static __inline__ __m128i __DEFAULT_FN_ATTRS128 868_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 869 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, 870 (__mmask8)__U); 871} 872 873static __inline__ __m128i __DEFAULT_FN_ATTRS128 874_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { 875 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 876 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 877} 878 879static __inline__ __m256i __DEFAULT_FN_ATTRS256 880_mm256_cvttph_epu16(__m256h __A) { 881 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 882 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 883} 884 885static __inline__ __m256i __DEFAULT_FN_ATTRS256 886_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 887 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, 888 (__mmask16)__U); 889} 890 891static __inline__ __m256i __DEFAULT_FN_ATTRS256 892_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { 893 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 894 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 895} 896 897static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { 898 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); 899} 900 901static __inline__ __m128h __DEFAULT_FN_ATTRS128 902_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 903 return (__m128h)__builtin_ia32_selectph_128( 904 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); 905} 906 907static __inline__ __m128h __DEFAULT_FN_ATTRS128 908_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { 909 return (__m128h)__builtin_ia32_selectph_128( 910 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); 911} 912 913static __inline__ __m256h __DEFAULT_FN_ATTRS256 914_mm256_cvtepu16_ph(__m256i __A) { 915 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); 916} 917 918static __inline__ __m256h __DEFAULT_FN_ATTRS256 919_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 920 return (__m256h)__builtin_ia32_selectph_256( 921 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); 922} 923 924static __inline__ __m256h __DEFAULT_FN_ATTRS256 925_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { 926 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 927 (__v16hf)_mm256_cvtepu16_ph(__A), 928 (__v16hf)_mm256_setzero_ph()); 929} 930 931static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { 932 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 933 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 934} 935 936static __inline__ __m128i __DEFAULT_FN_ATTRS128 937_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 938 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, 939 (__mmask8)__U); 940} 941 942static __inline__ __m128i __DEFAULT_FN_ATTRS128 943_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 944 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 945 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 946} 947 948static __inline__ __m256i __DEFAULT_FN_ATTRS256 949_mm256_cvtph_epi32(__m128h __A) { 950 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 951 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 952} 953 954static __inline__ __m256i __DEFAULT_FN_ATTRS256 955_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 956 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, 957 (__mmask8)__U); 958} 959 960static __inline__ __m256i __DEFAULT_FN_ATTRS256 961_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 962 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 963 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 964} 965 966static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { 967 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 968 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 969} 970 971static __inline__ __m128i __DEFAULT_FN_ATTRS128 972_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 973 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, 974 (__mmask8)__U); 975} 976 977static __inline__ __m128i __DEFAULT_FN_ATTRS128 978_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 979 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 980 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 981} 982 983static __inline__ __m256i __DEFAULT_FN_ATTRS256 984_mm256_cvtph_epu32(__m128h __A) { 985 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 986 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 987} 988 989static __inline__ __m256i __DEFAULT_FN_ATTRS256 990_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 991 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, 992 (__mmask8)__U); 993} 994 995static __inline__ __m256i __DEFAULT_FN_ATTRS256 996_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 997 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 998 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 999} 1000 1001static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { 1002 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1003 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1004} 1005 1006static __inline__ __m128h __DEFAULT_FN_ATTRS128 1007_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1008 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, 1009 (__mmask8)__U); 1010} 1011 1012static __inline__ __m128h __DEFAULT_FN_ATTRS128 1013_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { 1014 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1015 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1016} 1017 1018static __inline__ __m128h __DEFAULT_FN_ATTRS256 1019_mm256_cvtepi32_ph(__m256i __A) { 1020 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); 1021} 1022 1023static __inline__ __m128h __DEFAULT_FN_ATTRS256 1024_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1025 return (__m128h)__builtin_ia32_selectph_128( 1026 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); 1027} 1028 1029static __inline__ __m128h __DEFAULT_FN_ATTRS256 1030_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { 1031 return (__m128h)__builtin_ia32_selectph_128( 1032 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); 1033} 1034 1035static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { 1036 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1037 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1038} 1039 1040static __inline__ __m128h __DEFAULT_FN_ATTRS128 1041_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1042 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, 1043 (__mmask8)__U); 1044} 1045 1046static __inline__ __m128h __DEFAULT_FN_ATTRS128 1047_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { 1048 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1049 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1050} 1051 1052static __inline__ __m128h __DEFAULT_FN_ATTRS256 1053_mm256_cvtepu32_ph(__m256i __A) { 1054 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); 1055} 1056 1057static __inline__ __m128h __DEFAULT_FN_ATTRS256 1058_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1059 return (__m128h)__builtin_ia32_selectph_128( 1060 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); 1061} 1062 1063static __inline__ __m128h __DEFAULT_FN_ATTRS256 1064_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { 1065 return (__m128h)__builtin_ia32_selectph_128( 1066 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); 1067} 1068 1069static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { 1070 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1071 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 1072} 1073 1074static __inline__ __m128i __DEFAULT_FN_ATTRS128 1075_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 1076 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, 1077 (__mmask8)__U); 1078} 1079 1080static __inline__ __m128i __DEFAULT_FN_ATTRS128 1081_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1082 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1083 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 1084} 1085 1086static __inline__ __m256i __DEFAULT_FN_ATTRS256 1087_mm256_cvttph_epi32(__m128h __A) { 1088 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1089 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 1090} 1091 1092static __inline__ __m256i __DEFAULT_FN_ATTRS256 1093_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 1094 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, 1095 (__mmask8)__U); 1096} 1097 1098static __inline__ __m256i __DEFAULT_FN_ATTRS256 1099_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1100 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1101 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 1102} 1103 1104static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { 1105 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1106 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 1107} 1108 1109static __inline__ __m128i __DEFAULT_FN_ATTRS128 1110_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 1111 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, 1112 (__mmask8)__U); 1113} 1114 1115static __inline__ __m128i __DEFAULT_FN_ATTRS128 1116_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1117 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1118 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 1119} 1120 1121static __inline__ __m256i __DEFAULT_FN_ATTRS256 1122_mm256_cvttph_epu32(__m128h __A) { 1123 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1124 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 1125} 1126 1127static __inline__ __m256i __DEFAULT_FN_ATTRS256 1128_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 1129 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, 1130 (__mmask8)__U); 1131} 1132 1133static __inline__ __m256i __DEFAULT_FN_ATTRS256 1134_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1135 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1136 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 1137} 1138 1139static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { 1140 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1141 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1142} 1143 1144static __inline__ __m128h __DEFAULT_FN_ATTRS128 1145_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1146 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, 1147 (__mmask8)__U); 1148} 1149 1150static __inline__ __m128h __DEFAULT_FN_ATTRS128 1151_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { 1152 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1153 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1154} 1155 1156static __inline__ __m128h __DEFAULT_FN_ATTRS256 1157_mm256_cvtepi64_ph(__m256i __A) { 1158 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1159 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1160} 1161 1162static __inline__ __m128h __DEFAULT_FN_ATTRS256 1163_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1164 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, 1165 (__mmask8)__U); 1166} 1167 1168static __inline__ __m128h __DEFAULT_FN_ATTRS256 1169_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { 1170 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1171 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1172} 1173 1174static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { 1175 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1176 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1177} 1178 1179static __inline__ __m128i __DEFAULT_FN_ATTRS128 1180_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1181 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, 1182 (__mmask8)__U); 1183} 1184 1185static __inline__ __m128i __DEFAULT_FN_ATTRS128 1186_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1187 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1188 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1189} 1190 1191static __inline__ __m256i __DEFAULT_FN_ATTRS256 1192_mm256_cvtph_epi64(__m128h __A) { 1193 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1194 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1195} 1196 1197static __inline__ __m256i __DEFAULT_FN_ATTRS256 1198_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1199 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, 1200 (__mmask8)__U); 1201} 1202 1203static __inline__ __m256i __DEFAULT_FN_ATTRS256 1204_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1205 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1206 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1207} 1208 1209static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { 1210 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1211 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1212} 1213 1214static __inline__ __m128h __DEFAULT_FN_ATTRS128 1215_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1216 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, 1217 (__mmask8)__U); 1218} 1219 1220static __inline__ __m128h __DEFAULT_FN_ATTRS128 1221_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { 1222 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1223 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1224} 1225 1226static __inline__ __m128h __DEFAULT_FN_ATTRS256 1227_mm256_cvtepu64_ph(__m256i __A) { 1228 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1229 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1230} 1231 1232static __inline__ __m128h __DEFAULT_FN_ATTRS256 1233_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1234 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, 1235 (__mmask8)__U); 1236} 1237 1238static __inline__ __m128h __DEFAULT_FN_ATTRS256 1239_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { 1240 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1241 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1242} 1243 1244static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { 1245 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1246 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1247} 1248 1249static __inline__ __m128i __DEFAULT_FN_ATTRS128 1250_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1251 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1252 (__mmask8)__U); 1253} 1254 1255static __inline__ __m128i __DEFAULT_FN_ATTRS128 1256_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1257 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1258 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1259} 1260 1261static __inline__ __m256i __DEFAULT_FN_ATTRS256 1262_mm256_cvtph_epu64(__m128h __A) { 1263 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1264 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1265} 1266 1267static __inline__ __m256i __DEFAULT_FN_ATTRS256 1268_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1269 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1270 (__mmask8)__U); 1271} 1272 1273static __inline__ __m256i __DEFAULT_FN_ATTRS256 1274_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1275 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1276 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1277} 1278 1279static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { 1280 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1281 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1282} 1283 1284static __inline__ __m128i __DEFAULT_FN_ATTRS128 1285_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1286 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, 1287 (__mmask8)__U); 1288} 1289 1290static __inline__ __m128i __DEFAULT_FN_ATTRS128 1291_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1292 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1293 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1294} 1295 1296static __inline__ __m256i __DEFAULT_FN_ATTRS256 1297_mm256_cvttph_epi64(__m128h __A) { 1298 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1299 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1300} 1301 1302static __inline__ __m256i __DEFAULT_FN_ATTRS256 1303_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1304 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, 1305 (__mmask8)__U); 1306} 1307 1308static __inline__ __m256i __DEFAULT_FN_ATTRS256 1309_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1310 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1311 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1312} 1313 1314static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { 1315 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1316 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1317} 1318 1319static __inline__ __m128i __DEFAULT_FN_ATTRS128 1320_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1321 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1322 (__mmask8)__U); 1323} 1324 1325static __inline__ __m128i __DEFAULT_FN_ATTRS128 1326_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1327 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1328 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1329} 1330 1331static __inline__ __m256i __DEFAULT_FN_ATTRS256 1332_mm256_cvttph_epu64(__m128h __A) { 1333 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1334 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1335} 1336 1337static __inline__ __m256i __DEFAULT_FN_ATTRS256 1338_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1339 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1340 (__mmask8)__U); 1341} 1342 1343static __inline__ __m256i __DEFAULT_FN_ATTRS256 1344_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1345 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1346 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1347} 1348 1349static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { 1350 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1351 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); 1352} 1353 1354static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, 1355 __mmask8 __U, 1356 __m128h __A) { 1357 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, 1358 (__mmask8)__U); 1359} 1360 1361static __inline__ __m128 __DEFAULT_FN_ATTRS128 1362_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1363 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1364 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); 1365} 1366 1367static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { 1368 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1369 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); 1370} 1371 1372static __inline__ __m256 __DEFAULT_FN_ATTRS256 1373_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { 1374 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, 1375 (__mmask8)__U); 1376} 1377 1378static __inline__ __m256 __DEFAULT_FN_ATTRS256 1379_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1380 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1381 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); 1382} 1383 1384static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { 1385 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1386 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1387} 1388 1389static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, 1390 __mmask8 __U, 1391 __m128 __A) { 1392 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, 1393 (__mmask8)__U); 1394} 1395 1396static __inline__ __m128h __DEFAULT_FN_ATTRS128 1397_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { 1398 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1399 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1400} 1401 1402static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { 1403 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1404 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1405} 1406 1407static __inline__ __m128h __DEFAULT_FN_ATTRS256 1408_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { 1409 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, 1410 (__mmask8)__U); 1411} 1412 1413static __inline__ __m128h __DEFAULT_FN_ATTRS256 1414_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { 1415 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1416 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1417} 1418 1419static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, 1420 __m128h __B, 1421 __m128h __C) { 1422 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1423 (__v8hf)__C); 1424} 1425 1426static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, 1427 __mmask8 __U, 1428 __m128h __B, 1429 __m128h __C) { 1430 return (__m128h)__builtin_ia32_selectph_128( 1431 (__mmask8)__U, 1432 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1433 (__v8hf)__A); 1434} 1435 1436static __inline__ __m128h __DEFAULT_FN_ATTRS128 1437_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1438 return (__m128h)__builtin_ia32_selectph_128( 1439 (__mmask8)__U, 1440 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1441 (__v8hf)__C); 1442} 1443 1444static __inline__ __m128h __DEFAULT_FN_ATTRS128 1445_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1446 return (__m128h)__builtin_ia32_selectph_128( 1447 (__mmask8)__U, 1448 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1449 (__v8hf)_mm_setzero_ph()); 1450} 1451 1452static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, 1453 __m128h __B, 1454 __m128h __C) { 1455 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1456 -(__v8hf)__C); 1457} 1458 1459static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, 1460 __mmask8 __U, 1461 __m128h __B, 1462 __m128h __C) { 1463 return (__m128h)__builtin_ia32_selectph_128( 1464 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1465 (__v8hf)__A); 1466} 1467 1468static __inline__ __m128h __DEFAULT_FN_ATTRS128 1469_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1470 return (__m128h)__builtin_ia32_selectph_128( 1471 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1472 (__v8hf)_mm_setzero_ph()); 1473} 1474 1475static __inline__ __m128h __DEFAULT_FN_ATTRS128 1476_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1477 return (__m128h)__builtin_ia32_selectph_128( 1478 (__mmask8)__U, 1479 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1480 (__v8hf)__C); 1481} 1482 1483static __inline__ __m128h __DEFAULT_FN_ATTRS128 1484_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1485 return (__m128h)__builtin_ia32_selectph_128( 1486 (__mmask8)__U, 1487 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1488 (__v8hf)_mm_setzero_ph()); 1489} 1490 1491static __inline__ __m128h __DEFAULT_FN_ATTRS128 1492_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1493 return (__m128h)__builtin_ia32_selectph_128( 1494 (__mmask8)__U, 1495 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1496 (__v8hf)_mm_setzero_ph()); 1497} 1498 1499static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, 1500 __m256h __B, 1501 __m256h __C) { 1502 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1503 (__v16hf)__C); 1504} 1505 1506static __inline__ __m256h __DEFAULT_FN_ATTRS256 1507_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1508 return (__m256h)__builtin_ia32_selectph_256( 1509 (__mmask16)__U, 1510 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1511 (__v16hf)__A); 1512} 1513 1514static __inline__ __m256h __DEFAULT_FN_ATTRS256 1515_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1516 return (__m256h)__builtin_ia32_selectph_256( 1517 (__mmask16)__U, 1518 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1519 (__v16hf)__C); 1520} 1521 1522static __inline__ __m256h __DEFAULT_FN_ATTRS256 1523_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1524 return (__m256h)__builtin_ia32_selectph_256( 1525 (__mmask16)__U, 1526 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1527 (__v16hf)_mm256_setzero_ph()); 1528} 1529 1530static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, 1531 __m256h __B, 1532 __m256h __C) { 1533 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1534 -(__v16hf)__C); 1535} 1536 1537static __inline__ __m256h __DEFAULT_FN_ATTRS256 1538_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1539 return (__m256h)__builtin_ia32_selectph_256( 1540 (__mmask16)__U, 1541 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1542 (__v16hf)__A); 1543} 1544 1545static __inline__ __m256h __DEFAULT_FN_ATTRS256 1546_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1547 return (__m256h)__builtin_ia32_selectph_256( 1548 (__mmask16)__U, 1549 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1550 (__v16hf)_mm256_setzero_ph()); 1551} 1552 1553static __inline__ __m256h __DEFAULT_FN_ATTRS256 1554_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1555 return (__m256h)__builtin_ia32_selectph_256( 1556 (__mmask16)__U, 1557 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1558 (__v16hf)__C); 1559} 1560 1561static __inline__ __m256h __DEFAULT_FN_ATTRS256 1562_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1563 return (__m256h)__builtin_ia32_selectph_256( 1564 (__mmask16)__U, 1565 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1566 (__v16hf)_mm256_setzero_ph()); 1567} 1568 1569static __inline__ __m256h __DEFAULT_FN_ATTRS256 1570_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1571 return (__m256h)__builtin_ia32_selectph_256( 1572 (__mmask16)__U, 1573 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1574 (__v16hf)_mm256_setzero_ph()); 1575} 1576 1577static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, 1578 __m128h __B, 1579 __m128h __C) { 1580 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1581 (__v8hf)__C); 1582} 1583 1584static __inline__ __m128h __DEFAULT_FN_ATTRS128 1585_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1586 return (__m128h)__builtin_ia32_selectph_128( 1587 (__mmask8)__U, 1588 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1589 (__v8hf)__A); 1590} 1591 1592static __inline__ __m128h __DEFAULT_FN_ATTRS128 1593_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1594 return (__m128h)__builtin_ia32_selectph_128( 1595 (__mmask8)__U, 1596 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1597 (__v8hf)__C); 1598} 1599 1600static __inline__ __m128h __DEFAULT_FN_ATTRS128 1601_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1602 return (__m128h)__builtin_ia32_selectph_128( 1603 (__mmask8)__U, 1604 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1605 (__v8hf)_mm_setzero_ph()); 1606} 1607 1608static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, 1609 __m128h __B, 1610 __m128h __C) { 1611 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1612 -(__v8hf)__C); 1613} 1614 1615static __inline__ __m128h __DEFAULT_FN_ATTRS128 1616_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1617 return (__m128h)__builtin_ia32_selectph_128( 1618 (__mmask8)__U, 1619 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1620 (__v8hf)__A); 1621} 1622 1623static __inline__ __m128h __DEFAULT_FN_ATTRS128 1624_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1625 return (__m128h)__builtin_ia32_selectph_128( 1626 (__mmask8)__U, 1627 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1628 (__v8hf)_mm_setzero_ph()); 1629} 1630 1631static __inline__ __m256h __DEFAULT_FN_ATTRS256 1632_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { 1633 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1634 (__v16hf)__C); 1635} 1636 1637static __inline__ __m256h __DEFAULT_FN_ATTRS256 1638_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1639 return (__m256h)__builtin_ia32_selectph_256( 1640 (__mmask16)__U, 1641 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1642 (__v16hf)__A); 1643} 1644 1645static __inline__ __m256h __DEFAULT_FN_ATTRS256 1646_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1647 return (__m256h)__builtin_ia32_selectph_256( 1648 (__mmask16)__U, 1649 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1650 (__v16hf)__C); 1651} 1652 1653static __inline__ __m256h __DEFAULT_FN_ATTRS256 1654_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1655 return (__m256h)__builtin_ia32_selectph_256( 1656 (__mmask16)__U, 1657 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1658 (__v16hf)_mm256_setzero_ph()); 1659} 1660 1661static __inline__ __m256h __DEFAULT_FN_ATTRS256 1662_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { 1663 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1664 -(__v16hf)__C); 1665} 1666 1667static __inline__ __m256h __DEFAULT_FN_ATTRS256 1668_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1669 return (__m256h)__builtin_ia32_selectph_256( 1670 (__mmask16)__U, 1671 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1672 (__v16hf)__A); 1673} 1674 1675static __inline__ __m256h __DEFAULT_FN_ATTRS256 1676_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1677 return (__m256h)__builtin_ia32_selectph_256( 1678 (__mmask16)__U, 1679 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1680 (__v16hf)_mm256_setzero_ph()); 1681} 1682 1683static __inline__ __m128h __DEFAULT_FN_ATTRS128 1684_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1685 return (__m128h)__builtin_ia32_selectph_128( 1686 (__mmask8)__U, 1687 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1688 (__v8hf)__C); 1689} 1690 1691static __inline__ __m256h __DEFAULT_FN_ATTRS256 1692_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1693 return (__m256h)__builtin_ia32_selectph_256( 1694 (__mmask16)__U, 1695 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1696 (__v16hf)__C); 1697} 1698 1699static __inline__ __m128h __DEFAULT_FN_ATTRS128 1700_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1701 return (__m128h)__builtin_ia32_selectph_128( 1702 (__mmask8)__U, 1703 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1704 (__v8hf)__C); 1705} 1706 1707static __inline__ __m256h __DEFAULT_FN_ATTRS256 1708_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1709 return (__m256h)__builtin_ia32_selectph_256( 1710 (__mmask16)__U, 1711 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1712 (__v16hf)__C); 1713} 1714 1715static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, 1716 __m128h __B, 1717 __m128h __C) { 1718 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1719 (__v8hf)__C); 1720} 1721 1722static __inline__ __m128h __DEFAULT_FN_ATTRS128 1723_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1724 return (__m128h)__builtin_ia32_selectph_128( 1725 (__mmask8)__U, 1726 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), 1727 (__v8hf)__A); 1728} 1729 1730static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, 1731 __m256h __B, 1732 __m256h __C) { 1733 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1734 (__v16hf)__C); 1735} 1736 1737static __inline__ __m256h __DEFAULT_FN_ATTRS256 1738_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1739 return (__m256h)__builtin_ia32_selectph_256( 1740 (__mmask16)__U, 1741 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), 1742 (__v16hf)__A); 1743} 1744 1745static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, 1746 __m128h __B, 1747 __m128h __C) { 1748 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1749 -(__v8hf)__C); 1750} 1751 1752static __inline__ __m128h __DEFAULT_FN_ATTRS128 1753_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1754 return (__m128h)__builtin_ia32_selectph_128( 1755 (__mmask8)__U, 1756 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1757 (__v8hf)__A); 1758} 1759 1760static __inline__ __m128h __DEFAULT_FN_ATTRS128 1761_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1762 return (__m128h)__builtin_ia32_selectph_128( 1763 (__mmask8)__U, 1764 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1765 (__v8hf)__C); 1766} 1767 1768static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, 1769 __m256h __B, 1770 __m256h __C) { 1771 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1772 -(__v16hf)__C); 1773} 1774 1775static __inline__ __m256h __DEFAULT_FN_ATTRS256 1776_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1777 return (__m256h)__builtin_ia32_selectph_256( 1778 (__mmask16)__U, 1779 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1780 (__v16hf)__A); 1781} 1782 1783static __inline__ __m256h __DEFAULT_FN_ATTRS256 1784_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1785 return (__m256h)__builtin_ia32_selectph_256( 1786 (__mmask16)__U, 1787 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1788 (__v16hf)__C); 1789} 1790 1791static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, 1792 __m128h __B) { 1793 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1794 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1795} 1796 1797static __inline__ __m128h __DEFAULT_FN_ATTRS128 1798_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1799 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1800 (__v4sf)__W, (__mmask8)__U); 1801} 1802 1803static __inline__ __m128h __DEFAULT_FN_ATTRS128 1804_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1805 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1806 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1807} 1808 1809static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, 1810 __m256h __B) { 1811 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1812 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1813} 1814 1815static __inline__ __m256h __DEFAULT_FN_ATTRS256 1816_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1817 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1818 (__v8sf)__W, (__mmask8)__U); 1819} 1820 1821static __inline__ __m256h __DEFAULT_FN_ATTRS256 1822_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1823 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1824 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1825} 1826 1827static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, 1828 __m128h __B, 1829 __m128h __C) { 1830 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1831 (__v4sf)__C, (__mmask8)-1); 1832} 1833 1834static __inline__ __m128h __DEFAULT_FN_ATTRS128 1835_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1836 return (__m128h)__builtin_ia32_selectps_128( 1837 __U, 1838 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, 1839 (__v4sf)__C, (__mmask8)__U), 1840 (__v4sf)__A); 1841} 1842 1843static __inline__ __m128h __DEFAULT_FN_ATTRS128 1844_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1845 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1846 (__v4sf)__C, (__mmask8)__U); 1847} 1848 1849static __inline__ __m128h __DEFAULT_FN_ATTRS128 1850_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1851 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( 1852 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); 1853} 1854 1855static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, 1856 __m256h __B, 1857 __m256h __C) { 1858 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1859 (__v8sf)__C, (__mmask8)-1); 1860} 1861 1862static __inline__ __m256h __DEFAULT_FN_ATTRS256 1863_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1864 return (__m256h)__builtin_ia32_selectps_256( 1865 __U, 1866 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1867 (__mmask8)__U), 1868 (__v8sf)__A); 1869} 1870 1871static __inline__ __m256h __DEFAULT_FN_ATTRS256 1872_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1873 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1874 (__v8sf)__C, (__mmask8)__U); 1875} 1876 1877static __inline__ __m256h __DEFAULT_FN_ATTRS256 1878_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1879 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( 1880 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); 1881} 1882 1883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, 1884 __m128h __B) { 1885 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1886 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1887} 1888 1889static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, 1890 __mmask8 __U, 1891 __m128h __A, 1892 __m128h __B) { 1893 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1894 (__v4sf)__W, (__mmask8)__U); 1895} 1896 1897static __inline__ __m128h __DEFAULT_FN_ATTRS128 1898_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1899 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1900 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1901} 1902 1903static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, 1904 __m256h __B) { 1905 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1906 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1907} 1908 1909static __inline__ __m256h __DEFAULT_FN_ATTRS256 1910_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1911 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1912 (__v8sf)__W, (__mmask8)__U); 1913} 1914 1915static __inline__ __m256h __DEFAULT_FN_ATTRS256 1916_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1917 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1918 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1919} 1920 1921static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, 1922 __m128h __B, 1923 __m128h __C) { 1924 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1925 (__v4sf)__C, (__mmask8)-1); 1926} 1927 1928static __inline__ __m128h __DEFAULT_FN_ATTRS128 1929_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1930 return (__m128h)__builtin_ia32_selectps_128( 1931 __U, 1932 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, 1933 (__mmask8)__U), 1934 (__v4sf)__A); 1935} 1936 1937static __inline__ __m128h __DEFAULT_FN_ATTRS128 1938_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1939 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1940 (__v4sf)__C, (__mmask8)__U); 1941} 1942 1943static __inline__ __m128h __DEFAULT_FN_ATTRS128 1944_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1945 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, 1946 (__v4sf)__C, (__mmask8)__U); 1947} 1948 1949static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, 1950 __m256h __B, 1951 __m256h __C) { 1952 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1953 (__v8sf)__C, (__mmask8)-1); 1954} 1955 1956static __inline__ __m256h __DEFAULT_FN_ATTRS256 1957_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1958 return (__m256h)__builtin_ia32_selectps_256( 1959 __U, 1960 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1961 (__mmask8)__U), 1962 (__v8sf)__A); 1963} 1964 1965static __inline__ __m256h __DEFAULT_FN_ATTRS256 1966_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1967 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1968 (__v8sf)__C, (__mmask8)__U); 1969} 1970 1971static __inline__ __m256h __DEFAULT_FN_ATTRS256 1972_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1973 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, 1974 (__v8sf)__C, (__mmask8)__U); 1975} 1976 1977static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, 1978 __m128h __A, 1979 __m128h __W) { 1980 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, 1981 (__v8hf)__A); 1982} 1983 1984static __inline__ __m256h __DEFAULT_FN_ATTRS256 1985_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { 1986 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, 1987 (__v16hf)__A); 1988} 1989 1990static __inline__ __m128h __DEFAULT_FN_ATTRS128 1991_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { 1992 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, 1993 (__v8hi)__B); 1994} 1995 1996static __inline__ __m256h __DEFAULT_FN_ATTRS256 1997_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { 1998 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, 1999 (__v16hi)__B); 2000} 2001 2002static __inline__ __m128h __DEFAULT_FN_ATTRS128 2003_mm_permutexvar_ph(__m128i __A, __m128h __B) { 2004 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); 2005} 2006 2007static __inline__ __m256h __DEFAULT_FN_ATTRS256 2008_mm256_permutexvar_ph(__m256i __A, __m256h __B) { 2009 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); 2010} 2011 2012static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2013_mm256_reduce_add_ph(__m256h __W) { 2014 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); 2015} 2016 2017static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2018_mm256_reduce_mul_ph(__m256h __W) { 2019 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); 2020} 2021 2022static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2023_mm256_reduce_max_ph(__m256h __V) { 2024 return __builtin_ia32_reduce_fmax_ph256(__V); 2025} 2026 2027static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2028_mm256_reduce_min_ph(__m256h __V) { 2029 return __builtin_ia32_reduce_fmin_ph256(__V); 2030} 2031 2032static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2033_mm_reduce_add_ph(__m128h __W) { 2034 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); 2035} 2036 2037static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2038_mm_reduce_mul_ph(__m128h __W) { 2039 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); 2040} 2041 2042static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2043_mm_reduce_max_ph(__m128h __V) { 2044 return __builtin_ia32_reduce_fmax_ph128(__V); 2045} 2046 2047static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2048_mm_reduce_min_ph(__m128h __V) { 2049 return __builtin_ia32_reduce_fmin_ph128(__V); 2050} 2051 2052// intrinsics below are alias for f*mul_*ch 2053#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) 2054#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) 2055#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) 2056#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) 2057#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) 2058#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) 2059 2060#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) 2061#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) 2062#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) 2063#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) 2064#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) 2065#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) 2066 2067#undef __DEFAULT_FN_ATTRS128 2068#undef __DEFAULT_FN_ATTRS256 2069 2070#endif 2071#endif 2072