1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2022 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library.  This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error                                                                         \
32  "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42  __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43  {
44    return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(
45      __x._M_data);
46  }
47
48template <typename _TV,
49	  typename _TVT
50	  = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
51	  typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
52  _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
53  __to_masktype(_TV __x)
54  { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
55
56// }}}
57// __interleave128_lo {{{
58template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
59	  typename _Trait = _VectorTraits<_Tp>>
60  _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
61  __interleave128_lo(const _Ap& __av, const _Bp& __bv)
62  {
63    const _Tp __a(__av);
64    const _Tp __b(__bv);
65    if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
66      return _Tp{__a[0], __b[0]};
67    else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
68      return _Tp{__a[0], __b[0], __a[1], __b[1]};
69    else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
70      return _Tp{__a[0], __b[0], __a[1], __b[1],
71		 __a[2], __b[2], __a[3], __b[3]};
72    else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
73      return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
74		 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
75		 __a[6], __b[6], __a[7], __b[7]};
76    else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
77      return _Tp{__a[0], __b[0], __a[2], __b[2]};
78    else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
79      return _Tp{__a[0], __b[0], __a[1], __b[1],
80		 __a[4], __b[4], __a[5], __b[5]};
81    else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
82      return _Tp{__a[0],  __b[0],  __a[1],  __b[1], __a[2], __b[2],
83		 __a[3],  __b[3],  __a[8],  __b[8], __a[9], __b[9],
84		 __a[10], __b[10], __a[11], __b[11]};
85    else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
86      return _Tp{__a[0],  __b[0],  __a[1],  __b[1],  __a[2],  __b[2],  __a[3],
87		 __b[3],  __a[4],  __b[4],  __a[5],  __b[5],  __a[6],  __b[6],
88		 __a[7],  __b[7],  __a[16], __b[16], __a[17], __b[17], __a[18],
89		 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
90		 __a[22], __b[22], __a[23], __b[23]};
91    else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
92      return _Tp{__a[0], __b[0], __a[2], __b[2],
93		 __a[4], __b[4], __a[6], __b[6]};
94    else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
95      return _Tp{__a[0],  __b[0],  __a[1],  __b[1], __a[4], __b[4],
96		 __a[5],  __b[5],  __a[8],  __b[8], __a[9], __b[9],
97		 __a[12], __b[12], __a[13], __b[13]};
98    else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
99      return _Tp{__a[0],  __b[0],  __a[1],  __b[1],  __a[2],  __b[2],  __a[3],
100		 __b[3],  __a[8],  __b[8],  __a[9],  __b[9],  __a[10], __b[10],
101		 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
102		 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
103		 __a[26], __b[26], __a[27], __b[27]};
104    else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
105      return _Tp{__a[0],  __b[0],  __a[1],  __b[1],  __a[2],  __b[2],  __a[3],
106		 __b[3],  __a[4],  __b[4],  __a[5],  __b[5],  __a[6],  __b[6],
107		 __a[7],  __b[7],  __a[16], __b[16], __a[17], __b[17], __a[18],
108		 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
109		 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
110		 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
111		 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
112		 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
113		 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
114		 __b[55]};
115    else
116      __assert_unreachable<_Tp>();
117  }
118
119// }}}
120// __is_zero{{{
121template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
122  _GLIBCXX_SIMD_INTRINSIC constexpr bool
123  __is_zero(_Tp __a)
124  {
125    if (!__builtin_is_constant_evaluated())
126      {
127	if constexpr (__have_avx)
128	  {
129	    if constexpr (_TVT::template _S_is<float, 8>)
130	      return _mm256_testz_ps(__a, __a);
131	    else if constexpr (_TVT::template _S_is<double, 4>)
132	      return _mm256_testz_pd(__a, __a);
133	    else if constexpr (sizeof(_Tp) == 32)
134	      return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
135	    else if constexpr (_TVT::template _S_is<float>)
136	      return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
137	    else if constexpr (_TVT::template _S_is<double, 2>)
138	      return _mm_testz_pd(__a, __a);
139	    else
140	      return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
141	  }
142	else if constexpr (__have_sse4_1)
143	  return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
144				 __intrin_bitcast<__m128i>(__a));
145      }
146    else if constexpr (sizeof(_Tp) <= 8)
147      return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
148    else
149      {
150	const auto __b = __vector_bitcast<_LLong>(__a);
151	if constexpr (sizeof(__b) == 16)
152	  return (__b[0] | __b[1]) == 0;
153	else if constexpr (sizeof(__b) == 32)
154	  return __is_zero(__lo128(__b) | __hi128(__b));
155	else if constexpr (sizeof(__b) == 64)
156	  return __is_zero(__lo256(__b) | __hi256(__b));
157	else
158	  __assert_unreachable<_Tp>();
159      }
160  }
161
162// }}}
163// __movemask{{{
164template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
165  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
166  __movemask(_Tp __a)
167  {
168    if constexpr (sizeof(_Tp) == 32)
169      {
170	if constexpr (_TVT::template _S_is<float>)
171	  return _mm256_movemask_ps(__to_intrin(__a));
172	else if constexpr (_TVT::template _S_is<double>)
173	  return _mm256_movemask_pd(__to_intrin(__a));
174	else
175	  return _mm256_movemask_epi8(__to_intrin(__a));
176      }
177    else if constexpr (_TVT::template _S_is<float>)
178      return _mm_movemask_ps(__to_intrin(__a));
179    else if constexpr (_TVT::template _S_is<double>)
180      return _mm_movemask_pd(__to_intrin(__a));
181    else
182      return _mm_movemask_epi8(__to_intrin(__a));
183  }
184
185// }}}
186// __testz{{{
187template <typename _TI, typename _TVT = _VectorTraits<_TI>>
188  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
189  __testz(_TI __a, _TI __b)
190  {
191    static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
192						    _TVT::_S_full_size>>);
193    if (!__builtin_is_constant_evaluated())
194      {
195	if constexpr (sizeof(_TI) == 32)
196	  {
197	    if constexpr (_TVT::template _S_is<float>)
198	      return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
199	    else if constexpr (_TVT::template _S_is<double>)
200	      return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
201	    else
202	      return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
203	  }
204	else if constexpr (_TVT::template _S_is<float> && __have_avx)
205	  return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
206	else if constexpr (_TVT::template _S_is<double> && __have_avx)
207	  return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
208	else if constexpr (__have_sse4_1)
209	  return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
210				 __intrin_bitcast<__m128i>(__to_intrin(__b)));
211	else
212	  return __movemask(0 == __and(__a, __b)) != 0;
213      }
214    else
215      return __is_zero(__and(__a, __b));
216  }
217
218// }}}
219// __testc{{{
220// requires SSE4.1 or above
221template <typename _TI, typename _TVT = _VectorTraits<_TI>>
222  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
223  __testc(_TI __a, _TI __b)
224  {
225    static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
226						    _TVT::_S_full_size>>);
227    if (__builtin_is_constant_evaluated())
228      return __is_zero(__andnot(__a, __b));
229
230    if constexpr (sizeof(_TI) == 32)
231      {
232	if constexpr (_TVT::template _S_is<float>)
233	  return _mm256_testc_ps(__a, __b);
234	else if constexpr (_TVT::template _S_is<double>)
235	  return _mm256_testc_pd(__a, __b);
236	else
237	  return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
238      }
239    else if constexpr (_TVT::template _S_is<float> && __have_avx)
240      return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
241    else if constexpr (_TVT::template _S_is<double> && __have_avx)
242      return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
243    else
244      {
245	static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
246	return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
247			       __intrin_bitcast<__m128i>(__to_intrin(__b)));
248      }
249  }
250
251// }}}
252// __testnzc{{{
253template <typename _TI, typename _TVT = _VectorTraits<_TI>>
254  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
255  __testnzc(_TI __a, _TI __b)
256  {
257    static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
258						    _TVT::_S_full_size>>);
259    if (!__builtin_is_constant_evaluated())
260      {
261	if constexpr (sizeof(_TI) == 32)
262	  {
263	    if constexpr (_TVT::template _S_is<float>)
264	      return _mm256_testnzc_ps(__a, __b);
265	    else if constexpr (_TVT::template _S_is<double>)
266	      return _mm256_testnzc_pd(__a, __b);
267	    else
268	      return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
269	  }
270	else if constexpr (_TVT::template _S_is<float> && __have_avx)
271	  return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
272	else if constexpr (_TVT::template _S_is<double> && __have_avx)
273	  return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
274	else if constexpr (__have_sse4_1)
275	  return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
276				   __intrin_bitcast<__m128i>(__to_intrin(__b)));
277	else
278	  return __movemask(0 == __and(__a, __b)) == 0
279		 && __movemask(0 == __andnot(__a, __b)) == 0;
280      }
281    else
282      return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
283  }
284
285// }}}
286// __xzyw{{{
287// shuffles the complete vector, swapping the inner two quarters. Often useful
288// for AVX for fixing up a shuffle result.
289template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
290  _GLIBCXX_SIMD_INTRINSIC _Tp
291  __xzyw(_Tp __a)
292  {
293    if constexpr (sizeof(_Tp) == 16)
294      {
295	const auto __x = __vector_bitcast<conditional_t<
296	  is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
297	return reinterpret_cast<_Tp>(
298	  decltype(__x){__x[0], __x[2], __x[1], __x[3]});
299      }
300    else if constexpr (sizeof(_Tp) == 32)
301      {
302	const auto __x = __vector_bitcast<conditional_t<
303	  is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
304	return reinterpret_cast<_Tp>(
305	  decltype(__x){__x[0], __x[2], __x[1], __x[3]});
306      }
307    else if constexpr (sizeof(_Tp) == 64)
308      {
309	const auto __x = __vector_bitcast<conditional_t<
310	  is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
311	return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
312						   __x[5], __x[2], __x[3],
313						   __x[6], __x[7]});
314      }
315    else
316      __assert_unreachable<_Tp>();
317  }
318
319// }}}
320// __maskload_epi32{{{
321template <typename _Tp>
322  _GLIBCXX_SIMD_INTRINSIC auto
323  __maskload_epi32(const int* __ptr, _Tp __k)
324  {
325    if constexpr (sizeof(__k) == 16)
326      return _mm_maskload_epi32(__ptr, __k);
327    else
328      return _mm256_maskload_epi32(__ptr, __k);
329  }
330
331// }}}
332// __maskload_epi64{{{
333template <typename _Tp>
334  _GLIBCXX_SIMD_INTRINSIC auto
335  __maskload_epi64(const _LLong* __ptr, _Tp __k)
336  {
337    if constexpr (sizeof(__k) == 16)
338      return _mm_maskload_epi64(__ptr, __k);
339    else
340      return _mm256_maskload_epi64(__ptr, __k);
341  }
342
343// }}}
344// __maskload_ps{{{
345template <typename _Tp>
346  _GLIBCXX_SIMD_INTRINSIC auto
347  __maskload_ps(const float* __ptr, _Tp __k)
348  {
349    if constexpr (sizeof(__k) == 16)
350      return _mm_maskload_ps(__ptr, __k);
351    else
352      return _mm256_maskload_ps(__ptr, __k);
353  }
354
355// }}}
356// __maskload_pd{{{
357template <typename _Tp>
358  _GLIBCXX_SIMD_INTRINSIC auto
359  __maskload_pd(const double* __ptr, _Tp __k)
360  {
361    if constexpr (sizeof(__k) == 16)
362      return _mm_maskload_pd(__ptr, __k);
363    else
364      return _mm256_maskload_pd(__ptr, __k);
365  }
366
367// }}}
368
369#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
370#include "simd_x86_conversions.h"
371#endif
372
373// ISA & type detection {{{
374template <typename _Tp, size_t _Np>
375  constexpr bool
376  __is_sse_ps()
377  {
378    return __have_sse
379	   && is_same_v<_Tp,
380			float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
381  }
382
383template <typename _Tp, size_t _Np>
384  constexpr bool
385  __is_sse_pd()
386  {
387    return __have_sse2
388	   && is_same_v<_Tp,
389			double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
390  }
391
392template <typename _Tp, size_t _Np>
393  constexpr bool
394  __is_avx_ps()
395  {
396    return __have_avx
397	   && is_same_v<_Tp,
398			float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
399  }
400
401template <typename _Tp, size_t _Np>
402  constexpr bool
403  __is_avx_pd()
404  {
405    return __have_avx
406	   && is_same_v<_Tp,
407			double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
408  }
409
410template <typename _Tp, size_t _Np>
411  constexpr bool
412  __is_avx512_ps()
413  {
414    return __have_avx512f
415	   && is_same_v<_Tp,
416			float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
417  }
418
419template <typename _Tp, size_t _Np>
420  constexpr bool
421  __is_avx512_pd()
422  {
423    return __have_avx512f
424	   && is_same_v<_Tp,
425			double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
426  }
427
428// }}}
429struct _MaskImplX86Mixin;
430
431// _CommonImplX86 {{{
432struct _CommonImplX86 : _CommonImplBuiltin
433{
434#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
435  // _S_converts_via_decomposition {{{
436  template <typename _From, typename _To, size_t _ToSize>
437    static constexpr bool _S_converts_via_decomposition()
438    {
439      if constexpr (is_integral_v<
440		      _From> && is_integral_v<_To> && sizeof(_From) == 8
441		    && _ToSize == 16)
442	return (sizeof(_To) == 2 && !__have_ssse3)
443	       || (sizeof(_To) == 1 && !__have_avx512f);
444      else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
445	return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
446		&& !__have_avx512dq)
447	       || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
448		   && _ToSize == 16);
449      else if constexpr (
450	is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
451	&& !__have_avx512dq)
452	return (sizeof(_To) == 4 && _ToSize == 16)
453	       || (sizeof(_To) == 8 && _ToSize < 64);
454      else
455	return false;
456    }
457
458  template <typename _From, typename _To, size_t _ToSize>
459    static inline constexpr bool __converts_via_decomposition_v
460      = _S_converts_via_decomposition<_From, _To, _ToSize>();
461
462  // }}}
463#endif
464  // _S_store {{{
465  using _CommonImplBuiltin::_S_store;
466
467  template <typename _Tp, size_t _Np>
468    _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __x,
469						 void* __addr)
470    {
471      constexpr size_t _Bytes = _Np * sizeof(_Tp);
472
473      if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
474	{
475	  const auto __v = __to_intrin(__x);
476
477	  if constexpr (_Bytes & 1)
478	    {
479	      if constexpr (_Bytes < 16)
480		_mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
481				     __intrin_bitcast<__m128i>(__v));
482	      else if constexpr (_Bytes < 32)
483		_mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
484					__intrin_bitcast<__m256i>(__v));
485	      else
486		_mm512_mask_storeu_epi8(__addr,
487					0xffffffffffffffffull >> (64 - _Bytes),
488					__intrin_bitcast<__m512i>(__v));
489	    }
490	  else if constexpr (_Bytes & 2)
491	    {
492	      if constexpr (_Bytes < 16)
493		_mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
494				      __intrin_bitcast<__m128i>(__v));
495	      else if constexpr (_Bytes < 32)
496		_mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
497					 __intrin_bitcast<__m256i>(__v));
498	      else
499		_mm512_mask_storeu_epi16(__addr,
500					 0xffffffffull >> (32 - _Bytes / 2),
501					 __intrin_bitcast<__m512i>(__v));
502	    }
503	  else if constexpr (_Bytes & 4)
504	    {
505	      if constexpr (_Bytes < 16)
506		_mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
507				      __intrin_bitcast<__m128i>(__v));
508	      else if constexpr (_Bytes < 32)
509		_mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
510					 __intrin_bitcast<__m256i>(__v));
511	      else
512		_mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
513					 __intrin_bitcast<__m512i>(__v));
514	    }
515	  else
516	    {
517	      static_assert(
518		_Bytes > 16,
519		"_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
520		"- 1)) != 0 is impossible");
521	      if constexpr (_Bytes < 32)
522		_mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
523					 __intrin_bitcast<__m256i>(__v));
524	      else
525		_mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
526					 __intrin_bitcast<__m512i>(__v));
527	    }
528	}
529      else
530	_CommonImplBuiltin::_S_store(__x, __addr);
531    }
532
533  // }}}
534  // _S_store_bool_array(_BitMask) {{{
535  template <size_t _Np, bool _Sanitized>
536    _GLIBCXX_SIMD_INTRINSIC static constexpr void
537    _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
538    {
539      if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
540	_S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr {
541			if constexpr (_Np <= 16)
542			  return _mm_movm_epi8(__x._M_to_bits());
543			else if constexpr (_Np <= 32)
544			  return _mm256_movm_epi8(__x._M_to_bits());
545			else if constexpr (_Np <= 64)
546			  return _mm512_movm_epi8(__x._M_to_bits());
547			else
548			  __assert_unreachable<_SizeConstant<_Np>>();
549		      }()),
550		      __mem);
551      else if constexpr (__have_bmi2)
552	{
553	  if constexpr (_Np <= 4)
554	    _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
555	  else
556	    __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
557	      [&](auto __i) {
558		constexpr size_t __offset = __i * sizeof(size_t);
559		constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
560		if constexpr (__todo == 1)
561		  __mem[__offset] = __x[__offset];
562		else
563		  {
564		    const auto __bools =
565#ifdef __x86_64__
566		      _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
567				0x0101010101010101ULL);
568#else // __x86_64__
569		      _pdep_u32(
570			__x.template _M_extract<__offset>()._M_to_bits(),
571			0x01010101U);
572#endif // __x86_64__
573		    _S_store<__todo>(__bools, __mem + __offset);
574		  }
575	      });
576	}
577      else if constexpr (__have_sse2 && _Np > 7)
578	__execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) {
579	  constexpr int __offset = __i * 16;
580	  constexpr int __todo = std::min(16, int(_Np) - __offset);
581	  const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
582	  __vector_type16_t<_UChar> __bools;
583	  if constexpr (__have_avx512f)
584	    {
585	      auto __as32bits
586		= _mm512_maskz_mov_epi32(__bits, __to_intrin(
587						   __vector_broadcast<16>(1)));
588	      auto __as16bits
589		= __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
590					    __todo > 8 ? __hi256(__as32bits)
591						       : __m256i()));
592	      __bools = __vector_bitcast<_UChar>(
593		_mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
594	    }
595	  else
596	    {
597	      using _V = __vector_type_t<_UChar, 16>;
598	      auto __tmp = _mm_cvtsi32_si128(__bits);
599	      __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
600	      __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
601	      __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
602	      _V __tmp2 = reinterpret_cast<_V>(__tmp);
603	      __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
604			   1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
605	      __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
606	    }
607	  _S_store<__todo>(__bools, __mem + __offset);
608	});
609      else
610	_CommonImplBuiltin::_S_store_bool_array(__x, __mem);
611    }
612
613  // }}}
614  // _S_blend_avx512 {{{
615  // Returns: __k ? __b : __a
616  // TODO: reverse __a and __b to match COND_EXPR
617  // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
618  //           __k
619  template <typename _Kp, typename _TV>
620    _GLIBCXX_SIMD_INTRINSIC static _TV
621    _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
622    {
623#ifdef __clang__
624      // FIXME: this does a boolean choice, not a blend
625      return __k ? __a : __b;
626#else
627      static_assert(__is_vector_type_v<_TV>);
628      using _Tp = typename _VectorTraits<_TV>::value_type;
629      static_assert(sizeof(_TV) >= 16);
630      static_assert(sizeof(_Tp) <= 8);
631      using _IntT
632	= conditional_t<(sizeof(_Tp) > 2),
633			conditional_t<sizeof(_Tp) == 4, int, long long>,
634			conditional_t<sizeof(_Tp) == 1, char, short>>;
635      [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
636      [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
637      if constexpr (sizeof(_TV) == 64)
638	{
639	  if constexpr (sizeof(_Tp) == 1)
640	    return reinterpret_cast<_TV>(
641	      __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
642	  else if constexpr (sizeof(_Tp) == 2)
643	    return reinterpret_cast<_TV>(
644	      __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
645	  else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
646	    return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
647	  else if constexpr (sizeof(_Tp) == 4)
648	    return reinterpret_cast<_TV>(
649	      __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
650	  else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
651	    return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
652	  else if constexpr (sizeof(_Tp) == 8)
653	    return reinterpret_cast<_TV>(
654	      __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
655	}
656      else if constexpr (sizeof(_TV) == 32)
657	{
658	  if constexpr (sizeof(_Tp) == 1)
659	    return reinterpret_cast<_TV>(
660	      __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
661	  else if constexpr (sizeof(_Tp) == 2)
662	    return reinterpret_cast<_TV>(
663	      __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
664	  else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
665	    return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
666	  else if constexpr (sizeof(_Tp) == 4)
667	    return reinterpret_cast<_TV>(
668	      __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
669	  else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
670	    return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
671	  else if constexpr (sizeof(_Tp) == 8)
672	    return reinterpret_cast<_TV>(
673	      __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
674	}
675      else if constexpr (sizeof(_TV) == 16)
676	{
677	  if constexpr (sizeof(_Tp) == 1)
678	    return reinterpret_cast<_TV>(
679	      __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
680	  else if constexpr (sizeof(_Tp) == 2)
681	    return reinterpret_cast<_TV>(
682	      __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
683	  else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
684	    return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
685	  else if constexpr (sizeof(_Tp) == 4)
686	    return reinterpret_cast<_TV>(
687	      __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
688	  else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
689	    return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
690	  else if constexpr (sizeof(_Tp) == 8)
691	    return reinterpret_cast<_TV>(
692	      __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
693	}
694#endif
695    }
696
697  // }}}
698  // _S_blend_intrin {{{
699  // Returns: __k ? __b : __a
700  // TODO: reverse __a and __b to match COND_EXPR
701  // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
702  //           Bytes wide
703  template <typename _Tp>
704    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_blend_intrin(_Tp __k, _Tp __a,
705						       _Tp __b) noexcept
706    {
707      static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
708      constexpr struct
709      {
710	_GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
711						  __m128 __k) const noexcept
712	{
713	  return __builtin_ia32_blendvps(__a, __b, __k);
714	}
715	_GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
716						   __m128d __k) const noexcept
717	{
718	  return __builtin_ia32_blendvpd(__a, __b, __k);
719	}
720	_GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
721						   __m128i __k) const noexcept
722	{
723	  return reinterpret_cast<__m128i>(
724	    __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
725				       reinterpret_cast<__v16qi>(__b),
726				       reinterpret_cast<__v16qi>(__k)));
727	}
728	_GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
729						  __m256 __k) const noexcept
730	{
731	  return __builtin_ia32_blendvps256(__a, __b, __k);
732	}
733	_GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
734						   __m256d __k) const noexcept
735	{
736	  return __builtin_ia32_blendvpd256(__a, __b, __k);
737	}
738	_GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
739						   __m256i __k) const noexcept
740	{
741	  if constexpr (__have_avx2)
742	    return reinterpret_cast<__m256i>(
743	      __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
744					 reinterpret_cast<__v32qi>(__b),
745					 reinterpret_cast<__v32qi>(__k)));
746	  else
747	    return reinterpret_cast<__m256i>(
748	      __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
749					 reinterpret_cast<__v8sf>(__b),
750					 reinterpret_cast<__v8sf>(__k)));
751	}
752      } __eval;
753      return __eval(__a, __b, __k);
754    }
755
756  // }}}
757  // _S_blend {{{
758  // Returns: __k ? __at1 : __at0
759  // TODO: reverse __at0 and __at1 to match COND_EXPR
760  template <typename _Tp, size_t _Np>
761    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
762    _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
763	     _SimdWrapper<_Tp, _Np> __at1)
764    {
765      static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
766      if (__k._M_is_constprop() && __at0._M_is_constprop()
767	  && __at1._M_is_constprop())
768	return __generate_from_n_evaluations<_Np,
769					     __vector_type_t<_Tp, _Np>>([&](
770	  auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; });
771      else if constexpr (sizeof(__at0) == 64
772			 || (__have_avx512vl && sizeof(__at0) >= 16))
773	return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
774      else
775	{
776	  static_assert((__have_avx512vl && sizeof(__at0) < 16)
777			|| !__have_avx512vl);
778	  constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
779	  return __vector_bitcast<_Tp, _Np>(
780	    _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
781			    __vector_bitcast<_Tp, __size>(__at1)));
782	}
783    }
784
785  template <typename _Tp, size_t _Np>
786    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
787    _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
788	     _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
789    {
790      const auto __kk = __wrapper_bitcast<_Tp>(__k);
791      if (__builtin_is_constant_evaluated()
792	  || (__kk._M_is_constprop() && __at0._M_is_constprop()
793	      && __at1._M_is_constprop()))
794	{
795	  auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
796	  if (__r._M_is_constprop())
797	    return __r;
798	}
799      if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
800		    && (sizeof(_Tp) >= 4 || __have_avx512bw))
801	// convert to bitmask and call overload above
802	return _S_blend(
803	  _SimdWrapper<bool, _Np>(
804	    __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
805	      ._M_to_bits()),
806	  __at0, __at1);
807      else
808	{
809	  // Since GCC does not assume __k to be a mask, using the builtin
810	  // conditional operator introduces an extra compare against 0 before
811	  // blending. So we rather call the intrinsic here.
812	  if constexpr (__have_sse4_1)
813	    return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
814				   __to_intrin(__at1));
815	  else
816	    return __or(__andnot(__kk, __at0), __and(__kk, __at1));
817	}
818    }
819
820  // }}}
821};
822
823// }}}
824// _SimdImplX86 {{{
825template <typename _Abi, typename>
826  struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
827  {
828    using _Base = _SimdImplBuiltin<_Abi>;
829
830    template <typename _Tp>
831      using _MaskMember = typename _Base::template _MaskMember<_Tp>;
832
833    template <typename _Tp>
834      static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
835
836    template <typename _Tp>
837      static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
838
839    template <typename _Tp>
840      static constexpr size_t _S_max_store_size
841	= (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw  ? 64
842	  : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
843								   : 16;
844    using _MaskImpl = typename _Abi::_MaskImpl;
845
846    // _S_masked_load {{{
847    template <typename _Tp, size_t _Np, typename _Up>
848      static inline _SimdWrapper<_Tp, _Np>
849      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
850		     const _Up* __mem) noexcept
851      {
852	static_assert(_Np == _S_size<_Tp>);
853	if constexpr (is_same_v<_Tp, _Up> || // no conversion
854		      (sizeof(_Tp) == sizeof(_Up)
855		       && is_integral_v<
856			    _Tp> == is_integral_v<_Up>) // conversion via bit
857							// reinterpretation
858	)
859	  {
860	    [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
861	    if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
862			  && sizeof(_Tp) == 1)
863	      {
864		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
865		if constexpr (sizeof(__intrin) == 16)
866		  __merge = __vector_bitcast<_Tp, _Np>(
867		    _mm_mask_loadu_epi8(__intrin, __kk, __mem));
868		else if constexpr (sizeof(__merge) == 32)
869		  __merge = __vector_bitcast<_Tp, _Np>(
870		    _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
871		else if constexpr (sizeof(__merge) == 64)
872		  __merge = __vector_bitcast<_Tp, _Np>(
873		    _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
874		else
875		  __assert_unreachable<_Tp>();
876	      }
877	    else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
878			       && sizeof(_Tp) == 2)
879	      {
880		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
881		if constexpr (sizeof(__intrin) == 16)
882		  __merge = __vector_bitcast<_Tp, _Np>(
883		    _mm_mask_loadu_epi16(__intrin, __kk, __mem));
884		else if constexpr (sizeof(__intrin) == 32)
885		  __merge = __vector_bitcast<_Tp, _Np>(
886		    _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
887		else if constexpr (sizeof(__intrin) == 64)
888		  __merge = __vector_bitcast<_Tp, _Np>(
889		    _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
890		else
891		  __assert_unreachable<_Tp>();
892	      }
893	    else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
894			       && sizeof(_Tp) == 4 && is_integral_v<_Up>)
895	      {
896		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
897		if constexpr (sizeof(__intrin) == 16)
898		  __merge = __vector_bitcast<_Tp, _Np>(
899		    _mm_mask_loadu_epi32(__intrin, __kk, __mem));
900		else if constexpr (sizeof(__intrin) == 32)
901		  __merge = __vector_bitcast<_Tp, _Np>(
902		    _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
903		else if constexpr (sizeof(__intrin) == 64)
904		  __merge = __vector_bitcast<_Tp, _Np>(
905		    _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
906		else
907		  __assert_unreachable<_Tp>();
908	      }
909	    else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
910			       && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
911	      {
912		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
913		if constexpr (sizeof(__intrin) == 16)
914		  __merge = __vector_bitcast<_Tp, _Np>(
915		    _mm_mask_loadu_ps(__intrin, __kk, __mem));
916		else if constexpr (sizeof(__intrin) == 32)
917		  __merge = __vector_bitcast<_Tp, _Np>(
918		    _mm256_mask_loadu_ps(__intrin, __kk, __mem));
919		else if constexpr (sizeof(__intrin) == 64)
920		  __merge = __vector_bitcast<_Tp, _Np>(
921		    _mm512_mask_loadu_ps(__intrin, __kk, __mem));
922		else
923		  __assert_unreachable<_Tp>();
924	      }
925	    else if constexpr (__have_avx2 && sizeof(_Tp) == 4
926			       && is_integral_v<_Up>)
927	      {
928		static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
929		__merge
930		  = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
931			 __vector_bitcast<_Tp, _Np>(
932			   __maskload_epi32(reinterpret_cast<const int*>(__mem),
933					    __to_intrin(__k))));
934	      }
935	    else if constexpr (__have_avx && sizeof(_Tp) == 4)
936	      {
937		static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
938		__merge
939		  = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
940			 __vector_bitcast<_Tp, _Np>(
941			   __maskload_ps(reinterpret_cast<const float*>(__mem),
942					 __to_intrin(__k))));
943	      }
944	    else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945			       && sizeof(_Tp) == 8 && is_integral_v<_Up>)
946	      {
947		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948		if constexpr (sizeof(__intrin) == 16)
949		  __merge = __vector_bitcast<_Tp, _Np>(
950		    _mm_mask_loadu_epi64(__intrin, __kk, __mem));
951		else if constexpr (sizeof(__intrin) == 32)
952		  __merge = __vector_bitcast<_Tp, _Np>(
953		    _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
954		else if constexpr (sizeof(__intrin) == 64)
955		  __merge = __vector_bitcast<_Tp, _Np>(
956		    _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
957		else
958		  __assert_unreachable<_Tp>();
959	      }
960	    else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961			       && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
962	      {
963		const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964		if constexpr (sizeof(__intrin) == 16)
965		  __merge = __vector_bitcast<_Tp, _Np>(
966		    _mm_mask_loadu_pd(__intrin, __kk, __mem));
967		else if constexpr (sizeof(__intrin) == 32)
968		  __merge = __vector_bitcast<_Tp, _Np>(
969		    _mm256_mask_loadu_pd(__intrin, __kk, __mem));
970		else if constexpr (sizeof(__intrin) == 64)
971		  __merge = __vector_bitcast<_Tp, _Np>(
972		    _mm512_mask_loadu_pd(__intrin, __kk, __mem));
973		else
974		  __assert_unreachable<_Tp>();
975	      }
976	    else if constexpr (__have_avx2 && sizeof(_Tp) == 8
977			       && is_integral_v<_Up>)
978	      {
979		static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
980		__merge
981		  = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982			 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
983			   reinterpret_cast<const _LLong*>(__mem),
984			   __to_intrin(__k))));
985	      }
986	    else if constexpr (__have_avx && sizeof(_Tp) == 8)
987	      {
988		static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
989		__merge
990		  = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991			 __vector_bitcast<_Tp, _Np>(
992			   __maskload_pd(reinterpret_cast<const double*>(__mem),
993					 __to_intrin(__k))));
994	      }
995	    else
996	      _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
997					[&](auto __i) {
998					  __merge._M_set(__i, static_cast<_Tp>(
999								__mem[__i]));
1000					});
1001	  }
1002	/* Very uncertain, that the following improves anything. Needs
1003	benchmarking
1004	 * before it's activated.
1005	else if constexpr (sizeof(_Up) <= 8 && // no long double
1006			   !__converts_via_decomposition_v<
1007			     _Up, _Tp,
1008			     sizeof(__merge)> // conversion via decomposition
1009					      // is better handled via the
1010					      // bit_iteration fallback below
1011	)
1012	  {
1013	    // TODO: copy pattern from _S_masked_store, which doesn't resort to
1014	    // fixed_size
1015	    using _Ap       = simd_abi::deduce_t<_Up, _Np>;
1016	    using _ATraits = _SimdTraits<_Up, _Ap>;
1017	    using _AImpl   = typename _ATraits::_SimdImpl;
1018	    typename _ATraits::_SimdMember __uncvted{};
1019	    typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1020	_S_convert<_Up>(__k);
1021	    __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1022	    _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1023	    _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1024	  }
1025	  */
1026	else
1027	  __merge = _Base::_S_masked_load(__merge, __k, __mem);
1028	return __merge;
1029      }
1030
1031    // }}}
1032    // _S_masked_store_nocvt {{{
1033    template <typename _Tp, size_t _Np>
1034      _GLIBCXX_SIMD_INTRINSIC static void
1035      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1036			    _SimdWrapper<bool, _Np> __k)
1037      {
1038	[[maybe_unused]] const auto __vi = __to_intrin(__v);
1039	if constexpr (sizeof(__vi) == 64)
1040	  {
1041	    static_assert(sizeof(__v) == 64 && __have_avx512f);
1042	    if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1043	      _mm512_mask_storeu_epi8(__mem, __k, __vi);
1044	    else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1045	      _mm512_mask_storeu_epi16(__mem, __k, __vi);
1046	    else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1047	      {
1048		if constexpr (is_integral_v<_Tp>)
1049		  _mm512_mask_storeu_epi32(__mem, __k, __vi);
1050		else
1051		  _mm512_mask_storeu_ps(__mem, __k, __vi);
1052	      }
1053	    else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1054	      {
1055		if constexpr (is_integral_v<_Tp>)
1056		  _mm512_mask_storeu_epi64(__mem, __k, __vi);
1057		else
1058		  _mm512_mask_storeu_pd(__mem, __k, __vi);
1059	      }
1060#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1061      // with Skylake-AVX512, __have_avx512bw is true
1062	  else if constexpr (__have_sse2)
1063	    {
1064	      using _M   = __vector_type_t<_Tp, _Np>;
1065	      using _MVT = _VectorTraits<_M>;
1066	      _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1067				  __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1068				  reinterpret_cast<char*>(__mem));
1069	      _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1070				  __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1071				    __k._M_data >> 1 * _MVT::_S_full_size)),
1072				  reinterpret_cast<char*>(__mem) + 1 * 16);
1073	      _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1074				  __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1075				    __k._M_data >> 2 * _MVT::_S_full_size)),
1076				  reinterpret_cast<char*>(__mem) + 2 * 16);
1077	      if constexpr (_Np > 48 / sizeof(_Tp))
1078		_mm_maskmoveu_si128(
1079		  __auto_bitcast(__extract<3, 4>(__v._M_data)),
1080		  __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1081		    __k._M_data >> 3 * _MVT::_S_full_size)),
1082		  reinterpret_cast<char*>(__mem) + 3 * 16);
1083	    }
1084#endif
1085	    else
1086	      __assert_unreachable<_Tp>();
1087	  }
1088	else if constexpr (sizeof(__vi) == 32)
1089	  {
1090	    if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1091	      _mm256_mask_storeu_epi8(__mem, __k, __vi);
1092	    else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1093	      _mm256_mask_storeu_epi16(__mem, __k, __vi);
1094	    else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1095	      {
1096		if constexpr (is_integral_v<_Tp>)
1097		  _mm256_mask_storeu_epi32(__mem, __k, __vi);
1098		else
1099		  _mm256_mask_storeu_ps(__mem, __k, __vi);
1100	      }
1101	    else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1102	      {
1103		if constexpr (is_integral_v<_Tp>)
1104		  _mm256_mask_storeu_epi64(__mem, __k, __vi);
1105		else
1106		  _mm256_mask_storeu_pd(__mem, __k, __vi);
1107	      }
1108	    else if constexpr (__have_avx512f
1109			       && (sizeof(_Tp) >= 4 || __have_avx512bw))
1110	      {
1111		// use a 512-bit maskstore, using zero-extension of the bitmask
1112		_S_masked_store_nocvt(
1113		  _SimdWrapper64<_Tp>(
1114		    __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1115		  __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1116	      }
1117	    else
1118	      _S_masked_store_nocvt(__v, __mem,
1119				    _MaskImpl::template _S_to_maskvector<
1120				      __int_for_sizeof_t<_Tp>, _Np>(__k));
1121	  }
1122	else if constexpr (sizeof(__vi) == 16)
1123	  {
1124	    if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1125	      _mm_mask_storeu_epi8(__mem, __k, __vi);
1126	    else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1127	      _mm_mask_storeu_epi16(__mem, __k, __vi);
1128	    else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1129	      {
1130		if constexpr (is_integral_v<_Tp>)
1131		  _mm_mask_storeu_epi32(__mem, __k, __vi);
1132		else
1133		  _mm_mask_storeu_ps(__mem, __k, __vi);
1134	      }
1135	    else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1136	      {
1137		if constexpr (is_integral_v<_Tp>)
1138		  _mm_mask_storeu_epi64(__mem, __k, __vi);
1139		else
1140		  _mm_mask_storeu_pd(__mem, __k, __vi);
1141	      }
1142	    else if constexpr (__have_avx512f
1143			       && (sizeof(_Tp) >= 4 || __have_avx512bw))
1144	      {
1145		// use a 512-bit maskstore, using zero-extension of the bitmask
1146		_S_masked_store_nocvt(
1147		  _SimdWrapper64<_Tp>(
1148		    __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1149		  __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1150	      }
1151	    else
1152	      _S_masked_store_nocvt(__v, __mem,
1153				    _MaskImpl::template _S_to_maskvector<
1154				      __int_for_sizeof_t<_Tp>, _Np>(__k));
1155	  }
1156	else
1157	  __assert_unreachable<_Tp>();
1158      }
1159
1160    template <typename _Tp, size_t _Np>
1161      _GLIBCXX_SIMD_INTRINSIC static void
1162      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1163			    _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1164      {
1165	if constexpr (sizeof(__v) <= 16)
1166	  {
1167	    [[maybe_unused]] const auto __vi
1168	      = __intrin_bitcast<__m128i>(__as_vector(__v));
1169	    [[maybe_unused]] const auto __ki
1170	      = __intrin_bitcast<__m128i>(__as_vector(__k));
1171	    if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1172	      _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1173	    else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1174	      _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1175	    else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1176			       && is_integral_v<_Tp>)
1177	      _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1178	    else if constexpr (__have_avx && sizeof(_Tp) == 4)
1179	      _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1180			       __vector_bitcast<float>(__vi));
1181	    else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1182			       && is_integral_v<_Tp>)
1183	      _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1184	    else if constexpr (__have_avx && sizeof(_Tp) == 8)
1185	      _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1186			       __vector_bitcast<double>(__vi));
1187	    else if constexpr (__have_sse2)
1188	      _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1189	  }
1190	else if constexpr (sizeof(__v) == 32)
1191	  {
1192	    [[maybe_unused]] const auto __vi
1193	      = __intrin_bitcast<__m256i>(__as_vector(__v));
1194	    [[maybe_unused]] const auto __ki
1195	      = __intrin_bitcast<__m256i>(__as_vector(__k));
1196	    if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1197	      _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1198	    else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1199	      _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1200	    else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1201			       && is_integral_v<_Tp>)
1202	      _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1203	    else if constexpr (sizeof(_Tp) == 4)
1204	      _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1205				  __vector_bitcast<float>(__v));
1206	    else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1207			       && is_integral_v<_Tp>)
1208	      _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1209				     __vi);
1210	    else if constexpr (__have_avx && sizeof(_Tp) == 8)
1211	      _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1212				  __vector_bitcast<double>(__v));
1213	    else if constexpr (__have_sse2)
1214	      {
1215		_mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1216				    reinterpret_cast<char*>(__mem));
1217		_mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1218				    reinterpret_cast<char*>(__mem) + 16);
1219	      }
1220	  }
1221	else
1222	  __assert_unreachable<_Tp>();
1223      }
1224
1225    // }}}
1226    // _S_masked_store {{{
1227    template <typename _Tp, size_t _Np, typename _Up>
1228      _GLIBCXX_SIMD_INTRINSIC static void
1229      _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1230		      const _MaskMember<_Tp> __k) noexcept
1231      {
1232	if constexpr (is_integral_v<
1233			_Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1234		      && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1235		      && (sizeof(__v) == 64 || __have_avx512vl))
1236	  { // truncating store
1237	    const auto __vi = __to_intrin(__v);
1238	    const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1239	    if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1240			  && sizeof(__vi) == 64)
1241	      _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1242	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1243			       && sizeof(__vi) == 32)
1244	      _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1245	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1246			       && sizeof(__vi) == 16)
1247	      _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1248	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1249			       && sizeof(__vi) == 64)
1250	      _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1251	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1252			       && sizeof(__vi) == 32)
1253	      _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1254	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1255			       && sizeof(__vi) == 16)
1256	      _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1257	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1258			       && sizeof(__vi) == 64)
1259	      _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1260	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1261			       && sizeof(__vi) == 32)
1262	      _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1263	    else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1264			       && sizeof(__vi) == 16)
1265	      _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1266	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1267			       && sizeof(__vi) == 64)
1268	      _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1269	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1270			       && sizeof(__vi) == 32)
1271	      _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1272	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1273			       && sizeof(__vi) == 16)
1274	      _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1275	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1276			       && sizeof(__vi) == 64)
1277	      _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1278	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1279			       && sizeof(__vi) == 32)
1280	      _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1281	    else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1282			       && sizeof(__vi) == 16)
1283	      _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1284	    else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1285			       && sizeof(__vi) == 64)
1286	      _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1287	    else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1288			       && sizeof(__vi) == 32)
1289	      _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1290	    else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1291			       && sizeof(__vi) == 16)
1292	      _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1293	    else
1294	      __assert_unreachable<_Tp>();
1295	  }
1296	else
1297	  _Base::_S_masked_store(__v, __mem, __k);
1298      }
1299
1300    // }}}
1301    // _S_multiplies {{{
1302    template <typename _V, typename _VVT = _VectorTraits<_V>>
1303      _GLIBCXX_SIMD_INTRINSIC static constexpr _V _S_multiplies(_V __x, _V __y)
1304      {
1305	using _Tp = typename _VVT::value_type;
1306	if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1307	    || __y._M_is_constprop())
1308	  return __as_vector(__x) * __as_vector(__y);
1309	else if constexpr (sizeof(_Tp) == 1)
1310	  {
1311	    if constexpr (sizeof(_V) == 2)
1312	      {
1313		const auto __xs = reinterpret_cast<short>(__x._M_data);
1314		const auto __ys = reinterpret_cast<short>(__y._M_data);
1315		return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1316		  ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1317	      }
1318	    else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1319	      {
1320		const auto __xi = reinterpret_cast<int>(__x._M_data);
1321		const auto __yi = reinterpret_cast<int>(__y._M_data);
1322		return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1323		  ((__xi * __yi) & 0xff)
1324		  | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1325		  | ((__xi >> 16) * (__yi & 0xff0000)));
1326	      }
1327	    else if constexpr (sizeof(_V) == 4)
1328	      {
1329		const auto __xi = reinterpret_cast<int>(__x._M_data);
1330		const auto __yi = reinterpret_cast<int>(__y._M_data);
1331		return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1332		  ((__xi * __yi) & 0xff)
1333		  | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1334		  | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1335		  | ((__xi >> 24) * (__yi & 0xff000000u)));
1336	      }
1337	    else if constexpr (sizeof(_V) == 8 && __have_avx2
1338			       && is_signed_v<_Tp>)
1339	      return __convert<typename _VVT::type>(
1340		__vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1341		* __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1342	    else if constexpr (sizeof(_V) == 8 && __have_avx2
1343			       && is_unsigned_v<_Tp>)
1344	      return __convert<typename _VVT::type>(
1345		__vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1346		* __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1347	    else
1348	      {
1349		// codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1350		constexpr size_t __full_size = _VVT::_S_full_size;
1351		constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1352		using _ShortW = _SimdWrapper<short, _Np>;
1353		const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1354				       * __vector_bitcast<short, _Np>(__y);
1355		_ShortW __high_byte = _ShortW()._M_data - 256;
1356		//[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1357		const _ShortW __odd
1358		  = (__vector_bitcast<short, _Np>(__x) >> 8)
1359		    * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1360		if constexpr (__have_avx512bw && sizeof(_V) > 2)
1361		  return _CommonImplX86::_S_blend_avx512(
1362		    0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1363		    __vector_bitcast<_Tp>(__odd));
1364		else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1365		  return _CommonImplX86::_S_blend_intrin(__to_intrin(
1366							   __high_byte),
1367							 __to_intrin(__even),
1368							 __to_intrin(__odd));
1369		else
1370		  return __to_intrin(
1371		    __or(__andnot(__high_byte, __even), __odd));
1372	      }
1373	  }
1374	else
1375	  return _Base::_S_multiplies(__x, __y);
1376      }
1377
1378    // }}}
1379    // _S_divides {{{
1380#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1381    template <typename _Tp, size_t _Np>
1382      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1383      _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1384      {
1385	if (!__builtin_is_constant_evaluated()
1386	    && !__builtin_constant_p(__y._M_data))
1387	  if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1388	    { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1389	      // Note that using floating-point division is likely to raise the
1390	      // *Inexact* exception flag and thus appears like an invalid
1391	      // "as-if" transformation. However, C++ doesn't specify how the
1392	      // fpenv can be observed and points to C. C says that function
1393	      // calls are assumed to potentially raise fp exceptions, unless
1394	      // documented otherwise. Consequently, operator/, which is a
1395	      // function call, may raise fp exceptions.
1396	      /*const struct _CsrGuard
1397	      {
1398		const unsigned _M_data = _mm_getcsr();
1399		_CsrGuard()
1400		{
1401		  _mm_setcsr(0x9f80); // turn off FP exceptions and
1402	      flush-to-zero
1403		}
1404		~_CsrGuard() { _mm_setcsr(_M_data); }
1405	      } __csr;*/
1406	      using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1407	      constexpr size_t __n_intermediate
1408		= std::min(_Np, (__have_avx512f ? 64
1409				 : __have_avx   ? 32
1410						: 16)
1411				  / sizeof(_Float));
1412	      using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1413	      constexpr size_t __n_floatv
1414		= __div_roundup(_Np, __n_intermediate);
1415	      using _R = __vector_type_t<_Tp, _Np>;
1416	      const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1417	      const auto __yf = __convert_all<_FloatV, __n_floatv>(
1418		_Abi::__make_padding_nonzero(__as_vector(__y)));
1419	      return __call_with_n_evaluations<__n_floatv>(
1420		[](auto... __quotients) {
1421		  return __vector_convert<_R>(__quotients...);
1422		},
1423		[&__xf,
1424		 &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> {
1425#if __RECIPROCAL_MATH__
1426		  // If -freciprocal-math is active, using the `/` operator is
1427		  // incorrect because it may be translated to an imprecise
1428		  // multiplication with reciprocal. We need to use inline
1429		  // assembly to force a real division.
1430		  _FloatV __r;
1431		  if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1432					    // because once -mavx is given, GCC
1433					    // emits VEX encoded vdivp[sd]
1434		    {
1435		      if constexpr (sizeof(_Tp) == 4)
1436			asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1437			    : "=x"(__r)
1438			    : "x"(__xf[__i]), "x"(__yf[__i]));
1439		      else
1440			asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1441			    : "=x"(__r)
1442			    : "x"(__xf[__i]), "x"(__yf[__i]));
1443		    }
1444		  else
1445		    {
1446		      __r = __xf[__i];
1447		      if constexpr (sizeof(_Tp) == 4)
1448			asm("divpd\t{%1, %0|%0, %1}"
1449			    : "=x"(__r)
1450			    : "x"(__yf[__i]));
1451		      else
1452			asm("divps\t{%1, %0|%0, %1}"
1453			    : "=x"(__r)
1454			    : "x"(__yf[__i]));
1455		    }
1456		  return __r;
1457#else
1458		  return __xf[__i] / __yf[__i];
1459#endif
1460		});
1461	    }
1462	/* 64-bit int division is potentially optimizable via double division if
1463	 * the value in __x is small enough and the conversion between
1464	 * int<->double is efficient enough:
1465	else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1466			   sizeof(_Tp) == 8)
1467	  {
1468	    if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1469	      {
1470		if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1471						    0xffe0'0000'0000'0000ull}))
1472		  {
1473		    __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1474		  }
1475	      }
1476	  }
1477	  */
1478	return _Base::_S_divides(__x, __y);
1479      }
1480#endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1481
1482    // }}}
1483    // _S_modulus {{{
1484    template <typename _Tp, size_t _Np>
1485      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1486      _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1487      {
1488	if (__builtin_is_constant_evaluated()
1489	    || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1490	  return _Base::_S_modulus(__x, __y);
1491	else
1492	  return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1493      }
1494
1495    // }}}
1496    // _S_bit_shift_left {{{
1497    // Notes on UB. C++2a [expr.shift] says:
1498    // -1- [...] The operands shall be of integral or unscoped enumeration type
1499    //     and integral promotions are performed. The type of the result is that
1500    //     of the promoted left operand. The behavior is undefined if the right
1501    //     operand is negative, or greater than or equal to the width of the
1502    //     promoted left operand.
1503    // -2- The value of E1 << E2 is the unique value congruent to E1��2^E2 modulo
1504    //     2^N, where N is the width of the type of the result.
1505    //
1506    // C++17 [expr.shift] says:
1507    // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1508    //     bits are zero-filled. If E1 has an unsigned type, the value of the
1509    //     result is E1 �� 2^E2 , reduced modulo one more than the maximum value
1510    //     representable in the result type. Otherwise, if E1 has a signed type
1511    //     and non-negative value, and E1 �� 2^E2 is representable in the
1512    //     corresponding unsigned type of the result type, then that value,
1513    //     converted to the result type, is the resulting value; otherwise, the
1514    //     behavior is undefined.
1515    //
1516    // Consequences:
1517    // With C++2a signed and unsigned types have the same UB
1518    // characteristics:
1519    // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1520    //
1521    // With C++17 there's little room for optimizations because the standard
1522    // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1523    // short and char shifts must assume shifts affect bits of neighboring
1524    // values.
1525  #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1526    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1527      constexpr inline _GLIBCXX_CONST static typename _TVT::type
1528      _S_bit_shift_left(_Tp __xx, int __y)
1529      {
1530	using _V = typename _TVT::type;
1531	using _Up = typename _TVT::value_type;
1532	_V __x = __xx;
1533	[[maybe_unused]] const auto __ix = __to_intrin(__x);
1534	if (__builtin_is_constant_evaluated())
1535	  return __x << __y;
1536#if __cplusplus > 201703
1537	// after C++17, signed shifts have no UB, and behave just like unsigned
1538	// shifts
1539	else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1540	  return __vector_bitcast<_Up>(
1541	    _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1542			      __y));
1543#endif
1544	else if constexpr (sizeof(_Up) == 1)
1545	  {
1546	    // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1547	    if (__builtin_constant_p(__y))
1548	      {
1549		if (__y == 0)
1550		  return __x;
1551		else if (__y == 1)
1552		  return __x + __x;
1553		else if (__y == 2)
1554		  {
1555		    __x = __x + __x;
1556		    return __x + __x;
1557		  }
1558		else if (__y > 2 && __y < 8)
1559		  {
1560		    if constexpr (sizeof(__x) > sizeof(unsigned))
1561		      {
1562			const _UChar __mask = 0xff << __y; // precomputed vector
1563			return __vector_bitcast<_Up>(
1564			  __vector_bitcast<_UChar>(
1565			    __vector_bitcast<unsigned>(__x) << __y)
1566			  & __mask);
1567		      }
1568		    else
1569		      {
1570			const unsigned __mask
1571			  = (0xff & (0xff << __y)) * 0x01010101u;
1572			return reinterpret_cast<_V>(
1573			  static_cast<__int_for_sizeof_t<_V>>(
1574			    unsigned(
1575			      reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1576			      << __y)
1577			    & __mask));
1578		      }
1579		  }
1580		else if (__y >= 8 && __y < 32)
1581		  return _V();
1582		else
1583		  __builtin_unreachable();
1584	      }
1585	    // general strategy in the following: use an sllv instead of sll
1586	    // instruction, because it's 2 to 4 times faster:
1587	    else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1588	      return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1589		_mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1590				  _mm256_set1_epi16(__y))));
1591	    else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1592	      return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1593		_mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1594				  _mm512_set1_epi16(__y))));
1595	    else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1596	      {
1597		const auto __shift = _mm512_set1_epi16(__y);
1598		return __vector_bitcast<_Up>(
1599		  __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1600			     _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1601			   _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1602			     _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1603	      }
1604	    else if constexpr (__have_avx2 && sizeof(__x) == 32)
1605	      {
1606#if 1
1607		const auto __shift = _mm_cvtsi32_si128(__y);
1608		auto __k
1609		  = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1610		__k |= _mm256_srli_epi16(__k, 8);
1611		return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1612					     & __k);
1613#else
1614		const _Up __k = 0xff << __y;
1615		return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1616		       & __k;
1617#endif
1618	      }
1619	    else
1620	      {
1621		const auto __shift = _mm_cvtsi32_si128(__y);
1622		auto __k
1623		  = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1624		__k |= _mm_srli_epi16(__k, 8);
1625		return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1626	      }
1627	  }
1628	return __x << __y;
1629      }
1630
1631    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1632      constexpr inline _GLIBCXX_CONST static typename _TVT::type
1633      _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1634      {
1635	using _V = typename _TVT::type;
1636	using _Up = typename _TVT::value_type;
1637	_V __x = __xx;
1638	[[maybe_unused]] const auto __ix = __to_intrin(__x);
1639	[[maybe_unused]] const auto __iy = __to_intrin(__y);
1640	if (__builtin_is_constant_evaluated())
1641	  return __x << __y;
1642#if __cplusplus > 201703
1643	// after C++17, signed shifts have no UB, and behave just like unsigned
1644	// shifts
1645	else if constexpr (is_signed_v<_Up>)
1646	  return __vector_bitcast<_Up>(
1647	    _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1648			      __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1649#endif
1650	else if constexpr (sizeof(_Up) == 1)
1651	  {
1652	    if constexpr (sizeof __ix == 64 && __have_avx512bw)
1653	      return __vector_bitcast<_Up>(__concat(
1654		_mm512_cvtepi16_epi8(
1655		  _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1656				    _mm512_cvtepu8_epi16(__lo256(__iy)))),
1657		_mm512_cvtepi16_epi8(
1658		  _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1659				    _mm512_cvtepu8_epi16(__hi256(__iy))))));
1660	    else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1661	      return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1662		_mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1663				  _mm512_cvtepu8_epi16(__iy))));
1664	    else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1665	      return __intrin_bitcast<_V>(
1666		_mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1667						 _mm_cvtepu8_epi16(__iy))));
1668	    else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1669	      return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1670		_mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1671				  _mm256_cvtepu8_epi16(__iy))));
1672	    else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1673	      return __intrin_bitcast<_V>(
1674		__lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1675		  _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1676		  _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1677	    else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1678	      {
1679		auto __mask
1680		  = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1681		auto __x4
1682		  = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1683		__x4 &= char(0xf0);
1684		__x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1685		  __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1686		__mask += __mask;
1687		auto __x2
1688		  = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1689		__x2 &= char(0xfc);
1690		__x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1691		  __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1692		__mask += __mask;
1693		auto __x1 = __x + __x;
1694		__x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1695		  __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1696		return __x
1697		       & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1698	      }
1699	    else if constexpr (sizeof(__x) == 16)
1700	      {
1701		auto __mask
1702		  = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1703		auto __x4
1704		  = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1705		__x4 &= char(0xf0);
1706		__x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1707		__mask += __mask;
1708		auto __x2
1709		  = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1710		__x2 &= char(0xfc);
1711		__x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1712		__mask += __mask;
1713		auto __x1 = __x + __x;
1714		__x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1715		return __x
1716		       & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1717	      }
1718	    else
1719	      return __x << __y;
1720	  }
1721	else if constexpr (sizeof(_Up) == 2)
1722	  {
1723	    if constexpr (sizeof __ix == 64 && __have_avx512bw)
1724	      return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1725	    else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1726	      return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1727	    else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1728	      return __vector_bitcast<_Up>(
1729		__lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1730					  _mm512_castsi256_si512(__iy))));
1731	    else if constexpr (sizeof __ix == 32 && __have_avx2)
1732	      {
1733		const auto __ux = __vector_bitcast<unsigned>(__x);
1734		const auto __uy = __vector_bitcast<unsigned>(__y);
1735		return __vector_bitcast<_Up>(_mm256_blend_epi16(
1736		  __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1737		  __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1738	      }
1739	    else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1740	      return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1741	    else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1742	      return __intrin_bitcast<_V>(
1743		__lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1744					  _mm512_castsi128_si512(__iy))));
1745	    else if constexpr (sizeof __ix == 16 && __have_avx2)
1746	      {
1747		const auto __ux = __vector_bitcast<unsigned>(__ix);
1748		const auto __uy = __vector_bitcast<unsigned>(__iy);
1749		return __intrin_bitcast<_V>(_mm_blend_epi16(
1750		  __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1751		  __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1752	      }
1753	    else if constexpr (sizeof __ix == 16)
1754	      {
1755		using _Float4 = __vector_type_t<float, 4>;
1756		using _Int4 = __vector_type_t<int, 4>;
1757		using _UInt4 = __vector_type_t<unsigned, 4>;
1758		const _UInt4 __yu
1759		  = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1760		return __x
1761		       * __intrin_bitcast<_V>(
1762			 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1763			   reinterpret_cast<_Float4>(__yu << 23)))
1764			 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1765			      reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1766			    << 16));
1767	      }
1768	    else
1769	      __assert_unreachable<_Tp>();
1770	  }
1771	else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1772			   && !__have_avx2)
1773	  // latency is suboptimal, but throughput is at full speedup
1774	  return __intrin_bitcast<_V>(
1775	    __vector_bitcast<unsigned>(__ix)
1776	    * __vector_convert<__vector_type16_t<int>>(
1777	      _SimdWrapper<float, 4>(__vector_bitcast<float>(
1778		(__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1779	else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1780			   && !__have_avx2)
1781	  {
1782	    const auto __lo = _mm_sll_epi64(__ix, __iy);
1783	    const auto __hi
1784	      = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1785	    if constexpr (__have_sse4_1)
1786	      return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1787	    else
1788	      return __vector_bitcast<_Up>(
1789		_mm_move_sd(__vector_bitcast<double>(__hi),
1790			    __vector_bitcast<double>(__lo)));
1791	  }
1792	else
1793	  return __x << __y;
1794      }
1795#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1796
1797    // }}}
1798    // _S_bit_shift_right {{{
1799#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1800    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1801      constexpr inline _GLIBCXX_CONST static typename _TVT::type
1802      _S_bit_shift_right(_Tp __xx, int __y)
1803      {
1804	using _V = typename _TVT::type;
1805	using _Up = typename _TVT::value_type;
1806	_V __x = __xx;
1807	[[maybe_unused]] const auto __ix = __to_intrin(__x);
1808	if (__builtin_is_constant_evaluated())
1809	  return __x >> __y;
1810	else if (__builtin_constant_p(__y)
1811		 && is_unsigned_v<
1812		   _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1813	  return _V();
1814	else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1815	  return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1816		 & _Up(0xff >> __y);
1817	//}}}
1818	else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1819	  return __intrin_bitcast<_V>(
1820	    (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1821				       >> (__y + 8))
1822	     << 8)
1823	    | (__vector_bitcast<_UShort>(
1824		 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1825		 >> __y)
1826	       >> 8));
1827	//}}}
1828	// GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1829	else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1830	  {
1831	    if (__y > 32)
1832	      return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1833		      & _Up(0xffff'ffff'0000'0000ull))
1834		     | __vector_bitcast<_Up>(
1835		       __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1836					     >> 32)
1837		       >> (__y - 32));
1838	    else
1839	      return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1840					  >> __y)
1841		     | __vector_bitcast<_Up>(
1842		       __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1843		       >> __y);
1844	  }
1845	//}}}
1846	else
1847	  return __x >> __y;
1848      }
1849
1850    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1851      constexpr inline _GLIBCXX_CONST static typename _TVT::type
1852      _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1853      {
1854	using _V = typename _TVT::type;
1855	using _Up = typename _TVT::value_type;
1856	_V __x = __xx;
1857	[[maybe_unused]] const auto __ix = __to_intrin(__x);
1858	[[maybe_unused]] const auto __iy = __to_intrin(__y);
1859	if (__builtin_is_constant_evaluated()
1860	    || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1861	  return __x >> __y;
1862	else if constexpr (sizeof(_Up) == 1) //{{{
1863	  {
1864	    if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1865	      return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1866		is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1867						  _mm_cvtepi8_epi16(__iy))
1868				 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1869						  _mm_cvtepu8_epi16(__iy))));
1870	    if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1871	      return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1872		is_signed_v<_Up>
1873		  ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1874				      _mm256_cvtepi8_epi16(__iy))
1875		  : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1876				      _mm256_cvtepu8_epi16(__iy))));
1877	    else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1878	      return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1879		is_signed_v<_Up>
1880		  ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1881				      _mm512_cvtepi8_epi16(__iy))
1882		  : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1883				      _mm512_cvtepu8_epi16(__iy))));
1884	    else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1885	      return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1886		_mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1887		0x5555'5555'5555'5555ull,
1888		_mm512_srav_epi16(
1889		  _mm512_slli_epi16(__ix, 8),
1890		  _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1891					_mm512_set1_epi16(8)))));
1892	    else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1893	      return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1894		_mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1895		0x5555'5555'5555'5555ull,
1896		_mm512_srlv_epi16(
1897		  _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1898		  _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1899	    /* This has better throughput but higher latency than the impl below
1900	    else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1901			       is_unsigned_v<_Up>)
1902	      {
1903		const auto __shorts = __to_intrin(_S_bit_shift_right(
1904		  __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1905		  __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1906		return __vector_bitcast<_Up>(
1907		  _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1908	      }
1909	      */
1910	    else if constexpr (__have_avx2 && sizeof(__x) > 8)
1911	      // the following uses vpsr[al]vd, which requires AVX2
1912	      if constexpr (is_signed_v<_Up>)
1913		{
1914		  const auto r3 = __vector_bitcast<_UInt>(
1915				    (__vector_bitcast<int>(__x)
1916				     >> (__vector_bitcast<_UInt>(__y) >> 24)))
1917				  & 0xff000000u;
1918		  const auto r2
1919		    = __vector_bitcast<_UInt>(
1920			((__vector_bitcast<int>(__x) << 8)
1921			 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1922		      & 0xff000000u;
1923		  const auto r1
1924		    = __vector_bitcast<_UInt>(
1925			((__vector_bitcast<int>(__x) << 16)
1926			 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1927		      & 0xff000000u;
1928		  const auto r0 = __vector_bitcast<_UInt>(
1929		    (__vector_bitcast<int>(__x) << 24)
1930		    >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1931		  return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1932					       | (r0 >> 24));
1933		}
1934	      else
1935		{
1936		  const auto r3 = (__vector_bitcast<_UInt>(__x)
1937				   >> (__vector_bitcast<_UInt>(__y) >> 24))
1938				  & 0xff000000u;
1939		  const auto r2
1940		    = ((__vector_bitcast<_UInt>(__x) << 8)
1941		       >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1942		      & 0xff000000u;
1943		  const auto r1
1944		    = ((__vector_bitcast<_UInt>(__x) << 16)
1945		       >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1946		      & 0xff000000u;
1947		  const auto r0
1948		    = (__vector_bitcast<_UInt>(__x) << 24)
1949		      >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1950		  return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1951					       | (r0 >> 24));
1952		}
1953	    else if constexpr (__have_sse4_1
1954			       && is_unsigned_v<_Up> && sizeof(__x) > 2)
1955	      {
1956		auto __x128 = __vector_bitcast<_Up>(__ix);
1957		auto __mask
1958		  = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
1959		auto __x4 = __vector_bitcast<_Up>(
1960		  (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
1961		__x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1962		  __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
1963		__mask += __mask;
1964		auto __x2 = __vector_bitcast<_Up>(
1965		  (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
1966		__x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1967		  __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
1968		__mask += __mask;
1969		auto __x1 = __vector_bitcast<_Up>(
1970		  (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
1971		__x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1972		  __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
1973		return __intrin_bitcast<_V>(
1974		  __x128
1975		  & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
1976		     == 0)); // y > 7 nulls the result
1977	      }
1978	    else if constexpr (__have_sse4_1
1979			       && is_signed_v<_Up> && sizeof(__x) > 2)
1980	      {
1981		auto __mask = __vector_bitcast<_UChar>(
1982		  __vector_bitcast<_UShort>(__iy) << 5);
1983		auto __maskl = [&]() {
1984		  return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
1985		};
1986		auto __xh = __vector_bitcast<short>(__ix);
1987		auto __xl = __vector_bitcast<short>(__ix) << 8;
1988		auto __xh4 = __xh >> 4;
1989		auto __xl4 = __xl >> 4;
1990		__xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1991		  __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
1992		__xl = __vector_bitcast<short>(
1993		  _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
1994						  __to_intrin(__xl4)));
1995		__mask += __mask;
1996		auto __xh2 = __xh >> 2;
1997		auto __xl2 = __xl >> 2;
1998		__xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1999		  __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2000		__xl = __vector_bitcast<short>(
2001		  _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2002						  __to_intrin(__xl2)));
2003		__mask += __mask;
2004		auto __xh1 = __xh >> 1;
2005		auto __xl1 = __xl >> 1;
2006		__xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2007		  __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2008		__xl = __vector_bitcast<short>(
2009		  _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2010						  __to_intrin(__xl1)));
2011		return __intrin_bitcast<_V>(
2012		  (__vector_bitcast<_Up>((__xh & short(0xff00)))
2013		   | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2014					   >> 8))
2015		  & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2016		     == 0)); // y > 7 nulls the result
2017	      }
2018	    else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2019	      {
2020		auto __mask
2021		  = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2022		auto __x4 = __vector_bitcast<_Up>(
2023		  (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2024		__x = __mask > 0x7f ? __x4 : __x;
2025		__mask += __mask;
2026		auto __x2 = __vector_bitcast<_Up>(
2027		  (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2028		__x = __mask > 0x7f ? __x2 : __x;
2029		__mask += __mask;
2030		auto __x1 = __vector_bitcast<_Up>(
2031		  (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2032		__x = __mask > 0x7f ? __x1 : __x;
2033		return __x
2034		       & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2035	      }
2036	    else if constexpr (sizeof(__x) > 2) // signed SSE2
2037	      {
2038		static_assert(is_signed_v<_Up>);
2039		auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2040		auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2041		auto __xh = __vector_bitcast<short>(__x);
2042		auto __xl = __vector_bitcast<short>(__x) << 8;
2043		auto __xh4 = __xh >> 4;
2044		auto __xl4 = __xl >> 4;
2045		__xh = __maskh > 0x7fff ? __xh4 : __xh;
2046		__xl = __maskl > 0x7fff ? __xl4 : __xl;
2047		__maskh += __maskh;
2048		__maskl += __maskl;
2049		auto __xh2 = __xh >> 2;
2050		auto __xl2 = __xl >> 2;
2051		__xh = __maskh > 0x7fff ? __xh2 : __xh;
2052		__xl = __maskl > 0x7fff ? __xl2 : __xl;
2053		__maskh += __maskh;
2054		__maskl += __maskl;
2055		auto __xh1 = __xh >> 1;
2056		auto __xl1 = __xl >> 1;
2057		__xh = __maskh > 0x7fff ? __xh1 : __xh;
2058		__xl = __maskl > 0x7fff ? __xl1 : __xl;
2059		__x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2060		      | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2061					      >> 8);
2062		return __x
2063		       & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2064	      }
2065	    else
2066	      return __x >> __y;
2067	  }                                                      //}}}
2068	else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2069	  {
2070	    [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) {
2071	      if constexpr (sizeof(__a) == 16)
2072		return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2073				       0xaa);
2074	      else if constexpr (sizeof(__a) == 32)
2075		return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2076					  0xaa);
2077	      else if constexpr (sizeof(__a) == 64)
2078		return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2079					       __to_intrin(__b));
2080	      else
2081		__assert_unreachable<decltype(__a)>();
2082	    };
2083	    if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2084	      return __intrin_bitcast<_V>(is_signed_v<_Up>
2085					    ? _mm_srav_epi16(__ix, __iy)
2086					    : _mm_srlv_epi16(__ix, __iy));
2087	    else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2088	      return __vector_bitcast<_Up>(is_signed_v<_Up>
2089					     ? _mm256_srav_epi16(__ix, __iy)
2090					     : _mm256_srlv_epi16(__ix, __iy));
2091	    else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2092	      return __vector_bitcast<_Up>(is_signed_v<_Up>
2093					     ? _mm512_srav_epi16(__ix, __iy)
2094					     : _mm512_srlv_epi16(__ix, __iy));
2095	    else if constexpr (__have_avx2 && is_signed_v<_Up>)
2096	      return __intrin_bitcast<_V>(
2097		__blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2098			      >> (__vector_bitcast<int>(__iy) & 0xffffu))
2099			       >> 16,
2100			     __vector_bitcast<int>(__ix)
2101			       >> (__vector_bitcast<int>(__iy) >> 16)));
2102	    else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2103	      return __intrin_bitcast<_V>(
2104		__blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2105			       >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2106			     __vector_bitcast<_UInt>(__ix)
2107			       >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2108	    else if constexpr (__have_sse4_1)
2109	      {
2110		auto __mask = __vector_bitcast<_UShort>(__iy);
2111		auto __x128 = __vector_bitcast<_Up>(__ix);
2112		//__mask *= 0x0808;
2113		__mask = (__mask << 3) | (__mask << 11);
2114		// do __x128 = 0 where __y[4] is set
2115		__x128 = __vector_bitcast<_Up>(
2116		  _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2117				  __to_intrin(__mask)));
2118		// do __x128 =>> 8 where __y[3] is set
2119		__x128 = __vector_bitcast<_Up>(
2120		  _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2121				  __to_intrin(__mask += __mask)));
2122		// do __x128 =>> 4 where __y[2] is set
2123		__x128 = __vector_bitcast<_Up>(
2124		  _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2125				  __to_intrin(__mask += __mask)));
2126		// do __x128 =>> 2 where __y[1] is set
2127		__x128 = __vector_bitcast<_Up>(
2128		  _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2129				  __to_intrin(__mask += __mask)));
2130		// do __x128 =>> 1 where __y[0] is set
2131		return __intrin_bitcast<_V>(
2132		  _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2133				  __to_intrin(__mask + __mask)));
2134	      }
2135	    else
2136	      {
2137		auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2138		auto __x128 = __vector_bitcast<_Up>(__ix);
2139		auto __mask = [](__vector_type16_t<_UShort> __kk) {
2140		  return __vector_bitcast<short>(__kk) < 0;
2141		};
2142		// do __x128 = 0 where __y[4] is set
2143		__x128 = __mask(__k) ? decltype(__x128)() : __x128;
2144		// do __x128 =>> 8 where __y[3] is set
2145		__x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2146		// do __x128 =>> 4 where __y[2] is set
2147		__x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2148		// do __x128 =>> 2 where __y[1] is set
2149		__x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2150		// do __x128 =>> 1 where __y[0] is set
2151		return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2152							      : __x128);
2153	      }
2154	  }                                                  //}}}
2155	else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2156	  {
2157	    if constexpr (is_unsigned_v<_Up>)
2158	      {
2159		// x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2160		const __m128 __factor_f = reinterpret_cast<__m128>(
2161		  0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2162		const __m128i __factor
2163		  = __builtin_constant_p(__factor_f)
2164		      ? __to_intrin(
2165			__make_vector<unsigned>(__factor_f[0], __factor_f[1],
2166						__factor_f[2], __factor_f[3]))
2167		      : _mm_cvttps_epi32(__factor_f);
2168		const auto __r02
2169		  = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2170		const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2171						 _mm_srli_si128(__factor, 4));
2172		if constexpr (__have_sse4_1)
2173		  return __intrin_bitcast<_V>(
2174		    _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2175		else
2176		  return __intrin_bitcast<_V>(
2177		    __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2178	      }
2179	    else
2180	      {
2181		auto __shift = [](auto __a, auto __b) {
2182		  if constexpr (is_signed_v<_Up>)
2183		    return _mm_sra_epi32(__a, __b);
2184		  else
2185		    return _mm_srl_epi32(__a, __b);
2186		};
2187		const auto __r0
2188		  = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2189		const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2190		const auto __r2
2191		  = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2192		const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2193		if constexpr (__have_sse4_1)
2194		  return __intrin_bitcast<_V>(
2195		    _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2196				    _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2197		else
2198		  return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2199		    _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2200		    _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2201	      }
2202	  } //}}}
2203	else
2204	  return __x >> __y;
2205      }
2206#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2207
2208    // }}}
2209    // compares {{{
2210    // _S_equal_to {{{
2211    template <typename _Tp, size_t _Np>
2212      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2213      _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2214      {
2215	if constexpr (__is_avx512_abi<_Abi>()) // {{{
2216	  {
2217	    if (__builtin_is_constant_evaluated()
2218		|| (__x._M_is_constprop() && __y._M_is_constprop()))
2219	      return _MaskImpl::_S_to_bits(
2220		__as_wrapper<_Np>(__x._M_data == __y._M_data));
2221
2222	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2223	    [[maybe_unused]] const auto __xi = __to_intrin(__x);
2224	    [[maybe_unused]] const auto __yi = __to_intrin(__y);
2225	    if constexpr (is_floating_point_v<_Tp>)
2226	      {
2227		if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2228		  return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2229		else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2230		  return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2231		else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2232		  return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2233		else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2234		  return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2235		else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2236		  return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2237		else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2238		  return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2239		else
2240		  __assert_unreachable<_Tp>();
2241	      }
2242	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2243	      return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2244	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2245	      return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2246	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2247	      return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2248	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2249	      return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2250	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2251	      return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2252	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2253	      return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2254	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2255	      return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2256	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2257	      return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2258	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2259	      return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2260	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2261	      return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2262	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2263	      return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2264	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2265	      return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2266	    else
2267	      __assert_unreachable<_Tp>();
2268	  } // }}}
2269	else if (__builtin_is_constant_evaluated())
2270	  return _Base::_S_equal_to(__x, __y);
2271	else if constexpr (sizeof(__x) == 8) // {{{
2272	  {
2273	    const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2274				== __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2275	    _MaskMember<_Tp> __r64;
2276	    __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2277	    return __r64;
2278	  } // }}}
2279	else
2280	  return _Base::_S_equal_to(__x, __y);
2281      }
2282
2283    // }}}
2284    // _S_not_equal_to {{{
2285    template <typename _Tp, size_t _Np>
2286      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2287      _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2288      {
2289	if constexpr (__is_avx512_abi<_Abi>()) // {{{
2290	  {
2291	    if (__builtin_is_constant_evaluated()
2292		|| (__x._M_is_constprop() && __y._M_is_constprop()))
2293	      return _MaskImpl::_S_to_bits(
2294		__as_wrapper<_Np>(__x._M_data != __y._M_data));
2295
2296	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2297	    [[maybe_unused]] const auto __xi = __to_intrin(__x);
2298	    [[maybe_unused]] const auto __yi = __to_intrin(__y);
2299	    if constexpr (is_floating_point_v<_Tp>)
2300	      {
2301		if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2302		  return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2303		else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2304		  return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2305		else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2306		  return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2307		else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2308		  return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2309		else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2310		  return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2311		else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2312		  return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2313		else
2314		  __assert_unreachable<_Tp>();
2315	      }
2316	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2317	      return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2318	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2319	      return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2320	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2321	      return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2322	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2323	      return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2324	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2325	      return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2326	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2327	      return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2328	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2329	      return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2330	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2331	      return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2332	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2333	      return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2334	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2335	      return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2336	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2337	      return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2338	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2339	      return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2340	    else
2341	      __assert_unreachable<_Tp>();
2342	  }                                                   // }}}
2343	else if (__builtin_is_constant_evaluated())
2344	  return _Base::_S_not_equal_to(__x, __y);
2345	else if constexpr (sizeof(__x) == 8)
2346	  {
2347	    const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2348				!= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2349	    _MaskMember<_Tp> __r64;
2350	    __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2351	    return __r64;
2352	  }
2353	else
2354	  return _Base::_S_not_equal_to(__x, __y);
2355      }
2356
2357    // }}}
2358    // _S_less {{{
2359    template <typename _Tp, size_t _Np>
2360      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2361      _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2362      {
2363	if constexpr (__is_avx512_abi<_Abi>()) // {{{
2364	  {
2365	    if (__builtin_is_constant_evaluated()
2366		|| (__x._M_is_constprop() && __y._M_is_constprop()))
2367	      return _MaskImpl::_S_to_bits(
2368		__as_wrapper<_Np>(__x._M_data < __y._M_data));
2369
2370	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2371	    [[maybe_unused]] const auto __xi = __to_intrin(__x);
2372	    [[maybe_unused]] const auto __yi = __to_intrin(__y);
2373	    if constexpr (sizeof(__xi) == 64)
2374	      {
2375		if constexpr (is_same_v<_Tp, float>)
2376		  return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2377		else if constexpr (is_same_v<_Tp, double>)
2378		  return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2379		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2380		  return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2381		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2382		  return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2383		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2384		  return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2385		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2386		  return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2387		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2388		  return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2389		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2390		  return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2391		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2392		  return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2393		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2394		  return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2395		else
2396		  __assert_unreachable<_Tp>();
2397	      }
2398	    else if constexpr (sizeof(__xi) == 32)
2399	      {
2400		if constexpr (is_same_v<_Tp, float>)
2401		  return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2402		else if constexpr (is_same_v<_Tp, double>)
2403		  return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2404		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2405		  return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2406		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2407		  return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2408		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2409		  return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2410		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2411		  return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2412		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2413		  return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2414		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2415		  return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2416		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2417		  return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2418		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2419		  return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2420		else
2421		  __assert_unreachable<_Tp>();
2422	      }
2423	    else if constexpr (sizeof(__xi) == 16)
2424	      {
2425		if constexpr (is_same_v<_Tp, float>)
2426		  return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2427		else if constexpr (is_same_v<_Tp, double>)
2428		  return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2429		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2430		  return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2431		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2432		  return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2433		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2434		  return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2435		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2436		  return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2437		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2438		  return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2439		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2440		  return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2441		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2442		  return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2443		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2444		  return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2445		else
2446		  __assert_unreachable<_Tp>();
2447	      }
2448	    else
2449	      __assert_unreachable<_Tp>();
2450	  }                                                   // }}}
2451	else if (__builtin_is_constant_evaluated())
2452	  return _Base::_S_less(__x, __y);
2453	else if constexpr (sizeof(__x) == 8)
2454	  {
2455	    const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2456				< __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2457	    _MaskMember<_Tp> __r64;
2458	    __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2459	    return __r64;
2460	  }
2461	else
2462	  return _Base::_S_less(__x, __y);
2463      }
2464
2465    // }}}
2466    // _S_less_equal {{{
2467    template <typename _Tp, size_t _Np>
2468      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2469      _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2470      {
2471	if constexpr (__is_avx512_abi<_Abi>()) // {{{
2472	  {
2473	    if (__builtin_is_constant_evaluated()
2474		|| (__x._M_is_constprop() && __y._M_is_constprop()))
2475	      return _MaskImpl::_S_to_bits(
2476		__as_wrapper<_Np>(__x._M_data <= __y._M_data));
2477
2478	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2479	    [[maybe_unused]] const auto __xi = __to_intrin(__x);
2480	    [[maybe_unused]] const auto __yi = __to_intrin(__y);
2481	    if constexpr (sizeof(__xi) == 64)
2482	      {
2483		if constexpr (is_same_v<_Tp, float>)
2484		  return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2485		else if constexpr (is_same_v<_Tp, double>)
2486		  return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2487		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2488		  return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2489		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2490		  return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2491		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2492		  return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2493		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2494		  return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2495		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2496		  return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2497		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2498		  return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2499		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2500		  return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2501		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2502		  return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2503		else
2504		  __assert_unreachable<_Tp>();
2505	      }
2506	    else if constexpr (sizeof(__xi) == 32)
2507	      {
2508		if constexpr (is_same_v<_Tp, float>)
2509		  return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2510		else if constexpr (is_same_v<_Tp, double>)
2511		  return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2512		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2513		  return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2514		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2515		  return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2516		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2517		  return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2518		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2519		  return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2520		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2521		  return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2522		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2523		  return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2524		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2525		  return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2526		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2527		  return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2528		else
2529		  __assert_unreachable<_Tp>();
2530	      }
2531	    else if constexpr (sizeof(__xi) == 16)
2532	      {
2533		if constexpr (is_same_v<_Tp, float>)
2534		  return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2535		else if constexpr (is_same_v<_Tp, double>)
2536		  return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2537		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2538		  return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2539		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2540		  return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2541		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2542		  return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2543		else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2544		  return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2545		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2546		  return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2547		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2548		  return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2549		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2550		  return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2551		else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2552		  return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2553		else
2554		  __assert_unreachable<_Tp>();
2555	      }
2556	    else
2557	      __assert_unreachable<_Tp>();
2558	  }                                                   // }}}
2559	else if (__builtin_is_constant_evaluated())
2560	  return _Base::_S_less_equal(__x, __y);
2561	else if constexpr (sizeof(__x) == 8)
2562	  {
2563	    const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2564				<= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2565	    _MaskMember<_Tp> __r64;
2566	    __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2567	    return __r64;
2568	  }
2569	else
2570	  return _Base::_S_less_equal(__x, __y);
2571      }
2572
2573    // }}} }}}
2574    // negation {{{
2575    template <typename _Tp, size_t _Np>
2576      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2577      _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2578      {
2579	if constexpr (__is_avx512_abi<_Abi>())
2580	  return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2581	else
2582	  return _Base::_S_negate(__x);
2583      }
2584
2585    // }}}
2586    // math {{{
2587    using _Base::_S_abs;
2588
2589    // _S_sqrt {{{
2590    template <typename _Tp, size_t _Np>
2591      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2592      _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2593      {
2594	if constexpr (__is_sse_ps<_Tp, _Np>())
2595	  return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2596	else if constexpr (__is_sse_pd<_Tp, _Np>())
2597	  return _mm_sqrt_pd(__x);
2598	else if constexpr (__is_avx_ps<_Tp, _Np>())
2599	  return _mm256_sqrt_ps(__x);
2600	else if constexpr (__is_avx_pd<_Tp, _Np>())
2601	  return _mm256_sqrt_pd(__x);
2602	else if constexpr (__is_avx512_ps<_Tp, _Np>())
2603	  return _mm512_sqrt_ps(__x);
2604	else if constexpr (__is_avx512_pd<_Tp, _Np>())
2605	  return _mm512_sqrt_pd(__x);
2606	else
2607	  __assert_unreachable<_Tp>();
2608      }
2609
2610    // }}}
2611    // _S_ldexp {{{
2612    template <typename _Tp, size_t _Np>
2613      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2614      _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2615	       __fixed_size_storage_t<int, _Np> __exp)
2616      {
2617	if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2618	  {
2619	    const auto __xi = __to_intrin(__x);
2620	    constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2621	      __cvt;
2622	    const auto __expi = __to_intrin(__cvt(__exp));
2623	    using _Up = __bool_storage_member_type_t<_Np>;
2624	    constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2625	    if constexpr (sizeof(__xi) == 16)
2626	      {
2627		if constexpr (sizeof(_Tp) == 8)
2628		  return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2629		else
2630		  return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2631	      }
2632	    else if constexpr (sizeof(__xi) == 32)
2633	      {
2634		if constexpr (sizeof(_Tp) == 8)
2635		  return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2636		else
2637		  return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2638	      }
2639	    else
2640	      {
2641		static_assert(sizeof(__xi) == 64);
2642		if constexpr (sizeof(_Tp) == 8)
2643		  return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2644		else
2645		  return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2646	      }
2647	  }
2648	else
2649	  return _Base::_S_ldexp(__x, __exp);
2650      }
2651
2652    // }}}
2653    // _S_trunc {{{
2654    template <typename _Tp, size_t _Np>
2655      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2656      _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2657      {
2658	if constexpr (__is_avx512_ps<_Tp, _Np>())
2659	  return _mm512_roundscale_ps(__x, 0x0b);
2660	else if constexpr (__is_avx512_pd<_Tp, _Np>())
2661	  return _mm512_roundscale_pd(__x, 0x0b);
2662	else if constexpr (__is_avx_ps<_Tp, _Np>())
2663	  return _mm256_round_ps(__x, 0xb);
2664	else if constexpr (__is_avx_pd<_Tp, _Np>())
2665	  return _mm256_round_pd(__x, 0xb);
2666	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2667	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2668	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2669	  return _mm_round_pd(__x, 0xb);
2670	else if constexpr (__is_sse_ps<_Tp, _Np>())
2671	  {
2672	    auto __truncated
2673	      = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2674	    const auto __no_fractional_values
2675	      = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2676				      & 0x7f800000u)
2677		< 0x4b000000; // the exponent is so large that no mantissa bits
2678			      // signify fractional values (0x3f8 + 23*8 =
2679			      // 0x4b0)
2680	    return __no_fractional_values ? __truncated : __to_intrin(__x);
2681	  }
2682	else
2683	  return _Base::_S_trunc(__x);
2684      }
2685
2686    // }}}
2687    // _S_round {{{
2688    template <typename _Tp, size_t _Np>
2689      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2690      _S_round(_SimdWrapper<_Tp, _Np> __x)
2691      {
2692	// Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2693	// from zero as required by std::round. Therefore this function is more
2694	// complicated.
2695	using _V = __vector_type_t<_Tp, _Np>;
2696	_V __truncated;
2697	if constexpr (__is_avx512_ps<_Tp, _Np>())
2698	  __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2699	else if constexpr (__is_avx512_pd<_Tp, _Np>())
2700	  __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2701	else if constexpr (__is_avx_ps<_Tp, _Np>())
2702	  __truncated = _mm256_round_ps(__x._M_data,
2703					_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2704	else if constexpr (__is_avx_pd<_Tp, _Np>())
2705	  __truncated = _mm256_round_pd(__x._M_data,
2706					_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2707	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2708	  __truncated = __auto_bitcast(
2709	    _mm_round_ps(__to_intrin(__x),
2710			 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2711	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2712	  __truncated
2713	    = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2714	else if constexpr (__is_sse_ps<_Tp, _Np>())
2715	  __truncated = __auto_bitcast(
2716	    _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2717	else
2718	  return _Base::_S_round(__x);
2719
2720	// x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2721	// x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2722
2723	const _V __rounded
2724	  = __truncated
2725	    + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2726		 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2727		 : _V());
2728	if constexpr (__have_sse4_1)
2729	  return __rounded;
2730	else // adjust for missing range in cvttps_epi32
2731	  return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2732							      : __x._M_data;
2733      }
2734
2735    // }}}
2736    // _S_nearbyint {{{
2737    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2738      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x) noexcept
2739      {
2740	if constexpr (_TVT::template _S_is<float, 16>)
2741	  return _mm512_roundscale_ps(__x, 0x0c);
2742	else if constexpr (_TVT::template _S_is<double, 8>)
2743	  return _mm512_roundscale_pd(__x, 0x0c);
2744	else if constexpr (_TVT::template _S_is<float, 8>)
2745	  return _mm256_round_ps(__x,
2746				 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2747	else if constexpr (_TVT::template _S_is<double, 4>)
2748	  return _mm256_round_pd(__x,
2749				 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2750	else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2751	  return _mm_round_ps(__x,
2752			      _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2753	else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2754	  return _mm_round_pd(__x,
2755			      _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2756	else
2757	  return _Base::_S_nearbyint(__x);
2758      }
2759
2760    // }}}
2761    // _S_rint {{{
2762    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2763      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept
2764      {
2765	if constexpr (_TVT::template _S_is<float, 16>)
2766	  return _mm512_roundscale_ps(__x, 0x04);
2767	else if constexpr (_TVT::template _S_is<double, 8>)
2768	  return _mm512_roundscale_pd(__x, 0x04);
2769	else if constexpr (_TVT::template _S_is<float, 8>)
2770	  return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2771	else if constexpr (_TVT::template _S_is<double, 4>)
2772	  return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2773	else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2774	  return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2775	else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2776	  return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2777	else
2778	  return _Base::_S_rint(__x);
2779      }
2780
2781    // }}}
2782    // _S_floor {{{
2783    template <typename _Tp, size_t _Np>
2784      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2785      _S_floor(_SimdWrapper<_Tp, _Np> __x)
2786      {
2787	if constexpr (__is_avx512_ps<_Tp, _Np>())
2788	  return _mm512_roundscale_ps(__x, 0x09);
2789	else if constexpr (__is_avx512_pd<_Tp, _Np>())
2790	  return _mm512_roundscale_pd(__x, 0x09);
2791	else if constexpr (__is_avx_ps<_Tp, _Np>())
2792	  return _mm256_round_ps(__x, 0x9);
2793	else if constexpr (__is_avx_pd<_Tp, _Np>())
2794	  return _mm256_round_pd(__x, 0x9);
2795	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2796	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2797	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2798	  return _mm_round_pd(__x, 0x9);
2799	else
2800	  return _Base::_S_floor(__x);
2801      }
2802
2803    // }}}
2804    // _S_ceil {{{
2805    template <typename _Tp, size_t _Np>
2806      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2807      _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2808      {
2809	if constexpr (__is_avx512_ps<_Tp, _Np>())
2810	  return _mm512_roundscale_ps(__x, 0x0a);
2811	else if constexpr (__is_avx512_pd<_Tp, _Np>())
2812	  return _mm512_roundscale_pd(__x, 0x0a);
2813	else if constexpr (__is_avx_ps<_Tp, _Np>())
2814	  return _mm256_round_ps(__x, 0xa);
2815	else if constexpr (__is_avx_pd<_Tp, _Np>())
2816	  return _mm256_round_pd(__x, 0xa);
2817	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2818	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2819	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2820	  return _mm_round_pd(__x, 0xa);
2821	else
2822	  return _Base::_S_ceil(__x);
2823      }
2824
2825    // }}}
2826    // _S_signbit {{{
2827    template <typename _Tp, size_t _Np>
2828      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2829      _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2830      {
2831	if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2832	  {
2833	    if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2834	      return _mm512_movepi32_mask(
2835		__intrin_bitcast<__m512i>(__x._M_data));
2836	    else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2837	      return _mm512_movepi64_mask(
2838		__intrin_bitcast<__m512i>(__x._M_data));
2839	    else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2840	      return _mm256_movepi32_mask(
2841		__intrin_bitcast<__m256i>(__x._M_data));
2842	    else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2843	      return _mm256_movepi64_mask(
2844		__intrin_bitcast<__m256i>(__x._M_data));
2845	    else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2846	      return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2847	    else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2848	      return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2849	  }
2850	else if constexpr (__is_avx512_abi<_Abi>())
2851	  {
2852	    const auto __xi = __to_intrin(__x);
2853	    [[maybe_unused]] constexpr auto __k1
2854	      = _Abi::template _S_implicit_mask_intrin<_Tp>();
2855	    if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2856	      return _mm_movemask_ps(__xi);
2857	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2858	      return _mm_movemask_pd(__xi);
2859	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2860	      return _mm256_movemask_ps(__xi);
2861	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2862	      return _mm256_movemask_pd(__xi);
2863	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2864	      return _mm512_mask_cmplt_epi32_mask(
2865		__k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2866	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2867	      return _mm512_mask_cmplt_epi64_mask(
2868		__k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2869	    else
2870	      __assert_unreachable<_Tp>();
2871	  }
2872	else
2873	  return _Base::_S_signbit(__x);
2874	/*{
2875	  using _I = __int_for_sizeof_t<_Tp>;
2876	  if constexpr (sizeof(__x) == 64)
2877	    return _S_less(__vector_bitcast<_I>(__x), _I());
2878	  else
2879	    {
2880	      const auto __xx = __vector_bitcast<_I>(__x._M_data);
2881	      [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2882	      if constexpr ((sizeof(_Tp) == 4 &&
2883			     (__have_avx2 || sizeof(__x) == 16)) ||
2884			    __have_avx512vl)
2885		{
2886		  return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2887		}
2888	      else if constexpr ((__have_avx2 ||
2889				  (__have_ssse3 && sizeof(__x) == 16)))
2890		{
2891		  return __vector_bitcast<_Tp>((__xx & __signmask) ==
2892					       __signmask);
2893		}
2894	      else
2895		{ // SSE2/3 or AVX (w/o AVX2)
2896		  constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2897		  return __vector_bitcast<_Tp>(
2898		    __vector_bitcast<_Tp>(
2899		      (__xx & __signmask) |
2900		      __vector_bitcast<_I>(__one)) // -1 or 1
2901		    != __one);
2902		}
2903	    }
2904	}*/
2905      }
2906
2907    // }}}
2908    // _S_isnonzerovalue_mask {{{
2909    // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2910    template <typename _Tp>
2911      _GLIBCXX_SIMD_INTRINSIC static auto _S_isnonzerovalue_mask(_Tp __x)
2912      {
2913	using _Traits = _VectorTraits<_Tp>;
2914	if constexpr (__have_avx512dq_vl)
2915	  {
2916	    if constexpr (_Traits::template _S_is<
2917			    float, 2> || _Traits::template _S_is<float, 4>)
2918	      return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2919	    else if constexpr (_Traits::template _S_is<float, 8>)
2920	      return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2921	    else if constexpr (_Traits::template _S_is<float, 16>)
2922	      return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2923	    else if constexpr (_Traits::template _S_is<double, 2>)
2924	      return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2925	    else if constexpr (_Traits::template _S_is<double, 4>)
2926	      return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2927	    else if constexpr (_Traits::template _S_is<double, 8>)
2928	      return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2929	    else
2930	      __assert_unreachable<_Tp>();
2931	  }
2932	else
2933	  {
2934	    using _Up = typename _Traits::value_type;
2935	    constexpr size_t _Np = _Traits::_S_full_size;
2936	    const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2937	    const auto __b = __x * _Up();             // NaN if __x == inf
2938	    if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2939	      return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2940				     _CMP_ORD_Q);
2941	    else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2942	      return __mmask8(0xf
2943			      & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2944						   __auto_bitcast(__b),
2945						   _CMP_ORD_Q));
2946	    else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2947	      return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2948	    else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2949	      return __mmask8(0x3
2950			      & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2951						   __auto_bitcast(__b),
2952						   _CMP_ORD_Q));
2953	    else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
2954	      return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2955	    else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
2956	      return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
2957						 __auto_bitcast(__b),
2958						 _CMP_ORD_Q));
2959	    else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
2960	      return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2961	    else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
2962	      return __mmask8(0xf
2963			      & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2964						   __auto_bitcast(__b),
2965						   _CMP_ORD_Q));
2966	    else if constexpr (__is_avx512_ps<_Up, _Np>())
2967	      return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2968	    else if constexpr (__is_avx512_pd<_Up, _Np>())
2969	      return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2970	    else
2971	      __assert_unreachable<_Tp>();
2972	  }
2973      }
2974
2975    // }}}
2976    // _S_isfinite {{{
2977    template <typename _Tp, size_t _Np>
2978      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2979      _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
2980      {
2981	static_assert(is_floating_point_v<_Tp>);
2982#if !__FINITE_MATH_ONLY__
2983	if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2984	  {
2985	    const auto __xi = __to_intrin(__x);
2986	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2987	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2988	      return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2989	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2990	      return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2991	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2992	      return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2993	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2994	      return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2995	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2996	      return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2997	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2998	      return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2999	  }
3000	else if constexpr (__is_avx512_abi<_Abi>())
3001	  {
3002	    // if all exponent bits are set, __x is either inf or NaN
3003	    using _I = __int_for_sizeof_t<_Tp>;
3004	    const auto __inf = __vector_bitcast<_I>(
3005	      __vector_broadcast<_Np>(__infinity_v<_Tp>));
3006	    return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3007	  }
3008	else
3009#endif
3010	  return _Base::_S_isfinite(__x);
3011      }
3012
3013    // }}}
3014    // _S_isinf {{{
3015    template <typename _Tp, size_t _Np>
3016      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3017      _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3018      {
3019#if !__FINITE_MATH_ONLY__
3020	if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3021	  {
3022	    const auto __xi = __to_intrin(__x);
3023	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3024	      return _mm512_fpclass_ps_mask(__xi, 0x18);
3025	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3026	      return _mm512_fpclass_pd_mask(__xi, 0x18);
3027	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3028	      return _mm256_fpclass_ps_mask(__xi, 0x18);
3029	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3030	      return _mm256_fpclass_pd_mask(__xi, 0x18);
3031	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3032	      return _mm_fpclass_ps_mask(__xi, 0x18);
3033	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3034	      return _mm_fpclass_pd_mask(__xi, 0x18);
3035	    else
3036	      __assert_unreachable<_Tp>();
3037	  }
3038	else if constexpr (__have_avx512dq_vl)
3039	  {
3040	    if constexpr (__is_sse_pd<_Tp, _Np>())
3041	      return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3042	    else if constexpr (__is_avx_pd<_Tp, _Np>())
3043	      return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3044	    else if constexpr (__is_sse_ps<_Tp, _Np>())
3045	      return _mm_movm_epi32(
3046		_mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3047	    else if constexpr (__is_avx_ps<_Tp, _Np>())
3048	      return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3049	    else
3050	      __assert_unreachable<_Tp>();
3051	  }
3052	else
3053#endif
3054	  return _Base::_S_isinf(__x);
3055      }
3056
3057    // }}}
3058    // _S_isnormal {{{
3059    template <typename _Tp, size_t _Np>
3060      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3061      _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3062      {
3063#if __FINITE_MATH_ONLY__
3064	[[maybe_unused]] constexpr int __mode = 0x26;
3065#else
3066      [[maybe_unused]] constexpr int __mode = 0xbf;
3067#endif
3068	if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3069	  {
3070	    const auto __xi = __to_intrin(__x);
3071	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3072	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3073	      return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3074	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3075	      return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3076	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3077	      return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3078	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3079	      return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3080	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3081	      return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3082	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3083	      return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3084	    else
3085	      __assert_unreachable<_Tp>();
3086	  }
3087	else if constexpr (__have_avx512dq)
3088	  {
3089	    if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3090	      return _mm_movm_epi32(
3091		_knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3092	    else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3093	      return _mm256_movm_epi32(
3094		_knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3095	    else if constexpr (__is_avx512_ps<_Tp, _Np>())
3096	      return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3097	    else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3098	      return _mm_movm_epi64(
3099		_knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3100	    else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3101	      return _mm256_movm_epi64(
3102		_knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3103	    else if constexpr (__is_avx512_pd<_Tp, _Np>())
3104	      return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3105	    else
3106	      __assert_unreachable<_Tp>();
3107	  }
3108	else if constexpr (__is_avx512_abi<_Abi>())
3109	  {
3110	    using _I = __int_for_sizeof_t<_Tp>;
3111	    const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3112	    const auto minn = __vector_bitcast<_I>(
3113	      __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3114#if __FINITE_MATH_ONLY__
3115	    return _S_less_equal<_I, _Np>(minn, absn);
3116#else
3117	  const auto infn
3118	    = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3119	  return __and(_S_less_equal<_I, _Np>(minn, absn),
3120		       _S_less<_I, _Np>(absn, infn));
3121#endif
3122	  }
3123	else
3124	  return _Base::_S_isnormal(__x);
3125      }
3126
3127    // }}}
3128    // _S_isnan {{{
3129    template <typename _Tp, size_t _Np>
3130      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3131      _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3132      { return _S_isunordered(__x, __x); }
3133
3134    // }}}
3135    // _S_isunordered {{{
3136    template <typename _Tp, size_t _Np>
3137      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3138      _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3139		     [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3140      {
3141#if __FINITE_MATH_ONLY__
3142	return {}; // false
3143#else
3144	const auto __xi = __to_intrin(__x);
3145	const auto __yi = __to_intrin(__y);
3146	if constexpr (__is_avx512_abi<_Abi>())
3147	  {
3148	    constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3149	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3150	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3151	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3152	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3153	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3154	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3155	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3156	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3157	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3158	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3159	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3160	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3161	  }
3162      else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3163	return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3164      else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3165	return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3166      else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3167	return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3168      else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3169	return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3170      else
3171	__assert_unreachable<_Tp>();
3172#endif
3173      }
3174
3175    // }}}
3176    // _S_isgreater {{{
3177    template <typename _Tp, size_t _Np>
3178      static constexpr _MaskMember<_Tp> _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
3179						     _SimdWrapper<_Tp, _Np> __y)
3180      {
3181	const auto __xi = __to_intrin(__x);
3182	const auto __yi = __to_intrin(__y);
3183	if constexpr (__is_avx512_abi<_Abi>())
3184	  {
3185	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3186	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3187	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3188	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3189	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3190	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3191	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3192	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3193	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3194	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3195	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3196	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3197	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3198	    else
3199	      __assert_unreachable<_Tp>();
3200	  }
3201	else if constexpr (__have_avx)
3202	  {
3203	    if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3204	      return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3205	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3206	      return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3207	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3208	      return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3209	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3210	      return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3211	    else
3212	      __assert_unreachable<_Tp>();
3213	  }
3214	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3215			   && sizeof(_Tp) == 4)
3216	  {
3217	    const auto __xn = __vector_bitcast<int>(__xi);
3218	    const auto __yn = __vector_bitcast<int>(__yi);
3219	    const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3220	    const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3221	    return __auto_bitcast(
3222	      __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3223	  }
3224	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3225			   && sizeof(_Tp) == 8)
3226	  return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3227	    -_mm_ucomigt_sd(__xi, __yi),
3228	    -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3229			    _mm_unpackhi_pd(__yi, __yi))};
3230	else
3231	  return _Base::_S_isgreater(__x, __y);
3232      }
3233
3234    // }}}
3235    // _S_isgreaterequal {{{
3236    template <typename _Tp, size_t _Np>
3237      static constexpr _MaskMember<_Tp>
3238      _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3239      {
3240	const auto __xi = __to_intrin(__x);
3241	const auto __yi = __to_intrin(__y);
3242	if constexpr (__is_avx512_abi<_Abi>())
3243	  {
3244	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3245	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3246	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3247	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3248	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3249	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3250	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3251	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3252	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3253	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3254	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3255	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3256	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3257	    else
3258	      __assert_unreachable<_Tp>();
3259	  }
3260	else if constexpr (__have_avx)
3261	  {
3262	    if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3263	      return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3264	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3265	      return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3266	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3267	      return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3268	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3269	      return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3270	    else
3271	      __assert_unreachable<_Tp>();
3272	  }
3273	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3274			   && sizeof(_Tp) == 4)
3275	  {
3276	    const auto __xn = __vector_bitcast<int>(__xi);
3277	    const auto __yn = __vector_bitcast<int>(__yi);
3278	    const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3279	    const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3280	    return __auto_bitcast(
3281	      __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3282	  }
3283	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3284			   && sizeof(_Tp) == 8)
3285	  return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3286	    -_mm_ucomige_sd(__xi, __yi),
3287	    -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3288			    _mm_unpackhi_pd(__yi, __yi))};
3289	else
3290	  return _Base::_S_isgreaterequal(__x, __y);
3291      }
3292
3293    // }}}
3294    // _S_isless {{{
3295    template <typename _Tp, size_t _Np>
3296      static constexpr _MaskMember<_Tp> _S_isless(_SimdWrapper<_Tp, _Np> __x,
3297						  _SimdWrapper<_Tp, _Np> __y)
3298      {
3299	const auto __xi = __to_intrin(__x);
3300	const auto __yi = __to_intrin(__y);
3301	if constexpr (__is_avx512_abi<_Abi>())
3302	  {
3303	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3304	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3305	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3306	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3307	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3308	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3309	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3310	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3311	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3312	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3313	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3314	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3315	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3316	    else
3317	      __assert_unreachable<_Tp>();
3318	  }
3319	else if constexpr (__have_avx)
3320	  {
3321	    if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3322	      return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3323	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3324	      return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3325	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3326	      return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3327	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3328	      return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3329	    else
3330	      __assert_unreachable<_Tp>();
3331	  }
3332	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3333			   && sizeof(_Tp) == 4)
3334	  {
3335	    const auto __xn = __vector_bitcast<int>(__xi);
3336	    const auto __yn = __vector_bitcast<int>(__yi);
3337	    const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3338	    const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3339	    return __auto_bitcast(
3340	      __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3341	  }
3342	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3343			   && sizeof(_Tp) == 8)
3344	  return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3345	    -_mm_ucomigt_sd(__yi, __xi),
3346	    -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3347			    _mm_unpackhi_pd(__xi, __xi))};
3348	else
3349	  return _Base::_S_isless(__x, __y);
3350      }
3351
3352    // }}}
3353    // _S_islessequal {{{
3354    template <typename _Tp, size_t _Np>
3355      static constexpr _MaskMember<_Tp>
3356      _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3357      {
3358	const auto __xi = __to_intrin(__x);
3359	const auto __yi = __to_intrin(__y);
3360	if constexpr (__is_avx512_abi<_Abi>())
3361	  {
3362	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3363	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3364	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3365	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3366	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3367	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3368	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3369	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3370	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3371	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3372	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3373	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3374	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3375	    else
3376	      __assert_unreachable<_Tp>();
3377	  }
3378	else if constexpr (__have_avx)
3379	  {
3380	    if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3381	      return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3382	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3383	      return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3384	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3385	      return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3386	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3387	      return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3388	    else
3389	      __assert_unreachable<_Tp>();
3390	  }
3391	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3392			   && sizeof(_Tp) == 4)
3393	  {
3394	    const auto __xn = __vector_bitcast<int>(__xi);
3395	    const auto __yn = __vector_bitcast<int>(__yi);
3396	    const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3397	    const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3398	    return __auto_bitcast(
3399	      __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3400	  }
3401	else if constexpr (__have_sse2 && sizeof(__xi) == 16
3402			   && sizeof(_Tp) == 8)
3403	  return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3404	    -_mm_ucomige_sd(__yi, __xi),
3405	    -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3406			    _mm_unpackhi_pd(__xi, __xi))};
3407	else
3408	  return _Base::_S_islessequal(__x, __y);
3409      }
3410
3411    // }}}
3412    // _S_islessgreater {{{
3413    template <typename _Tp, size_t _Np>
3414      static constexpr _MaskMember<_Tp>
3415      _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3416      {
3417	const auto __xi = __to_intrin(__x);
3418	const auto __yi = __to_intrin(__y);
3419	if constexpr (__is_avx512_abi<_Abi>())
3420	  {
3421	    const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3422	    if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3423	      return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3424	    else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3425	      return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3426	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3427	      return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3428	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3429	      return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3430	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3431	      return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3432	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3433	      return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3434	    else
3435	      __assert_unreachable<_Tp>();
3436	  }
3437	else if constexpr (__have_avx)
3438	  {
3439	    if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3440	      return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3441	    else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3442	      return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3443	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3444	      return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3445	    else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3446	      return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3447	    else
3448	      __assert_unreachable<_Tp>();
3449	  }
3450	else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3451	  return __auto_bitcast(
3452	    __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3453	else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3454	  return __to_masktype(
3455	    __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3456	else
3457	  __assert_unreachable<_Tp>();
3458      }
3459
3460    //}}} }}}
3461  };
3462
3463// }}}
3464// _MaskImplX86Mixin {{{
3465struct _MaskImplX86Mixin
3466{
3467  template <typename _Tp>
3468    using _TypeTag = _Tp*;
3469
3470  using _Base = _MaskImplBuiltinMixin;
3471
3472  // _S_to_maskvector(bool) {{{
3473  template <typename _Up, size_t _ToN = 1, typename _Tp>
3474    _GLIBCXX_SIMD_INTRINSIC static constexpr enable_if_t<
3475      is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3476    _S_to_maskvector(_Tp __x)
3477    {
3478      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3479      return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3480		 : __vector_type_t<_Up, _ToN>();
3481    }
3482
3483  // }}}
3484  // _S_to_maskvector(_SanitizedBitMask) {{{
3485  template <typename _Up, size_t _UpN = 0, size_t _Np,
3486	    size_t _ToN = _UpN == 0 ? _Np : _UpN>
3487    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3488    _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3489    {
3490      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3491      using _UV = __vector_type_t<_Up, _ToN>;
3492      using _UI = __intrinsic_type_t<_Up, _ToN>;
3493      [[maybe_unused]] const auto __k = __x._M_to_bits();
3494      if constexpr (_Np == 1)
3495	return _S_to_maskvector<_Up, _ToN>(__k);
3496      else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3497	return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3498	  [&](auto __i) -> _Up { return -__x[__i.value]; });
3499      else if constexpr (sizeof(_Up) == 1)
3500	{
3501	  if constexpr (sizeof(_UI) == 16)
3502	    {
3503	      if constexpr (__have_avx512bw_vl)
3504		return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3505	      else if constexpr (__have_avx512bw)
3506		return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3507	      else if constexpr (__have_avx512f)
3508		{
3509		  auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3510		  auto __as16bits
3511		    = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3512						__hi256(__as32bits)));
3513		  return __intrin_bitcast<_UV>(
3514		    _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3515		}
3516	      else if constexpr (__have_ssse3)
3517		{
3518		  const auto __bitmask = __to_intrin(
3519		    __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3520					  8, 16, 32, 64, 128));
3521		  return __intrin_bitcast<_UV>(
3522		    __vector_bitcast<_Up>(
3523		      _mm_shuffle_epi8(__to_intrin(
3524					 __vector_type_t<_ULLong, 2>{__k}),
3525				       _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3526						     1, 1, 1, 1, 1, 1, 1))
3527		      & __bitmask)
3528		    != 0);
3529		}
3530	      // else fall through
3531	    }
3532	  else if constexpr (sizeof(_UI) == 32)
3533	    {
3534	      if constexpr (__have_avx512bw_vl)
3535		return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3536	      else if constexpr (__have_avx512bw)
3537		return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3538	      else if constexpr (__have_avx512f)
3539		{
3540		  auto __as16bits = // 0 16 1 17 ... 15 31
3541		    _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3542				      16)
3543		    | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3544							       ~__m512i()),
3545					16);
3546		  auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3547		    __lo256(__as16bits),
3548		    __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3549		  );
3550		  // deinterleave:
3551		  return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3552		    __0_16_1_17, // 0 16 1 17 2 ...
3553		    _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3554				     11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3555				     3, 5, 7, 9, 11, 13,
3556				     15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3557					     // 0-3  8-11 16-19 24-27
3558					     // 4-7 12-15 20-23 28-31
3559		}
3560	      else if constexpr (__have_avx2)
3561		{
3562		  const auto __bitmask
3563		    = _mm256_broadcastsi128_si256(__to_intrin(
3564		      __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3565					    4, 8, 16, 32, 64, 128)));
3566		  return __vector_bitcast<_Up>(
3567		    __vector_bitcast<_Up>(
3568		      _mm256_shuffle_epi8(
3569			_mm256_broadcastsi128_si256(
3570			  __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3571			_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3572					 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3573					 3, 3, 3, 3, 3, 3))
3574		      & __bitmask)
3575		    != 0);
3576		}
3577	      // else fall through
3578	    }
3579	  else if constexpr (sizeof(_UI) == 64)
3580	    return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3581	  if constexpr (std::min(_ToN, _Np) <= 4)
3582	    {
3583	      if constexpr (_Np > 7) // avoid overflow
3584		__x &= _SanitizedBitMask<_Np>(0x0f);
3585	      const _UInt __char_mask
3586		= ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3587		  * 0xff;
3588	      _UV __r = {};
3589	      __builtin_memcpy(&__r, &__char_mask,
3590			       std::min(sizeof(__r), sizeof(__char_mask)));
3591	      return __r;
3592	    }
3593	  else if constexpr (std::min(_ToN, _Np) <= 7)
3594	    {
3595	      if constexpr (_Np > 7) // avoid overflow
3596		__x &= _SanitizedBitMask<_Np>(0x7f);
3597	      const _ULLong __char_mask
3598		= ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3599		  * 0xff;
3600	      _UV __r = {};
3601	      __builtin_memcpy(&__r, &__char_mask,
3602			       std::min(sizeof(__r), sizeof(__char_mask)));
3603	      return __r;
3604	    }
3605	}
3606      else if constexpr (sizeof(_Up) == 2)
3607	{
3608	  if constexpr (sizeof(_UI) == 16)
3609	    {
3610	      if constexpr (__have_avx512bw_vl)
3611		return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3612	      else if constexpr (__have_avx512bw)
3613		return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3614	      else if constexpr (__have_avx512f)
3615		{
3616		  __m256i __as32bits = {};
3617		  if constexpr (__have_avx512vl)
3618		    __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3619		  else
3620		    __as32bits
3621		      = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3622		  return __intrin_bitcast<_UV>(
3623		    _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3624		}
3625	      // else fall through
3626	    }
3627	  else if constexpr (sizeof(_UI) == 32)
3628	    {
3629	      if constexpr (__have_avx512bw_vl)
3630		return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3631	      else if constexpr (__have_avx512bw)
3632		return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3633	      else if constexpr (__have_avx512f)
3634		{
3635		  auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3636		  return __vector_bitcast<_Up>(
3637		    __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3638					      __hi256(__as32bits))));
3639		}
3640	      // else fall through
3641	    }
3642	  else if constexpr (sizeof(_UI) == 64)
3643	    return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3644	}
3645      else if constexpr (sizeof(_Up) == 4)
3646	{
3647	  if constexpr (sizeof(_UI) == 16)
3648	    {
3649	      if constexpr (__have_avx512dq_vl)
3650		return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3651	      else if constexpr (__have_avx512dq)
3652		return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3653	      else if constexpr (__have_avx512vl)
3654		return __intrin_bitcast<_UV>(
3655		  _mm_maskz_mov_epi32(__k, ~__m128i()));
3656	      else if constexpr (__have_avx512f)
3657		return __intrin_bitcast<_UV>(
3658		  __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3659	      // else fall through
3660	    }
3661	  else if constexpr (sizeof(_UI) == 32)
3662	    {
3663	      if constexpr (__have_avx512dq_vl)
3664		return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3665	      else if constexpr (__have_avx512dq)
3666		return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3667	      else if constexpr (__have_avx512vl)
3668		return __vector_bitcast<_Up>(
3669		  _mm256_maskz_mov_epi32(__k, ~__m256i()));
3670	      else if constexpr (__have_avx512f)
3671		return __vector_bitcast<_Up>(
3672		  __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3673	      // else fall through
3674	    }
3675	  else if constexpr (sizeof(_UI) == 64)
3676	    return __vector_bitcast<_Up>(
3677	      __have_avx512dq ? _mm512_movm_epi32(__k)
3678			      : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3679	}
3680      else if constexpr (sizeof(_Up) == 8)
3681	{
3682	  if constexpr (sizeof(_UI) == 16)
3683	    {
3684	      if constexpr (__have_avx512dq_vl)
3685		return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3686	      else if constexpr (__have_avx512dq)
3687		return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3688	      else if constexpr (__have_avx512vl)
3689		return __vector_bitcast<_Up>(
3690		  _mm_maskz_mov_epi64(__k, ~__m128i()));
3691	      else if constexpr (__have_avx512f)
3692		return __vector_bitcast<_Up>(
3693		  __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3694	      // else fall through
3695	    }
3696	  else if constexpr (sizeof(_UI) == 32)
3697	    {
3698	      if constexpr (__have_avx512dq_vl)
3699		return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3700	      else if constexpr (__have_avx512dq)
3701		return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3702	      else if constexpr (__have_avx512vl)
3703		return __vector_bitcast<_Up>(
3704		  _mm256_maskz_mov_epi64(__k, ~__m256i()));
3705	      else if constexpr (__have_avx512f)
3706		return __vector_bitcast<_Up>(
3707		  __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3708	      // else fall through
3709	    }
3710	  else if constexpr (sizeof(_UI) == 64)
3711	    return __vector_bitcast<_Up>(
3712	      __have_avx512dq ? _mm512_movm_epi64(__k)
3713			      : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3714	}
3715
3716      using _UpUInt = make_unsigned_t<_Up>;
3717      using _V = __vector_type_t<_UpUInt, _ToN>;
3718      constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3719      if constexpr (_ToN == 2)
3720	{
3721	  return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3722	}
3723      else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3724	{
3725	  if constexpr (sizeof(_Up) == 4)
3726	    return __vector_bitcast<_Up>(_mm256_cmp_ps(
3727	      _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3728			    _mm256_castsi256_ps(_mm256_setr_epi32(
3729			      0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3730	      _mm256_setzero_ps(), _CMP_NEQ_UQ));
3731	  else if constexpr (sizeof(_Up) == 8)
3732	    return __vector_bitcast<_Up>(_mm256_cmp_pd(
3733	      _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3734			    _mm256_castsi256_pd(
3735			      _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3736	      _mm256_setzero_pd(), _CMP_NEQ_UQ));
3737	  else
3738	    __assert_unreachable<_Up>();
3739	}
3740      else if constexpr (__bits_per_element >= _ToN)
3741	{
3742	  constexpr auto __bitmask
3743	    = __generate_vector<_V>([](auto __i) constexpr->_UpUInt {
3744		return __i < _ToN ? 1ull << __i : 0;
3745	      });
3746	  const auto __bits
3747	    = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3748	  if constexpr (__bits_per_element > _ToN)
3749	    return __vector_bitcast<_Up>(__bits) > 0;
3750	  else
3751	    return __vector_bitcast<_Up>(__bits != 0);
3752	}
3753      else
3754	{
3755	  const _V __tmp
3756	    = __generate_vector<_V>([&](auto __i) constexpr {
3757		return static_cast<_UpUInt>(
3758		  __k >> (__bits_per_element * (__i / __bits_per_element)));
3759	      })
3760	      & __generate_vector<_V>([](auto __i) constexpr {
3761		  return static_cast<_UpUInt>(1ull
3762					      << (__i % __bits_per_element));
3763		}); // mask bit index
3764	  return __intrin_bitcast<_UV>(__tmp != _V());
3765	}
3766    }
3767
3768  // }}}
3769  // _S_to_maskvector(_SimdWrapper) {{{
3770  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3771	    size_t _ToN = _UpN == 0 ? _Np : _UpN>
3772    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3773    _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3774    {
3775      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3776      using _TW = _SimdWrapper<_Tp, _Np>;
3777      using _UW = _SimdWrapper<_Up, _ToN>;
3778      using _UI = __intrinsic_type_t<_Up, _ToN>;
3779      if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3780	return _S_to_maskvector<_Up, _ToN>(
3781	  _BitMask<_Np>(__x._M_data)._M_sanitized());
3782      // vector -> vector bitcast
3783      else if constexpr (sizeof(_Up) == sizeof(_Tp)
3784			 && sizeof(_TW) == sizeof(_UW))
3785	return __wrapper_bitcast<_Up, _ToN>(
3786	  _ToN <= _Np
3787	    ? __x
3788	    : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3789      else // vector -> vector {{{
3790	{
3791	  if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3792	    {
3793	      const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3794	      return __generate_from_n_evaluations<std::min(_ToN, _Np),
3795						   __vector_type_t<_Up, _ToN>>(
3796		[&](auto __i) -> _Up { return __y[__i.value]; });
3797	    }
3798	  using _To = __vector_type_t<_Up, _ToN>;
3799	  [[maybe_unused]] constexpr size_t _FromN = _Np;
3800	  constexpr int _FromBytes = sizeof(_Tp);
3801	  constexpr int _ToBytes = sizeof(_Up);
3802	  const auto __k = __x._M_data;
3803
3804	  if constexpr (_FromBytes == _ToBytes)
3805	    return __intrin_bitcast<_To>(__k);
3806	  else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3807	    { // SSE -> SSE {{{
3808	      if constexpr (_FromBytes == 4 && _ToBytes == 8)
3809		return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3810	      else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3811		{
3812		  const auto __y
3813		    = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3814		  return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3815		}
3816	      else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3817		{
3818		  auto __y
3819		    = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3820		  auto __z
3821		    = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3822		  return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3823		}
3824	      else if constexpr (_FromBytes == 8 && _ToBytes == 4
3825				 && __have_sse2)
3826		return __intrin_bitcast<_To>(
3827		  _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3828	      else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3829		return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3830						    _UI());
3831	      else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3832		return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3833	      else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3834		{
3835		  const auto __y
3836		    = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3837		  return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3838		}
3839	      else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3840		{
3841		  if constexpr (__have_sse2 && !__have_ssse3)
3842		    return __intrin_bitcast<_To>(_mm_packs_epi32(
3843		      _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3844		      __m128i()));
3845		  else
3846		    return __intrin_bitcast<_To>(
3847		      __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3848			__vector_bitcast<_Up>(__k)));
3849		}
3850	      else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3851		return __intrin_bitcast<_To>(
3852		  _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3853	      else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3854		return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3855	      else if constexpr (_FromBytes == 8 && _ToBytes == 1
3856				 && __have_ssse3)
3857		return __intrin_bitcast<_To>(
3858		  _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3859				   _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3860						 -1, -1, -1, -1, -1, -1, -1,
3861						 -1)));
3862	      else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3863		{
3864		  auto __y
3865		    = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3866		  __y = _mm_packs_epi32(__y, __m128i());
3867		  return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3868		}
3869	      else if constexpr (_FromBytes == 4 && _ToBytes == 1
3870				 && __have_ssse3)
3871		return __intrin_bitcast<_To>(
3872		  _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3873				   _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3874						 -1, -1, -1, -1, -1, -1, -1,
3875						 -1)));
3876	      else if constexpr (_FromBytes == 4 && _ToBytes == 1)
3877		{
3878		  const auto __y
3879		    = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3880		  return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3881		}
3882	      else if constexpr (_FromBytes == 2 && _ToBytes == 1)
3883		return __intrin_bitcast<_To>(
3884		  _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
3885	      else
3886		__assert_unreachable<_Tp>();
3887	    } // }}}
3888	  else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
3889	    { // AVX -> AVX {{{
3890	      if constexpr (_FromBytes == _ToBytes)
3891		__assert_unreachable<_Tp>();
3892	      else if constexpr (_FromBytes == _ToBytes * 2)
3893		{
3894		  const auto __y = __vector_bitcast<_LLong>(__k);
3895		  return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3896		    _mm_packs_epi16(__lo128(__y), __hi128(__y))));
3897		}
3898	      else if constexpr (_FromBytes == _ToBytes * 4)
3899		{
3900		  const auto __y = __vector_bitcast<_LLong>(__k);
3901		  return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3902		    _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
3903				    __m128i())));
3904		}
3905	      else if constexpr (_FromBytes == _ToBytes * 8)
3906		{
3907		  const auto __y = __vector_bitcast<_LLong>(__k);
3908		  return __intrin_bitcast<_To>(
3909		    _mm256_castsi128_si256(_mm_shuffle_epi8(
3910		      _mm_packs_epi16(__lo128(__y), __hi128(__y)),
3911		      _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
3912				    -1, -1, -1, -1, -1))));
3913		}
3914	      else if constexpr (_FromBytes * 2 == _ToBytes)
3915		{
3916		  auto __y = __xzyw(__to_intrin(__k));
3917		  if constexpr (is_floating_point_v<
3918				  _Tp> || (!__have_avx2 && _FromBytes == 4))
3919		    {
3920		      const auto __yy = __vector_bitcast<float>(__y);
3921		      return __intrin_bitcast<_To>(
3922			_mm256_unpacklo_ps(__yy, __yy));
3923		    }
3924		  else
3925		    return __intrin_bitcast<_To>(
3926		      _mm256_unpacklo_epi8(__y, __y));
3927		}
3928	      else if constexpr (_FromBytes * 4 == _ToBytes)
3929		{
3930		  auto __y
3931		    = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3932					__lo128(__vector_bitcast<_LLong>(
3933					  __k))); // drops 3/4 of input
3934		  return __intrin_bitcast<_To>(
3935		    __concat(_mm_unpacklo_epi16(__y, __y),
3936			     _mm_unpackhi_epi16(__y, __y)));
3937		}
3938	      else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3939		{
3940		  auto __y
3941		    = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3942					__lo128(__vector_bitcast<_LLong>(
3943					  __k))); // drops 3/4 of input
3944		  __y
3945		    = _mm_unpacklo_epi16(__y,
3946					 __y); // drops another 1/2 => 7/8 total
3947		  return __intrin_bitcast<_To>(
3948		    __concat(_mm_unpacklo_epi32(__y, __y),
3949			     _mm_unpackhi_epi32(__y, __y)));
3950		}
3951	      else
3952		__assert_unreachable<_Tp>();
3953	    } // }}}
3954	  else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
3955	    { // SSE -> AVX {{{
3956	      if constexpr (_FromBytes == _ToBytes)
3957		return __intrin_bitcast<_To>(
3958		  __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
3959		    __zero_extend(__to_intrin(__k))));
3960	      else if constexpr (_FromBytes * 2 == _ToBytes)
3961		{ // keep all
3962		  return __intrin_bitcast<_To>(
3963		    __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
3964					       __vector_bitcast<_LLong>(__k)),
3965			     _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
3966					       __vector_bitcast<_LLong>(__k))));
3967		}
3968	      else if constexpr (_FromBytes * 4 == _ToBytes)
3969		{
3970		  if constexpr (__have_avx2)
3971		    {
3972		      return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
3973			__concat(__vector_bitcast<_LLong>(__k),
3974				 __vector_bitcast<_LLong>(__k)),
3975			_mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3976					 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
3977					 6, 6, 7, 7, 7, 7)));
3978		    }
3979		  else
3980		    {
3981		      return __intrin_bitcast<_To>(__concat(
3982			_mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3983					 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
3984						       2, 2, 2, 2, 3, 3, 3, 3)),
3985			_mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3986					 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
3987						       6, 6, 6, 6, 7, 7, 7,
3988						       7))));
3989		    }
3990		}
3991	      else if constexpr (_FromBytes * 8 == _ToBytes)
3992		{
3993		  if constexpr (__have_avx2)
3994		    {
3995		      return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
3996			__concat(__vector_bitcast<_LLong>(__k),
3997				 __vector_bitcast<_LLong>(__k)),
3998			_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3999					 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4000					 3, 3, 3, 3, 3, 3)));
4001		    }
4002		  else
4003		    {
4004		      return __intrin_bitcast<_To>(__concat(
4005			_mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4006					 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4007						       1, 1, 1, 1, 1, 1, 1, 1)),
4008			_mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4009					 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4010						       3, 3, 3, 3, 3, 3, 3,
4011						       3))));
4012		    }
4013		}
4014	      else if constexpr (_FromBytes == _ToBytes * 2)
4015		return __intrin_bitcast<_To>(__m256i(__zero_extend(
4016		  _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4017	      else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4018		{
4019		  return __intrin_bitcast<_To>(__m256i(__zero_extend(
4020		    _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4021				     _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4022						   -1, -1, -1, -1, -1, -1, -1,
4023						   -1)))));
4024		}
4025	      else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4026		{
4027		  return __intrin_bitcast<_To>(__m256i(__zero_extend(
4028		    _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4029				     _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4030						   -1, -1, -1, -1, -1, -1, -1,
4031						   -1)))));
4032		}
4033	      else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4034		{
4035		  return __intrin_bitcast<_To>(__m256i(__zero_extend(
4036		    _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4037				     _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4038						   -1, -1, -1, -1, -1, -1, -1,
4039						   -1, -1)))));
4040		}
4041	      else
4042		static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4043	    } // }}}
4044	  else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4045	    { // AVX -> SSE {{{
4046	      if constexpr (_FromBytes == _ToBytes)
4047		{ // keep low 1/2
4048		  return __intrin_bitcast<_To>(__lo128(__k));
4049		}
4050	      else if constexpr (_FromBytes == _ToBytes * 2)
4051		{ // keep all
4052		  auto __y = __vector_bitcast<_LLong>(__k);
4053		  return __intrin_bitcast<_To>(
4054		    _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4055		}
4056	      else if constexpr (_FromBytes == _ToBytes * 4)
4057		{ // add 1/2 undef
4058		  auto __y = __vector_bitcast<_LLong>(__k);
4059		  return __intrin_bitcast<_To>(
4060		    _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4061				    __m128i()));
4062		}
4063	      else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4064		{ // add 3/4 undef
4065		  auto __y = __vector_bitcast<_LLong>(__k);
4066		  return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4067		    _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4068		    _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4069				  -1, -1, -1, -1)));
4070		}
4071	      else if constexpr (_FromBytes * 2 == _ToBytes)
4072		{ // keep low 1/4
4073		  auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4074		  return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4075		}
4076	      else if constexpr (_FromBytes * 4 == _ToBytes)
4077		{ // keep low 1/8
4078		  auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4079		  __y = _mm_unpacklo_epi8(__y, __y);
4080		  return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4081		}
4082	      else if constexpr (_FromBytes * 8 == _ToBytes)
4083		{ // keep low 1/16
4084		  auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4085		  __y = _mm_unpacklo_epi8(__y, __y);
4086		  __y = _mm_unpacklo_epi8(__y, __y);
4087		  return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4088		}
4089	      else
4090		static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4091	    } // }}}
4092	  else
4093	    return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4094	  /*
4095	  if constexpr (_FromBytes > _ToBytes) {
4096	      const _To     __y      = __vector_bitcast<_Up>(__k);
4097	      return [&] <size_t... _Is> (index_sequence<_Is...>) {
4098		constexpr int _Stride = _FromBytes / _ToBytes;
4099		return _To{__y[(_Is + 1) * _Stride - 1]...};
4100	      }(make_index_sequence<std::min(_ToN, _FromN)>());
4101	  } else {
4102	      // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4103	      // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4104	      // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4105	      // ...
4106	      return [&] <size_t... _Is> (index_sequence<_Is...>) {
4107		constexpr int __dup = _ToBytes / _FromBytes;
4108		return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4109	      }(make_index_sequence<_FromN>());
4110	  }
4111	  */
4112	} // }}}
4113    }
4114
4115  // }}}
4116  // _S_to_bits {{{
4117  template <typename _Tp, size_t _Np>
4118    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4119    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4120    {
4121      if constexpr (is_same_v<_Tp, bool>)
4122	return _BitMask<_Np>(__x._M_data)._M_sanitized();
4123      else
4124	{
4125	  static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4126	  if (__builtin_is_constant_evaluated()
4127	      || __builtin_constant_p(__x._M_data))
4128	    {
4129	      const auto __bools = -__x._M_data;
4130	      const _ULLong __k = __call_with_n_evaluations<_Np>(
4131		[](auto... __bits) { return (__bits | ...); },
4132		[&](auto __i) { return _ULLong(__bools[+__i]) << __i; });
4133	      if (__builtin_is_constant_evaluated()
4134		  || __builtin_constant_p(__k))
4135		return __k;
4136	    }
4137	  const auto __xi = __to_intrin(__x);
4138	  if constexpr (sizeof(_Tp) == 1)
4139	    if constexpr (sizeof(__xi) == 16)
4140	      if constexpr (__have_avx512bw_vl)
4141		return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4142	      else // implies SSE2
4143		return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4144	    else if constexpr (sizeof(__xi) == 32)
4145	      if constexpr (__have_avx512bw_vl)
4146		return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4147	      else // implies AVX2
4148		return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4149	    else // implies AVX512BW
4150	      return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4151
4152	  else if constexpr (sizeof(_Tp) == 2)
4153	    if constexpr (sizeof(__xi) == 16)
4154	      if constexpr (__have_avx512bw_vl)
4155		return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4156	      else if constexpr (__have_avx512bw)
4157		return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4158	      else // implies SSE2
4159		return _BitMask<_Np>(
4160		  _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4161	    else if constexpr (sizeof(__xi) == 32)
4162	      if constexpr (__have_avx512bw_vl)
4163		return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4164	      else if constexpr (__have_avx512bw)
4165		return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4166	      else // implies SSE2
4167		return _BitMask<_Np>(_mm_movemask_epi8(
4168		  _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4169	    else // implies AVX512BW
4170	      return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4171
4172	  else if constexpr (sizeof(_Tp) == 4)
4173	    if constexpr (sizeof(__xi) == 16)
4174	      if constexpr (__have_avx512dq_vl)
4175		return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4176	      else if constexpr (__have_avx512vl)
4177		return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4178	      else if constexpr (__have_avx512dq)
4179		return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4180	      else if constexpr (__have_avx512f)
4181		return _BitMask<_Np>(
4182		  _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4183	      else // implies SSE
4184		return _BitMask<_Np>(
4185		  _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4186	    else if constexpr (sizeof(__xi) == 32)
4187	      if constexpr (__have_avx512dq_vl)
4188		return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4189	      else if constexpr (__have_avx512dq)
4190		return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4191	      else if constexpr (__have_avx512vl)
4192		return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4193	      else if constexpr (__have_avx512f)
4194		return _BitMask<_Np>(
4195		  _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4196	      else // implies AVX
4197		return _BitMask<_Np>(
4198		  _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4199	    else // implies AVX512??
4200	      if constexpr (__have_avx512dq)
4201	      return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4202	    else // implies AVX512F
4203	      return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4204
4205	  else if constexpr (sizeof(_Tp) == 8)
4206	    if constexpr (sizeof(__xi) == 16)
4207	      if constexpr (__have_avx512dq_vl)
4208		return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4209	      else if constexpr (__have_avx512dq)
4210		return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4211	      else if constexpr (__have_avx512vl)
4212		return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4213	      else if constexpr (__have_avx512f)
4214		return _BitMask<_Np>(
4215		  _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4216	      else // implies SSE2
4217		return _BitMask<_Np>(
4218		  _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4219	    else if constexpr (sizeof(__xi) == 32)
4220	      if constexpr (__have_avx512dq_vl)
4221		return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4222	      else if constexpr (__have_avx512dq)
4223		return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4224	      else if constexpr (__have_avx512vl)
4225		return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4226	      else if constexpr (__have_avx512f)
4227		return _BitMask<_Np>(
4228		  _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4229	      else // implies AVX
4230		return _BitMask<_Np>(
4231		  _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4232	    else // implies AVX512??
4233	      if constexpr (__have_avx512dq)
4234	      return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4235	    else // implies AVX512F
4236	      return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4237
4238	  else
4239	    __assert_unreachable<_Tp>();
4240	}
4241    }
4242  // }}}
4243};
4244
4245// }}}
4246// _MaskImplX86 {{{
4247template <typename _Abi, typename>
4248  struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4249  {
4250    using _MaskImplX86Mixin::_S_to_bits;
4251    using _MaskImplX86Mixin::_S_to_maskvector;
4252    using _MaskImplBuiltin<_Abi>::_S_convert;
4253
4254    // member types {{{
4255    template <typename _Tp>
4256      using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4257
4258    template <typename _Tp>
4259      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4260
4261    template <typename _Tp>
4262      static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4263
4264    using _Base = _MaskImplBuiltin<_Abi>;
4265
4266    // }}}
4267    // _S_broadcast {{{
4268    template <typename _Tp>
4269      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4270      _S_broadcast(bool __x)
4271      {
4272	if constexpr (__is_avx512_abi<_Abi>())
4273	  return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4274		     : _MaskMember<_Tp>();
4275	else
4276	  return _Base::template _S_broadcast<_Tp>(__x);
4277      }
4278
4279    // }}}
4280    // _S_load {{{
4281    template <typename _Tp>
4282      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4283      _S_load(const bool* __mem)
4284      {
4285	static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4286	if constexpr (__have_avx512bw)
4287	  {
4288	    const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) {
4289	      if constexpr (__is_avx512_abi<_Abi>())
4290		return __bits;
4291	      else
4292		return _S_to_maskvector<_Tp>(
4293		  _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4294	    };
4295
4296	    if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4297	      {
4298		__m128i __a = {};
4299		__builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4300		return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4301	      }
4302	    else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4303	      {
4304		__m256i __a = {};
4305		__builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4306		return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4307	      }
4308	    else if constexpr (_S_size<_Tp> <= 64)
4309	      {
4310		__m512i __a = {};
4311		__builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4312		return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4313	      }
4314	  }
4315	else if constexpr (__is_avx512_abi<_Abi>())
4316	  {
4317	    if constexpr (_S_size<_Tp> <= 8)
4318	      {
4319		__m128i __a = {};
4320		__builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4321		const auto __b = _mm512_cvtepi8_epi64(__a);
4322		return _mm512_test_epi64_mask(__b, __b);
4323	      }
4324	    else if constexpr (_S_size<_Tp> <= 16)
4325	      {
4326		__m128i __a = {};
4327		__builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4328		const auto __b = _mm512_cvtepi8_epi32(__a);
4329		return _mm512_test_epi32_mask(__b, __b);
4330	      }
4331	    else if constexpr (_S_size<_Tp> <= 32)
4332	      {
4333		__m128i __a = {};
4334		__builtin_memcpy(&__a, __mem, 16);
4335		const auto __b = _mm512_cvtepi8_epi32(__a);
4336		__builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4337		const auto __c = _mm512_cvtepi8_epi32(__a);
4338		return _mm512_test_epi32_mask(__b, __b)
4339		       | (_mm512_test_epi32_mask(__c, __c) << 16);
4340	      }
4341	    else if constexpr (_S_size<_Tp> <= 64)
4342	      {
4343		__m128i __a = {};
4344		__builtin_memcpy(&__a, __mem, 16);
4345		const auto __b = _mm512_cvtepi8_epi32(__a);
4346		__builtin_memcpy(&__a, __mem + 16, 16);
4347		const auto __c = _mm512_cvtepi8_epi32(__a);
4348		if constexpr (_S_size<_Tp> <= 48)
4349		  {
4350		    __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4351		    const auto __d = _mm512_cvtepi8_epi32(__a);
4352		    return _mm512_test_epi32_mask(__b, __b)
4353			   | (_mm512_test_epi32_mask(__c, __c) << 16)
4354			   | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4355		  }
4356		else
4357		  {
4358		    __builtin_memcpy(&__a, __mem + 16, 16);
4359		    const auto __d = _mm512_cvtepi8_epi32(__a);
4360		    __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4361		    const auto __e = _mm512_cvtepi8_epi32(__a);
4362		    return _mm512_test_epi32_mask(__b, __b)
4363			   | (_mm512_test_epi32_mask(__c, __c) << 16)
4364			   | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4365			   | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4366		  }
4367	      }
4368	    else
4369	      __assert_unreachable<_Tp>();
4370	  }
4371	else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4372	  return __vector_bitcast<_Tp>(
4373	    __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4374				   -int(__mem[1]), -int(__mem[1])});
4375	else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4376	  {
4377	    int __bool4 = 0;
4378	    __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4379	    const auto __k = __to_intrin(
4380	      (__vector_broadcast<4>(__bool4)
4381	       & __make_vector<int>(0x1, 0x100, 0x10000,
4382				    _S_size<_Tp> == 4 ? 0x1000000 : 0))
4383	      != 0);
4384	    return __vector_bitcast<_Tp>(
4385	      __concat(_mm_unpacklo_epi32(__k, __k),
4386		       _mm_unpackhi_epi32(__k, __k)));
4387	  }
4388	else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4389	  {
4390	    int __bools = 0;
4391	    __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4392	    if constexpr (__have_sse2)
4393	      {
4394		__m128i __k = _mm_cvtsi32_si128(__bools);
4395		__k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4396		return __vector_bitcast<_Tp, _S_size<_Tp>>(
4397		  _mm_unpacklo_epi16(__k, __k));
4398	      }
4399	    else
4400	      {
4401		__m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4402		_mm_empty();
4403		return __vector_bitcast<_Tp, _S_size<_Tp>>(
4404		  _mm_cmpgt_ps(__k, __m128()));
4405	      }
4406	  }
4407	else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4408	  {
4409	    __m128i __k = {};
4410	    __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4411	    __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4412	    return __vector_bitcast<_Tp>(
4413	      __concat(_mm_unpacklo_epi16(__k, __k),
4414		       _mm_unpackhi_epi16(__k, __k)));
4415	  }
4416	else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4417	  {
4418	    __m128i __k = {};
4419	    __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4420	    __k = _mm_cmpgt_epi8(__k, __m128i());
4421	    if constexpr (_S_size<_Tp> <= 8)
4422	      return __vector_bitcast<_Tp, _S_size<_Tp>>(
4423		_mm_unpacklo_epi8(__k, __k));
4424	    else
4425	      return __concat(_mm_unpacklo_epi8(__k, __k),
4426			      _mm_unpackhi_epi8(__k, __k));
4427	  }
4428	else
4429	  return _Base::template _S_load<_Tp>(__mem);
4430      }
4431
4432    // }}}
4433    // _S_from_bitmask{{{
4434    template <size_t _Np, typename _Tp>
4435      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4436      _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4437      {
4438	static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4439	if constexpr (__is_avx512_abi<_Abi>())
4440	  return __bits._M_to_bits();
4441	else
4442	  return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4443      }
4444
4445    // }}}
4446    // _S_masked_load {{{2
4447    template <typename _Tp, size_t _Np>
4448      static inline _SimdWrapper<_Tp, _Np>
4449      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4450		     _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4451      {
4452	if constexpr (__is_avx512_abi<_Abi>())
4453	  {
4454	    if constexpr (__have_avx512bw_vl)
4455	      {
4456		if constexpr (_Np <= 16)
4457		  {
4458		    const auto __a
4459		      = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4460		    return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4461		  }
4462		else if constexpr (_Np <= 32)
4463		  {
4464		    const auto __a
4465		      = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4466		    return (__merge & ~__mask)
4467			   | _mm256_test_epi8_mask(__a, __a);
4468		  }
4469		else if constexpr (_Np <= 64)
4470		  {
4471		    const auto __a
4472		      = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4473		    return (__merge & ~__mask)
4474			   | _mm512_test_epi8_mask(__a, __a);
4475		  }
4476		else
4477		  __assert_unreachable<_Tp>();
4478	      }
4479	    else
4480	      {
4481		_BitOps::_S_bit_iteration(__mask, [&](auto __i) {
4482		  __merge._M_set(__i, __mem[__i]);
4483		});
4484		return __merge;
4485	      }
4486	  }
4487	else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4488	  {
4489	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4490	    __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4491					   _mm256_mask_loadu_epi8(__m256i(),
4492								  __k, __mem));
4493	  }
4494	else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4495	  {
4496	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4497	    __merge
4498	      = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4499				  __m128i(),
4500				  _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4501	  }
4502	else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4503	  {
4504	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4505	    __merge = _mm256_mask_sub_epi16(
4506	      __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4507	      _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4508	  }
4509	else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4510	  {
4511	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4512	    __merge = _mm_mask_sub_epi16(
4513	      __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4514	      _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4515	  }
4516	else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4517	  {
4518	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4519	    __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4520	      __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4521	      _mm256_cvtepi8_epi32(
4522		_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4523	  }
4524	else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4525	  {
4526	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4527	    __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4528	      __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4529	      _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4530	  }
4531	else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4532	  {
4533	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4534	    __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4535	      __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4536	      _mm256_cvtepi8_epi64(
4537		_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4538	  }
4539	else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4540	  {
4541	    const auto __k = _S_to_bits(__mask)._M_to_bits();
4542	    __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4543	      __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4544	      _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4545	  }
4546	else
4547	  return _Base::_S_masked_load(__merge, __mask, __mem);
4548	return __merge;
4549      }
4550
4551    // _S_store {{{2
4552    template <typename _Tp, size_t _Np>
4553      _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
4554						   bool* __mem) noexcept
4555      {
4556	if constexpr (__is_avx512_abi<_Abi>())
4557	  {
4558	    if constexpr (__have_avx512bw_vl)
4559	      _CommonImplX86::_S_store<_Np>(
4560		__vector_bitcast<char>([](auto __data) {
4561		  if constexpr (_Np <= 16)
4562		    return _mm_maskz_set1_epi8(__data, 1);
4563		  else if constexpr (_Np <= 32)
4564		    return _mm256_maskz_set1_epi8(__data, 1);
4565		  else
4566		    return _mm512_maskz_set1_epi8(__data, 1);
4567		}(__v._M_data)),
4568		__mem);
4569	    else if constexpr (_Np <= 8)
4570	      _CommonImplX86::_S_store<_Np>(
4571		__vector_bitcast<char>(
4572#if defined __x86_64__
4573		  __make_wrapper<_ULLong>(
4574		    _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4575#else
4576		  __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4577					_pdep_u32(__v._M_data >> 4,
4578						  0x01010101U))
4579#endif
4580		    ),
4581		__mem);
4582	    else if constexpr (_Np <= 16)
4583	      _mm512_mask_cvtepi32_storeu_epi8(
4584		__mem, 0xffffu >> (16 - _Np),
4585		_mm512_maskz_set1_epi32(__v._M_data, 1));
4586	    else
4587	      __assert_unreachable<_Tp>();
4588	  }
4589	else if constexpr (__is_sse_abi<_Abi>()) //{{{
4590	  {
4591	    if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4592	      {
4593		const auto __k = __vector_bitcast<int>(__v);
4594		__mem[0] = -__k[1];
4595		__mem[1] = -__k[3];
4596	      }
4597	    else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4598	      {
4599		if constexpr (__have_sse2)
4600		  {
4601		    const unsigned __bool4
4602		      = __vector_bitcast<_UInt>(_mm_packs_epi16(
4603			  _mm_packs_epi32(__intrin_bitcast<__m128i>(
4604					    __to_intrin(__v)),
4605					  __m128i()),
4606			  __m128i()))[0]
4607			& 0x01010101u;
4608		    __builtin_memcpy(__mem, &__bool4, _Np);
4609		  }
4610		else if constexpr (__have_mmx)
4611		  {
4612		    const __m64 __k = _mm_cvtps_pi8(
4613		      __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4614		    __builtin_memcpy(__mem, &__k, _Np);
4615		    _mm_empty();
4616		  }
4617		else
4618		  return _Base::_S_store(__v, __mem);
4619	      }
4620	    else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4621	      {
4622		_CommonImplX86::_S_store<_Np>(
4623		  __vector_bitcast<char>(_mm_packs_epi16(
4624		    __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4625		    __m128i())),
4626		  __mem);
4627	      }
4628	    else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4629	      _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4630	    else
4631	      __assert_unreachable<_Tp>();
4632	  }                                      // }}}
4633	else if constexpr (__is_avx_abi<_Abi>()) // {{{
4634	  {
4635	    if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4636	      {
4637		auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4638		int __bool4;
4639		if constexpr (__have_avx2)
4640		  __bool4 = _mm256_movemask_epi8(__k);
4641		else
4642		  __bool4 = (_mm_movemask_epi8(__lo128(__k))
4643			     | (_mm_movemask_epi8(__hi128(__k)) << 16));
4644		__bool4 &= 0x01010101;
4645		__builtin_memcpy(__mem, &__bool4, _Np);
4646	      }
4647	    else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4648	      {
4649		const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4650		const auto __k2
4651		  = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4652				   15);
4653		const auto __k3
4654		  = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4655		_CommonImplX86::_S_store<_Np>(__k3, __mem);
4656	      }
4657	    else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4658	      {
4659		if constexpr (__have_avx2)
4660		  {
4661		    const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4662		    const auto __bools = __vector_bitcast<char>(
4663		      _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4664		    _CommonImplX86::_S_store<_Np>(__bools, __mem);
4665		  }
4666		else
4667		  {
4668		    const auto __bools
4669		      = 1
4670			& __vector_bitcast<_UChar>(
4671			  _mm_packs_epi16(__lo128(__to_intrin(__v)),
4672					  __hi128(__to_intrin(__v))));
4673		    _CommonImplX86::_S_store<_Np>(__bools, __mem);
4674		  }
4675	      }
4676	    else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4677	      _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4678	    else
4679	      __assert_unreachable<_Tp>();
4680	  } // }}}
4681	else
4682	  __assert_unreachable<_Tp>();
4683      }
4684
4685    // _S_masked_store {{{2
4686    template <typename _Tp, size_t _Np>
4687      static inline void
4688      _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4689		      const _SimdWrapper<_Tp, _Np> __k) noexcept
4690      {
4691	if constexpr (__is_avx512_abi<_Abi>())
4692	  {
4693	    static_assert(is_same_v<_Tp, bool>);
4694	    if constexpr (_Np <= 16 && __have_avx512bw_vl)
4695	      _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4696	    else if constexpr (_Np <= 16)
4697	      _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4698					       _mm512_maskz_set1_epi32(__v, 1));
4699	    else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4700	      _mm256_mask_storeu_epi8(__mem, __k,
4701				      _mm256_maskz_set1_epi8(__v, 1));
4702	    else if constexpr (_Np <= 32 && __have_avx512bw)
4703	      _mm256_mask_storeu_epi8(__mem, __k,
4704				      __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4705	    else if constexpr (_Np <= 64 && __have_avx512bw)
4706	      _mm512_mask_storeu_epi8(__mem, __k,
4707				      _mm512_maskz_set1_epi8(__v, 1));
4708	    else
4709	      __assert_unreachable<_Tp>();
4710	  }
4711	else
4712	  _Base::_S_masked_store(__v, __mem, __k);
4713      }
4714
4715    // logical and bitwise operators {{{2
4716    template <typename _Tp, size_t _Np>
4717      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4718      _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x,
4719		     const _SimdWrapper<_Tp, _Np>& __y)
4720      {
4721	if constexpr (is_same_v<_Tp, bool>)
4722	  {
4723	    if constexpr (__have_avx512dq && _Np <= 8)
4724	      return _kand_mask8(__x._M_data, __y._M_data);
4725	    else if constexpr (_Np <= 16)
4726	      return _kand_mask16(__x._M_data, __y._M_data);
4727	    else if constexpr (__have_avx512bw && _Np <= 32)
4728	      return _kand_mask32(__x._M_data, __y._M_data);
4729	    else if constexpr (__have_avx512bw && _Np <= 64)
4730	      return _kand_mask64(__x._M_data, __y._M_data);
4731	    else
4732	      __assert_unreachable<_Tp>();
4733	  }
4734	else
4735	  return _Base::_S_logical_and(__x, __y);
4736      }
4737
4738    template <typename _Tp, size_t _Np>
4739      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4740      _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x,
4741		    const _SimdWrapper<_Tp, _Np>& __y)
4742      {
4743	if constexpr (is_same_v<_Tp, bool>)
4744	  {
4745	    if constexpr (__have_avx512dq && _Np <= 8)
4746	      return _kor_mask8(__x._M_data, __y._M_data);
4747	    else if constexpr (_Np <= 16)
4748	      return _kor_mask16(__x._M_data, __y._M_data);
4749	    else if constexpr (__have_avx512bw && _Np <= 32)
4750	      return _kor_mask32(__x._M_data, __y._M_data);
4751	    else if constexpr (__have_avx512bw && _Np <= 64)
4752	      return _kor_mask64(__x._M_data, __y._M_data);
4753	    else
4754	      __assert_unreachable<_Tp>();
4755	  }
4756	else
4757	  return _Base::_S_logical_or(__x, __y);
4758      }
4759
4760    template <typename _Tp, size_t _Np>
4761      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4762      _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4763      {
4764	if constexpr (is_same_v<_Tp, bool>)
4765	  {
4766	    if constexpr (__have_avx512dq && _Np <= 8)
4767	      return _kandn_mask8(__x._M_data,
4768				  _Abi::template __implicit_mask_n<_Np>());
4769	    else if constexpr (_Np <= 16)
4770	      return _kandn_mask16(__x._M_data,
4771				   _Abi::template __implicit_mask_n<_Np>());
4772	    else if constexpr (__have_avx512bw && _Np <= 32)
4773	      return _kandn_mask32(__x._M_data,
4774				   _Abi::template __implicit_mask_n<_Np>());
4775	    else if constexpr (__have_avx512bw && _Np <= 64)
4776	      return _kandn_mask64(__x._M_data,
4777				   _Abi::template __implicit_mask_n<_Np>());
4778	    else
4779	      __assert_unreachable<_Tp>();
4780	  }
4781	else
4782	  return _Base::_S_bit_not(__x);
4783      }
4784
4785    template <typename _Tp, size_t _Np>
4786      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4787      _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x,
4788		 const _SimdWrapper<_Tp, _Np>& __y)
4789      {
4790	if constexpr (is_same_v<_Tp, bool>)
4791	  {
4792	    if constexpr (__have_avx512dq && _Np <= 8)
4793	      return _kand_mask8(__x._M_data, __y._M_data);
4794	    else if constexpr (_Np <= 16)
4795	      return _kand_mask16(__x._M_data, __y._M_data);
4796	    else if constexpr (__have_avx512bw && _Np <= 32)
4797	      return _kand_mask32(__x._M_data, __y._M_data);
4798	    else if constexpr (__have_avx512bw && _Np <= 64)
4799	      return _kand_mask64(__x._M_data, __y._M_data);
4800	    else
4801	      __assert_unreachable<_Tp>();
4802	  }
4803	else
4804	  return _Base::_S_bit_and(__x, __y);
4805      }
4806
4807    template <typename _Tp, size_t _Np>
4808      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4809      _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x,
4810		const _SimdWrapper<_Tp, _Np>& __y)
4811      {
4812	if constexpr (is_same_v<_Tp, bool>)
4813	  {
4814	    if constexpr (__have_avx512dq && _Np <= 8)
4815	      return _kor_mask8(__x._M_data, __y._M_data);
4816	    else if constexpr (_Np <= 16)
4817	      return _kor_mask16(__x._M_data, __y._M_data);
4818	    else if constexpr (__have_avx512bw && _Np <= 32)
4819	      return _kor_mask32(__x._M_data, __y._M_data);
4820	    else if constexpr (__have_avx512bw && _Np <= 64)
4821	      return _kor_mask64(__x._M_data, __y._M_data);
4822	    else
4823	      __assert_unreachable<_Tp>();
4824	  }
4825	else
4826	  return _Base::_S_bit_or(__x, __y);
4827      }
4828
4829    template <typename _Tp, size_t _Np>
4830      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4831      _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x,
4832		 const _SimdWrapper<_Tp, _Np>& __y)
4833      {
4834	if constexpr (is_same_v<_Tp, bool>)
4835	  {
4836	    if constexpr (__have_avx512dq && _Np <= 8)
4837	      return _kxor_mask8(__x._M_data, __y._M_data);
4838	    else if constexpr (_Np <= 16)
4839	      return _kxor_mask16(__x._M_data, __y._M_data);
4840	    else if constexpr (__have_avx512bw && _Np <= 32)
4841	      return _kxor_mask32(__x._M_data, __y._M_data);
4842	    else if constexpr (__have_avx512bw && _Np <= 64)
4843	      return _kxor_mask64(__x._M_data, __y._M_data);
4844	    else
4845	      __assert_unreachable<_Tp>();
4846	  }
4847	else
4848	  return _Base::_S_bit_xor(__x, __y);
4849      }
4850
4851    //}}}2
4852    // _S_masked_assign{{{
4853    template <size_t _Np>
4854      _GLIBCXX_SIMD_INTRINSIC static void
4855      _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4856		       _SimdWrapper<bool, _Np>& __lhs,
4857		       _SimdWrapper<bool, _Np> __rhs)
4858      {
4859	__lhs._M_data
4860	  = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
4861      }
4862
4863    template <size_t _Np>
4864      _GLIBCXX_SIMD_INTRINSIC static void
4865      _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4866		       _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
4867      {
4868	if (__rhs)
4869	  __lhs._M_data = __k._M_data | __lhs._M_data;
4870	else
4871	  __lhs._M_data = ~__k._M_data & __lhs._M_data;
4872      }
4873
4874    using _MaskImplBuiltin<_Abi>::_S_masked_assign;
4875
4876    //}}}
4877    // _S_all_of {{{
4878    template <typename _Tp>
4879      _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
4880      {
4881	if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4882	  {
4883	    constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4884	    using _TI = __intrinsic_type_t<_Tp, _Np>;
4885	    const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4886	    if constexpr (__have_sse4_1)
4887	      {
4888		_GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4889		  = _Abi::template _S_implicit_mask_intrin<_Tp>();
4890		return 0 != __testc(__a, __b);
4891	      }
4892	    else if constexpr (is_same_v<_Tp, float>)
4893	      return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
4894		     == (1 << _Np) - 1;
4895	    else if constexpr (is_same_v<_Tp, double>)
4896	      return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
4897		     == (1 << _Np) - 1;
4898	    else
4899	      return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4900		     == (1 << (_Np * sizeof(_Tp))) - 1;
4901	  }
4902	else if constexpr (__is_avx512_abi<_Abi>())
4903	  {
4904	    constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
4905	    const auto __kk = __k._M_data._M_data;
4906	    if constexpr (sizeof(__kk) == 1)
4907	      {
4908		if constexpr (__have_avx512dq)
4909		  return _kortestc_mask8_u8(__kk, _Mask == 0xff
4910						    ? __kk
4911						    : __mmask8(~_Mask));
4912		else
4913		  return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
4914	      }
4915	    else if constexpr (sizeof(__kk) == 2)
4916	      return _kortestc_mask16_u8(__kk, _Mask == 0xffff
4917						 ? __kk
4918						 : __mmask16(~_Mask));
4919	    else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
4920	      return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
4921						 ? __kk
4922						 : __mmask32(~_Mask));
4923	    else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
4924	      return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
4925						 ? __kk
4926						 : __mmask64(~_Mask));
4927	    else
4928	      __assert_unreachable<_Tp>();
4929	  }
4930      }
4931
4932    // }}}
4933    // _S_any_of {{{
4934    template <typename _Tp>
4935      _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
4936      {
4937	if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4938	  {
4939	    constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4940	    using _TI = __intrinsic_type_t<_Tp, _Np>;
4941	    const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4942	    if constexpr (__have_sse4_1)
4943	      {
4944		if constexpr (_Abi::template _S_is_partial<
4945				_Tp> || sizeof(__k) < 16)
4946		  {
4947		    _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4948		      = _Abi::template _S_implicit_mask_intrin<_Tp>();
4949		    return 0 == __testz(__a, __b);
4950		  }
4951		else
4952		  return 0 == __testz(__a, __a);
4953	      }
4954	    else if constexpr (is_same_v<_Tp, float>)
4955	      return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
4956	    else if constexpr (is_same_v<_Tp, double>)
4957	      return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
4958	    else
4959	      return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4960		     != 0;
4961	  }
4962	else if constexpr (__is_avx512_abi<_Abi>())
4963	  return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
4964		 != 0;
4965      }
4966
4967    // }}}
4968    // _S_none_of {{{
4969    template <typename _Tp>
4970      _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
4971      {
4972	if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4973	  {
4974	    constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4975	    using _TI = __intrinsic_type_t<_Tp, _Np>;
4976	    const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4977	    if constexpr (__have_sse4_1)
4978	      {
4979		if constexpr (_Abi::template _S_is_partial<
4980				_Tp> || sizeof(__k) < 16)
4981		  {
4982		    _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4983		      = _Abi::template _S_implicit_mask_intrin<_Tp>();
4984		    return 0 != __testz(__a, __b);
4985		  }
4986		else
4987		  return 0 != __testz(__a, __a);
4988	      }
4989	    else if constexpr (is_same_v<_Tp, float>)
4990	      return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
4991	    else if constexpr (is_same_v<_Tp, double>)
4992	      return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
4993	    else
4994	      return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
4995		     == 0;
4996	  }
4997	else if constexpr (__is_avx512_abi<_Abi>())
4998	  return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
4999		 == 0;
5000      }
5001
5002    // }}}
5003    // _S_some_of {{{
5004    template <typename _Tp>
5005      _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
5006      {
5007	if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5008	  {
5009	    constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5010	    using _TI = __intrinsic_type_t<_Tp, _Np>;
5011	    const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5012	    if constexpr (__have_sse4_1)
5013	      {
5014		_GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5015		  = _Abi::template _S_implicit_mask_intrin<_Tp>();
5016		return 0 != __testnzc(__a, __b);
5017	      }
5018	    else if constexpr (is_same_v<_Tp, float>)
5019	      {
5020		constexpr int __allbits = (1 << _Np) - 1;
5021		const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5022		return __tmp > 0 && __tmp < __allbits;
5023	      }
5024	    else if constexpr (is_same_v<_Tp, double>)
5025	      {
5026		constexpr int __allbits = (1 << _Np) - 1;
5027		const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5028		return __tmp > 0 && __tmp < __allbits;
5029	      }
5030	    else
5031	      {
5032		constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5033		const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5034		return __tmp > 0 && __tmp < __allbits;
5035	      }
5036	  }
5037	else if constexpr (__is_avx512_abi<_Abi>())
5038	  return _S_any_of(__k) && !_S_all_of(__k);
5039	else
5040	  __assert_unreachable<_Tp>();
5041      }
5042
5043    // }}}
5044    // _S_popcount {{{
5045    template <typename _Tp>
5046      _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
5047      {
5048	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5049	const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5050	if constexpr (__is_avx512_abi<_Abi>())
5051	  {
5052	    if constexpr (_Np > 32)
5053	      return __builtin_popcountll(__kk);
5054	    else
5055	      return __builtin_popcount(__kk);
5056	  }
5057	else
5058	  {
5059	    if constexpr (__have_popcnt)
5060	      {
5061		int __bits
5062		  = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5063		const int __count = __builtin_popcount(__bits);
5064		return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5065	      }
5066	    else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5067	      {
5068		const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5069		return mask - (mask >> 1);
5070	      }
5071	    else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5072	      {
5073		auto __x = -(__lo128(__kk) + __hi128(__kk));
5074		return __x[0] + __x[1];
5075	      }
5076	    else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5077	      {
5078		if constexpr (__have_sse2)
5079		  {
5080		    __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5081		    __x = _mm_add_epi32(
5082		      __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5083		    __x = _mm_add_epi32(
5084		      __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5085		    return -_mm_cvtsi128_si32(__x);
5086		  }
5087		else
5088		  return __builtin_popcount(
5089		    _mm_movemask_ps(__auto_bitcast(__kk)));
5090	      }
5091	    else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5092	      {
5093		auto __x = __to_intrin(__kk);
5094		__x = _mm_add_epi16(__x,
5095				    _mm_shuffle_epi32(__x,
5096						      _MM_SHUFFLE(0, 1, 2, 3)));
5097		__x = _mm_add_epi16(
5098		  __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5099		__x = _mm_add_epi16(
5100		  __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5101		return -short(_mm_extract_epi16(__x, 0));
5102	      }
5103	    else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5104	      {
5105		auto __x = __to_intrin(__kk);
5106		__x = _mm_add_epi8(__x,
5107				   _mm_shuffle_epi32(__x,
5108						     _MM_SHUFFLE(0, 1, 2, 3)));
5109		__x = _mm_add_epi8(__x,
5110				   _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5111									3)));
5112		__x = _mm_add_epi8(__x,
5113				   _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5114									1)));
5115		auto __y = -__vector_bitcast<_UChar>(__x);
5116		if constexpr (__have_sse4_1)
5117		  return __y[0] + __y[1];
5118		else
5119		  {
5120		    unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5121		    return (__z & 0xff) + (__z >> 8);
5122		  }
5123	      }
5124	    else if constexpr (sizeof(__kk) == 32)
5125	      {
5126		// The following works only as long as the implementations above
5127		// use a summation
5128		using _I = __int_for_sizeof_t<_Tp>;
5129		const auto __as_int = __vector_bitcast<_I>(__kk);
5130		_MaskImplX86<simd_abi::__sse>::_S_popcount(
5131		  simd_mask<_I, simd_abi::__sse>(__private_init,
5132						 __lo128(__as_int)
5133						   + __hi128(__as_int)));
5134	      }
5135	    else
5136	      __assert_unreachable<_Tp>();
5137	  }
5138      }
5139
5140    // }}}
5141    // _S_find_first_set {{{
5142    template <typename _Tp>
5143      _GLIBCXX_SIMD_INTRINSIC static int
5144      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5145      {
5146	if constexpr (__is_avx512_abi<_Abi>())
5147	  return std::__countr_zero(__k._M_data._M_data);
5148	else
5149	  return _Base::_S_find_first_set(__k);
5150      }
5151
5152    // }}}
5153    // _S_find_last_set {{{
5154    template <typename _Tp>
5155      _GLIBCXX_SIMD_INTRINSIC static int
5156      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5157      {
5158	if constexpr (__is_avx512_abi<_Abi>())
5159	  return std::__bit_width(__k._M_data._M_data) - 1;
5160	else
5161	  return _Base::_S_find_last_set(__k);
5162      }
5163
5164    // }}}
5165  };
5166
5167// }}}
5168
5169_GLIBCXX_SIMD_END_NAMESPACE
5170#endif // __cplusplus >= 201703L
5171#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5172
5173// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
5174