xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/experimental/bits/simd_builtin.h (revision c42dbd0ed2e61fe6eda8590caa852ccf34719964)
1 // Simd Abi specific implementations -*- C++ -*-
2 
3 // Copyright (C) 2020-2022 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27 
28 #if __cplusplus >= 201703L
29 
30 #include <array>
31 #include <cmath>
32 #include <cstdlib>
33 
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
35 // _S_allbits{{{
36 template <typename _V>
37   static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38     = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39 
40 // }}}
41 // _S_signmask, _S_absmask{{{
42 template <typename _V, typename = _VectorTraits<_V>>
43   static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44     = __xor(_V() + 1, _V() - 1);
45 
46 template <typename _V, typename = _VectorTraits<_V>>
47   static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48     = __andnot(_S_signmask<_V>, _S_allbits<_V>);
49 
50 //}}}
51 // __vector_permute<Indices...>{{{
52 // Index == -1 requests zeroing of the output element
53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
54 	  typename = __detail::__odr_helper>
55   _Tp
56   __vector_permute(_Tp __x)
57   {
58     static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
59     return __make_vector<typename _TVT::value_type>(
60       (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
61   }
62 
63 // }}}
64 // __vector_shuffle<Indices...>{{{
65 // Index == -1 requests zeroing of the output element
66 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
67 	  typename = __detail::__odr_helper>
68   _Tp
69   __vector_shuffle(_Tp __x, _Tp __y)
70   {
71     return _Tp{(_Indices == -1 ? 0
72 		: _Indices < _TVT::_S_full_size
73 		  ? __x[_Indices]
74 		  : __y[_Indices - _TVT::_S_full_size])...};
75   }
76 
77 // }}}
78 // __make_wrapper{{{
79 template <typename _Tp, typename... _Args>
80   _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
81   __make_wrapper(const _Args&... __args)
82   { return __make_vector<_Tp>(__args...); }
83 
84 // }}}
85 // __wrapper_bitcast{{{
86 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
87 	  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
88   _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
89   __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
90   {
91     static_assert(_Np > 1);
92     return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
93   }
94 
95 // }}}
96 // __shift_elements_right{{{
97 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct
98 template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
99   _GLIBCXX_SIMD_INTRINSIC _Tp
100   __shift_elements_right(_Tp __v)
101   {
102     [[maybe_unused]] const auto __iv = __to_intrin(__v);
103     static_assert(__shift <= sizeof(_Tp));
104     if constexpr (__shift == 0)
105       return __v;
106     else if constexpr (__shift == sizeof(_Tp))
107       return _Tp();
108 #if _GLIBCXX_SIMD_X86INTRIN // {{{
109     else if constexpr (__have_sse && __shift == 8
110 		       && _TVT::template _S_is<float, 4>)
111       return _mm_movehl_ps(__iv, __iv);
112     else if constexpr (__have_sse2 && __shift == 8
113 		       && _TVT::template _S_is<double, 2>)
114       return _mm_unpackhi_pd(__iv, __iv);
115     else if constexpr (__have_sse2 && sizeof(_Tp) == 16)
116       return reinterpret_cast<typename _TVT::type>(
117 	_mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
118     else if constexpr (__shift == 16 && sizeof(_Tp) == 32)
119       {
120 	/*if constexpr (__have_avx && _TVT::template _S_is<double, 4>)
121 	  return _mm256_permute2f128_pd(__iv, __iv, 0x81);
122 	else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
123 	  return _mm256_permute2f128_ps(__iv, __iv, 0x81);
124 	else if constexpr (__have_avx)
125 	  return reinterpret_cast<typename _TVT::type>(
126 	    _mm256_permute2f128_si256(__iv, __iv, 0x81));
127 	else*/
128 	return __zero_extend(__hi128(__v));
129       }
130     else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16)
131       {
132 	const auto __vll = __vector_bitcast<_LLong>(__v);
133 	return reinterpret_cast<typename _TVT::type>(
134 	  _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81),
135 			     __vll, __shift));
136       }
137     else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16)
138       {
139 	const auto __vll = __vector_bitcast<_LLong>(__v);
140 	return reinterpret_cast<typename _TVT::type>(
141 	  __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
142 		   _mm_srli_si128(__hi128(__vll), __shift)));
143       }
144     else if constexpr (sizeof(_Tp) == 32 && __shift > 16)
145       return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v)));
146     else if constexpr (sizeof(_Tp) == 64 && __shift == 32)
147       return __zero_extend(__hi256(__v));
148     else if constexpr (__have_avx512f && sizeof(_Tp) == 64)
149       {
150 	if constexpr (__shift >= 48)
151 	  return __zero_extend(
152 	    __shift_elements_right<__shift - 48>(__extract<3, 4>(__v)));
153 	else if constexpr (__shift >= 32)
154 	  return __zero_extend(
155 	    __shift_elements_right<__shift - 32>(__hi256(__v)));
156 	else if constexpr (__shift % 8 == 0)
157 	  return reinterpret_cast<typename _TVT::type>(
158 	    _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
159 				__shift / 8));
160 	else if constexpr (__shift % 4 == 0)
161 	  return reinterpret_cast<typename _TVT::type>(
162 	    _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
163 				__shift / 4));
164 	else if constexpr (__have_avx512bw && __shift < 16)
165 	  {
166 	    const auto __vll = __vector_bitcast<_LLong>(__v);
167 	    return reinterpret_cast<typename _TVT::type>(
168 	      _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9),
169 				 __vll, __shift));
170 	  }
171 	else if constexpr (__have_avx512bw && __shift < 32)
172 	  {
173 	    const auto __vll = __vector_bitcast<_LLong>(__v);
174 	    return reinterpret_cast<typename _TVT::type>(
175 	      _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee),
176 				 _mm512_shuffle_i32x4(__vll, __vll, 0xf9),
177 				 __shift - 16));
178 	  }
179 	else
180 	  __assert_unreachable<_Tp>();
181       }
182   /*
183       } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
184 	  return __auto_bitcast(__extract<__shift / 16, 4>(__v));
185   */
186 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
187     else
188       {
189 	constexpr int __chunksize = __shift % 8 == 0   ? 8
190 				    : __shift % 4 == 0 ? 4
191 				    : __shift % 2 == 0 ? 2
192 						       : 1;
193 	auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
194 	using _Up = decltype(__w);
195 	return __intrin_bitcast<_Tp>(
196 	  __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
197 	    [](auto... __chunks) { return _Up{__chunks...}; },
198 	    [&](auto __i) { return __w[__shift / __chunksize + __i]; }));
199       }
200   }
201 
202 // }}}
203 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
204 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
205   _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST
206   _SimdWrapper<_Tp, _Np / _Total * _Combine>
207   __extract_part(const _SimdWrapper<_Tp, _Np> __x)
208   {
209     if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
210       return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
211     else
212       {
213 	constexpr size_t __values_per_part = _Np / _Total;
214 	constexpr size_t __values_to_skip = _Index * __values_per_part;
215 	constexpr size_t __return_size = __values_per_part * _Combine;
216 	using _R = __vector_type_t<_Tp, __return_size>;
217 	static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
218 			<= sizeof(__x),
219 		      "out of bounds __extract_part");
220 	// the following assertion would ensure no "padding" to be read
221 	// static_assert(_Total >= _Index + _Combine, "_Total must be greater
222 	// than _Index");
223 
224 	// static_assert(__return_size * _Total == _Np, "_Np must be divisible
225 	// by _Total");
226 	if (__x._M_is_constprop())
227 	  return __generate_from_n_evaluations<__return_size, _R>(
228 	    [&](auto __i) { return __x[__values_to_skip + __i]; });
229 	if constexpr (_Index == 0 && _Total == 1)
230 	  return __x;
231 	else if constexpr (_Index == 0)
232 	  return __intrin_bitcast<_R>(__as_vector(__x));
233 #if _GLIBCXX_SIMD_X86INTRIN // {{{
234 	else if constexpr (sizeof(__x) == 32
235 			   && __return_size * sizeof(_Tp) <= 16)
236 	  {
237 	    constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
238 	    if constexpr (__bytes_to_skip == 16)
239 	      return __vector_bitcast<_Tp, __return_size>(
240 		__hi128(__as_vector(__x)));
241 	    else
242 	      return __vector_bitcast<_Tp, __return_size>(
243 		_mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
244 				__lo128(__vector_bitcast<_LLong>(__x)),
245 				__bytes_to_skip));
246 	  }
247 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
248 	else if constexpr (_Index > 0
249 			   && (__values_to_skip % __return_size != 0
250 			       || sizeof(_R) >= 8)
251 			   && (__values_to_skip + __return_size) * sizeof(_Tp)
252 				<= 64
253 			   && sizeof(__x) >= 16)
254 	  return __intrin_bitcast<_R>(
255 	    __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
256 	      __as_vector(__x)));
257 	else
258 	  {
259 	    _R __r = {};
260 	    __builtin_memcpy(&__r,
261 			     reinterpret_cast<const char*>(&__x)
262 			       + sizeof(_Tp) * __values_to_skip,
263 			     __return_size * sizeof(_Tp));
264 	    return __r;
265 	  }
266       }
267   }
268 
269 // }}}
270 // __extract_part(_SimdWrapper<bool, _Np>) {{{
271 template <int _Index, int _Total, int _Combine = 1, size_t _Np>
272   _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
273   __extract_part(const _SimdWrapper<bool, _Np> __x)
274   {
275     static_assert(_Combine == 1, "_Combine != 1 not implemented");
276     static_assert(__have_avx512f && _Np == _Np);
277     static_assert(_Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
278     return __x._M_data >> (_Index * _Np / _Total);
279   }
280 
281 // }}}
282 
283 // __vector_convert {{{
284 // implementation requires an index sequence
285 template <typename _To, typename _From, size_t... _I>
286   _GLIBCXX_SIMD_INTRINSIC constexpr _To
287   __vector_convert(_From __a, index_sequence<_I...>)
288   {
289     using _Tp = typename _VectorTraits<_To>::value_type;
290     return _To{static_cast<_Tp>(__a[_I])...};
291   }
292 
293 template <typename _To, typename _From, size_t... _I>
294   _GLIBCXX_SIMD_INTRINSIC constexpr _To
295   __vector_convert(_From __a, _From __b, index_sequence<_I...>)
296   {
297     using _Tp = typename _VectorTraits<_To>::value_type;
298     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
299   }
300 
301 template <typename _To, typename _From, size_t... _I>
302   _GLIBCXX_SIMD_INTRINSIC constexpr _To
303   __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
304   {
305     using _Tp = typename _VectorTraits<_To>::value_type;
306     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
307 	       static_cast<_Tp>(__c[_I])...};
308   }
309 
310 template <typename _To, typename _From, size_t... _I>
311   _GLIBCXX_SIMD_INTRINSIC constexpr _To
312   __vector_convert(_From __a, _From __b, _From __c, _From __d,
313 		   index_sequence<_I...>)
314   {
315     using _Tp = typename _VectorTraits<_To>::value_type;
316     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
317 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
318   }
319 
320 template <typename _To, typename _From, size_t... _I>
321   _GLIBCXX_SIMD_INTRINSIC constexpr _To
322   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
323 		   index_sequence<_I...>)
324   {
325     using _Tp = typename _VectorTraits<_To>::value_type;
326     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
327 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
328 	       static_cast<_Tp>(__e[_I])...};
329   }
330 
331 template <typename _To, typename _From, size_t... _I>
332   _GLIBCXX_SIMD_INTRINSIC constexpr _To
333   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
334 		   _From __f, index_sequence<_I...>)
335   {
336     using _Tp = typename _VectorTraits<_To>::value_type;
337     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
338 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
339 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
340   }
341 
342 template <typename _To, typename _From, size_t... _I>
343   _GLIBCXX_SIMD_INTRINSIC constexpr _To
344   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
345 		   _From __f, _From __g, index_sequence<_I...>)
346   {
347     using _Tp = typename _VectorTraits<_To>::value_type;
348     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
349 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
350 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
351 	       static_cast<_Tp>(__g[_I])...};
352   }
353 
354 template <typename _To, typename _From, size_t... _I>
355   _GLIBCXX_SIMD_INTRINSIC constexpr _To
356   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
357 		   _From __f, _From __g, _From __h, index_sequence<_I...>)
358   {
359     using _Tp = typename _VectorTraits<_To>::value_type;
360     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
361 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
362 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
363 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
364   }
365 
366 template <typename _To, typename _From, size_t... _I>
367   _GLIBCXX_SIMD_INTRINSIC constexpr _To
368   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
369 		   _From __f, _From __g, _From __h, _From __i,
370 		   index_sequence<_I...>)
371   {
372     using _Tp = typename _VectorTraits<_To>::value_type;
373     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
374 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
375 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
376 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
377 	       static_cast<_Tp>(__i[_I])...};
378   }
379 
380 template <typename _To, typename _From, size_t... _I>
381   _GLIBCXX_SIMD_INTRINSIC constexpr _To
382   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
383 		   _From __f, _From __g, _From __h, _From __i, _From __j,
384 		   index_sequence<_I...>)
385   {
386     using _Tp = typename _VectorTraits<_To>::value_type;
387     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
388 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
389 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
390 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
391 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
392   }
393 
394 template <typename _To, typename _From, size_t... _I>
395   _GLIBCXX_SIMD_INTRINSIC constexpr _To
396   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
397 		   _From __f, _From __g, _From __h, _From __i, _From __j,
398 		   _From __k, index_sequence<_I...>)
399   {
400     using _Tp = typename _VectorTraits<_To>::value_type;
401     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
402 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
403 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
404 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
405 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
406 	       static_cast<_Tp>(__k[_I])...};
407   }
408 
409 template <typename _To, typename _From, size_t... _I>
410   _GLIBCXX_SIMD_INTRINSIC constexpr _To
411   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
412 		   _From __f, _From __g, _From __h, _From __i, _From __j,
413 		   _From __k, _From __l, index_sequence<_I...>)
414   {
415     using _Tp = typename _VectorTraits<_To>::value_type;
416     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
417 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
418 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
419 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
420 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
421 	       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
422   }
423 
424 template <typename _To, typename _From, size_t... _I>
425   _GLIBCXX_SIMD_INTRINSIC constexpr _To
426   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
427 		   _From __f, _From __g, _From __h, _From __i, _From __j,
428 		   _From __k, _From __l, _From __m, index_sequence<_I...>)
429   {
430     using _Tp = typename _VectorTraits<_To>::value_type;
431     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
432 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
433 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
434 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
435 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
436 	       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
437 	       static_cast<_Tp>(__m[_I])...};
438   }
439 
440 template <typename _To, typename _From, size_t... _I>
441   _GLIBCXX_SIMD_INTRINSIC constexpr _To
442   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
443 		   _From __f, _From __g, _From __h, _From __i, _From __j,
444 		   _From __k, _From __l, _From __m, _From __n,
445 		   index_sequence<_I...>)
446   {
447     using _Tp = typename _VectorTraits<_To>::value_type;
448     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
449 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
450 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
451 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
452 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
453 	       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
454 	       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
455   }
456 
457 template <typename _To, typename _From, size_t... _I>
458   _GLIBCXX_SIMD_INTRINSIC constexpr _To
459   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
460 		   _From __f, _From __g, _From __h, _From __i, _From __j,
461 		   _From __k, _From __l, _From __m, _From __n, _From __o,
462 		   index_sequence<_I...>)
463   {
464     using _Tp = typename _VectorTraits<_To>::value_type;
465     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
466 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
467 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
468 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
469 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
470 	       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
471 	       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
472 	       static_cast<_Tp>(__o[_I])...};
473   }
474 
475 template <typename _To, typename _From, size_t... _I>
476   _GLIBCXX_SIMD_INTRINSIC constexpr _To
477   __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
478 		   _From __f, _From __g, _From __h, _From __i, _From __j,
479 		   _From __k, _From __l, _From __m, _From __n, _From __o,
480 		   _From __p, index_sequence<_I...>)
481   {
482     using _Tp = typename _VectorTraits<_To>::value_type;
483     return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
484 	       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
485 	       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
486 	       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
487 	       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
488 	       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
489 	       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
490 	       static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
491   }
492 
493 // Defer actual conversion to the overload that takes an index sequence. Note
494 // that this function adds zeros or drops values off the end if you don't ensure
495 // matching width.
496 template <typename _To, typename... _From, size_t _FromSize>
497   _GLIBCXX_SIMD_INTRINSIC constexpr _To
498   __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
499   {
500 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
501     using _From0 = __first_of_pack_t<_From...>;
502     using _FW = _SimdWrapper<_From0, _FromSize>;
503     if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
504       {
505 	if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
506 		      == 0) // power-of-two number of arguments
507 	  return __convert_x86<_To>(__as_vector(__xs)...);
508 	else // append zeros and recurse until the above branch is taken
509 	  return __vector_convert<_To>(__xs..., _FW{});
510       }
511     else
512 #endif
513       return __vector_convert<_To>(
514 	__as_vector(__xs)...,
515 	make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
516 			       _VectorTraits<_To>::_S_full_size, int(_FromSize))
517 						  : _FromSize)>());
518   }
519 
520 // }}}
521 // __convert function{{{
522 template <typename _To, typename _From, typename... _More>
523   _GLIBCXX_SIMD_INTRINSIC constexpr auto
524   __convert(_From __v0, _More... __vs)
525   {
526     static_assert((true && ... && is_same_v<_From, _More>) );
527     if constexpr (__is_vectorizable_v<_From>)
528       {
529 	using _V = typename _VectorTraits<_To>::type;
530 	using _Tp = typename _VectorTraits<_To>::value_type;
531 	return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
532       }
533     else if constexpr (__is_vector_type_v<_From>)
534       return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
535     else // _SimdWrapper arguments
536       {
537 	constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
538 	if constexpr (__is_vectorizable_v<_To>)
539 	  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
540 	else if constexpr (!__is_vector_type_v<_To>)
541 	  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
542 	else
543 	  {
544 	    static_assert(
545 	      sizeof...(_More) == 0
546 		|| _VectorTraits<_To>::_S_full_size >= __input_size,
547 	      "__convert(...) requires the input to fit into the output");
548 	    return __vector_convert<_To>(__v0, __vs...);
549 	  }
550       }
551   }
552 
553 // }}}
554 // __convert_all{{{
555 // Converts __v into array<_To, N>, where N is _NParts if non-zero or
556 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
557 // Note: this function may return less than all converted elements
558 template <typename _To,
559 	  size_t _NParts = 0, // allows to convert fewer or more (only last
560 			      // _To, to be partially filled) than all
561 	  size_t _Offset = 0, // where to start, # of elements (not Bytes or
562 			      // Parts)
563 	  typename _From, typename _FromVT = _VectorTraits<_From>>
564   _GLIBCXX_SIMD_INTRINSIC auto
565   __convert_all(_From __v)
566   {
567     if constexpr (is_arithmetic_v<_To> && _NParts != 1)
568       {
569 	static_assert(_Offset < _FromVT::_S_full_size);
570 	constexpr auto _Np
571 	  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
572 	return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
573 	  [&](auto __i) { return static_cast<_To>(__v[__i + _Offset]); });
574       }
575     else
576       {
577 	static_assert(__is_vector_type_v<_To>);
578 	using _ToVT = _VectorTraits<_To>;
579 	if constexpr (__is_vector_type_v<_From>)
580 	  return __convert_all<_To, _NParts>(__as_wrapper(__v));
581 	else if constexpr (_NParts == 1)
582 	  {
583 	    static_assert(_Offset % _ToVT::_S_full_size == 0);
584 	    return array<_To, 1>{__vector_convert<_To>(
585 	      __extract_part<_Offset / _ToVT::_S_full_size,
586 			     __div_roundup(_FromVT::_S_partial_width,
587 					   _ToVT::_S_full_size)>(__v))};
588 	  }
589 #if _GLIBCXX_SIMD_X86INTRIN // {{{
590 	else if constexpr (!__have_sse4_1 && _Offset == 0
591 	  && is_integral_v<typename _FromVT::value_type>
592 	  && sizeof(typename _FromVT::value_type)
593 	      < sizeof(typename _ToVT::value_type)
594 	  && !(sizeof(typename _FromVT::value_type) == 4
595 	      && is_same_v<typename _ToVT::value_type, double>))
596 	  {
597 	    using _ToT = typename _ToVT::value_type;
598 	    using _FromT = typename _FromVT::value_type;
599 	    constexpr size_t _Np
600 	      = _NParts != 0
601 		  ? _NParts
602 		  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
603 	    using _R = array<_To, _Np>;
604 	    // __adjust modifies its input to have _Np (use _SizeConstant)
605 	    // entries so that no unnecessary intermediate conversions are
606 	    // requested and, more importantly, no intermediate conversions are
607 	    // missing
608 	    [[maybe_unused]] auto __adjust
609 	      = [](auto __n,
610 		   auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
611 	      return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
612 	    };
613 	    [[maybe_unused]] const auto __vi = __to_intrin(__v);
614 	    auto&& __make_array = [](auto __x0, [[maybe_unused]] auto __x1) {
615 	      if constexpr (_Np == 1)
616 		return _R{__intrin_bitcast<_To>(__x0)};
617 	      else
618 		return _R{__intrin_bitcast<_To>(__x0),
619 			  __intrin_bitcast<_To>(__x1)};
620 	    };
621 
622 	    if constexpr (_Np == 0)
623 	      return _R{};
624 	    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
625 	      {
626 		static_assert(is_integral_v<_FromT>);
627 		static_assert(is_integral_v<_ToT>);
628 		if constexpr (is_unsigned_v<_FromT>)
629 		  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
630 				      _mm_unpackhi_epi8(__vi, __m128i()));
631 		else
632 		  return __make_array(
633 		    _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
634 		    _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
635 	      }
636 	    else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
637 	      {
638 		static_assert(is_integral_v<_FromT>);
639 		if constexpr (is_floating_point_v<_ToT>)
640 		  {
641 		    const auto __ints
642 		      = __convert_all<__vector_type16_t<int>, _Np>(
643 			__adjust(_SizeConstant<_Np * 4>(), __v));
644 		    return __generate_from_n_evaluations<_Np, _R>(
645 		      [&](auto __i) {
646 			return __vector_convert<_To>(__as_wrapper(__ints[__i]));
647 		      });
648 		  }
649 		else if constexpr (is_unsigned_v<_FromT>)
650 		  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
651 				      _mm_unpackhi_epi16(__vi, __m128i()));
652 		else
653 		  return __make_array(
654 		    _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
655 		    _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
656 	      }
657 	    else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
658 			       && is_integral_v<_FromT> && is_integral_v<_ToT>)
659 	      {
660 		if constexpr (is_unsigned_v<_FromT>)
661 		  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
662 				      _mm_unpackhi_epi32(__vi, __m128i()));
663 		else
664 		  return __make_array(
665 		    _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
666 		    _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
667 	      }
668 	    else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
669 			       && is_integral_v<_FromT> && is_integral_v<_ToT>)
670 	      {
671 		if constexpr (is_unsigned_v<_FromT>)
672 		  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
673 				      _mm_unpackhi_epi32(__vi, __m128i()));
674 		else
675 		  return __make_array(
676 		    _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
677 		    _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
678 	      }
679 	    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
680 			       && is_signed_v<_FromT>)
681 	      {
682 		const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
683 					 _mm_unpackhi_epi8(__vi, __vi)};
684 		const __vector_type_t<int, 4> __vvvv[4] = {
685 		  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
686 		  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
687 		  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
688 		  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
689 		if constexpr (sizeof(_ToT) == 4)
690 		  return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
691 		    return __vector_convert<_To>(
692 		      _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
693 		  });
694 		else if constexpr (is_integral_v<_ToT>)
695 		  return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
696 		    const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
697 		    const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
698 		    return __vector_bitcast<_ToT>(
699 		      __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
700 				   : _mm_unpackhi_epi32(__sx32, __signbits));
701 		  });
702 		else
703 		  return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
704 		    const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
705 		    return __vector_convert<_To>(
706 		      __i % 2 == 0 ? __int4
707 				   : _SimdWrapper<int, 4>(
708 				     _mm_unpackhi_epi64(__to_intrin(__int4),
709 							__to_intrin(__int4))));
710 		  });
711 	      }
712 	    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
713 	      {
714 		const auto __shorts = __convert_all<__vector_type16_t<
715 		  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
716 		  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
717 		return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
718 		  return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
719 		});
720 	      }
721 	    else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
722 			       && is_signed_v<_FromT> && is_integral_v<_ToT>)
723 	      {
724 		const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
725 					 _mm_unpackhi_epi16(__vi, __vi)};
726 		const __vector_type16_t<int> __vvvv[4]
727 		  = {__vector_bitcast<int>(
728 		       _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
729 					  _mm_srai_epi32(__vv[0], 31))),
730 		     __vector_bitcast<int>(
731 		       _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
732 					  _mm_srai_epi32(__vv[0], 31))),
733 		     __vector_bitcast<int>(
734 		       _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
735 					  _mm_srai_epi32(__vv[1], 31))),
736 		     __vector_bitcast<int>(
737 		       _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
738 					  _mm_srai_epi32(__vv[1], 31)))};
739 		return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
740 		  return __vector_bitcast<_ToT>(__vvvv[__i]);
741 		});
742 	      }
743 	    else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
744 	      {
745 		const auto __ints
746 		  = __convert_all<__vector_type16_t<conditional_t<
747 		    is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
748 		    unsigned int>>>(
749 		    __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
750 		return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
751 		  return __convert_all<_To>(__ints[__i / 2])[__i % 2];
752 		});
753 	      }
754 	    else
755 	      __assert_unreachable<_To>();
756 	  }
757 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
758 	else if constexpr ((_FromVT::_S_partial_width - _Offset)
759 			   > _ToVT::_S_full_size)
760 	  {
761 	    /*
762 	    static_assert(
763 	      (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
764 	    0,
765 	      "__convert_all only supports power-of-2 number of elements.
766 	    Otherwise " "the return type cannot be array<_To, N>.");
767 	      */
768 	    constexpr size_t _NTotal
769 	      = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
770 	    constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
771 	    static_assert(
772 	      _Np <= _NTotal
773 	      || (_Np == _NTotal + 1
774 		  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
775 		       > 0));
776 	    using _R = array<_To, _Np>;
777 	    if constexpr (_Np == 1)
778 	      return _R{__vector_convert<_To>(
779 		__extract_part<_Offset, _FromVT::_S_partial_width,
780 			       _ToVT::_S_full_size>(__v))};
781 	    else
782 	      return __generate_from_n_evaluations<_Np, _R>([&](
783 		auto __i) constexpr {
784 		auto __part
785 		  = __extract_part<__i * _ToVT::_S_full_size + _Offset,
786 				   _FromVT::_S_partial_width,
787 				   _ToVT::_S_full_size>(__v);
788 		return __vector_convert<_To>(__part);
789 	      });
790 	  }
791 	else if constexpr (_Offset == 0)
792 	  return array<_To, 1>{__vector_convert<_To>(__v)};
793 	else
794 	  return array<_To, 1>{__vector_convert<_To>(
795 	    __extract_part<_Offset, _FromVT::_S_partial_width,
796 			   _FromVT::_S_partial_width - _Offset>(__v))};
797       }
798   }
799 
800 // }}}
801 
802 // _GnuTraits {{{
803 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
804   struct _GnuTraits
805   {
806     using _IsValid = true_type;
807     using _SimdImpl = typename _Abi::_SimdImpl;
808     using _MaskImpl = typename _Abi::_MaskImpl;
809 
810     // simd and simd_mask member types {{{
811     using _SimdMember = _SimdWrapper<_Tp, _Np>;
812     using _MaskMember = _SimdWrapper<_Mp, _Np>;
813     static constexpr size_t _S_simd_align = alignof(_SimdMember);
814     static constexpr size_t _S_mask_align = alignof(_MaskMember);
815 
816     // }}}
817     // size metadata {{{
818     static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
819     static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
820 
821     // }}}
822     // _SimdBase / base class for simd, providing extra conversions {{{
823     struct _SimdBase2
824     {
825       _GLIBCXX_SIMD_ALWAYS_INLINE
826       explicit operator __intrinsic_type_t<_Tp, _Np>() const
827       {
828 	return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data);
829       }
830       _GLIBCXX_SIMD_ALWAYS_INLINE
831       explicit operator __vector_type_t<_Tp, _Np>() const
832       {
833 	return static_cast<const simd<_Tp, _Abi>*>(this)->_M_data.__builtin();
834       }
835     };
836 
837     struct _SimdBase1
838     {
839       _GLIBCXX_SIMD_ALWAYS_INLINE
840       explicit operator __intrinsic_type_t<_Tp, _Np>() const
841       { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
842     };
843 
844     using _SimdBase = conditional_t<
845       is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
846       _SimdBase1, _SimdBase2>;
847 
848     // }}}
849     // _MaskBase {{{
850     struct _MaskBase2
851     {
852       _GLIBCXX_SIMD_ALWAYS_INLINE
853       explicit operator __intrinsic_type_t<_Tp, _Np>() const
854       {
855 	return static_cast<const simd_mask<_Tp, _Abi>*>(this)
856 	  ->_M_data.__intrin();
857       }
858       _GLIBCXX_SIMD_ALWAYS_INLINE
859       explicit operator __vector_type_t<_Tp, _Np>() const
860       {
861 	return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data;
862       }
863     };
864 
865     struct _MaskBase1
866     {
867       _GLIBCXX_SIMD_ALWAYS_INLINE
868       explicit operator __intrinsic_type_t<_Tp, _Np>() const
869       { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
870     };
871 
872     using _MaskBase = conditional_t<
873       is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
874       _MaskBase1, _MaskBase2>;
875 
876     // }}}
877     // _MaskCastType {{{
878     // parameter type of one explicit simd_mask constructor
879     class _MaskCastType
880     {
881       using _Up = __intrinsic_type_t<_Tp, _Np>;
882       _Up _M_data;
883 
884     public:
885       _GLIBCXX_SIMD_ALWAYS_INLINE
886       _MaskCastType(_Up __x) : _M_data(__x) {}
887       _GLIBCXX_SIMD_ALWAYS_INLINE
888       operator _MaskMember() const { return _M_data; }
889     };
890 
891     // }}}
892     // _SimdCastType {{{
893     // parameter type of one explicit simd constructor
894     class _SimdCastType1
895     {
896       using _Ap = __intrinsic_type_t<_Tp, _Np>;
897       _SimdMember _M_data;
898 
899     public:
900       _GLIBCXX_SIMD_ALWAYS_INLINE
901       _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
902       _GLIBCXX_SIMD_ALWAYS_INLINE
903       operator _SimdMember() const { return _M_data; }
904     };
905 
906     class _SimdCastType2
907     {
908       using _Ap = __intrinsic_type_t<_Tp, _Np>;
909       using _Bp = __vector_type_t<_Tp, _Np>;
910       _SimdMember _M_data;
911 
912     public:
913       _GLIBCXX_SIMD_ALWAYS_INLINE
914       _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
915       _GLIBCXX_SIMD_ALWAYS_INLINE
916       _SimdCastType2(_Bp __b) : _M_data(__b) {}
917       _GLIBCXX_SIMD_ALWAYS_INLINE
918       operator _SimdMember() const { return _M_data; }
919     };
920 
921     using _SimdCastType = conditional_t<
922       is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
923       _SimdCastType1, _SimdCastType2>;
924     //}}}
925   };
926 
927 // }}}
928 struct _CommonImplX86;
929 struct _CommonImplNeon;
930 struct _CommonImplBuiltin;
931 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin;
932 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin;
933 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86;
934 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86;
935 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon;
936 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon;
937 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc;
938 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc;
939 
940 // simd_abi::_VecBuiltin {{{
941 template <int _UsedBytes>
942   struct simd_abi::_VecBuiltin
943   {
944     template <typename _Tp>
945       static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
946 
947     // validity traits {{{
948     struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
949 
950     template <typename _Tp>
951       struct _IsValidSizeFor
952 	: __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
953 			   && _UsedBytes % sizeof(_Tp) == 0
954 			   && _UsedBytes <= __vectorized_sizeof<_Tp>()
955 			   && (!__have_avx512f || _UsedBytes <= 32))> {};
956 
957     template <typename _Tp>
958       struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
959 				    _IsValidSizeFor<_Tp>> {};
960 
961     template <typename _Tp>
962       static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
963 
964     // }}}
965     // _SimdImpl/_MaskImpl {{{
966 #if _GLIBCXX_SIMD_X86INTRIN
967     using _CommonImpl = _CommonImplX86;
968     using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
969     using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
970 #elif _GLIBCXX_SIMD_HAVE_NEON
971     using _CommonImpl = _CommonImplNeon;
972     using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
973     using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
974 #else
975     using _CommonImpl = _CommonImplBuiltin;
976 #ifdef __ALTIVEC__
977     using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
978     using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
979 #else
980     using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
981     using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
982 #endif
983 #endif
984 
985     // }}}
986     // __traits {{{
987     template <typename _Tp>
988       using _MaskValueType = __int_for_sizeof_t<_Tp>;
989 
990     template <typename _Tp>
991       using __traits
992 	= conditional_t<_S_is_valid_v<_Tp>,
993 			_GnuTraits<_Tp, _MaskValueType<_Tp>,
994 				   _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
995 			_InvalidTraits>;
996 
997     //}}}
998     // size metadata {{{
999     template <typename _Tp>
1000       static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1001 
1002     template <typename _Tp>
1003       static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1004 
1005     // }}}
1006     // implicit masks {{{
1007     template <typename _Tp>
1008       using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
1009 
1010     template <typename _Tp>
1011       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1012       _S_implicit_mask()
1013       {
1014 	using _UV = typename _MaskMember<_Tp>::_BuiltinType;
1015 	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1016 	  return ~_UV();
1017 	else
1018 	  {
1019 	    constexpr auto __size = _S_size<_Tp>;
1020 	    _GLIBCXX_SIMD_USE_CONSTEXPR auto __r = __generate_vector<_UV>(
1021 	      [](auto __i) constexpr { return __i < __size ? -1 : 0; });
1022 	    return __r;
1023 	  }
1024       }
1025 
1026     template <typename _Tp>
1027       _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp,
1028 								  _S_size<_Tp>>
1029       _S_implicit_mask_intrin()
1030       {
1031 	return __to_intrin(
1032 	  __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data));
1033       }
1034 
1035     template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1036       _GLIBCXX_SIMD_INTRINSIC static constexpr _TW _S_masked(_TW __x)
1037       {
1038 	using _Tp = typename _TVT::value_type;
1039 	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1040 	  return __x;
1041 	else
1042 	  return __and(__as_vector(__x),
1043 		       __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
1044       }
1045 
1046     template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1047       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1048       __make_padding_nonzero(_TW __x)
1049       {
1050 	using _Tp = typename _TVT::value_type;
1051 	if constexpr (!_S_is_partial<_Tp>)
1052 	  return __x;
1053 	else
1054 	  {
1055 	    _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
1056 	      = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
1057 	    if constexpr (is_integral_v<_Tp>)
1058 	      return __or(__x, ~__implicit_mask);
1059 	    else
1060 	      {
1061 		_GLIBCXX_SIMD_USE_CONSTEXPR auto __one
1062 		  = __andnot(__implicit_mask,
1063 			     __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
1064 		// it's not enough to return `x | 1_in_padding` because the
1065 		// padding in x might be inf or nan (independent of
1066 		// __FINITE_MATH_ONLY__, because it's about padding bits)
1067 		return __or(__and(__x, __implicit_mask), __one);
1068 	      }
1069 	  }
1070       }
1071     // }}}
1072   };
1073 
1074 // }}}
1075 // simd_abi::_VecBltnBtmsk {{{
1076 template <int _UsedBytes>
1077   struct simd_abi::_VecBltnBtmsk
1078   {
1079     template <typename _Tp>
1080       static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
1081 
1082     // validity traits {{{
1083     struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
1084 
1085     template <typename _Tp>
1086       struct _IsValidSizeFor
1087 	: __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
1088 			   && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
1089 			   && (_UsedBytes > 32 || __have_avx512vl))> {};
1090 
1091     // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
1092     // required.
1093     template <typename _Tp>
1094       struct _IsValid
1095 	: conjunction<
1096 	    _IsValidAbiTag, __bool_constant<__have_avx512f>,
1097 	    __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
1098 	    __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
1099 	    _IsValidSizeFor<_Tp>> {};
1100 
1101     template <typename _Tp>
1102       static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
1103 
1104     // }}}
1105     // simd/_MaskImpl {{{
1106   #if _GLIBCXX_SIMD_X86INTRIN
1107     using _CommonImpl = _CommonImplX86;
1108     using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
1109     using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
1110   #else
1111     template <int>
1112       struct _MissingImpl;
1113 
1114     using _CommonImpl = _MissingImpl<_UsedBytes>;
1115     using _SimdImpl = _MissingImpl<_UsedBytes>;
1116     using _MaskImpl = _MissingImpl<_UsedBytes>;
1117   #endif
1118 
1119     // }}}
1120     // __traits {{{
1121     template <typename _Tp>
1122       using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
1123 
1124     template <typename _Tp>
1125       using __traits = conditional_t<
1126 	_S_is_valid_v<_Tp>,
1127 	_GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1128 	_InvalidTraits>;
1129 
1130     //}}}
1131     // size metadata {{{
1132     template <typename _Tp>
1133       static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1134     template <typename _Tp>
1135       static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1136 
1137     // }}}
1138     // implicit mask {{{
1139   private:
1140     template <typename _Tp>
1141       using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1142 
1143   public:
1144     template <size_t _Np>
1145       _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1146       __implicit_mask_n()
1147       {
1148 	using _Tp = __bool_storage_member_type_t<_Np>;
1149 	return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
1150       }
1151 
1152     template <typename _Tp>
1153       _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1154       _S_implicit_mask()
1155       { return __implicit_mask_n<_S_size<_Tp>>(); }
1156 
1157     template <typename _Tp>
1158       _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<
1159 	_S_size<_Tp>>
1160       _S_implicit_mask_intrin()
1161       { return __implicit_mask_n<_S_size<_Tp>>(); }
1162 
1163     template <typename _Tp, size_t _Np>
1164       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1165       _S_masked(_SimdWrapper<_Tp, _Np> __x)
1166       {
1167 	if constexpr (is_same_v<_Tp, bool>)
1168 	  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
1169 	    return _MaskImpl::_S_bit_and(
1170 	      __x, _SimdWrapper<_Tp, _Np>(
1171 		     __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
1172 	  else
1173 	    return __x;
1174 	else
1175 	  return _S_masked(__x._M_data);
1176       }
1177 
1178     template <typename _TV>
1179       _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1180       _S_masked(_TV __x)
1181       {
1182 	using _Tp = typename _VectorTraits<_TV>::value_type;
1183 	static_assert(
1184 	  !__is_bitmask_v<_TV>,
1185 	  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1186 	  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
1187 	if constexpr (_S_is_partial<_Tp>)
1188 	  {
1189 	    constexpr size_t _Np = _S_size<_Tp>;
1190 	    return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1191 	      _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1192 	      _SimdWrapper<_Tp, _Np>(__x));
1193 	  }
1194 	else
1195 	  return __x;
1196       }
1197 
1198     template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1199       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1200       __make_padding_nonzero(_TV __x)
1201       {
1202 	using _Tp = typename _TVT::value_type;
1203 	if constexpr (!_S_is_partial<_Tp>)
1204 	  return __x;
1205 	else
1206 	  {
1207 	    constexpr size_t _Np = _S_size<_Tp>;
1208 	    if constexpr (is_integral_v<typename _TVT::value_type>)
1209 	      return __x
1210 		     | __generate_vector<_Tp, _S_full_size<_Tp>>(
1211 		       [](auto __i) -> _Tp {
1212 			 if (__i < _Np)
1213 			   return 0;
1214 			 else
1215 			   return 1;
1216 		       });
1217 	    else
1218 	      return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1219 		       _S_implicit_mask<_Tp>(),
1220 		       _SimdWrapper<_Tp, _Np>(
1221 			 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
1222 		       _SimdWrapper<_Tp, _Np>(__x))
1223 		._M_data;
1224 	  }
1225       }
1226 
1227     // }}}
1228   };
1229 
1230 //}}}
1231 // _CommonImplBuiltin {{{
1232 struct _CommonImplBuiltin
1233 {
1234   // _S_converts_via_decomposition{{{
1235   // This lists all cases where a __vector_convert needs to fall back to
1236   // conversion of individual scalars (i.e. decompose the input vector into
1237   // scalars, convert, compose output vector). In those cases, _S_masked_load &
1238   // _S_masked_store prefer to use the _S_bit_iteration implementation.
1239   template <typename _From, typename _To, size_t _ToSize>
1240     static inline constexpr bool __converts_via_decomposition_v
1241       = sizeof(_From) != sizeof(_To);
1242 
1243   // }}}
1244   // _S_load{{{
1245   template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1246     _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1247     _S_load(const void* __p)
1248     {
1249       static_assert(_Np > 1);
1250       static_assert(_Bytes % sizeof(_Tp) == 0);
1251       using _Rp = __vector_type_t<_Tp, _Np>;
1252       if constexpr (sizeof(_Rp) == _Bytes)
1253 	{
1254 	  _Rp __r;
1255 	  __builtin_memcpy(&__r, __p, _Bytes);
1256 	  return __r;
1257 	}
1258       else
1259 	{
1260 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1261 	  using _Up = conditional_t<
1262 	    is_integral_v<_Tp>,
1263 	    conditional_t<_Bytes % 4 == 0,
1264 			  conditional_t<_Bytes % 8 == 0, long long, int>,
1265 			  conditional_t<_Bytes % 2 == 0, short, signed char>>,
1266 	    conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
1267 			  double>>;
1268 	  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1269 	  if constexpr (sizeof(_V) != sizeof(_Rp))
1270 	    { // on i386 with 4 < _Bytes <= 8
1271 	      _Rp __r{};
1272 	      __builtin_memcpy(&__r, __p, _Bytes);
1273 	      return __r;
1274 	    }
1275 	  else
1276 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1277 	  using _V = _Rp;
1278 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1279 	    {
1280 	      _V __r{};
1281 	      static_assert(_Bytes <= sizeof(_V));
1282 	      __builtin_memcpy(&__r, __p, _Bytes);
1283 	      return reinterpret_cast<_Rp>(__r);
1284 	    }
1285 	}
1286     }
1287 
1288   // }}}
1289   // _S_store {{{
1290   template <size_t _ReqBytes = 0, typename _TV>
1291     _GLIBCXX_SIMD_INTRINSIC static void _S_store(_TV __x, void* __addr)
1292     {
1293       constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
1294       static_assert(sizeof(__x) >= _Bytes);
1295 
1296       if constexpr (__is_vector_type_v<_TV>)
1297 	{
1298 	  using _Tp = typename _VectorTraits<_TV>::value_type;
1299 	  constexpr size_t _Np = _Bytes / sizeof(_Tp);
1300 	  static_assert(_Np * sizeof(_Tp) == _Bytes);
1301 
1302 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1303 	  using _Up = conditional_t<
1304 	    (is_integral_v<_Tp> || _Bytes < 4),
1305 	    conditional_t<(sizeof(__x) > sizeof(long long)), long long, _Tp>,
1306 	    float>;
1307 	  const auto __v = __vector_bitcast<_Up>(__x);
1308 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1309 	  const __vector_type_t<_Tp, _Np> __v = __x;
1310 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1311 
1312 	  if constexpr ((_Bytes & (_Bytes - 1)) != 0)
1313 	    {
1314 	      constexpr size_t _MoreBytes = std::__bit_ceil(_Bytes);
1315 	      alignas(decltype(__v)) char __tmp[_MoreBytes];
1316 	      __builtin_memcpy(__tmp, &__v, _MoreBytes);
1317 	      __builtin_memcpy(__addr, __tmp, _Bytes);
1318 	    }
1319 	  else
1320 	    __builtin_memcpy(__addr, &__v, _Bytes);
1321 	}
1322       else
1323 	__builtin_memcpy(__addr, &__x, _Bytes);
1324     }
1325 
1326   template <typename _Tp, size_t _Np>
1327     _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __x,
1328 						 void* __addr)
1329     { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1330 
1331   // }}}
1332   // _S_store_bool_array(_BitMask) {{{
1333   template <size_t _Np, bool _Sanitized>
1334     _GLIBCXX_SIMD_INTRINSIC static constexpr void
1335     _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1336     {
1337       if constexpr (_Np == 1)
1338 	__mem[0] = __x[0];
1339       else if constexpr (_Np == 2)
1340 	{
1341 	  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
1342 	  _S_store<_Np>(__bool2, __mem);
1343 	}
1344       else if constexpr (_Np == 3)
1345 	{
1346 	  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
1347 	  _S_store<_Np>(__bool3, __mem);
1348 	}
1349       else
1350 	{
1351 	  __execute_n_times<__div_roundup(_Np, 4)>([&](auto __i) {
1352 	    constexpr int __offset = __i * 4;
1353 	    constexpr int __remaining = _Np - __offset;
1354 	    if constexpr (__remaining > 4 && __remaining <= 7)
1355 	      {
1356 		const _ULLong __bool7
1357 		  = (__x.template _M_extract<__offset>()._M_to_bits()
1358 		     * 0x40810204081ULL)
1359 		    & 0x0101010101010101ULL;
1360 		_S_store<__remaining>(__bool7, __mem + __offset);
1361 	      }
1362 	    else if constexpr (__remaining >= 4)
1363 	      {
1364 		int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1365 		if constexpr (__remaining > 7)
1366 		  __bits &= 0xf;
1367 		const int __bool4 = (__bits * 0x204081) & 0x01010101;
1368 		_S_store<4>(__bool4, __mem + __offset);
1369 	      }
1370 	  });
1371 	}
1372     }
1373 
1374   // }}}
1375   // _S_blend{{{
1376   template <typename _Tp, size_t _Np>
1377     _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1378     _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1379 	     _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1380     { return __k._M_data ? __at1._M_data : __at0._M_data; }
1381 
1382   // }}}
1383 };
1384 
1385 // }}}
1386 // _SimdImplBuiltin {{{1
1387 template <typename _Abi, typename>
1388   struct _SimdImplBuiltin
1389   {
1390     // member types {{{2
1391     template <typename _Tp>
1392       static constexpr size_t _S_max_store_size = 16;
1393 
1394     using abi_type = _Abi;
1395 
1396     template <typename _Tp>
1397       using _TypeTag = _Tp*;
1398 
1399     template <typename _Tp>
1400       using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1401 
1402     template <typename _Tp>
1403       using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1404 
1405     template <typename _Tp>
1406       static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1407 
1408     template <typename _Tp>
1409       static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1410 
1411     using _CommonImpl = typename _Abi::_CommonImpl;
1412     using _SuperImpl = typename _Abi::_SimdImpl;
1413     using _MaskImpl = typename _Abi::_MaskImpl;
1414 
1415     // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1416     template <typename _Tp, size_t _Np>
1417       _GLIBCXX_SIMD_INTRINSIC static simd<_Tp, _Abi>
1418       _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1419       { return {__private_init, __x}; }
1420 
1421     template <typename _Tp, size_t _Np>
1422       _GLIBCXX_SIMD_INTRINSIC static simd<_Tp, _Abi>
1423       _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1424       { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1425 
1426     // _S_broadcast {{{2
1427     template <typename _Tp>
1428       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1429       _S_broadcast(_Tp __x) noexcept
1430       { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1431 
1432     // _S_generator {{{2
1433     template <typename _Fp, typename _Tp>
1434       inline static constexpr _SimdMember<_Tp> _S_generator(_Fp&& __gen,
1435 							    _TypeTag<_Tp>)
1436       {
1437 	return __generate_vector<_Tp, _S_full_size<_Tp>>([&](
1438 	  auto __i) constexpr {
1439 	  if constexpr (__i < _S_size<_Tp>)
1440 	    return __gen(__i);
1441 	  else
1442 	    return 0;
1443 	});
1444       }
1445 
1446     // _S_load {{{2
1447     template <typename _Tp, typename _Up>
1448       _GLIBCXX_SIMD_INTRINSIC static _SimdMember<_Tp>
1449       _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1450       {
1451 	constexpr size_t _Np = _S_size<_Tp>;
1452 	constexpr size_t __max_load_size
1453 	  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw   ? 64
1454 	    : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
1455 								      : 16;
1456 	constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1457 	if constexpr (sizeof(_Up) > 8)
1458 	  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&](
1459 	    auto __i) constexpr {
1460 	    return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1461 	  });
1462 	else if constexpr (is_same_v<_Up, _Tp>)
1463 	  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1464 					       _Np * sizeof(_Tp)>(__mem);
1465 	else if constexpr (__bytes_to_load <= __max_load_size)
1466 	  return __convert<_SimdMember<_Tp>>(
1467 	    _CommonImpl::template _S_load<_Up, _Np>(__mem));
1468 	else if constexpr (__bytes_to_load % __max_load_size == 0)
1469 	  {
1470 	    constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1471 	    constexpr size_t __elements_per_load = _Np / __n_loads;
1472 	    return __call_with_n_evaluations<__n_loads>(
1473 	      [](auto... __uncvted) {
1474 		return __convert<_SimdMember<_Tp>>(__uncvted...);
1475 	      },
1476 	      [&](auto __i) {
1477 		return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1478 		  __mem + __i * __elements_per_load);
1479 	      });
1480 	  }
1481 	else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
1482 			   && __max_load_size > 16)
1483 	  { // e.g. int[] -> <char, 12> with AVX2
1484 	    constexpr size_t __n_loads
1485 	      = __bytes_to_load / (__max_load_size / 2);
1486 	    constexpr size_t __elements_per_load = _Np / __n_loads;
1487 	    return __call_with_n_evaluations<__n_loads>(
1488 	      [](auto... __uncvted) {
1489 		return __convert<_SimdMember<_Tp>>(__uncvted...);
1490 	      },
1491 	      [&](auto __i) {
1492 		return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1493 		  __mem + __i * __elements_per_load);
1494 	      });
1495 	  }
1496 	else // e.g. int[] -> <char, 9>
1497 	  return __call_with_subscripts(
1498 	    __mem, make_index_sequence<_Np>(), [](auto... __args) {
1499 	      return __vector_type_t<_Tp, _S_full_size<_Tp>>{
1500 		static_cast<_Tp>(__args)...};
1501 	    });
1502       }
1503 
1504     // _S_masked_load {{{2
1505     template <typename _Tp, size_t _Np, typename _Up>
1506       static inline _SimdWrapper<_Tp, _Np>
1507       _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1508 		     const _Up* __mem) noexcept
1509       {
1510 	_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), [&](auto __i) {
1511 	  __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1512 	});
1513 	return __merge;
1514       }
1515 
1516     // _S_store {{{2
1517     template <typename _Tp, typename _Up>
1518       _GLIBCXX_SIMD_INTRINSIC static void
1519       _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1520       {
1521 	// TODO: converting int -> "smaller int" can be optimized with AVX512
1522 	constexpr size_t _Np = _S_size<_Tp>;
1523 	constexpr size_t __max_store_size
1524 	  = _SuperImpl::template _S_max_store_size<_Up>;
1525 	if constexpr (sizeof(_Up) > 8)
1526 	  __execute_n_times<_Np>([&](auto __i) constexpr {
1527 	    __mem[__i] = __v[__i];
1528 	  });
1529 	else if constexpr (is_same_v<_Up, _Tp>)
1530 	  _CommonImpl::_S_store(__v, __mem);
1531 	else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1532 	  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1533 				__mem);
1534 	else
1535 	  {
1536 	    constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1537 	    // round up to convert the last partial vector as well:
1538 	    constexpr size_t __stores = __div_roundup(_Np, __vsize);
1539 	    constexpr size_t __full_stores = _Np / __vsize;
1540 	    using _V = __vector_type_t<_Up, __vsize>;
1541 	    const array<_V, __stores> __converted
1542 	      = __convert_all<_V, __stores>(__v);
1543 	    __execute_n_times<__full_stores>([&](auto __i) constexpr {
1544 	      _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1545 	    });
1546 	    if constexpr (__full_stores < __stores)
1547 	      _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1548 					     * sizeof(_Up)>(
1549 		__converted[__full_stores], __mem + __full_stores * __vsize);
1550 	  }
1551       }
1552 
1553     // _S_masked_store_nocvt {{{2
1554     template <typename _Tp, size_t _Np>
1555       _GLIBCXX_SIMD_INTRINSIC static void
1556       _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1557 			    _MaskMember<_Tp> __k)
1558       {
1559 	_BitOps::_S_bit_iteration(
1560 	  _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
1561 	    __mem[__i] = __v[__i];
1562 	  });
1563       }
1564 
1565     // _S_masked_store {{{2
1566     template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1567 	      typename _Tp = typename _TVT::value_type, typename _Up>
1568       static inline void
1569       _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k)
1570 	noexcept
1571       {
1572 	constexpr size_t _TV_size = _S_size<_Tp>;
1573 	[[maybe_unused]] const auto __vi = __to_intrin(__v);
1574 	constexpr size_t __max_store_size
1575 	  = _SuperImpl::template _S_max_store_size<_Up>;
1576 	if constexpr (
1577 	  is_same_v<
1578 	    _Tp,
1579 	    _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1580 	  {
1581 	    // bitwise or no conversion, reinterpret:
1582 	    const _MaskMember<_Up> __kk = [&]() {
1583 	      if constexpr (__is_bitmask_v<decltype(__k)>)
1584 		return _MaskMember<_Up>(__k._M_data);
1585 	      else
1586 		return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1587 	    }();
1588 	    _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1589 					      __mem, __kk);
1590 	  }
1591 	else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1592 			   && !_CommonImpl::
1593 				template __converts_via_decomposition_v<
1594 				  _Tp, _Up, __max_store_size>)
1595 	  { // conversion via decomposition is better handled via the
1596 	    // bit_iteration
1597 	    // fallback below
1598 	    constexpr size_t _UW_size
1599 	      = std::min(_TV_size, __max_store_size / sizeof(_Up));
1600 	    static_assert(_UW_size <= _TV_size);
1601 	    using _UW = _SimdWrapper<_Up, _UW_size>;
1602 	    using _UV = __vector_type_t<_Up, _UW_size>;
1603 	    using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1604 	    if constexpr (_UW_size == _TV_size) // one convert+store
1605 	      {
1606 		const _UW __converted = __convert<_UW>(__v);
1607 		_SuperImpl::_S_masked_store_nocvt(
1608 		  __converted, __mem,
1609 		  _UAbi::_MaskImpl::template _S_convert<
1610 		    __int_for_sizeof_t<_Up>>(__k));
1611 	      }
1612 	    else
1613 	      {
1614 		static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1615 		constexpr size_t _NFullStores = _TV_size / _UW_size;
1616 		constexpr size_t _NAllStores
1617 		  = __div_roundup(_TV_size, _UW_size);
1618 		constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1619 		const array<_UV, _NAllStores> __converted
1620 		  = __convert_all<_UV, _NAllStores>(__v);
1621 		__execute_n_times<_NFullStores>([&](auto __i) {
1622 		  _SuperImpl::_S_masked_store_nocvt(
1623 		    _UW(__converted[__i]), __mem + __i * _UW_size,
1624 		    _UAbi::_MaskImpl::template _S_convert<
1625 		      __int_for_sizeof_t<_Up>>(
1626 		      __extract_part<__i, _NParts>(__k.__as_full_vector())));
1627 		});
1628 		if constexpr (_NAllStores
1629 			      > _NFullStores) // one partial at the end
1630 		  _SuperImpl::_S_masked_store_nocvt(
1631 		    _UW(__converted[_NFullStores]),
1632 		    __mem + _NFullStores * _UW_size,
1633 		    _UAbi::_MaskImpl::template _S_convert<
1634 		      __int_for_sizeof_t<_Up>>(
1635 		      __extract_part<_NFullStores, _NParts>(
1636 			__k.__as_full_vector())));
1637 	      }
1638 	  }
1639 	else
1640 	  _BitOps::_S_bit_iteration(
1641 	    _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
1642 	      __mem[__i] = static_cast<_Up>(__v[__i]);
1643 	    });
1644       }
1645 
1646     // _S_complement {{{2
1647     template <typename _Tp, size_t _Np>
1648       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1649       _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1650       {
1651 	if constexpr (is_floating_point_v<_Tp>)
1652 	  return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
1653 	else
1654 	  return ~__x._M_data;
1655       }
1656 
1657     // _S_unary_minus {{{2
1658     template <typename _Tp, size_t _Np>
1659       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1660       _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1661       {
1662 	// GCC doesn't use the psign instructions, but pxor & psub seem to be
1663 	// just as good a choice as pcmpeqd & psign. So meh.
1664 	return -__x._M_data;
1665       }
1666 
1667     // arithmetic operators {{{2
1668     template <typename _Tp, size_t _Np>
1669       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1670       _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1671       { return __x._M_data + __y._M_data; }
1672 
1673     template <typename _Tp, size_t _Np>
1674       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1675       _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1676       { return __x._M_data - __y._M_data; }
1677 
1678     template <typename _Tp, size_t _Np>
1679       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1680       _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1681       { return __x._M_data * __y._M_data; }
1682 
1683     template <typename _Tp, size_t _Np>
1684       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1685       _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1686       {
1687 	// Note that division by 0 is always UB, so we must ensure we avoid the
1688 	// case for partial registers
1689 	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1690 	  return __x._M_data / __y._M_data;
1691 	else
1692 	  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1693       }
1694 
1695     template <typename _Tp, size_t _Np>
1696       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1697       _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1698       {
1699 	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1700 	  return __x._M_data % __y._M_data;
1701 	else
1702 	  return __as_vector(__x)
1703 		 % _Abi::__make_padding_nonzero(__as_vector(__y));
1704       }
1705 
1706     template <typename _Tp, size_t _Np>
1707       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1708       _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1709       { return __and(__x, __y); }
1710 
1711     template <typename _Tp, size_t _Np>
1712       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1713       _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1714       { return __or(__x, __y); }
1715 
1716     template <typename _Tp, size_t _Np>
1717       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1718       _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1719       { return __xor(__x, __y); }
1720 
1721     template <typename _Tp, size_t _Np>
1722       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1723       _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1724       { return __x._M_data << __y._M_data; }
1725 
1726     template <typename _Tp, size_t _Np>
1727       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1728       _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1729       { return __x._M_data >> __y._M_data; }
1730 
1731     template <typename _Tp, size_t _Np>
1732       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1733       _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1734       { return __x._M_data << __y; }
1735 
1736     template <typename _Tp, size_t _Np>
1737       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1738       _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1739       { return __x._M_data >> __y; }
1740 
1741     // compares {{{2
1742     // _S_equal_to {{{3
1743     template <typename _Tp, size_t _Np>
1744       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1745       _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1746       { return __x._M_data == __y._M_data; }
1747 
1748     // _S_not_equal_to {{{3
1749     template <typename _Tp, size_t _Np>
1750       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1751       _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1752       { return __x._M_data != __y._M_data; }
1753 
1754     // _S_less {{{3
1755     template <typename _Tp, size_t _Np>
1756       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1757       _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1758       { return __x._M_data < __y._M_data; }
1759 
1760     // _S_less_equal {{{3
1761     template <typename _Tp, size_t _Np>
1762       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1763       _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1764       { return __x._M_data <= __y._M_data; }
1765 
1766     // _S_negate {{{2
1767     template <typename _Tp, size_t _Np>
1768       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1769       _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1770       { return !__x._M_data; }
1771 
1772     // _S_min, _S_max, _S_minmax {{{2
1773     template <typename _Tp, size_t _Np>
1774       _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1775       _SimdWrapper<_Tp, _Np>
1776       _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1777       { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1778 
1779     template <typename _Tp, size_t _Np>
1780       _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1781       _SimdWrapper<_Tp, _Np>
1782       _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1783       { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1784 
1785     template <typename _Tp, size_t _Np>
1786       _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1787       pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1788       _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1789       {
1790 	return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1791 		__a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1792       }
1793 
1794     // reductions {{{2
1795     template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1796 	      typename _BinaryOperation>
1797       _GLIBCXX_SIMD_INTRINSIC static _Tp
1798       _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1799 			simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1800       {
1801 	using _V = __vector_type_t<_Tp, _Np / 2>;
1802 	static_assert(sizeof(_V) <= sizeof(__x));
1803 	// _S_full_size is the size of the smallest native SIMD register that
1804 	// can store _Np/2 elements:
1805 	using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1806 	using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
1807 	const auto __xx = __as_vector(__x);
1808 	return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1809 	  static_cast<_HalfSimd>(__as_vector(__binary_op(
1810 	    static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1811 	    static_cast<_FullSimd>(__intrin_bitcast<_V>(
1812 	      __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
1813 		__xx)))))),
1814 	  __binary_op);
1815       }
1816 
1817     template <typename _Tp, typename _BinaryOperation>
1818       _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1819       _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1820       {
1821 	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1822 	if constexpr (_Np == 1)
1823 	  return __x[0];
1824 	else if constexpr (_Np == 2)
1825 	  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1826 			     simd<_Tp, simd_abi::scalar>(__x[1]))[0];
1827 	else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1828 	  {
1829 	    [[maybe_unused]] constexpr auto __full_size
1830 	      = _Abi::template _S_full_size<_Tp>;
1831 	    if constexpr (_Np == 3)
1832 	      return __binary_op(
1833 		__binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1834 			    simd<_Tp, simd_abi::scalar>(__x[1])),
1835 		simd<_Tp, simd_abi::scalar>(__x[2]))[0];
1836 	    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1837 					 plus<>>)
1838 	      {
1839 		using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1840 		return _Ap::_SimdImpl::_S_reduce(
1841 		  simd<_Tp, _Ap>(__private_init,
1842 				 _Abi::_S_masked(__as_vector(__x))),
1843 		  __binary_op);
1844 	      }
1845 	    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1846 					 multiplies<>>)
1847 	      {
1848 		using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1849 		using _TW = _SimdWrapper<_Tp, __full_size>;
1850 		_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1851 		  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1852 		_GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1853 		  = __vector_broadcast<__full_size>(_Tp(1));
1854 		const _TW __x_full = __data(__x).__as_full_vector();
1855 		const _TW __x_padded_with_ones
1856 		  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1857 					       __x_full);
1858 		return _Ap::_SimdImpl::_S_reduce(
1859 		  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1860 		  __binary_op);
1861 	      }
1862 	    else if constexpr (_Np & 1)
1863 	      {
1864 		using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
1865 		return __binary_op(
1866 		  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1867 		    simd<_Tp, _Ap>(
1868 		      __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
1869 			__as_vector(__x))),
1870 		    __binary_op)),
1871 		  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
1872 	      }
1873 	    else
1874 	      return _S_reduce_partial<_Np>(
1875 		make_index_sequence<_Np / 2>(),
1876 		make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
1877 	  }                                   //}}}
1878 	else if constexpr (sizeof(__x) == 16) //{{{
1879 	  {
1880 	    if constexpr (_Np == 16)
1881 	      {
1882 		const auto __y = __data(__x);
1883 		__x = __binary_op(
1884 		  _M_make_simd<_Tp, _Np>(
1885 		    __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
1886 				     7, 7>(__y)),
1887 		  _M_make_simd<_Tp, _Np>(
1888 		    __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
1889 				     14, 14, 15, 15>(__y)));
1890 	      }
1891 	    if constexpr (_Np >= 8)
1892 	      {
1893 		const auto __y = __vector_bitcast<short>(__data(__x));
1894 		__x = __binary_op(
1895 		  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1896 		    __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
1897 		  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1898 		    __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
1899 	      }
1900 	    if constexpr (_Np >= 4)
1901 	      {
1902 		using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1903 		const auto __y = __vector_bitcast<_Up>(__data(__x));
1904 		__x = __binary_op(__x,
1905 				  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1906 				    __vector_permute<3, 2, 1, 0>(__y))));
1907 	      }
1908 	    using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1909 	    const auto __y = __vector_bitcast<_Up>(__data(__x));
1910 	    __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1911 				     __vector_permute<1, 1>(__y))));
1912 	    return __x[0];
1913 	  } //}}}
1914 	else
1915 	  {
1916 	    static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1917 	    static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
1918 	    using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
1919 	    using _V = simd<_Tp, _Ap>;
1920 	    return _Ap::_SimdImpl::_S_reduce(
1921 	      __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
1922 			  _V(__private_init,
1923 			     __extract<1, 2>(__as_vector(__x)))),
1924 	      static_cast<_BinaryOperation&&>(__binary_op));
1925 	  }
1926       }
1927 
1928     // math {{{2
1929     // frexp, modf and copysign implemented in simd_math.h
1930 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name)                                    \
1931     template <typename _Tp, typename... _More>                                 \
1932       static _Tp _S_##__name(const _Tp& __x, const _More&... __more)           \
1933       {                                                                        \
1934 	return __generate_vector<_Tp>(                                         \
1935 	  [&](auto __i) { return __name(__x[__i], __more[__i]...); });         \
1936       }
1937 
1938 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name)                            \
1939     template <typename _Tp, typename... _More>                                 \
1940       static typename _Tp::mask_type _S_##__name(const _Tp& __x,               \
1941 						 const _More&... __more)       \
1942       {                                                                        \
1943 	return __generate_vector<_Tp>(                                         \
1944 	  [&](auto __i) { return __name(__x[__i], __more[__i]...); });         \
1945       }
1946 
1947 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)                   \
1948     template <typename _Tp, typename... _More>                                 \
1949       static auto _S_##__name(const _Tp& __x, const _More&... __more)          \
1950       {                                                                        \
1951 	return __fixed_size_storage_t<_RetTp,                                  \
1952 				      _VectorTraits<_Tp>::_S_partial_width>::  \
1953 	  _S_generate([&](auto __meta) constexpr {                             \
1954 	    return __meta._S_generator(                                        \
1955 	      [&](auto __i) {                                                  \
1956 		return __name(__x[__meta._S_offset + __i],                     \
1957 			      __more[__meta._S_offset + __i]...);              \
1958 	      },                                                               \
1959 	      static_cast<_RetTp*>(nullptr));                                  \
1960 	  });                                                                  \
1961       }
1962 
1963     _GLIBCXX_SIMD_MATH_FALLBACK(acos)
1964     _GLIBCXX_SIMD_MATH_FALLBACK(asin)
1965     _GLIBCXX_SIMD_MATH_FALLBACK(atan)
1966     _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1967     _GLIBCXX_SIMD_MATH_FALLBACK(cos)
1968     _GLIBCXX_SIMD_MATH_FALLBACK(sin)
1969     _GLIBCXX_SIMD_MATH_FALLBACK(tan)
1970     _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1971     _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
1972     _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
1973     _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
1974     _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
1975     _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
1976     _GLIBCXX_SIMD_MATH_FALLBACK(exp)
1977     _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
1978     _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
1979     _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
1980     _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
1981     _GLIBCXX_SIMD_MATH_FALLBACK(log)
1982     _GLIBCXX_SIMD_MATH_FALLBACK(log10)
1983     _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
1984     _GLIBCXX_SIMD_MATH_FALLBACK(log2)
1985     _GLIBCXX_SIMD_MATH_FALLBACK(logb)
1986 
1987     // modf implemented in simd_math.h
1988     _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
1989     _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
1990     _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
1991     _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
1992     _GLIBCXX_SIMD_MATH_FALLBACK(pow)
1993     _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
1994     _GLIBCXX_SIMD_MATH_FALLBACK(erf)
1995     _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
1996     _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
1997     _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
1998 
1999     _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
2000     _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
2001 
2002     _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
2003     _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
2004 
2005     _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
2006     _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
2007 
2008     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2009       static _Tp
2010       _S_remquo(const _Tp __x, const _Tp __y,
2011 		__fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
2012       {
2013 	return __generate_vector<_Tp>([&](auto __i) {
2014 	  int __tmp;
2015 	  auto __r = remquo(__x[__i], __y[__i], &__tmp);
2016 	  __z->_M_set(__i, __tmp);
2017 	  return __r;
2018 	});
2019       }
2020 
2021     // copysign in simd_math.h
2022     _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
2023     _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
2024     _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
2025     _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
2026     _GLIBCXX_SIMD_MATH_FALLBACK(fma)
2027 
2028     template <typename _Tp, size_t _Np>
2029       static constexpr _MaskMember<_Tp>
2030       _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
2031 		   _SimdWrapper<_Tp, _Np> __y) noexcept
2032       {
2033 	using _Ip = __int_for_sizeof_t<_Tp>;
2034 	const auto __xn = __vector_bitcast<_Ip>(__x);
2035 	const auto __yn = __vector_bitcast<_Ip>(__y);
2036 	const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2037 	const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2038 	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2039 			__xp > __yp);
2040       }
2041 
2042     template <typename _Tp, size_t _Np>
2043       static constexpr _MaskMember<_Tp>
2044       _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
2045 			_SimdWrapper<_Tp, _Np> __y) noexcept
2046       {
2047 	using _Ip = __int_for_sizeof_t<_Tp>;
2048 	const auto __xn = __vector_bitcast<_Ip>(__x);
2049 	const auto __yn = __vector_bitcast<_Ip>(__y);
2050 	const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2051 	const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2052 	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2053 			__xp >= __yp);
2054       }
2055 
2056     template <typename _Tp, size_t _Np>
2057       static constexpr _MaskMember<_Tp>
2058       _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
2059       {
2060 	using _Ip = __int_for_sizeof_t<_Tp>;
2061 	const auto __xn = __vector_bitcast<_Ip>(__x);
2062 	const auto __yn = __vector_bitcast<_Ip>(__y);
2063 	const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2064 	const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2065 	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2066 			__xp < __yp);
2067       }
2068 
2069     template <typename _Tp, size_t _Np>
2070       static constexpr _MaskMember<_Tp>
2071       _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
2072 		     _SimdWrapper<_Tp, _Np> __y) noexcept
2073       {
2074 	using _Ip = __int_for_sizeof_t<_Tp>;
2075 	const auto __xn = __vector_bitcast<_Ip>(__x);
2076 	const auto __yn = __vector_bitcast<_Ip>(__y);
2077 	const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2078 	const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2079 	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2080 			__xp <= __yp);
2081       }
2082 
2083     template <typename _Tp, size_t _Np>
2084       static constexpr _MaskMember<_Tp>
2085       _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
2086 		       _SimdWrapper<_Tp, _Np> __y) noexcept
2087       {
2088 	return __andnot(_SuperImpl::_S_isunordered(__x, __y),
2089 			_SuperImpl::_S_not_equal_to(__x, __y));
2090       }
2091 
2092 #undef _GLIBCXX_SIMD_MATH_FALLBACK
2093 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
2094 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
2095     // _S_abs {{{3
2096     template <typename _Tp, size_t _Np>
2097     _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2098     _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
2099     {
2100       // if (__builtin_is_constant_evaluated())
2101       //  {
2102       //    return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2103       //  }
2104       if constexpr (is_floating_point_v<_Tp>)
2105 	// `v < 0 ? -v : v` cannot compile to the efficient implementation of
2106 	// masking the signbit off because it must consider v == -0
2107 
2108 	// ~(-0.) & v would be easy, but breaks with fno-signed-zeros
2109 	return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2110       else
2111 	return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2112     }
2113 
2114     // }}}3
2115     // _S_plus_minus {{{
2116     // Returns __x + __y - __y without -fassociative-math optimizing to __x.
2117     // - _TV must be __vector_type_t<floating-point type, N>.
2118     // - _UV must be _TV or floating-point type.
2119     template <typename _TV, typename _UV>
2120     _GLIBCXX_SIMD_INTRINSIC static constexpr _TV _S_plus_minus(_TV __x,
2121 							       _UV __y) noexcept
2122     {
2123   #if defined __i386__ && !defined __SSE_MATH__
2124       if constexpr (sizeof(__x) == 8)
2125 	{ // operations on __x would use the FPU
2126 	  static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
2127 	  const auto __x4 = __vector_bitcast<float, 4>(__x);
2128 	  if constexpr (is_same_v<_TV, _UV>)
2129 	    return __vector_bitcast<float, 2>(
2130 	      _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
2131 	  else
2132 	    return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
2133 	}
2134   #endif
2135   #if !defined __clang__ && __GCC_IEC_559 == 0
2136       if (__builtin_is_constant_evaluated()
2137 	  || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2138 	return (__x + __y) - __y;
2139       else
2140 	return [&] {
2141 	  __x += __y;
2142 	  if constexpr(__have_sse)
2143 	    {
2144 	      if constexpr (sizeof(__x) >= 16)
2145 		asm("" : "+x"(__x));
2146 	      else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2147 		asm("" : "+x"(__x[0]), "+x"(__x[1]));
2148 	      else
2149 		__assert_unreachable<_TV>();
2150 	    }
2151 	  else if constexpr(__have_neon)
2152 	    asm("" : "+w"(__x));
2153 	  else if constexpr (__have_power_vmx)
2154 	    {
2155 	      if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2156 		asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
2157 	      else
2158 		asm("" : "+v"(__x));
2159 	    }
2160 	  else
2161 	    asm("" : "+g"(__x));
2162 	  return __x - __y;
2163 	}();
2164   #else
2165       return (__x + __y) - __y;
2166   #endif
2167     }
2168 
2169     // }}}
2170     // _S_nearbyint {{{3
2171     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2172     _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x_) noexcept
2173     {
2174       using value_type = typename _TVT::value_type;
2175       using _V = typename _TVT::type;
2176       const _V __x = __x_;
2177       const _V __absx = __and(__x, _S_absmask<_V>);
2178       static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
2179       _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2180 	= _V() + (1ull << (__digits_v<value_type> - 1));
2181       const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2182       const _V __shifted = _S_plus_minus(__x, __shifter);
2183       return __absx < __shifter_abs ? __shifted : __x;
2184     }
2185 
2186     // _S_rint {{{3
2187     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2188     _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept
2189     {
2190       return _SuperImpl::_S_nearbyint(__x);
2191     }
2192 
2193     // _S_trunc {{{3
2194     template <typename _Tp, size_t _Np>
2195     _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2196     _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2197     {
2198       using _V = __vector_type_t<_Tp, _Np>;
2199       const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2200       static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
2201       constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
2202       _V __truncated = _S_plus_minus(__absx, __shifter);
2203       __truncated -= __truncated > __absx ? _V() + 1 : _V();
2204       return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2205 				: __x._M_data;
2206     }
2207 
2208     // _S_round {{{3
2209     template <typename _Tp, size_t _Np>
2210     _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2211     _S_round(_SimdWrapper<_Tp, _Np> __x)
2212     {
2213       const auto __abs_x = _SuperImpl::_S_abs(__x);
2214       const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2215       const auto __r_abs // round(abs(x)) =
2216 	= __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
2217       return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2218     }
2219 
2220     // _S_floor {{{3
2221     template <typename _Tp, size_t _Np>
2222     _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2223     _S_floor(_SimdWrapper<_Tp, _Np> __x)
2224     {
2225       const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2226       const auto __negative_input
2227 	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2228       const auto __mask
2229 	= __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2230       return __or(__andnot(__mask, __y),
2231 		  __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
2232     }
2233 
2234     // _S_ceil {{{3
2235     template <typename _Tp, size_t _Np>
2236     _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2237     _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2238     {
2239       const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2240       const auto __negative_input
2241 	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2242       const auto __inv_mask
2243 	= __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2244       return __or(__and(__inv_mask, __y),
2245 		  __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
2246     }
2247 
2248     // _S_isnan {{{3
2249     template <typename _Tp, size_t _Np>
2250     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2251     _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2252     {
2253   #if __FINITE_MATH_ONLY__
2254       return {}; // false
2255   #elif !defined __SUPPORT_SNAN__
2256       return ~(__x._M_data == __x._M_data);
2257   #elif defined __STDC_IEC_559__
2258       using _Ip = __int_for_sizeof_t<_Tp>;
2259       const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2260       const auto __infn
2261 	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2262       return __infn < __absn;
2263   #else
2264   #error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2265   #endif
2266     }
2267 
2268     // _S_isfinite {{{3
2269     template <typename _Tp, size_t _Np>
2270     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2271     _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2272     {
2273   #if __FINITE_MATH_ONLY__
2274       using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2275       _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2276       return __alltrue;
2277   #else
2278       // if all exponent bits are set, __x is either inf or NaN
2279       using _Ip = __int_for_sizeof_t<_Tp>;
2280       const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2281       const auto __maxn
2282 	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2283       return __absn <= __maxn;
2284   #endif
2285     }
2286 
2287     // _S_isunordered {{{3
2288     template <typename _Tp, size_t _Np>
2289     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2290     _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2291     {
2292       return __or(_S_isnan(__x), _S_isnan(__y));
2293     }
2294 
2295     // _S_signbit {{{3
2296     template <typename _Tp, size_t _Np>
2297     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2298     _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2299     {
2300       using _Ip = __int_for_sizeof_t<_Tp>;
2301       return __vector_bitcast<_Ip>(__x) < 0;
2302       // Arithmetic right shift (SRA) would also work (instead of compare), but
2303       // 64-bit SRA isn't available on x86 before AVX512. And in general,
2304       // compares are more likely to be efficient than SRA.
2305     }
2306 
2307     // _S_isinf {{{3
2308     template <typename _Tp, size_t _Np>
2309     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2310     _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2311     {
2312   #if __FINITE_MATH_ONLY__
2313       return {}; // false
2314   #else
2315       return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2316 							__vector_broadcast<_Np>(
2317 							  __infinity_v<_Tp>));
2318       // alternative:
2319       // compare to inf using the corresponding integer type
2320       /*
2321 	 return
2322 	 __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2323 			       _S_abs(__x)._M_data)
2324 	 ==
2325 	 __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2326 	 __infinity_v<_Tp>)));
2327 	 */
2328   #endif
2329     }
2330 
2331     // _S_isnormal {{{3
2332     template <typename _Tp, size_t _Np>
2333     _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2334     _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2335     {
2336       using _Ip = __int_for_sizeof_t<_Tp>;
2337       const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2338       const auto __minn
2339 	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2340   #if __FINITE_MATH_ONLY__
2341       return __absn >= __minn;
2342   #else
2343       const auto __maxn
2344 	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2345       return __minn <= __absn && __absn <= __maxn;
2346   #endif
2347     }
2348 
2349     // _S_fpclassify {{{3
2350     template <typename _Tp, size_t _Np>
2351     _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2352     _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2353     {
2354       using _I = __int_for_sizeof_t<_Tp>;
2355       const auto __xn
2356 	= __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2357       constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2358       _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2359 	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2360       _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2361 	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2362 
2363       _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2364 	= __vector_broadcast<_NI, _I>(FP_NORMAL);
2365   #if !__FINITE_MATH_ONLY__
2366       _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2367 	= __vector_broadcast<_NI, _I>(FP_NAN);
2368       _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2369 	= __vector_broadcast<_NI, _I>(FP_INFINITE);
2370   #endif
2371   #ifndef __FAST_MATH__
2372       _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2373 	= __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2374   #endif
2375       _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2376 	= __vector_broadcast<_NI, _I>(FP_ZERO);
2377 
2378       __vector_type_t<_I, _NI>
2379 	__tmp = __xn < __minn
2380   #ifdef __FAST_MATH__
2381 		  ? __fp_zero
2382   #else
2383 		  ? (__xn == 0 ? __fp_zero : __fp_subnormal)
2384   #endif
2385   #if __FINITE_MATH_ONLY__
2386 		  : __fp_normal;
2387   #else
2388 		  : (__xn < __infn ? __fp_normal
2389 				   : (__xn == __infn ? __fp_infinite : __fp_nan));
2390   #endif
2391 
2392       if constexpr (sizeof(_I) == sizeof(int))
2393 	{
2394 	  using _FixedInt = __fixed_size_storage_t<int, _Np>;
2395 	  const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2396 	  if constexpr (_FixedInt::_S_tuple_size == 1)
2397 	    return {__as_int};
2398 	  else if constexpr (_FixedInt::_S_tuple_size == 2
2399 			     && is_same_v<
2400 			       typename _FixedInt::_SecondType::_FirstAbi,
2401 			       simd_abi::scalar>)
2402 	    return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
2403 	  else if constexpr (_FixedInt::_S_tuple_size == 2)
2404 	    return {__extract<0, 2>(__as_int),
2405 		    __auto_bitcast(__extract<1, 2>(__as_int))};
2406 	  else
2407 	    __assert_unreachable<_Tp>();
2408 	}
2409       else if constexpr (_Np == 2 && sizeof(_I) == 8
2410 			 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
2411 	{
2412 	  const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2413 	  return {int(__aslong[0]), {int(__aslong[1])}};
2414 	}
2415   #if _GLIBCXX_SIMD_X86INTRIN
2416       else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
2417 			 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2418 	return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2419 				__to_intrin(__hi128(__tmp)))};
2420       else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
2421 			 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2422 	return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2423   #endif // _GLIBCXX_SIMD_X86INTRIN
2424       else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2425 	return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2426 					    [](auto... __l) {
2427 					      return __make_wrapper<int>(__l...);
2428 					    })};
2429       else
2430 	__assert_unreachable<_Tp>();
2431     }
2432 
2433     // _S_increment & _S_decrement{{{2
2434     template <typename _Tp, size_t _Np>
2435       _GLIBCXX_SIMD_INTRINSIC static void
2436       _S_increment(_SimdWrapper<_Tp, _Np>& __x)
2437       { __x = __x._M_data + 1; }
2438 
2439     template <typename _Tp, size_t _Np>
2440       _GLIBCXX_SIMD_INTRINSIC static void
2441       _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2442       { __x = __x._M_data - 1; }
2443 
2444     // smart_reference access {{{2
2445     template <typename _Tp, size_t _Np, typename _Up>
2446       _GLIBCXX_SIMD_INTRINSIC constexpr static void
2447       _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2448       { __v._M_set(__i, static_cast<_Up&&>(__x)); }
2449 
2450     // _S_masked_assign{{{2
2451     template <typename _Tp, typename _K, size_t _Np>
2452       _GLIBCXX_SIMD_INTRINSIC static void
2453       _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2454 		       __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2455       {
2456 	if (__k._M_is_constprop_none_of())
2457 	  return;
2458 	else if (__k._M_is_constprop_all_of())
2459 	  __lhs = __rhs;
2460 	else
2461 	  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2462       }
2463 
2464     template <typename _Tp, typename _K, size_t _Np>
2465       _GLIBCXX_SIMD_INTRINSIC static void
2466       _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2467 		       __type_identity_t<_Tp> __rhs)
2468       {
2469 	if (__k._M_is_constprop_none_of())
2470 	  return;
2471 	else if (__k._M_is_constprop_all_of())
2472 	  __lhs = __vector_broadcast<_Np>(__rhs);
2473 	else if (__builtin_constant_p(__rhs) && __rhs == 0)
2474 	  {
2475 	    if constexpr (!is_same_v<bool, _K>)
2476 	      // the __andnot optimization only makes sense if __k._M_data is a
2477 	      // vector register
2478 	      __lhs._M_data
2479 		= __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2480 	    else
2481 	      // for AVX512/__mmask, a _mm512_maskz_mov is best
2482 	      __lhs
2483 		= _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2484 	  }
2485 	else
2486 	  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2487 					_SimdWrapper<_Tp, _Np>(
2488 					  __vector_broadcast<_Np>(__rhs)));
2489       }
2490 
2491     // _S_masked_cassign {{{2
2492     template <typename _Op, typename _Tp, typename _K, size_t _Np>
2493       _GLIBCXX_SIMD_INTRINSIC static void
2494       _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2495 			_SimdWrapper<_Tp, _Np>& __lhs,
2496 			const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2497 			_Op __op)
2498       {
2499 	if (__k._M_is_constprop_none_of())
2500 	  return;
2501 	else if (__k._M_is_constprop_all_of())
2502 	  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
2503 	else
2504 	  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2505 					__op(_SuperImpl{}, __lhs, __rhs));
2506       }
2507 
2508     template <typename _Op, typename _Tp, typename _K, size_t _Np>
2509       _GLIBCXX_SIMD_INTRINSIC static void
2510       _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2511 			_SimdWrapper<_Tp, _Np>& __lhs,
2512 			const __type_identity_t<_Tp> __rhs, _Op __op)
2513       { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2514 
2515     // _S_masked_unary {{{2
2516     template <template <typename> class _Op, typename _Tp, typename _K,
2517 	      size_t _Np>
2518       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2519       _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2520 		      const _SimdWrapper<_Tp, _Np> __v)
2521       {
2522 	if (__k._M_is_constprop_none_of())
2523 	  return __v;
2524 	auto __vv = _M_make_simd(__v);
2525 	_Op<decltype(__vv)> __op;
2526 	if (__k._M_is_constprop_all_of())
2527 	  return __data(__op(__vv));
2528 	else
2529 	  return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2530       }
2531 
2532     //}}}2
2533   };
2534 
2535 // _MaskImplBuiltinMixin {{{1
2536 struct _MaskImplBuiltinMixin
2537 {
2538   template <typename _Tp>
2539     using _TypeTag = _Tp*;
2540 
2541   // _S_to_maskvector {{{
2542   template <typename _Up, size_t _ToN = 1>
2543     _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2544     _S_to_maskvector(bool __x)
2545     {
2546       static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2547       return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2548 		 : __vector_type_t<_Up, _ToN>{};
2549     }
2550 
2551   template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
2552 	    size_t _ToN = _UpN == 0 ? _Np : _UpN>
2553     _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2554     _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2555     {
2556       static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2557       return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
2558 	auto __i) constexpr {
2559 	if constexpr (__i < _Np)
2560 	  return __x[__i] ? ~_Up() : _Up();
2561 	else
2562 	  return _Up();
2563       });
2564     }
2565 
2566   template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
2567 	    size_t _ToN = _UpN == 0 ? _Np : _UpN>
2568     _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2569     _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2570     {
2571       static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2572       using _TW = _SimdWrapper<_Tp, _Np>;
2573       using _UW = _SimdWrapper<_Up, _ToN>;
2574       if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2575 	return __wrapper_bitcast<_Up, _ToN>(__x);
2576       else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2577 	return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2578       else
2579 	{ // vector -> vector
2580 	  /*
2581 	  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2582 	  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2583 	  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2584 	  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
2585 			     && sizeof(__y) == 16)
2586 	    return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2587 	  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2588 			     && sizeof(__y) == 16)
2589 	    return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2590 	  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2591 			     && sizeof(__y) == 16)
2592 	    return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2593 	  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2594 	  sizeof(_Up) == 1
2595 			     && sizeof(__y) == 16)
2596 	    return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2597 	  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2598 	  sizeof(_Up) == 1
2599 			     && sizeof(__y) == 16)
2600 	    return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2601 	  -1, -1, -1, -1, -1>(__y); else
2602 	  */
2603 	  {
2604 	    return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
2605 	      auto __i) constexpr {
2606 	      if constexpr (__i < _Np)
2607 		return _Up(__x[__i.value]);
2608 	      else
2609 		return _Up();
2610 	    });
2611 	  }
2612 	}
2613     }
2614 
2615   // }}}
2616   // _S_to_bits {{{
2617   template <typename _Tp, size_t _Np>
2618     _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2619     _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2620     {
2621       static_assert(!is_same_v<_Tp, bool>);
2622       static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2623       using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2624       const auto __bools
2625 	= __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
2626       _ULLong __r = 0;
2627       __execute_n_times<_Np>(
2628 	[&](auto __i) { __r |= _ULLong(__bools[__i.value]) << __i; });
2629       return __r;
2630     }
2631 
2632   // }}}
2633 };
2634 
2635 // _MaskImplBuiltin {{{1
2636 template <typename _Abi, typename>
2637   struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2638   {
2639     using _MaskImplBuiltinMixin::_S_to_bits;
2640     using _MaskImplBuiltinMixin::_S_to_maskvector;
2641 
2642     // member types {{{
2643     template <typename _Tp>
2644       using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2645 
2646     template <typename _Tp>
2647       using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2648 
2649     using _SuperImpl = typename _Abi::_MaskImpl;
2650     using _CommonImpl = typename _Abi::_CommonImpl;
2651 
2652     template <typename _Tp>
2653       static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2654 
2655     // }}}
2656     // _S_broadcast {{{
2657     template <typename _Tp>
2658       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2659       _S_broadcast(bool __x)
2660       {
2661 	return __x ? _Abi::template _S_implicit_mask<_Tp>()
2662 		   : _MaskMember<_Tp>();
2663       }
2664 
2665     // }}}
2666     // _S_load {{{
2667     template <typename _Tp>
2668       _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2669       _S_load(const bool* __mem)
2670       {
2671 	using _I = __int_for_sizeof_t<_Tp>;
2672 	if constexpr (sizeof(_Tp) == sizeof(bool))
2673 	  {
2674 	    const auto __bools
2675 	      = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2676 	    // bool is {0, 1}, everything else is UB
2677 	    return __bools > 0;
2678 	  }
2679 	else
2680 	  return __generate_vector<_I, _S_size<_Tp>>([&](auto __i) constexpr {
2681 	    return __mem[__i] ? ~_I() : _I();
2682 	  });
2683       }
2684 
2685     // }}}
2686     // _S_convert {{{
2687     template <typename _Tp, size_t _Np, bool _Sanitized>
2688       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2689       _S_convert(_BitMask<_Np, _Sanitized> __x)
2690       {
2691 	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2692 	  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2693 	else
2694 	  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2695 						       _S_size<_Tp>>(
2696 	    __x._M_sanitized());
2697       }
2698 
2699     template <typename _Tp, size_t _Np>
2700       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2701       _S_convert(_SimdWrapper<bool, _Np> __x)
2702       {
2703 	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2704 	  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2705 	else
2706 	  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2707 						       _S_size<_Tp>>(
2708 	    _BitMask<_Np>(__x._M_data)._M_sanitized());
2709       }
2710 
2711     template <typename _Tp, typename _Up, size_t _Np>
2712       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2713       _S_convert(_SimdWrapper<_Up, _Np> __x)
2714       {
2715 	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2716 	  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2717 	    _SuperImpl::_S_to_bits(__x));
2718 	else
2719 	  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2720 						       _S_size<_Tp>>(__x);
2721       }
2722 
2723     template <typename _Tp, typename _Up, typename _UAbi>
2724       _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2725       _S_convert(simd_mask<_Up, _UAbi> __x)
2726       {
2727 	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2728 	  {
2729 	    using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2730 	    if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2731 	      return _R(__data(__x));
2732 	    else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2733 	      return _R(__data(__x));
2734 	    else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2735 	      return _R(__data(__x)._M_to_bits());
2736 	    else // vector -> bits
2737 	      return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2738 	  }
2739 	else
2740 	  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2741 						       _S_size<_Tp>>(
2742 	    __data(__x));
2743       }
2744 
2745     // }}}
2746     // _S_masked_load {{{2
2747     template <typename _Tp, size_t _Np>
2748       static inline _SimdWrapper<_Tp, _Np>
2749       _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2750 		     _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2751       {
2752 	// AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2753 	auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2754 	_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2755 				  [&](auto __i) {
2756 				    __tmp._M_set(__i, -__mem[__i]);
2757 				  });
2758 	__merge = __wrapper_bitcast<_Tp>(__tmp);
2759 	return __merge;
2760       }
2761 
2762     // _S_store {{{2
2763     template <typename _Tp, size_t _Np>
2764       _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
2765 						   bool* __mem) noexcept
2766       {
2767 	__execute_n_times<_Np>([&](auto __i) constexpr {
2768 	  __mem[__i] = __v[__i];
2769 	});
2770       }
2771 
2772     // _S_masked_store {{{2
2773     template <typename _Tp, size_t _Np>
2774       static inline void
2775       _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2776 		      const _SimdWrapper<_Tp, _Np> __k) noexcept
2777       {
2778 	_BitOps::_S_bit_iteration(
2779 	  _SuperImpl::_S_to_bits(__k), [&](auto __i) constexpr {
2780 	    __mem[__i] = __v[__i];
2781 	  });
2782       }
2783 
2784     // _S_from_bitmask{{{2
2785     template <size_t _Np, typename _Tp>
2786       _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2787       _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2788       {
2789 	return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
2790       }
2791 
2792     // logical and bitwise operators {{{2
2793     template <typename _Tp, size_t _Np>
2794       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2795       _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x,
2796 		     const _SimdWrapper<_Tp, _Np>& __y)
2797       { return __and(__x._M_data, __y._M_data); }
2798 
2799     template <typename _Tp, size_t _Np>
2800       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2801       _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x,
2802 		    const _SimdWrapper<_Tp, _Np>& __y)
2803       { return __or(__x._M_data, __y._M_data); }
2804 
2805     template <typename _Tp, size_t _Np>
2806       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2807       _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2808       {
2809 	if constexpr (_Abi::template _S_is_partial<_Tp>)
2810 	  return __andnot(__x, __wrapper_bitcast<_Tp>(
2811 				 _Abi::template _S_implicit_mask<_Tp>()));
2812 	else
2813 	  return __not(__x._M_data);
2814       }
2815 
2816     template <typename _Tp, size_t _Np>
2817       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2818       _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x,
2819 		 const _SimdWrapper<_Tp, _Np>& __y)
2820       { return __and(__x._M_data, __y._M_data); }
2821 
2822     template <typename _Tp, size_t _Np>
2823       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2824       _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x,
2825 		const _SimdWrapper<_Tp, _Np>& __y)
2826       { return __or(__x._M_data, __y._M_data); }
2827 
2828     template <typename _Tp, size_t _Np>
2829       _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2830       _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x,
2831 		 const _SimdWrapper<_Tp, _Np>& __y)
2832       { return __xor(__x._M_data, __y._M_data); }
2833 
2834     // smart_reference access {{{2
2835     template <typename _Tp, size_t _Np>
2836       static constexpr void _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i,
2837 				   bool __x) noexcept
2838       {
2839 	if constexpr (is_same_v<_Tp, bool>)
2840 	  __k._M_set(__i, __x);
2841 	else
2842 	  {
2843 	    static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2844 	    if (__builtin_is_constant_evaluated())
2845 	      {
2846 		__k = __generate_from_n_evaluations<_Np,
2847 						    __vector_type_t<_Tp, _Np>>(
2848 		  [&](auto __j) {
2849 		    if (__i == static_cast<int>(__j))
2850 		      return _Tp(-__x);
2851 		    else
2852 		      return __k[+__j];
2853 		  });
2854 	      }
2855 	    else
2856 	      __k._M_data[__i] = -__x;
2857 	  }
2858       }
2859 
2860     // _S_masked_assign{{{2
2861     template <typename _Tp, size_t _Np>
2862       _GLIBCXX_SIMD_INTRINSIC static void
2863       _S_masked_assign(_SimdWrapper<_Tp, _Np> __k,
2864 		       _SimdWrapper<_Tp, _Np>& __lhs,
2865 		       __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2866       { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2867 
2868     template <typename _Tp, size_t _Np>
2869       _GLIBCXX_SIMD_INTRINSIC static void
2870       _S_masked_assign(_SimdWrapper<_Tp, _Np> __k,
2871 		       _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2872       {
2873 	if (__builtin_constant_p(__rhs))
2874 	  {
2875 	    if (__rhs == false)
2876 	      __lhs = __andnot(__k, __lhs);
2877 	    else
2878 	      __lhs = __or(__k, __lhs);
2879 	    return;
2880 	  }
2881 	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2882 				      __data(simd_mask<_Tp, _Abi>(__rhs)));
2883       }
2884 
2885     //}}}2
2886     // _S_all_of {{{
2887     template <typename _Tp>
2888       _GLIBCXX_SIMD_INTRINSIC static bool
2889       _S_all_of(simd_mask<_Tp, _Abi> __k)
2890       {
2891 	return __call_with_subscripts(
2892 	  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2893 	  [](const auto... __ent) constexpr { return (... && !(__ent == 0)); });
2894       }
2895 
2896     // }}}
2897     // _S_any_of {{{
2898     template <typename _Tp>
2899       _GLIBCXX_SIMD_INTRINSIC static bool
2900       _S_any_of(simd_mask<_Tp, _Abi> __k)
2901       {
2902 	return __call_with_subscripts(
2903 	  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2904 	  [](const auto... __ent) constexpr { return (... || !(__ent == 0)); });
2905       }
2906 
2907     // }}}
2908     // _S_none_of {{{
2909     template <typename _Tp>
2910       _GLIBCXX_SIMD_INTRINSIC static bool
2911       _S_none_of(simd_mask<_Tp, _Abi> __k)
2912       {
2913 	return __call_with_subscripts(
2914 	  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2915 	  [](const auto... __ent) constexpr { return (... && (__ent == 0)); });
2916       }
2917 
2918     // }}}
2919     // _S_some_of {{{
2920     template <typename _Tp>
2921       _GLIBCXX_SIMD_INTRINSIC static bool
2922       _S_some_of(simd_mask<_Tp, _Abi> __k)
2923       {
2924 	const int __n_true = _SuperImpl::_S_popcount(__k);
2925 	return __n_true > 0 && __n_true < int(_S_size<_Tp>);
2926       }
2927 
2928     // }}}
2929     // _S_popcount {{{
2930     template <typename _Tp>
2931       _GLIBCXX_SIMD_INTRINSIC static int
2932       _S_popcount(simd_mask<_Tp, _Abi> __k)
2933       {
2934 	using _I = __int_for_sizeof_t<_Tp>;
2935 	if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2936 	  return -reduce(
2937 	    simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2938 	else
2939 	  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2940 	    simd<_Tp, _Abi>(__private_init, __data(__k))));
2941       }
2942 
2943     // }}}
2944     // _S_find_first_set {{{
2945     template <typename _Tp>
2946       _GLIBCXX_SIMD_INTRINSIC static int
2947       _S_find_first_set(simd_mask<_Tp, _Abi> __k)
2948       {
2949 	return std::__countr_zero(
2950 	  _SuperImpl::_S_to_bits(__data(__k))._M_to_bits());
2951       }
2952 
2953     // }}}
2954     // _S_find_last_set {{{
2955     template <typename _Tp>
2956       _GLIBCXX_SIMD_INTRINSIC static int
2957       _S_find_last_set(simd_mask<_Tp, _Abi> __k)
2958       {
2959 	return std::__bit_width(
2960 	  _SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1;
2961       }
2962 
2963     // }}}
2964   };
2965 
2966 //}}}1
2967 _GLIBCXX_SIMD_END_NAMESPACE
2968 #endif // __cplusplus >= 201703L
2969 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
2970 
2971 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100
2972