1 // Simd Abi specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2022 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 27 28 #if __cplusplus >= 201703L 29 30 #include <array> 31 #include <cmath> 32 #include <cstdlib> 33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 // _S_allbits{{{ 36 template <typename _V> 37 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits 38 = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>()); 39 40 // }}} 41 // _S_signmask, _S_absmask{{{ 42 template <typename _V, typename = _VectorTraits<_V>> 43 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask 44 = __xor(_V() + 1, _V() - 1); 45 46 template <typename _V, typename = _VectorTraits<_V>> 47 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask 48 = __andnot(_S_signmask<_V>, _S_allbits<_V>); 49 50 //}}} 51 // __vector_permute<Indices...>{{{ 52 // Index == -1 requests zeroing of the output element 53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>, 54 typename = __detail::__odr_helper> 55 constexpr _Tp __vector_permute(_Tp __x)56 __vector_permute(_Tp __x) 57 { 58 static_assert(sizeof...(_Indices) == _TVT::_S_full_size); 59 return __make_vector<typename _TVT::value_type>( 60 (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...); 61 } 62 63 // }}} 64 // __vector_shuffle<Indices...>{{{ 65 // Index == -1 requests zeroing of the output element 66 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>, 67 typename = __detail::__odr_helper> 68 constexpr _Tp __vector_shuffle(_Tp __x,_Tp __y)69 __vector_shuffle(_Tp __x, _Tp __y) 70 { 71 return _Tp{(_Indices == -1 ? 0 72 : _Indices < _TVT::_S_full_size 73 ? __x[_Indices] 74 : __y[_Indices - _TVT::_S_full_size])...}; 75 } 76 77 // }}} 78 // __make_wrapper{{{ 79 template <typename _Tp, typename... _Args> 80 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)> __make_wrapper(const _Args &...__args)81 __make_wrapper(const _Args&... __args) 82 { return __make_vector<_Tp>(__args...); } 83 84 // }}} 85 // __wrapper_bitcast{{{ 86 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M, 87 size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)> 88 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np> __wrapper_bitcast(_SimdWrapper<_Up,_M> __x)89 __wrapper_bitcast(_SimdWrapper<_Up, _M> __x) 90 { 91 static_assert(_Np > 1); 92 return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data); 93 } 94 95 // }}} 96 // __shift_elements_right{{{ 97 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct 98 template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>> 99 _GLIBCXX_SIMD_INTRINSIC _Tp __shift_elements_right(_Tp __v)100 __shift_elements_right(_Tp __v) 101 { 102 [[maybe_unused]] const auto __iv = __to_intrin(__v); 103 static_assert(__shift <= sizeof(_Tp)); 104 if constexpr (__shift == 0) 105 return __v; 106 else if constexpr (__shift == sizeof(_Tp)) 107 return _Tp(); 108 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 109 else if constexpr (__have_sse && __shift == 8 110 && _TVT::template _S_is<float, 4>) 111 return _mm_movehl_ps(__iv, __iv); 112 else if constexpr (__have_sse2 && __shift == 8 113 && _TVT::template _S_is<double, 2>) 114 return _mm_unpackhi_pd(__iv, __iv); 115 else if constexpr (__have_sse2 && sizeof(_Tp) == 16) 116 return reinterpret_cast<typename _TVT::type>( 117 _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift)); 118 else if constexpr (__shift == 16 && sizeof(_Tp) == 32) 119 { 120 /*if constexpr (__have_avx && _TVT::template _S_is<double, 4>) 121 return _mm256_permute2f128_pd(__iv, __iv, 0x81); 122 else if constexpr (__have_avx && _TVT::template _S_is<float, 8>) 123 return _mm256_permute2f128_ps(__iv, __iv, 0x81); 124 else if constexpr (__have_avx) 125 return reinterpret_cast<typename _TVT::type>( 126 _mm256_permute2f128_si256(__iv, __iv, 0x81)); 127 else*/ 128 return __zero_extend(__hi128(__v)); 129 } 130 else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16) 131 { 132 const auto __vll = __vector_bitcast<_LLong>(__v); 133 return reinterpret_cast<typename _TVT::type>( 134 _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81), 135 __vll, __shift)); 136 } 137 else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16) 138 { 139 const auto __vll = __vector_bitcast<_LLong>(__v); 140 return reinterpret_cast<typename _TVT::type>( 141 __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift), 142 _mm_srli_si128(__hi128(__vll), __shift))); 143 } 144 else if constexpr (sizeof(_Tp) == 32 && __shift > 16) 145 return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v))); 146 else if constexpr (sizeof(_Tp) == 64 && __shift == 32) 147 return __zero_extend(__hi256(__v)); 148 else if constexpr (__have_avx512f && sizeof(_Tp) == 64) 149 { 150 if constexpr (__shift >= 48) 151 return __zero_extend( 152 __shift_elements_right<__shift - 48>(__extract<3, 4>(__v))); 153 else if constexpr (__shift >= 32) 154 return __zero_extend( 155 __shift_elements_right<__shift - 32>(__hi256(__v))); 156 else if constexpr (__shift % 8 == 0) 157 return reinterpret_cast<typename _TVT::type>( 158 _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v), 159 __shift / 8)); 160 else if constexpr (__shift % 4 == 0) 161 return reinterpret_cast<typename _TVT::type>( 162 _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v), 163 __shift / 4)); 164 else if constexpr (__have_avx512bw && __shift < 16) 165 { 166 const auto __vll = __vector_bitcast<_LLong>(__v); 167 return reinterpret_cast<typename _TVT::type>( 168 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9), 169 __vll, __shift)); 170 } 171 else if constexpr (__have_avx512bw && __shift < 32) 172 { 173 const auto __vll = __vector_bitcast<_LLong>(__v); 174 return reinterpret_cast<typename _TVT::type>( 175 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee), 176 _mm512_shuffle_i32x4(__vll, __vll, 0xf9), 177 __shift - 16)); 178 } 179 else 180 __assert_unreachable<_Tp>(); 181 } 182 /* 183 } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64) 184 return __auto_bitcast(__extract<__shift / 16, 4>(__v)); 185 */ 186 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 187 else 188 { 189 constexpr int __chunksize = __shift % 8 == 0 ? 8 190 : __shift % 4 == 0 ? 4 191 : __shift % 2 == 0 ? 2 192 : 1; 193 auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v); 194 using _Up = decltype(__w); 195 return __intrin_bitcast<_Tp>( 196 __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( 197 [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 198 return _Up{__chunks...}; 199 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 200 return __w[__shift / __chunksize + __i]; 201 })); 202 } 203 } 204 205 // }}} 206 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ 207 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np> 208 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr 209 _SimdWrapper<_Tp, _Np / _Total * _Combine> __extract_part(const _SimdWrapper<_Tp,_Np> __x)210 __extract_part(const _SimdWrapper<_Tp, _Np> __x) 211 { 212 if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0) 213 return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); 214 else 215 { 216 constexpr size_t __values_per_part = _Np / _Total; 217 constexpr size_t __values_to_skip = _Index * __values_per_part; 218 constexpr size_t __return_size = __values_per_part * _Combine; 219 using _R = __vector_type_t<_Tp, __return_size>; 220 static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) 221 <= sizeof(__x), 222 "out of bounds __extract_part"); 223 // the following assertion would ensure no "padding" to be read 224 // static_assert(_Total >= _Index + _Combine, "_Total must be greater 225 // than _Index"); 226 227 // static_assert(__return_size * _Total == _Np, "_Np must be divisible 228 // by _Total"); 229 if (__x._M_is_constprop()) 230 return __generate_from_n_evaluations<__return_size, _R>( 231 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 232 return __x[__values_to_skip + __i]; 233 }); 234 if constexpr (_Index == 0 && _Total == 1) 235 return __x; 236 else if constexpr (_Index == 0) 237 return __intrin_bitcast<_R>(__as_vector(__x)); 238 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 239 else if constexpr (sizeof(__x) == 32 240 && __return_size * sizeof(_Tp) <= 16) 241 { 242 constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp); 243 if constexpr (__bytes_to_skip == 16) 244 return __vector_bitcast<_Tp, __return_size>( 245 __hi128(__as_vector(__x))); 246 else 247 return __vector_bitcast<_Tp, __return_size>( 248 _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)), 249 __lo128(__vector_bitcast<_LLong>(__x)), 250 __bytes_to_skip)); 251 } 252 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 253 else if constexpr (_Index > 0 254 && (__values_to_skip % __return_size != 0 255 || sizeof(_R) >= 8) 256 && (__values_to_skip + __return_size) * sizeof(_Tp) 257 <= 64 258 && sizeof(__x) >= 16) 259 return __intrin_bitcast<_R>( 260 __shift_elements_right<__values_to_skip * sizeof(_Tp)>( 261 __as_vector(__x))); 262 else 263 { 264 _R __r = {}; 265 __builtin_memcpy(&__r, 266 reinterpret_cast<const char*>(&__x) 267 + sizeof(_Tp) * __values_to_skip, 268 __return_size * sizeof(_Tp)); 269 return __r; 270 } 271 } 272 } 273 274 // }}} 275 // __extract_part(_SimdWrapper<bool, _Np>) {{{ 276 template <int _Index, int _Total, int _Combine = 1, size_t _Np> 277 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine> __extract_part(const _SimdWrapper<bool,_Np> __x)278 __extract_part(const _SimdWrapper<bool, _Np> __x) 279 { 280 static_assert(_Combine == 1, "_Combine != 1 not implemented"); 281 static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0); 282 return __x._M_data >> (_Index * _Np / _Total); 283 } 284 285 // }}} 286 287 // __vector_convert {{{ 288 // implementation requires an index sequence 289 template <typename _To, typename _From, size_t... _I> 290 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,index_sequence<_I...>)291 __vector_convert(_From __a, index_sequence<_I...>) 292 { 293 using _Tp = typename _VectorTraits<_To>::value_type; 294 return _To{static_cast<_Tp>(__a[_I])...}; 295 } 296 297 template <typename _To, typename _From, size_t... _I> 298 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,index_sequence<_I...>)299 __vector_convert(_From __a, _From __b, index_sequence<_I...>) 300 { 301 using _Tp = typename _VectorTraits<_To>::value_type; 302 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...}; 303 } 304 305 template <typename _To, typename _From, size_t... _I> 306 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,index_sequence<_I...>)307 __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>) 308 { 309 using _Tp = typename _VectorTraits<_To>::value_type; 310 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 311 static_cast<_Tp>(__c[_I])...}; 312 } 313 314 template <typename _To, typename _From, size_t... _I> 315 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,index_sequence<_I...>)316 __vector_convert(_From __a, _From __b, _From __c, _From __d, 317 index_sequence<_I...>) 318 { 319 using _Tp = typename _VectorTraits<_To>::value_type; 320 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 321 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...}; 322 } 323 324 template <typename _To, typename _From, size_t... _I> 325 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,index_sequence<_I...>)326 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 327 index_sequence<_I...>) 328 { 329 using _Tp = typename _VectorTraits<_To>::value_type; 330 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 331 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 332 static_cast<_Tp>(__e[_I])...}; 333 } 334 335 template <typename _To, typename _From, size_t... _I> 336 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,index_sequence<_I...>)337 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 338 _From __f, index_sequence<_I...>) 339 { 340 using _Tp = typename _VectorTraits<_To>::value_type; 341 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 342 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 343 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...}; 344 } 345 346 template <typename _To, typename _From, size_t... _I> 347 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,index_sequence<_I...>)348 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 349 _From __f, _From __g, index_sequence<_I...>) 350 { 351 using _Tp = typename _VectorTraits<_To>::value_type; 352 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 353 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 354 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 355 static_cast<_Tp>(__g[_I])...}; 356 } 357 358 template <typename _To, typename _From, size_t... _I> 359 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,index_sequence<_I...>)360 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 361 _From __f, _From __g, _From __h, index_sequence<_I...>) 362 { 363 using _Tp = typename _VectorTraits<_To>::value_type; 364 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 365 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 366 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 367 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...}; 368 } 369 370 template <typename _To, typename _From, size_t... _I> 371 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,index_sequence<_I...>)372 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 373 _From __f, _From __g, _From __h, _From __i, 374 index_sequence<_I...>) 375 { 376 using _Tp = typename _VectorTraits<_To>::value_type; 377 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 378 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 379 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 380 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 381 static_cast<_Tp>(__i[_I])...}; 382 } 383 384 template <typename _To, typename _From, size_t... _I> 385 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,index_sequence<_I...>)386 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 387 _From __f, _From __g, _From __h, _From __i, _From __j, 388 index_sequence<_I...>) 389 { 390 using _Tp = typename _VectorTraits<_To>::value_type; 391 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 392 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 393 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 394 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 395 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...}; 396 } 397 398 template <typename _To, typename _From, size_t... _I> 399 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,index_sequence<_I...>)400 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 401 _From __f, _From __g, _From __h, _From __i, _From __j, 402 _From __k, index_sequence<_I...>) 403 { 404 using _Tp = typename _VectorTraits<_To>::value_type; 405 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 406 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 407 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 408 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 409 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 410 static_cast<_Tp>(__k[_I])...}; 411 } 412 413 template <typename _To, typename _From, size_t... _I> 414 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,_From __l,index_sequence<_I...>)415 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 416 _From __f, _From __g, _From __h, _From __i, _From __j, 417 _From __k, _From __l, index_sequence<_I...>) 418 { 419 using _Tp = typename _VectorTraits<_To>::value_type; 420 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 421 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 422 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 423 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 424 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 425 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...}; 426 } 427 428 template <typename _To, typename _From, size_t... _I> 429 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,_From __l,_From __m,index_sequence<_I...>)430 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 431 _From __f, _From __g, _From __h, _From __i, _From __j, 432 _From __k, _From __l, _From __m, index_sequence<_I...>) 433 { 434 using _Tp = typename _VectorTraits<_To>::value_type; 435 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 436 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 437 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 438 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 439 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 440 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 441 static_cast<_Tp>(__m[_I])...}; 442 } 443 444 template <typename _To, typename _From, size_t... _I> 445 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,_From __l,_From __m,_From __n,index_sequence<_I...>)446 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 447 _From __f, _From __g, _From __h, _From __i, _From __j, 448 _From __k, _From __l, _From __m, _From __n, 449 index_sequence<_I...>) 450 { 451 using _Tp = typename _VectorTraits<_To>::value_type; 452 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 453 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 454 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 455 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 456 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 457 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 458 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...}; 459 } 460 461 template <typename _To, typename _From, size_t... _I> 462 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,_From __l,_From __m,_From __n,_From __o,index_sequence<_I...>)463 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 464 _From __f, _From __g, _From __h, _From __i, _From __j, 465 _From __k, _From __l, _From __m, _From __n, _From __o, 466 index_sequence<_I...>) 467 { 468 using _Tp = typename _VectorTraits<_To>::value_type; 469 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 470 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 471 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 472 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 473 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 474 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 475 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 476 static_cast<_Tp>(__o[_I])...}; 477 } 478 479 template <typename _To, typename _From, size_t... _I> 480 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_From __a,_From __b,_From __c,_From __d,_From __e,_From __f,_From __g,_From __h,_From __i,_From __j,_From __k,_From __l,_From __m,_From __n,_From __o,_From __p,index_sequence<_I...>)481 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 482 _From __f, _From __g, _From __h, _From __i, _From __j, 483 _From __k, _From __l, _From __m, _From __n, _From __o, 484 _From __p, index_sequence<_I...>) 485 { 486 using _Tp = typename _VectorTraits<_To>::value_type; 487 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 488 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 489 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 490 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 491 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 492 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 493 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 494 static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...}; 495 } 496 497 // Defer actual conversion to the overload that takes an index sequence. Note 498 // that this function adds zeros or drops values off the end if you don't ensure 499 // matching width. 500 template <typename _To, typename... _From, size_t _FromSize> 501 _GLIBCXX_SIMD_INTRINSIC constexpr _To __vector_convert(_SimdWrapper<_From,_FromSize>...__xs)502 __vector_convert(_SimdWrapper<_From, _FromSize>... __xs) 503 { 504 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 505 using _From0 = __first_of_pack_t<_From...>; 506 using _FW = _SimdWrapper<_From0, _FromSize>; 507 if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop())) 508 { 509 if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1)) 510 == 0) // power-of-two number of arguments 511 return __convert_x86<_To>(__as_vector(__xs)...); 512 else // append zeros and recurse until the above branch is taken 513 return __vector_convert<_To>(__xs..., _FW{}); 514 } 515 else 516 #endif 517 return __vector_convert<_To>( 518 __as_vector(__xs)..., 519 make_index_sequence<(sizeof...(__xs) == 1 ? std::min( 520 _VectorTraits<_To>::_S_full_size, int(_FromSize)) 521 : _FromSize)>()); 522 } 523 524 // }}} 525 // __convert function{{{ 526 template <typename _To, typename _From, typename... _More> 527 _GLIBCXX_SIMD_INTRINSIC constexpr auto __convert(_From __v0,_More...__vs)528 __convert(_From __v0, _More... __vs) 529 { 530 static_assert((true && ... && is_same_v<_From, _More>) ); 531 if constexpr (__is_vectorizable_v<_From>) 532 { 533 using _V = typename _VectorTraits<_To>::type; 534 using _Tp = typename _VectorTraits<_To>::value_type; 535 return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...}; 536 } 537 else if constexpr (__is_vector_type_v<_From>) 538 return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...); 539 else // _SimdWrapper arguments 540 { 541 constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More)); 542 if constexpr (__is_vectorizable_v<_To>) 543 return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...); 544 else if constexpr (!__is_vector_type_v<_To>) 545 return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...)); 546 else 547 { 548 static_assert( 549 sizeof...(_More) == 0 550 || _VectorTraits<_To>::_S_full_size >= __input_size, 551 "__convert(...) requires the input to fit into the output"); 552 return __vector_convert<_To>(__v0, __vs...); 553 } 554 } 555 } 556 557 // }}} 558 // __convert_all{{{ 559 // Converts __v into array<_To, N>, where N is _NParts if non-zero or 560 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v). 561 // Note: this function may return less than all converted elements 562 template <typename _To, 563 size_t _NParts = 0, // allows to convert fewer or more (only last 564 // _To, to be partially filled) than all 565 size_t _Offset = 0, // where to start, # of elements (not Bytes or 566 // Parts) 567 typename _From, typename _FromVT = _VectorTraits<_From>> 568 _GLIBCXX_SIMD_INTRINSIC auto __convert_all(_From __v)569 __convert_all(_From __v) 570 { 571 if constexpr (is_arithmetic_v<_To> && _NParts != 1) 572 { 573 static_assert(_Offset < _FromVT::_S_full_size); 574 constexpr auto _Np 575 = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts; 576 return __generate_from_n_evaluations<_Np, array<_To, _Np>>( 577 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 578 return static_cast<_To>(__v[__i + _Offset]); 579 }); 580 } 581 else 582 { 583 static_assert(__is_vector_type_v<_To>); 584 using _ToVT = _VectorTraits<_To>; 585 if constexpr (__is_vector_type_v<_From>) 586 return __convert_all<_To, _NParts>(__as_wrapper(__v)); 587 else if constexpr (_NParts == 1) 588 { 589 static_assert(_Offset % _ToVT::_S_full_size == 0); 590 return array<_To, 1>{__vector_convert<_To>( 591 __extract_part<_Offset / _ToVT::_S_full_size, 592 __div_roundup(_FromVT::_S_partial_width, 593 _ToVT::_S_full_size)>(__v))}; 594 } 595 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 596 else if constexpr (!__have_sse4_1 && _Offset == 0 597 && is_integral_v<typename _FromVT::value_type> 598 && sizeof(typename _FromVT::value_type) 599 < sizeof(typename _ToVT::value_type) 600 && !(sizeof(typename _FromVT::value_type) == 4 601 && is_same_v<typename _ToVT::value_type, double>)) 602 { 603 using _ToT = typename _ToVT::value_type; 604 using _FromT = typename _FromVT::value_type; 605 constexpr size_t _Np 606 = _NParts != 0 607 ? _NParts 608 : (_FromVT::_S_partial_width / _ToVT::_S_full_size); 609 using _R = array<_To, _Np>; 610 // __adjust modifies its input to have _Np (use _SizeConstant) 611 // entries so that no unnecessary intermediate conversions are 612 // requested and, more importantly, no intermediate conversions are 613 // missing 614 [[maybe_unused]] auto __adjust 615 = [](auto __n, 616 auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> { 617 return __vector_bitcast<_FromT, decltype(__n)::value>(__vv); 618 }; 619 [[maybe_unused]] const auto __vi = __to_intrin(__v); 620 auto&& __make_array 621 = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 622 if constexpr (_Np == 1) 623 return _R{__intrin_bitcast<_To>(__x0)}; 624 else 625 return _R{__intrin_bitcast<_To>(__x0), 626 __intrin_bitcast<_To>(__x1)}; 627 }; 628 629 if constexpr (_Np == 0) 630 return _R{}; 631 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2) 632 { 633 static_assert(is_integral_v<_FromT>); 634 static_assert(is_integral_v<_ToT>); 635 if constexpr (is_unsigned_v<_FromT>) 636 return __make_array(_mm_unpacklo_epi8(__vi, __m128i()), 637 _mm_unpackhi_epi8(__vi, __m128i())); 638 else 639 return __make_array( 640 _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8), 641 _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8)); 642 } 643 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4) 644 { 645 static_assert(is_integral_v<_FromT>); 646 if constexpr (is_floating_point_v<_ToT>) 647 { 648 const auto __ints 649 = __convert_all<__vector_type16_t<int>, _Np>( 650 __adjust(_SizeConstant<_Np * 4>(), __v)); 651 return __generate_from_n_evaluations<_Np, _R>( 652 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 653 return __vector_convert<_To>(__as_wrapper(__ints[__i])); 654 }); 655 } 656 else if constexpr (is_unsigned_v<_FromT>) 657 return __make_array(_mm_unpacklo_epi16(__vi, __m128i()), 658 _mm_unpackhi_epi16(__vi, __m128i())); 659 else 660 return __make_array( 661 _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16), 662 _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16)); 663 } 664 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 665 && is_integral_v<_FromT> && is_integral_v<_ToT>) 666 { 667 if constexpr (is_unsigned_v<_FromT>) 668 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 669 _mm_unpackhi_epi32(__vi, __m128i())); 670 else 671 return __make_array( 672 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 673 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 674 } 675 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 676 && is_integral_v<_FromT> && is_integral_v<_ToT>) 677 { 678 if constexpr (is_unsigned_v<_FromT>) 679 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 680 _mm_unpackhi_epi32(__vi, __m128i())); 681 else 682 return __make_array( 683 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 684 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 685 } 686 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4 687 && is_signed_v<_FromT>) 688 { 689 const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi), 690 _mm_unpackhi_epi8(__vi, __vi)}; 691 const __vector_type_t<int, 4> __vvvv[4] = { 692 __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])), 693 __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])), 694 __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])), 695 __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; 696 if constexpr (sizeof(_ToT) == 4) 697 return __generate_from_n_evaluations<_Np, _R>( 698 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 699 return __vector_convert<_To>( 700 _SimdWrapper<int, 4>(__vvvv[__i] >> 24)); 701 }); 702 else if constexpr (is_integral_v<_ToT>) 703 return __generate_from_n_evaluations<_Np, _R>( 704 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 705 const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); 706 const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); 707 return __vector_bitcast<_ToT>( 708 __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) 709 : _mm_unpackhi_epi32(__sx32, __signbits)); 710 }); 711 else 712 return __generate_from_n_evaluations<_Np, _R>( 713 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 714 const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24; 715 return __vector_convert<_To>( 716 __i % 2 == 0 ? __int4 717 : _SimdWrapper<int, 4>( 718 _mm_unpackhi_epi64(__to_intrin(__int4), 719 __to_intrin(__int4)))); 720 }); 721 } 722 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) 723 { 724 const auto __shorts = __convert_all<__vector_type16_t< 725 conditional_t<is_signed_v<_FromT>, short, unsigned short>>>( 726 __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); 727 return __generate_from_n_evaluations<_Np, _R>( 728 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 729 return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; 730 }); 731 } 732 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 733 && is_signed_v<_FromT> && is_integral_v<_ToT>) 734 { 735 const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi), 736 _mm_unpackhi_epi16(__vi, __vi)}; 737 const __vector_type16_t<int> __vvvv[4] 738 = {__vector_bitcast<int>( 739 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16), 740 _mm_srai_epi32(__vv[0], 31))), 741 __vector_bitcast<int>( 742 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16), 743 _mm_srai_epi32(__vv[0], 31))), 744 __vector_bitcast<int>( 745 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16), 746 _mm_srai_epi32(__vv[1], 31))), 747 __vector_bitcast<int>( 748 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), 749 _mm_srai_epi32(__vv[1], 31)))}; 750 return __generate_from_n_evaluations<_Np, _R>( 751 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 752 return __vector_bitcast<_ToT>(__vvvv[__i]); 753 }); 754 } 755 else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) 756 { 757 const auto __ints 758 = __convert_all<__vector_type16_t<conditional_t< 759 is_signed_v<_FromT> || is_floating_point_v<_ToT>, int, 760 unsigned int>>>( 761 __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); 762 return __generate_from_n_evaluations<_Np, _R>( 763 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 764 return __convert_all<_To>(__ints[__i / 2])[__i % 2]; 765 }); 766 } 767 else 768 __assert_unreachable<_To>(); 769 } 770 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 771 else if constexpr ((_FromVT::_S_partial_width - _Offset) 772 > _ToVT::_S_full_size) 773 { 774 /* 775 static_assert( 776 (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) == 777 0, 778 "__convert_all only supports power-of-2 number of elements. 779 Otherwise " "the return type cannot be array<_To, N>."); 780 */ 781 constexpr size_t _NTotal 782 = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size; 783 constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts; 784 static_assert( 785 _Np <= _NTotal 786 || (_Np == _NTotal + 1 787 && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size 788 > 0)); 789 using _R = array<_To, _Np>; 790 if constexpr (_Np == 1) 791 return _R{__vector_convert<_To>( 792 __extract_part<_Offset, _FromVT::_S_partial_width, 793 _ToVT::_S_full_size>(__v))}; 794 else 795 return __generate_from_n_evaluations<_Np, _R>( 796 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 797 auto __part 798 = __extract_part<__i * _ToVT::_S_full_size + _Offset, 799 _FromVT::_S_partial_width, 800 _ToVT::_S_full_size>(__v); 801 return __vector_convert<_To>(__part); 802 }); 803 } 804 else if constexpr (_Offset == 0) 805 return array<_To, 1>{__vector_convert<_To>(__v)}; 806 else 807 return array<_To, 1>{__vector_convert<_To>( 808 __extract_part<_Offset, _FromVT::_S_partial_width, 809 _FromVT::_S_partial_width - _Offset>(__v))}; 810 } 811 } 812 813 // }}} 814 815 // _GnuTraits {{{ 816 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np> 817 struct _GnuTraits 818 { 819 using _IsValid = true_type; 820 using _SimdImpl = typename _Abi::_SimdImpl; 821 using _MaskImpl = typename _Abi::_MaskImpl; 822 823 // simd and simd_mask member types {{{ 824 using _SimdMember = _SimdWrapper<_Tp, _Np>; 825 using _MaskMember = _SimdWrapper<_Mp, _Np>; 826 static constexpr size_t _S_simd_align = alignof(_SimdMember); 827 static constexpr size_t _S_mask_align = alignof(_MaskMember); 828 829 // }}} 830 // size metadata {{{ 831 static constexpr size_t _S_full_size = _SimdMember::_S_full_size; 832 static constexpr bool _S_is_partial = _SimdMember::_S_is_partial; 833 834 // }}} 835 // _SimdBase / base class for simd, providing extra conversions {{{ 836 struct _SimdBase2 837 { 838 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 839 operator __intrinsic_type_t<_Tp, _Np>() const 840 { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); } 841 842 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 843 operator __vector_type_t<_Tp, _Np>() const 844 { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); } 845 }; 846 847 struct _SimdBase1 848 { 849 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 850 operator __intrinsic_type_t<_Tp, _Np>() const 851 { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); } 852 }; 853 854 using _SimdBase = conditional_t< 855 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 856 _SimdBase1, _SimdBase2>; 857 858 // }}} 859 // _MaskBase {{{ 860 struct _MaskBase2 861 { 862 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 863 operator __intrinsic_type_t<_Tp, _Np>() const 864 { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); } 865 866 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 867 operator __vector_type_t<_Tp, _Np>() const 868 { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; } 869 }; 870 871 struct _MaskBase1 872 { 873 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 874 operator __intrinsic_type_t<_Tp, _Np>() const 875 { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); } 876 }; 877 878 using _MaskBase = conditional_t< 879 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 880 _MaskBase1, _MaskBase2>; 881 882 // }}} 883 // _MaskCastType {{{ 884 // parameter type of one explicit simd_mask constructor 885 class _MaskCastType 886 { 887 using _Up = __intrinsic_type_t<_Tp, _Np>; 888 _Up _M_data; 889 890 public: 891 _GLIBCXX_SIMD_ALWAYS_INLINE _MaskCastType_GnuTraits892 _MaskCastType(_Up __x) : _M_data(__x) {} 893 894 _GLIBCXX_SIMD_ALWAYS_INLINE _MaskMember_GnuTraits895 operator _MaskMember() const { return _M_data; } 896 }; 897 898 // }}} 899 // _SimdCastType {{{ 900 // parameter type of one explicit simd constructor 901 class _SimdCastType1 902 { 903 using _Ap = __intrinsic_type_t<_Tp, _Np>; 904 _SimdMember _M_data; 905 906 public: 907 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr _SimdCastType1_GnuTraits908 _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 909 910 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr _SimdMember_GnuTraits911 operator _SimdMember() const { return _M_data; } 912 }; 913 914 class _SimdCastType2 915 { 916 using _Ap = __intrinsic_type_t<_Tp, _Np>; 917 using _Bp = __vector_type_t<_Tp, _Np>; 918 _SimdMember _M_data; 919 920 public: 921 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr _SimdCastType2_GnuTraits922 _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 923 924 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr _SimdCastType2_GnuTraits925 _SimdCastType2(_Bp __b) : _M_data(__b) {} 926 927 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr _SimdMember_GnuTraits928 operator _SimdMember() const { return _M_data; } 929 }; 930 931 using _SimdCastType = conditional_t< 932 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 933 _SimdCastType1, _SimdCastType2>; 934 //}}} 935 }; 936 937 // }}} 938 struct _CommonImplX86; 939 struct _CommonImplNeon; 940 struct _CommonImplBuiltin; 941 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin; 942 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin; 943 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86; 944 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86; 945 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon; 946 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon; 947 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc; 948 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc; 949 950 // simd_abi::_VecBuiltin {{{ 951 template <int _UsedBytes> 952 struct simd_abi::_VecBuiltin 953 { 954 template <typename _Tp> 955 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 956 957 // validity traits {{{ 958 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 959 960 template <typename _Tp> 961 struct _IsValidSizeFor 962 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 963 && _UsedBytes % sizeof(_Tp) == 0 964 && _UsedBytes <= __vectorized_sizeof<_Tp>() 965 && (!__have_avx512f || _UsedBytes <= 32))> {}; 966 967 template <typename _Tp> 968 struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>, 969 _IsValidSizeFor<_Tp>> {}; 970 971 template <typename _Tp> 972 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 973 974 // }}} 975 // _SimdImpl/_MaskImpl {{{ 976 #if _GLIBCXX_SIMD_X86INTRIN 977 using _CommonImpl = _CommonImplX86; 978 using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>; 979 using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>; 980 #elif _GLIBCXX_SIMD_HAVE_NEON 981 using _CommonImpl = _CommonImplNeon; 982 using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>; 983 using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>; 984 #else 985 using _CommonImpl = _CommonImplBuiltin; 986 #ifdef __ALTIVEC__ 987 using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>; 988 using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>; 989 #else 990 using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>; 991 using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>; 992 #endif 993 #endif 994 995 // }}} 996 // __traits {{{ 997 template <typename _Tp> 998 using _MaskValueType = __int_for_sizeof_t<_Tp>; 999 1000 template <typename _Tp> 1001 using __traits 1002 = conditional_t<_S_is_valid_v<_Tp>, 1003 _GnuTraits<_Tp, _MaskValueType<_Tp>, 1004 _VecBuiltin<_UsedBytes>, _S_size<_Tp>>, 1005 _InvalidTraits>; 1006 1007 //}}} 1008 // size metadata {{{ 1009 template <typename _Tp> 1010 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1011 1012 template <typename _Tp> 1013 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1014 1015 // }}} 1016 // implicit masks {{{ 1017 template <typename _Tp> 1018 using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>; 1019 1020 template <typename _Tp> 1021 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_implicit_mask_VecBuiltin1022 _S_implicit_mask() 1023 { 1024 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 1025 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1026 return ~_UV(); 1027 else 1028 { 1029 constexpr auto __size = _S_size<_Tp>; 1030 _GLIBCXX_SIMD_USE_CONSTEXPR auto __r 1031 = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1032 { return __i < __size ? -1 : 0; }); 1033 return __r; 1034 } 1035 } 1036 1037 template <typename _Tp> 1038 _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>> _S_implicit_mask_intrin_VecBuiltin1039 _S_implicit_mask_intrin() 1040 { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); } 1041 1042 template <typename _TW, typename _TVT = _VectorTraits<_TW>> 1043 _GLIBCXX_SIMD_INTRINSIC static constexpr _TW _S_masked_VecBuiltin1044 _S_masked(_TW __x) 1045 { 1046 using _Tp = typename _TVT::value_type; 1047 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1048 return __x; 1049 else 1050 return __and(__as_vector(__x), 1051 __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>())); 1052 } 1053 1054 template <typename _TW, typename _TVT = _VectorTraits<_TW>> 1055 _GLIBCXX_SIMD_INTRINSIC static constexpr auto __make_padding_nonzero_VecBuiltin1056 __make_padding_nonzero(_TW __x) 1057 { 1058 using _Tp = typename _TVT::value_type; 1059 if constexpr (!_S_is_partial<_Tp>) 1060 return __x; 1061 else 1062 { 1063 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask 1064 = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()); 1065 if constexpr (is_integral_v<_Tp>) 1066 return __or(__x, ~__implicit_mask); 1067 else 1068 { 1069 _GLIBCXX_SIMD_USE_CONSTEXPR auto __one 1070 = __andnot(__implicit_mask, 1071 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))); 1072 // it's not enough to return `x | 1_in_padding` because the 1073 // padding in x might be inf or nan (independent of 1074 // __FINITE_MATH_ONLY__, because it's about padding bits) 1075 return __or(__and(__x, __implicit_mask), __one); 1076 } 1077 } 1078 } 1079 // }}} 1080 }; 1081 1082 // }}} 1083 // simd_abi::_VecBltnBtmsk {{{ 1084 template <int _UsedBytes> 1085 struct simd_abi::_VecBltnBtmsk 1086 { 1087 template <typename _Tp> 1088 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 1089 1090 // validity traits {{{ 1091 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 1092 1093 template <typename _Tp> 1094 struct _IsValidSizeFor 1095 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 1096 && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64 1097 && (_UsedBytes > 32 || __have_avx512vl))> {}; 1098 1099 // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also 1100 // required. 1101 template <typename _Tp> 1102 struct _IsValid 1103 : conjunction< 1104 _IsValidAbiTag, __bool_constant<__have_avx512f>, 1105 __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>, 1106 __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>, 1107 _IsValidSizeFor<_Tp>> {}; 1108 1109 template <typename _Tp> 1110 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 1111 1112 // }}} 1113 // simd/_MaskImpl {{{ 1114 #if _GLIBCXX_SIMD_X86INTRIN 1115 using _CommonImpl = _CommonImplX86; 1116 using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>; 1117 using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>; 1118 #else 1119 template <int> 1120 struct _MissingImpl; 1121 1122 using _CommonImpl = _MissingImpl<_UsedBytes>; 1123 using _SimdImpl = _MissingImpl<_UsedBytes>; 1124 using _MaskImpl = _MissingImpl<_UsedBytes>; 1125 #endif 1126 1127 // }}} 1128 // __traits {{{ 1129 template <typename _Tp> 1130 using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>; 1131 1132 template <typename _Tp> 1133 using __traits = conditional_t< 1134 _S_is_valid_v<_Tp>, 1135 _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>, 1136 _InvalidTraits>; 1137 1138 //}}} 1139 // size metadata {{{ 1140 template <typename _Tp> 1141 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1142 template <typename _Tp> 1143 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1144 1145 // }}} 1146 // implicit mask {{{ 1147 private: 1148 template <typename _Tp> 1149 using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>; 1150 1151 public: 1152 template <size_t _Np> 1153 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np> __implicit_mask_n_VecBltnBtmsk1154 __implicit_mask_n() 1155 { 1156 using _Tp = __bool_storage_member_type_t<_Np>; 1157 return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp(); 1158 } 1159 1160 template <typename _Tp> 1161 _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp> _S_implicit_mask_VecBltnBtmsk1162 _S_implicit_mask() 1163 { return __implicit_mask_n<_S_size<_Tp>>(); } 1164 1165 template <typename _Tp> 1166 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>> _S_implicit_mask_intrin_VecBltnBtmsk1167 _S_implicit_mask_intrin() 1168 { return __implicit_mask_n<_S_size<_Tp>>(); } 1169 1170 template <typename _Tp, size_t _Np> 1171 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_masked_VecBltnBtmsk1172 _S_masked(_SimdWrapper<_Tp, _Np> __x) 1173 { 1174 if constexpr (is_same_v<_Tp, bool>) 1175 if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0) 1176 return _MaskImpl::_S_bit_and( 1177 __x, _SimdWrapper<_Tp, _Np>( 1178 __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1))); 1179 else 1180 return __x; 1181 else 1182 return _S_masked(__x._M_data); 1183 } 1184 1185 template <typename _TV> 1186 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV _S_masked_VecBltnBtmsk1187 _S_masked(_TV __x) 1188 { 1189 using _Tp = typename _VectorTraits<_TV>::value_type; 1190 static_assert( 1191 !__is_bitmask_v<_TV>, 1192 "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't " 1193 "know the number of elements. Use _SimdWrapper<bool, N> instead."); 1194 if constexpr (_S_is_partial<_Tp>) 1195 { 1196 constexpr size_t _Np = _S_size<_Tp>; 1197 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1198 _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(), 1199 _SimdWrapper<_Tp, _Np>(__x)); 1200 } 1201 else 1202 return __x; 1203 } 1204 1205 template <typename _TV, typename _TVT = _VectorTraits<_TV>> 1206 _GLIBCXX_SIMD_INTRINSIC static constexpr auto __make_padding_nonzero_VecBltnBtmsk1207 __make_padding_nonzero(_TV __x) 1208 { 1209 using _Tp = typename _TVT::value_type; 1210 if constexpr (!_S_is_partial<_Tp>) 1211 return __x; 1212 else 1213 { 1214 constexpr size_t _Np = _S_size<_Tp>; 1215 if constexpr (is_integral_v<typename _TVT::value_type>) 1216 return __x 1217 | __generate_vector<_Tp, _S_full_size<_Tp>>( 1218 [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { 1219 if (__i < _Np) 1220 return 0; 1221 else 1222 return 1; 1223 }); 1224 else 1225 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1226 _S_implicit_mask<_Tp>(), 1227 _SimdWrapper<_Tp, _Np>( 1228 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))), 1229 _SimdWrapper<_Tp, _Np>(__x)) 1230 ._M_data; 1231 } 1232 } 1233 1234 // }}} 1235 }; 1236 1237 //}}} 1238 // _CommonImplBuiltin {{{ 1239 struct _CommonImplBuiltin 1240 { 1241 // _S_converts_via_decomposition{{{ 1242 // This lists all cases where a __vector_convert needs to fall back to 1243 // conversion of individual scalars (i.e. decompose the input vector into 1244 // scalars, convert, compose output vector). In those cases, _S_masked_load & 1245 // _S_masked_store prefer to use the _S_bit_iteration implementation. 1246 template <typename _From, typename _To, size_t _ToSize> 1247 static inline constexpr bool __converts_via_decomposition_v 1248 = sizeof(_From) != sizeof(_To); 1249 1250 // }}} 1251 // _S_load{{{ 1252 template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)> 1253 _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np> _S_load_CommonImplBuiltin1254 _S_load(const void* __p) 1255 { 1256 static_assert(_Np > 1); 1257 static_assert(_Bytes % sizeof(_Tp) == 0); 1258 using _Rp = __vector_type_t<_Tp, _Np>; 1259 if constexpr (sizeof(_Rp) == _Bytes) 1260 { 1261 _Rp __r; 1262 __builtin_memcpy(&__r, __p, _Bytes); 1263 return __r; 1264 } 1265 else 1266 { 1267 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424 1268 using _Up = conditional_t< 1269 is_integral_v<_Tp>, 1270 conditional_t<_Bytes % 4 == 0, 1271 conditional_t<_Bytes % 8 == 0, long long, int>, 1272 conditional_t<_Bytes % 2 == 0, short, signed char>>, 1273 conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp, 1274 double>>; 1275 using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>; 1276 if constexpr (sizeof(_V) != sizeof(_Rp)) 1277 { // on i386 with 4 < _Bytes <= 8 1278 _Rp __r{}; 1279 __builtin_memcpy(&__r, __p, _Bytes); 1280 return __r; 1281 } 1282 else 1283 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424 1284 using _V = _Rp; 1285 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1286 { 1287 _V __r{}; 1288 static_assert(_Bytes <= sizeof(_V)); 1289 __builtin_memcpy(&__r, __p, _Bytes); 1290 return reinterpret_cast<_Rp>(__r); 1291 } 1292 } 1293 } 1294 1295 // }}} 1296 // _S_store {{{ 1297 template <size_t _Bytes> 1298 _GLIBCXX_SIMD_INTRINSIC static void _S_memcpy_CommonImplBuiltin1299 _S_memcpy(char* __dst, const char* __src) 1300 { 1301 if constexpr (_Bytes > 0) 1302 { 1303 constexpr size_t _Ns = std::__bit_floor(_Bytes); 1304 __builtin_memcpy(__dst, __src, _Ns); 1305 _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns); 1306 } 1307 } 1308 1309 template <size_t _ReqBytes = 0, typename _TV> 1310 _GLIBCXX_SIMD_INTRINSIC static void _S_store_CommonImplBuiltin1311 _S_store(_TV __x, void* __addr) 1312 { 1313 constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes; 1314 static_assert(sizeof(__x) >= _Bytes); 1315 1316 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424 1317 if constexpr (__is_vector_type_v<_TV>) 1318 _S_memcpy<_Bytes>(reinterpret_cast<char*>(__addr), reinterpret_cast<const char*>(&__x)); 1319 else 1320 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1321 __builtin_memcpy(__addr, &__x, _Bytes); 1322 } 1323 1324 template <typename _Tp, size_t _Np> 1325 _GLIBCXX_SIMD_INTRINSIC static void _S_store_CommonImplBuiltin1326 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 1327 { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); } 1328 1329 // }}} 1330 // _S_store_bool_array(_BitMask) {{{ 1331 template <size_t _Np, bool _Sanitized> 1332 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_store_bool_array_CommonImplBuiltin1333 _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) 1334 { 1335 if constexpr (_Np == 1) 1336 __mem[0] = __x[0]; 1337 else if (__builtin_is_constant_evaluated()) 1338 { 1339 for (size_t __i = 0; __i < _Np; ++__i) 1340 __mem[__i] = __x[__i]; 1341 } 1342 else if constexpr (_Np == 2) 1343 { 1344 short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101; 1345 _S_store<_Np>(__bool2, __mem); 1346 } 1347 else if constexpr (_Np == 3) 1348 { 1349 int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101; 1350 _S_store<_Np>(__bool3, __mem); 1351 } 1352 else 1353 { 1354 __execute_n_times<__div_roundup(_Np, 4)>( 1355 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1356 constexpr int __offset = __i * 4; 1357 constexpr int __remaining = _Np - __offset; 1358 if constexpr (__remaining > 4 && __remaining <= 7) 1359 { 1360 const _ULLong __bool7 1361 = (__x.template _M_extract<__offset>()._M_to_bits() 1362 * 0x40810204081ULL) 1363 & 0x0101010101010101ULL; 1364 _S_store<__remaining>(__bool7, __mem + __offset); 1365 } 1366 else if constexpr (__remaining >= 4) 1367 { 1368 int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 1369 if constexpr (__remaining > 7) 1370 __bits &= 0xf; 1371 const int __bool4 = (__bits * 0x204081) & 0x01010101; 1372 _S_store<4>(__bool4, __mem + __offset); 1373 } 1374 }); 1375 } 1376 } 1377 1378 // }}} 1379 // _S_blend{{{ 1380 template <typename _Tp, size_t _Np> 1381 _GLIBCXX_SIMD_INTRINSIC static constexpr auto _S_blend_CommonImplBuiltin1382 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 1383 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 1384 { return __k._M_data ? __at1._M_data : __at0._M_data; } 1385 1386 // }}} 1387 }; 1388 1389 // }}} 1390 // _SimdImplBuiltin {{{1 1391 template <typename _Abi, typename> 1392 struct _SimdImplBuiltin 1393 { 1394 // member types {{{2 1395 template <typename _Tp> 1396 static constexpr size_t _S_max_store_size = 16; 1397 1398 using abi_type = _Abi; 1399 1400 template <typename _Tp> 1401 using _TypeTag = _Tp*; 1402 1403 template <typename _Tp> 1404 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 1405 1406 template <typename _Tp> 1407 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 1408 1409 template <typename _Tp> 1410 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 1411 1412 template <typename _Tp> 1413 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 1414 1415 using _CommonImpl = typename _Abi::_CommonImpl; 1416 using _SuperImpl = typename _Abi::_SimdImpl; 1417 using _MaskImpl = typename _Abi::_MaskImpl; 1418 1419 // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2 1420 template <typename _Tp, size_t _Np> 1421 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> _M_make_simd_SimdImplBuiltin1422 _M_make_simd(_SimdWrapper<_Tp, _Np> __x) 1423 { return {__private_init, __x}; } 1424 1425 template <typename _Tp, size_t _Np> 1426 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> _M_make_simd_SimdImplBuiltin1427 _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x) 1428 { return {__private_init, __vector_bitcast<_Tp>(__x)}; } 1429 1430 // _S_broadcast {{{2 1431 template <typename _Tp> 1432 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> _S_broadcast_SimdImplBuiltin1433 _S_broadcast(_Tp __x) noexcept 1434 { return __vector_broadcast<_S_full_size<_Tp>>(__x); } 1435 1436 // _S_generator {{{2 1437 template <typename _Fp, typename _Tp> 1438 inline static constexpr _SimdMember<_Tp> _S_generator_SimdImplBuiltin1439 _S_generator(_Fp&& __gen, _TypeTag<_Tp>) 1440 { 1441 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1442 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1443 if constexpr (__i < _S_size<_Tp>) 1444 return __gen(__i); 1445 else 1446 return 0; 1447 }); 1448 } 1449 1450 // _S_load {{{2 1451 template <typename _Tp, typename _Up> 1452 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> _S_load_SimdImplBuiltin1453 _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept 1454 { 1455 constexpr size_t _Np = _S_size<_Tp>; 1456 constexpr size_t __max_load_size 1457 = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64 1458 : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32 1459 : 16; 1460 constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; 1461 if (__builtin_is_constant_evaluated()) 1462 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1463 [&](auto __i) constexpr { 1464 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1465 }); 1466 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1467 return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( 1468 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1469 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1470 }); 1471 else if constexpr (is_same_v<_Up, _Tp>) 1472 return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, 1473 _Np * sizeof(_Tp)>(__mem); 1474 else if constexpr (__bytes_to_load <= __max_load_size) 1475 return __convert<_SimdMember<_Tp>>( 1476 _CommonImpl::template _S_load<_Up, _Np>(__mem)); 1477 else if constexpr (__bytes_to_load % __max_load_size == 0) 1478 { 1479 constexpr size_t __n_loads = __bytes_to_load / __max_load_size; 1480 constexpr size_t __elements_per_load = _Np / __n_loads; 1481 return __call_with_n_evaluations<__n_loads>( 1482 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1483 return __convert<_SimdMember<_Tp>>(__uncvted...); 1484 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1485 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1486 __mem + __i * __elements_per_load); 1487 }); 1488 } 1489 else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 1490 && __max_load_size > 16) 1491 { // e.g. int[] -> <char, 12> with AVX2 1492 constexpr size_t __n_loads 1493 = __bytes_to_load / (__max_load_size / 2); 1494 constexpr size_t __elements_per_load = _Np / __n_loads; 1495 return __call_with_n_evaluations<__n_loads>( 1496 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1497 return __convert<_SimdMember<_Tp>>(__uncvted...); 1498 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1499 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1500 __mem + __i * __elements_per_load); 1501 }); 1502 } 1503 else // e.g. int[] -> <char, 9> 1504 return __call_with_subscripts( 1505 __mem, make_index_sequence<_Np>(), 1506 [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1507 return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; 1508 }); 1509 } 1510 1511 // _S_masked_load {{{2 1512 template <typename _Tp, size_t _Np, typename _Up> 1513 static constexpr inline _SimdWrapper<_Tp, _Np> _S_masked_load_SimdImplBuiltin1514 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 1515 const _Up* __mem) noexcept 1516 { 1517 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1518 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1519 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1520 }); 1521 return __merge; 1522 } 1523 1524 // _S_store {{{2 1525 template <typename _Tp, typename _Up> 1526 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_store_SimdImplBuiltin1527 _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept 1528 { 1529 // TODO: converting int -> "smaller int" can be optimized with AVX512 1530 constexpr size_t _Np = _S_size<_Tp>; 1531 constexpr size_t __max_store_size 1532 = _SuperImpl::template _S_max_store_size<_Up>; 1533 if (__builtin_is_constant_evaluated()) 1534 { 1535 for (size_t __i = 0; __i < _Np; ++__i) 1536 __mem[__i] = __v[__i]; 1537 } 1538 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1539 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1540 __mem[__i] = __v[__i]; 1541 }); 1542 else if constexpr (is_same_v<_Up, _Tp>) 1543 _CommonImpl::_S_store(__v, __mem); 1544 else if constexpr (sizeof(_Up) * _Np <= __max_store_size) 1545 _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)), 1546 __mem); 1547 else 1548 { 1549 constexpr size_t __vsize = __max_store_size / sizeof(_Up); 1550 // round up to convert the last partial vector as well: 1551 constexpr size_t __stores = __div_roundup(_Np, __vsize); 1552 constexpr size_t __full_stores = _Np / __vsize; 1553 using _V = __vector_type_t<_Up, __vsize>; 1554 const array<_V, __stores> __converted 1555 = __convert_all<_V, __stores>(__v); 1556 __execute_n_times<__full_stores>( 1557 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1558 _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); 1559 }); 1560 if constexpr (__full_stores < __stores) 1561 _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) 1562 * sizeof(_Up)>( 1563 __converted[__full_stores], __mem + __full_stores * __vsize); 1564 } 1565 } 1566 1567 // _S_masked_store_nocvt {{{2 1568 template <typename _Tp, size_t _Np> 1569 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_masked_store_nocvt_SimdImplBuiltin1570 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) 1571 { 1572 _BitOps::_S_bit_iteration( 1573 _MaskImpl::_S_to_bits(__k), 1574 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1575 __mem[__i] = __v[__i]; 1576 }); 1577 } 1578 1579 // _S_masked_store {{{2 1580 template <typename _TW, typename _TVT = _VectorTraits<_TW>, 1581 typename _Tp = typename _TVT::value_type, typename _Up> 1582 static constexpr inline void _S_masked_store_SimdImplBuiltin1583 _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept 1584 { 1585 constexpr size_t _TV_size = _S_size<_Tp>; 1586 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1587 constexpr size_t __max_store_size 1588 = _SuperImpl::template _S_max_store_size<_Up>; 1589 if constexpr ( 1590 is_same_v< 1591 _Tp, 1592 _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) 1593 { 1594 // bitwise or no conversion, reinterpret: 1595 const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1596 if constexpr (__is_bitmask_v<decltype(__k)>) 1597 return _MaskMember<_Up>(__k._M_data); 1598 else 1599 return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k); 1600 }(); 1601 _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v), 1602 __mem, __kk); 1603 } 1604 else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up) 1605 && !_CommonImpl:: 1606 template __converts_via_decomposition_v< 1607 _Tp, _Up, __max_store_size>) 1608 { // conversion via decomposition is better handled via the 1609 // bit_iteration 1610 // fallback below 1611 constexpr size_t _UW_size 1612 = std::min(_TV_size, __max_store_size / sizeof(_Up)); 1613 static_assert(_UW_size <= _TV_size); 1614 using _UW = _SimdWrapper<_Up, _UW_size>; 1615 using _UV = __vector_type_t<_Up, _UW_size>; 1616 using _UAbi = simd_abi::deduce_t<_Up, _UW_size>; 1617 if constexpr (_UW_size == _TV_size) // one convert+store 1618 { 1619 const _UW __converted = __convert<_UW>(__v); 1620 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1621 __converted, __mem, 1622 _UAbi::_MaskImpl::template _S_convert< 1623 __int_for_sizeof_t<_Up>>(__k)); 1624 } 1625 else 1626 { 1627 static_assert(_UW_size * sizeof(_Up) == __max_store_size); 1628 constexpr size_t _NFullStores = _TV_size / _UW_size; 1629 constexpr size_t _NAllStores 1630 = __div_roundup(_TV_size, _UW_size); 1631 constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; 1632 const array<_UV, _NAllStores> __converted 1633 = __convert_all<_UV, _NAllStores>(__v); 1634 __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1635 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1636 _UW(__converted[__i]), __mem + __i * _UW_size, 1637 _UAbi::_MaskImpl::template _S_convert< 1638 __int_for_sizeof_t<_Up>>( 1639 __extract_part<__i, _NParts>(__k.__as_full_vector()))); 1640 }); 1641 if constexpr (_NAllStores 1642 > _NFullStores) // one partial at the end 1643 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1644 _UW(__converted[_NFullStores]), 1645 __mem + _NFullStores * _UW_size, 1646 _UAbi::_MaskImpl::template _S_convert< 1647 __int_for_sizeof_t<_Up>>( 1648 __extract_part<_NFullStores, _NParts>( 1649 __k.__as_full_vector()))); 1650 } 1651 } 1652 else 1653 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1654 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1655 __mem[__i] = static_cast<_Up>(__v[__i]); 1656 }); 1657 } 1658 1659 // _S_complement {{{2 1660 template <typename _Tp, size_t _Np> 1661 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_complement_SimdImplBuiltin1662 _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept 1663 { 1664 if constexpr (is_floating_point_v<_Tp>) 1665 return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x)); 1666 else 1667 return ~__x._M_data; 1668 } 1669 1670 // _S_unary_minus {{{2 1671 template <typename _Tp, size_t _Np> 1672 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_unary_minus_SimdImplBuiltin1673 _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept 1674 { 1675 // GCC doesn't use the psign instructions, but pxor & psub seem to be 1676 // just as good a choice as pcmpeqd & psign. So meh. 1677 return -__x._M_data; 1678 } 1679 1680 // arithmetic operators {{{2 1681 template <typename _Tp, size_t _Np> 1682 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_plus_SimdImplBuiltin1683 _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1684 { return __x._M_data + __y._M_data; } 1685 1686 template <typename _Tp, size_t _Np> 1687 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_minus_SimdImplBuiltin1688 _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1689 { return __x._M_data - __y._M_data; } 1690 1691 template <typename _Tp, size_t _Np> 1692 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_multiplies_SimdImplBuiltin1693 _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1694 { return __x._M_data * __y._M_data; } 1695 1696 template <typename _Tp, size_t _Np> 1697 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_divides_SimdImplBuiltin1698 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1699 { 1700 // Note that division by 0 is always UB, so we must ensure we avoid the 1701 // case for partial registers 1702 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1703 return __x._M_data / __y._M_data; 1704 else 1705 return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data); 1706 } 1707 1708 template <typename _Tp, size_t _Np> 1709 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_modulus_SimdImplBuiltin1710 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1711 { 1712 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1713 return __x._M_data % __y._M_data; 1714 else 1715 return __as_vector(__x) 1716 % _Abi::__make_padding_nonzero(__as_vector(__y)); 1717 } 1718 1719 template <typename _Tp, size_t _Np> 1720 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_and_SimdImplBuiltin1721 _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1722 { return __and(__x, __y); } 1723 1724 template <typename _Tp, size_t _Np> 1725 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_or_SimdImplBuiltin1726 _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1727 { return __or(__x, __y); } 1728 1729 template <typename _Tp, size_t _Np> 1730 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_xor_SimdImplBuiltin1731 _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1732 { return __xor(__x, __y); } 1733 1734 template <typename _Tp, size_t _Np> 1735 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_bit_shift_left_SimdImplBuiltin1736 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1737 { return __x._M_data << __y._M_data; } 1738 1739 template <typename _Tp, size_t _Np> 1740 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_bit_shift_right_SimdImplBuiltin1741 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1742 { return __x._M_data >> __y._M_data; } 1743 1744 template <typename _Tp, size_t _Np> 1745 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_shift_left_SimdImplBuiltin1746 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y) 1747 { return __x._M_data << __y; } 1748 1749 template <typename _Tp, size_t _Np> 1750 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_shift_right_SimdImplBuiltin1751 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y) 1752 { return __x._M_data >> __y; } 1753 1754 // compares {{{2 1755 // _S_equal_to {{{3 1756 template <typename _Tp, size_t _Np> 1757 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_equal_to_SimdImplBuiltin1758 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1759 { return __x._M_data == __y._M_data; } 1760 1761 // _S_not_equal_to {{{3 1762 template <typename _Tp, size_t _Np> 1763 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_not_equal_to_SimdImplBuiltin1764 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1765 { return __x._M_data != __y._M_data; } 1766 1767 // _S_less {{{3 1768 template <typename _Tp, size_t _Np> 1769 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_less_SimdImplBuiltin1770 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1771 { return __x._M_data < __y._M_data; } 1772 1773 // _S_less_equal {{{3 1774 template <typename _Tp, size_t _Np> 1775 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_less_equal_SimdImplBuiltin1776 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1777 { return __x._M_data <= __y._M_data; } 1778 1779 // _S_negate {{{2 1780 template <typename _Tp, size_t _Np> 1781 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_negate_SimdImplBuiltin1782 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 1783 { return !__x._M_data; } 1784 1785 // _S_min, _S_max, _S_minmax {{{2 1786 template <typename _Tp, size_t _Np> 1787 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1788 _SimdWrapper<_Tp, _Np> _S_min_SimdImplBuiltin1789 _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1790 { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; } 1791 1792 template <typename _Tp, size_t _Np> 1793 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1794 _SimdWrapper<_Tp, _Np> _S_max_SimdImplBuiltin1795 _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1796 { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; } 1797 1798 template <typename _Tp, size_t _Np> 1799 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1800 pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>> _S_minmax_SimdImplBuiltin1801 _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1802 { 1803 return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data, 1804 __a._M_data < __b._M_data ? __b._M_data : __a._M_data}; 1805 } 1806 1807 // reductions {{{2 1808 template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp, 1809 typename _BinaryOperation> 1810 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_reduce_partial_SimdImplBuiltin1811 _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>, 1812 simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1813 { 1814 using _V = __vector_type_t<_Tp, _Np / 2>; 1815 static_assert(sizeof(_V) <= sizeof(__x)); 1816 // _S_full_size is the size of the smallest native SIMD register that 1817 // can store _Np/2 elements: 1818 using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>; 1819 using _HalfSimd = __deduced_simd<_Tp, _Np / 2>; 1820 const auto __xx = __as_vector(__x); 1821 return _HalfSimd::abi_type::_SimdImpl::_S_reduce( 1822 static_cast<_HalfSimd>(__as_vector(__binary_op( 1823 static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)), 1824 static_cast<_FullSimd>(__intrin_bitcast<_V>( 1825 __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>( 1826 __xx)))))), 1827 __binary_op); 1828 } 1829 1830 template <typename _Tp, typename _BinaryOperation> 1831 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_reduce_SimdImplBuiltin1832 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1833 { 1834 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 1835 if constexpr (_Np == 1) 1836 return __x[0]; 1837 else if constexpr (_Np == 2) 1838 return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1839 simd<_Tp, simd_abi::scalar>(__x[1]))[0]; 1840 else if (__builtin_is_constant_evaluated()) 1841 { 1842 simd<_Tp, simd_abi::scalar> __acc = __x[0]; 1843 for (size_t __i = 1; __i < _Np; ++__i) 1844 __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i])); 1845 return __acc[0]; 1846 } 1847 else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{ 1848 { 1849 [[maybe_unused]] constexpr auto __full_size 1850 = _Abi::template _S_full_size<_Tp>; 1851 if constexpr (_Np == 3) 1852 return __binary_op( 1853 __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1854 simd<_Tp, simd_abi::scalar>(__x[1])), 1855 simd<_Tp, simd_abi::scalar>(__x[2]))[0]; 1856 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1857 plus<>>) 1858 { 1859 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1860 return _Ap::_SimdImpl::_S_reduce( 1861 simd<_Tp, _Ap>(__private_init, 1862 _Abi::_S_masked(__as_vector(__x))), 1863 __binary_op); 1864 } 1865 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1866 multiplies<>>) 1867 { 1868 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1869 using _TW = _SimdWrapper<_Tp, __full_size>; 1870 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full 1871 = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector(); 1872 _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one 1873 = __vector_broadcast<__full_size>(_Tp(1)); 1874 const _TW __x_full = __data(__x).__as_full_vector(); 1875 const _TW __x_padded_with_ones 1876 = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one, 1877 __x_full); 1878 return _Ap::_SimdImpl::_S_reduce( 1879 simd<_Tp, _Ap>(__private_init, __x_padded_with_ones), 1880 __binary_op); 1881 } 1882 else if constexpr (_Np & 1) 1883 { 1884 using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>; 1885 return __binary_op( 1886 simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce( 1887 simd<_Tp, _Ap>( 1888 __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>( 1889 __as_vector(__x))), 1890 __binary_op)), 1891 simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0]; 1892 } 1893 else 1894 return _S_reduce_partial<_Np>( 1895 make_index_sequence<_Np / 2>(), 1896 make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op); 1897 } //}}} 1898 else if constexpr (sizeof(__x) == 16) //{{{ 1899 { 1900 if constexpr (_Np == 16) 1901 { 1902 const auto __y = __data(__x); 1903 __x = __binary_op( 1904 _M_make_simd<_Tp, _Np>( 1905 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1906 7, 7>(__y)), 1907 _M_make_simd<_Tp, _Np>( 1908 __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 1909 14, 14, 15, 15>(__y))); 1910 } 1911 if constexpr (_Np >= 8) 1912 { 1913 const auto __y = __vector_bitcast<short>(__data(__x)); 1914 __x = __binary_op( 1915 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1916 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))), 1917 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1918 __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y)))); 1919 } 1920 if constexpr (_Np >= 4) 1921 { 1922 using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>; 1923 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1924 __x = __binary_op(__x, 1925 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1926 __vector_permute<3, 2, 1, 0>(__y)))); 1927 } 1928 using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>; 1929 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1930 __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1931 __vector_permute<1, 1>(__y)))); 1932 return __x[0]; 1933 } //}}} 1934 else 1935 { 1936 static_assert(sizeof(__x) > __min_vector_size<_Tp>); 1937 static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2 1938 using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>; 1939 using _V = simd<_Tp, _Ap>; 1940 return _Ap::_SimdImpl::_S_reduce( 1941 __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))), 1942 _V(__private_init, 1943 __extract<1, 2>(__as_vector(__x)))), 1944 static_cast<_BinaryOperation&&>(__binary_op)); 1945 } 1946 } 1947 1948 // math {{{2 1949 // frexp, modf and copysign implemented in simd_math.h 1950 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \ 1951 template <typename _Tp, typename... _More> \ 1952 static _Tp \ 1953 _S_##__name(const _Tp& __x, const _More&... __more) \ 1954 { \ 1955 return __generate_vector<_Tp>( \ 1956 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1957 return __name(__x[__i], __more[__i]...); \ 1958 }); \ 1959 } 1960 1961 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ 1962 template <typename _Tp, typename... _More> \ 1963 static typename _Tp::mask_type \ 1964 _S_##__name(const _Tp& __x, const _More&... __more) \ 1965 { \ 1966 return __generate_vector<_Tp>( \ 1967 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1968 return __name(__x[__i], __more[__i]...); \ 1969 }); \ 1970 } 1971 1972 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ 1973 template <typename _Tp, typename... _More> \ 1974 static auto \ 1975 _S_##__name(const _Tp& __x, const _More&... __more) \ 1976 { \ 1977 return __fixed_size_storage_t<_RetTp, \ 1978 _VectorTraits<_Tp>::_S_partial_width>:: \ 1979 _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1980 return __meta._S_generator( \ 1981 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1982 return __name(__x[__meta._S_offset + __i], \ 1983 __more[__meta._S_offset + __i]...); \ 1984 }, \ 1985 static_cast<_RetTp*>(nullptr)); \ 1986 }); \ 1987 } 1988 1989 _GLIBCXX_SIMD_MATH_FALLBACK(acos) _GLIBCXX_SIMD_MATH_FALLBACK_SimdImplBuiltin1990 _GLIBCXX_SIMD_MATH_FALLBACK(asin) 1991 _GLIBCXX_SIMD_MATH_FALLBACK(atan) 1992 _GLIBCXX_SIMD_MATH_FALLBACK(atan2) 1993 _GLIBCXX_SIMD_MATH_FALLBACK(cos) 1994 _GLIBCXX_SIMD_MATH_FALLBACK(sin) 1995 _GLIBCXX_SIMD_MATH_FALLBACK(tan) 1996 _GLIBCXX_SIMD_MATH_FALLBACK(acosh) 1997 _GLIBCXX_SIMD_MATH_FALLBACK(asinh) 1998 _GLIBCXX_SIMD_MATH_FALLBACK(atanh) 1999 _GLIBCXX_SIMD_MATH_FALLBACK(cosh) 2000 _GLIBCXX_SIMD_MATH_FALLBACK(sinh) 2001 _GLIBCXX_SIMD_MATH_FALLBACK(tanh) 2002 _GLIBCXX_SIMD_MATH_FALLBACK(exp) 2003 _GLIBCXX_SIMD_MATH_FALLBACK(exp2) 2004 _GLIBCXX_SIMD_MATH_FALLBACK(expm1) 2005 _GLIBCXX_SIMD_MATH_FALLBACK(ldexp) 2006 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) 2007 _GLIBCXX_SIMD_MATH_FALLBACK(log) 2008 _GLIBCXX_SIMD_MATH_FALLBACK(log10) 2009 _GLIBCXX_SIMD_MATH_FALLBACK(log1p) 2010 _GLIBCXX_SIMD_MATH_FALLBACK(log2) 2011 _GLIBCXX_SIMD_MATH_FALLBACK(logb) 2012 2013 // modf implemented in simd_math.h 2014 _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) 2015 _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) 2016 _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) 2017 _GLIBCXX_SIMD_MATH_FALLBACK(fabs) 2018 _GLIBCXX_SIMD_MATH_FALLBACK(pow) 2019 _GLIBCXX_SIMD_MATH_FALLBACK(sqrt) 2020 _GLIBCXX_SIMD_MATH_FALLBACK(erf) 2021 _GLIBCXX_SIMD_MATH_FALLBACK(erfc) 2022 _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) 2023 _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) 2024 2025 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) 2026 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) 2027 2028 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) 2029 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) 2030 2031 _GLIBCXX_SIMD_MATH_FALLBACK(fmod) 2032 _GLIBCXX_SIMD_MATH_FALLBACK(remainder) 2033 2034 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2035 static _Tp 2036 _S_remquo(const _Tp __x, const _Tp __y, 2037 __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z) 2038 { 2039 return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2040 int __tmp; 2041 auto __r = remquo(__x[__i], __y[__i], &__tmp); 2042 __z->_M_set(__i, __tmp); 2043 return __r; 2044 }); 2045 } 2046 2047 // copysign in simd_math.h 2048 _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) _GLIBCXX_SIMD_MATH_FALLBACK_SimdImplBuiltin2049 _GLIBCXX_SIMD_MATH_FALLBACK(fdim) 2050 _GLIBCXX_SIMD_MATH_FALLBACK(fmax) 2051 _GLIBCXX_SIMD_MATH_FALLBACK(fmin) 2052 _GLIBCXX_SIMD_MATH_FALLBACK(fma) 2053 2054 template <typename _Tp, size_t _Np> 2055 static constexpr _MaskMember<_Tp> 2056 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 2057 _SimdWrapper<_Tp, _Np> __y) noexcept 2058 { 2059 using _Ip = __int_for_sizeof_t<_Tp>; 2060 const auto __xn = __vector_bitcast<_Ip>(__x); 2061 const auto __yn = __vector_bitcast<_Ip>(__y); 2062 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2063 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2064 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2065 __xp > __yp); 2066 } 2067 2068 template <typename _Tp, size_t _Np> 2069 static constexpr _MaskMember<_Tp> _S_isgreaterequal_SimdImplBuiltin2070 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, 2071 _SimdWrapper<_Tp, _Np> __y) noexcept 2072 { 2073 using _Ip = __int_for_sizeof_t<_Tp>; 2074 const auto __xn = __vector_bitcast<_Ip>(__x); 2075 const auto __yn = __vector_bitcast<_Ip>(__y); 2076 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2077 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2078 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2079 __xp >= __yp); 2080 } 2081 2082 template <typename _Tp, size_t _Np> 2083 static constexpr _MaskMember<_Tp> _S_isless_SimdImplBuiltin2084 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept 2085 { 2086 using _Ip = __int_for_sizeof_t<_Tp>; 2087 const auto __xn = __vector_bitcast<_Ip>(__x); 2088 const auto __yn = __vector_bitcast<_Ip>(__y); 2089 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2090 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2091 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2092 __xp < __yp); 2093 } 2094 2095 template <typename _Tp, size_t _Np> 2096 static constexpr _MaskMember<_Tp> _S_islessequal_SimdImplBuiltin2097 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, 2098 _SimdWrapper<_Tp, _Np> __y) noexcept 2099 { 2100 using _Ip = __int_for_sizeof_t<_Tp>; 2101 const auto __xn = __vector_bitcast<_Ip>(__x); 2102 const auto __yn = __vector_bitcast<_Ip>(__y); 2103 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2104 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2105 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2106 __xp <= __yp); 2107 } 2108 2109 template <typename _Tp, size_t _Np> 2110 static constexpr _MaskMember<_Tp> _S_islessgreater_SimdImplBuiltin2111 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, 2112 _SimdWrapper<_Tp, _Np> __y) noexcept 2113 { 2114 return __andnot(_SuperImpl::_S_isunordered(__x, __y), 2115 _SuperImpl::_S_not_equal_to(__x, __y)); 2116 } 2117 2118 #undef _GLIBCXX_SIMD_MATH_FALLBACK 2119 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET 2120 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET 2121 // _S_abs {{{3 2122 template <typename _Tp, size_t _Np> 2123 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_abs_SimdImplBuiltin2124 _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept 2125 { 2126 // if (__builtin_is_constant_evaluated()) 2127 // { 2128 // return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2129 // } 2130 if constexpr (is_floating_point_v<_Tp>) 2131 // `v < 0 ? -v : v` cannot compile to the efficient implementation of 2132 // masking the signbit off because it must consider v == -0 2133 2134 // ~(-0.) & v would be easy, but breaks with fno-signed-zeros 2135 return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data); 2136 else 2137 return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2138 } 2139 2140 // }}}3 2141 // _S_plus_minus {{{ 2142 // Returns __x + __y - __y without -fassociative-math optimizing to __x. 2143 // - _TV must be __vector_type_t<floating-point type, N>. 2144 // - _UV must be _TV or floating-point type. 2145 template <typename _TV, typename _UV> 2146 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV _S_plus_minus_SimdImplBuiltin2147 _S_plus_minus(_TV __x, _UV __y) noexcept 2148 { 2149 #if defined __i386__ && !defined __SSE_MATH__ 2150 if constexpr (sizeof(__x) == 8) 2151 { // operations on __x would use the FPU 2152 static_assert(is_same_v<_TV, __vector_type_t<float, 2>>); 2153 const auto __x4 = __vector_bitcast<float, 4>(__x); 2154 if constexpr (is_same_v<_TV, _UV>) 2155 return __vector_bitcast<float, 2>( 2156 _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y))); 2157 else 2158 return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y)); 2159 } 2160 #endif 2161 #if !defined __clang__ && __GCC_IEC_559 == 0 2162 if (__builtin_is_constant_evaluated() 2163 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 2164 return (__x + __y) - __y; 2165 else 2166 return [&] { 2167 __x += __y; 2168 if constexpr(__have_sse) 2169 { 2170 if constexpr (sizeof(__x) >= 16) 2171 asm("" : "+x"(__x)); 2172 else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>) 2173 asm("" : "+x"(__x[0]), "+x"(__x[1])); 2174 else 2175 __assert_unreachable<_TV>(); 2176 } 2177 else if constexpr(__have_neon) 2178 asm("" : "+w"(__x)); 2179 else if constexpr (__have_power_vmx) 2180 { 2181 if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>) 2182 asm("" : "+fgr"(__x[0]), "+fgr"(__x[1])); 2183 else 2184 asm("" : "+v"(__x)); 2185 } 2186 else 2187 asm("" : "+g"(__x)); 2188 return __x - __y; 2189 }(); 2190 #else 2191 return (__x + __y) - __y; 2192 #endif 2193 } 2194 2195 // }}} 2196 // _S_nearbyint {{{3 2197 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2198 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint_SimdImplBuiltin2199 _S_nearbyint(_Tp __x_) noexcept 2200 { 2201 using value_type = typename _TVT::value_type; 2202 using _V = typename _TVT::type; 2203 const _V __x = __x_; 2204 const _V __absx = __and(__x, _S_absmask<_V>); 2205 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>); 2206 _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs 2207 = _V() + (1ull << (__digits_v<value_type> - 1)); 2208 const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs); 2209 const _V __shifted = _S_plus_minus(__x, __shifter); 2210 return __absx < __shifter_abs ? __shifted : __x; 2211 } 2212 2213 // _S_rint {{{3 2214 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2215 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint_SimdImplBuiltin2216 _S_rint(_Tp __x) noexcept 2217 { return _SuperImpl::_S_nearbyint(__x); } 2218 2219 // _S_trunc {{{3 2220 template <typename _Tp, size_t _Np> 2221 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_trunc_SimdImplBuiltin2222 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2223 { 2224 using _V = __vector_type_t<_Tp, _Np>; 2225 const _V __absx = __and(__x._M_data, _S_absmask<_V>); 2226 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>); 2227 constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1); 2228 _V __truncated = _S_plus_minus(__absx, __shifter); 2229 __truncated -= __truncated > __absx ? _V() + 1 : _V(); 2230 return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated) 2231 : __x._M_data; 2232 } 2233 2234 // _S_round {{{3 2235 template <typename _Tp, size_t _Np> 2236 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_round_SimdImplBuiltin2237 _S_round(_SimdWrapper<_Tp, _Np> __x) 2238 { 2239 const auto __abs_x = _SuperImpl::_S_abs(__x); 2240 const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data; 2241 const auto __r_abs // round(abs(x)) = 2242 = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0); 2243 return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs); 2244 } 2245 2246 // _S_floor {{{3 2247 template <typename _Tp, size_t _Np> 2248 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_floor_SimdImplBuiltin2249 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2250 { 2251 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2252 const auto __negative_input 2253 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2254 const auto __mask 2255 = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2256 return __or(__andnot(__mask, __y), 2257 __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1))); 2258 } 2259 2260 // _S_ceil {{{3 2261 template <typename _Tp, size_t _Np> 2262 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_ceil_SimdImplBuiltin2263 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2264 { 2265 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2266 const auto __negative_input 2267 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2268 const auto __inv_mask 2269 = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2270 return __or(__and(__inv_mask, __y), 2271 __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1))); 2272 } 2273 2274 // _S_isnan {{{3 2275 template <typename _Tp, size_t _Np> 2276 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isnan_SimdImplBuiltin2277 _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2278 { 2279 #if __FINITE_MATH_ONLY__ 2280 return {}; // false 2281 #elif !defined __SUPPORT_SNAN__ 2282 return ~(__x._M_data == __x._M_data); 2283 #elif defined __STDC_IEC_559__ 2284 using _Ip = __int_for_sizeof_t<_Tp>; 2285 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2286 const auto __infn 2287 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 2288 return __infn < __absn; 2289 #else 2290 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?" 2291 #endif 2292 } 2293 2294 // _S_isfinite {{{3 2295 template <typename _Tp, size_t _Np> 2296 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isfinite_SimdImplBuiltin2297 _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2298 { 2299 #if __FINITE_MATH_ONLY__ 2300 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 2301 _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV(); 2302 return __alltrue; 2303 #else 2304 // if all exponent bits are set, __x is either inf or NaN 2305 using _Ip = __int_for_sizeof_t<_Tp>; 2306 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2307 const auto __maxn 2308 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2309 return __absn <= __maxn; 2310 #endif 2311 } 2312 2313 // _S_isunordered {{{3 2314 template <typename _Tp, size_t _Np> 2315 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isunordered_SimdImplBuiltin2316 _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2317 { return __or(_S_isnan(__x), _S_isnan(__y)); } 2318 2319 // _S_signbit {{{3 2320 template <typename _Tp, size_t _Np> 2321 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_signbit_SimdImplBuiltin2322 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2323 { 2324 using _Ip = __int_for_sizeof_t<_Tp>; 2325 return __vector_bitcast<_Ip>(__x) < 0; 2326 // Arithmetic right shift (SRA) would also work (instead of compare), but 2327 // 64-bit SRA isn't available on x86 before AVX512. And in general, 2328 // compares are more likely to be efficient than SRA. 2329 } 2330 2331 // _S_isinf {{{3 2332 template <typename _Tp, size_t _Np> 2333 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isinf_SimdImplBuiltin2334 _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2335 { 2336 #if __FINITE_MATH_ONLY__ 2337 return {}; // false 2338 #else 2339 return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x), 2340 __vector_broadcast<_Np>( 2341 __infinity_v<_Tp>)); 2342 // alternative: 2343 // compare to inf using the corresponding integer type 2344 /* 2345 return 2346 __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>( 2347 _S_abs(__x)._M_data) 2348 == 2349 __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>( 2350 __infinity_v<_Tp>))); 2351 */ 2352 #endif 2353 } 2354 2355 // _S_isnormal {{{3 2356 template <typename _Tp, size_t _Np> 2357 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_isnormal_SimdImplBuiltin2358 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 2359 { 2360 using _Ip = __int_for_sizeof_t<_Tp>; 2361 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2362 const auto __minn 2363 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>)); 2364 #if __FINITE_MATH_ONLY__ 2365 return __absn >= __minn; 2366 #else 2367 const auto __maxn 2368 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2369 return __minn <= __absn && __absn <= __maxn; 2370 #endif 2371 } 2372 2373 // _S_fpclassify {{{3 2374 template <typename _Tp, size_t _Np> 2375 _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np> _S_fpclassify_SimdImplBuiltin2376 _S_fpclassify(_SimdWrapper<_Tp, _Np> __x) 2377 { 2378 using _I = __int_for_sizeof_t<_Tp>; 2379 const auto __xn 2380 = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x))); 2381 constexpr size_t _NI = sizeof(__xn) / sizeof(_I); 2382 _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn 2383 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>)); 2384 2385 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal 2386 = __vector_broadcast<_NI, _I>(FP_NORMAL); 2387 #if !__FINITE_MATH_ONLY__ 2388 _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn 2389 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>)); 2390 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan 2391 = __vector_broadcast<_NI, _I>(FP_NAN); 2392 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite 2393 = __vector_broadcast<_NI, _I>(FP_INFINITE); 2394 #endif 2395 #ifndef __FAST_MATH__ 2396 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal 2397 = __vector_broadcast<_NI, _I>(FP_SUBNORMAL); 2398 #endif 2399 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero 2400 = __vector_broadcast<_NI, _I>(FP_ZERO); 2401 2402 __vector_type_t<_I, _NI> 2403 __tmp = __xn < __minn 2404 #ifdef __FAST_MATH__ 2405 ? __fp_zero 2406 #else 2407 ? (__xn == 0 ? __fp_zero : __fp_subnormal) 2408 #endif 2409 #if __FINITE_MATH_ONLY__ 2410 : __fp_normal; 2411 #else 2412 : (__xn < __infn ? __fp_normal 2413 : (__xn == __infn ? __fp_infinite : __fp_nan)); 2414 #endif 2415 2416 if constexpr (sizeof(_I) == sizeof(int)) 2417 { 2418 using _FixedInt = __fixed_size_storage_t<int, _Np>; 2419 const auto __as_int = __vector_bitcast<int, _Np>(__tmp); 2420 if constexpr (_FixedInt::_S_tuple_size == 1) 2421 return {__as_int}; 2422 else if constexpr (_FixedInt::_S_tuple_size == 2 2423 && is_same_v< 2424 typename _FixedInt::_SecondType::_FirstAbi, 2425 simd_abi::scalar>) 2426 return {__extract<0, 2>(__as_int), __as_int[_Np - 1]}; 2427 else if constexpr (_FixedInt::_S_tuple_size == 2) 2428 return {__extract<0, 2>(__as_int), 2429 __auto_bitcast(__extract<1, 2>(__as_int))}; 2430 else 2431 __assert_unreachable<_Tp>(); 2432 } 2433 else if constexpr (_Np == 2 && sizeof(_I) == 8 2434 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2) 2435 { 2436 const auto __aslong = __vector_bitcast<_LLong>(__tmp); 2437 return {int(__aslong[0]), {int(__aslong[1])}}; 2438 } 2439 #if _GLIBCXX_SIMD_X86INTRIN 2440 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32 2441 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1) 2442 return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)), 2443 __to_intrin(__hi128(__tmp)))}; 2444 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64 2445 && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1) 2446 return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))}; 2447 #endif // _GLIBCXX_SIMD_X86INTRIN 2448 else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1) 2449 return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), 2450 [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2451 return __make_wrapper<int>(__l...); 2452 })}; 2453 else 2454 __assert_unreachable<_Tp>(); 2455 } 2456 2457 // _S_increment & _S_decrement{{{2 2458 template <typename _Tp, size_t _Np> 2459 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_increment_SimdImplBuiltin2460 _S_increment(_SimdWrapper<_Tp, _Np>& __x) 2461 { __x = __x._M_data + 1; } 2462 2463 template <typename _Tp, size_t _Np> 2464 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_decrement_SimdImplBuiltin2465 _S_decrement(_SimdWrapper<_Tp, _Np>& __x) 2466 { __x = __x._M_data - 1; } 2467 2468 // smart_reference access {{{2 2469 template <typename _Tp, size_t _Np, typename _Up> 2470 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_set_SimdImplBuiltin2471 _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept 2472 { __v._M_set(__i, static_cast<_Up&&>(__x)); } 2473 2474 // _S_masked_assign{{{2 2475 template <typename _Tp, typename _K, size_t _Np> 2476 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_masked_assign_SimdImplBuiltin2477 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2478 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2479 { 2480 if (__k._M_is_constprop_none_of()) 2481 return; 2482 else if (__k._M_is_constprop_all_of()) 2483 __lhs = __rhs; 2484 else 2485 __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); 2486 } 2487 2488 template <typename _Tp, typename _K, size_t _Np> 2489 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_masked_assign_SimdImplBuiltin2490 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2491 __type_identity_t<_Tp> __rhs) 2492 { 2493 if (__k._M_is_constprop_none_of()) 2494 return; 2495 else if (__k._M_is_constprop_all_of()) 2496 __lhs = __vector_broadcast<_Np>(__rhs); 2497 else if (__builtin_constant_p(__rhs) && __rhs == 0) 2498 { 2499 if constexpr (!is_same_v<bool, _K>) 2500 // the __andnot optimization only makes sense if __k._M_data is a 2501 // vector register 2502 __lhs._M_data 2503 = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data); 2504 else 2505 // for AVX512/__mmask, a _mm512_maskz_mov is best 2506 __lhs 2507 = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>()); 2508 } 2509 else 2510 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2511 _SimdWrapper<_Tp, _Np>( 2512 __vector_broadcast<_Np>(__rhs))); 2513 } 2514 2515 // _S_masked_cassign {{{2 2516 template <typename _Op, typename _Tp, typename _K, size_t _Np> 2517 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_masked_cassign_SimdImplBuiltin2518 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2519 _SimdWrapper<_Tp, _Np>& __lhs, 2520 const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs, 2521 _Op __op) 2522 { 2523 if (__k._M_is_constprop_none_of()) 2524 return; 2525 else if (__k._M_is_constprop_all_of()) 2526 __lhs = __op(_SuperImpl{}, __lhs, __rhs); 2527 else 2528 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2529 __op(_SuperImpl{}, __lhs, __rhs)); 2530 } 2531 2532 template <typename _Op, typename _Tp, typename _K, size_t _Np> 2533 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_masked_cassign_SimdImplBuiltin2534 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2535 _SimdWrapper<_Tp, _Np>& __lhs, 2536 const __type_identity_t<_Tp> __rhs, _Op __op) 2537 { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); } 2538 2539 // _S_masked_unary {{{2 2540 template <template <typename> class _Op, typename _Tp, typename _K, 2541 size_t _Np> 2542 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_masked_unary_SimdImplBuiltin2543 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, 2544 const _SimdWrapper<_Tp, _Np> __v) 2545 { 2546 if (__k._M_is_constprop_none_of()) 2547 return __v; 2548 auto __vv = _M_make_simd(__v); 2549 _Op<decltype(__vv)> __op; 2550 if (__k._M_is_constprop_all_of()) 2551 return __data(__op(__vv)); 2552 else if constexpr (is_same_v<_Op<void>, __increment<void>>) 2553 { 2554 static_assert(not std::is_same_v<_K, bool>); 2555 if constexpr (is_integral_v<_Tp>) 2556 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2557 return __v._M_data - __vector_bitcast<_Tp>(__k._M_data); 2558 else if constexpr (not __have_avx2) 2559 return __v._M_data 2560 + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2561 _K, _Tp(1))); 2562 // starting with AVX2 it is more efficient to blend after add 2563 } 2564 else if constexpr (is_same_v<_Op<void>, __decrement<void>>) 2565 { 2566 static_assert(not std::is_same_v<_K, bool>); 2567 if constexpr (is_integral_v<_Tp>) 2568 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2569 return __v._M_data + __vector_bitcast<_Tp>(__k._M_data); 2570 else if constexpr (not __have_avx2) 2571 return __v._M_data 2572 - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2573 _K, _Tp(1))); 2574 // starting with AVX2 it is more efficient to blend after sub 2575 } 2576 return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv))); 2577 } 2578 2579 //}}}2 2580 }; 2581 2582 // _MaskImplBuiltinMixin {{{1 2583 struct _MaskImplBuiltinMixin 2584 { 2585 template <typename _Tp> 2586 using _TypeTag = _Tp*; 2587 2588 // _S_to_maskvector {{{ 2589 template <typename _Up, size_t _ToN = 1> 2590 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> _S_to_maskvector_MaskImplBuiltinMixin2591 _S_to_maskvector(bool __x) 2592 { 2593 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2594 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 2595 : __vector_type_t<_Up, _ToN>{}; 2596 } 2597 2598 template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized, 2599 size_t _ToN = _UpN == 0 ? _Np : _UpN> 2600 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> _S_to_maskvector_MaskImplBuiltinMixin2601 _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) 2602 { 2603 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2604 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2605 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2606 if constexpr (__i < _Np) 2607 return __x[__i] ? ~_Up() : _Up(); 2608 else 2609 return _Up(); 2610 }); 2611 } 2612 2613 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 2614 size_t _ToN = _UpN == 0 ? _Np : _UpN> 2615 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> _S_to_maskvector_MaskImplBuiltinMixin2616 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 2617 { 2618 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2619 using _TW = _SimdWrapper<_Tp, _Np>; 2620 using _UW = _SimdWrapper<_Up, _ToN>; 2621 if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW)) 2622 return __wrapper_bitcast<_Up, _ToN>(__x); 2623 else if constexpr (is_same_v<_Tp, bool>) // bits -> vector 2624 return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data)); 2625 else 2626 { // vector -> vector 2627 /* 2628 [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data); 2629 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) == 2630 16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr 2631 (sizeof(_Tp) == 4 && sizeof(_Up) == 2 2632 && sizeof(__y) == 16) 2633 return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y); 2634 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 2635 && sizeof(__y) == 16) 2636 return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y); 2637 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 2638 && sizeof(__y) == 16) 2639 return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, 2640 -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 && 2641 sizeof(_Up) == 1 2642 && sizeof(__y) == 16) 2643 return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 2644 -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 && 2645 sizeof(_Up) == 1 2646 && sizeof(__y) == 16) 2647 return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2648 -1, -1, -1, -1, -1>(__y); else 2649 */ 2650 { 2651 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2652 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2653 if constexpr (__i < _Np) 2654 return _Up(__x[__i.value]); 2655 else 2656 return _Up(); 2657 }); 2658 } 2659 } 2660 } 2661 2662 // }}} 2663 // _S_to_bits {{{ 2664 template <typename _Tp, size_t _Np> 2665 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> _S_to_bits_MaskImplBuiltinMixin2666 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 2667 { 2668 static_assert(!is_same_v<_Tp, bool>); 2669 static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong)); 2670 using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; 2671 const auto __bools 2672 = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); 2673 _ULLong __r = 0; 2674 __execute_n_times<_Np>( 2675 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2676 __r |= _ULLong(__bools[__i.value]) << __i; 2677 }); 2678 return __r; 2679 } 2680 2681 // }}} 2682 }; 2683 2684 // _MaskImplBuiltin {{{1 2685 template <typename _Abi, typename> 2686 struct _MaskImplBuiltin : _MaskImplBuiltinMixin 2687 { 2688 using _MaskImplBuiltinMixin::_S_to_bits; 2689 using _MaskImplBuiltinMixin::_S_to_maskvector; 2690 2691 // member types {{{ 2692 template <typename _Tp> 2693 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 2694 2695 template <typename _Tp> 2696 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 2697 2698 using _SuperImpl = typename _Abi::_MaskImpl; 2699 using _CommonImpl = typename _Abi::_CommonImpl; 2700 2701 template <typename _Tp> 2702 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 2703 2704 // }}} 2705 // _S_broadcast {{{ 2706 template <typename _Tp> 2707 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_broadcast_MaskImplBuiltin2708 _S_broadcast(bool __x) 2709 { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); } 2710 2711 // }}} 2712 // _S_load {{{ 2713 template <typename _Tp> 2714 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> _S_load_MaskImplBuiltin2715 _S_load(const bool* __mem) 2716 { 2717 using _I = __int_for_sizeof_t<_Tp>; 2718 if (not __builtin_is_constant_evaluated()) 2719 if constexpr (sizeof(_Tp) == sizeof(bool)) 2720 { 2721 const auto __bools 2722 = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem); 2723 // bool is {0, 1}, everything else is UB 2724 return __bools > 0; 2725 } 2726 return __generate_vector<_I, _S_size<_Tp>>( 2727 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2728 return __mem[__i] ? ~_I() : _I(); 2729 }); 2730 } 2731 2732 // }}} 2733 // _S_convert {{{ 2734 template <typename _Tp, size_t _Np, bool _Sanitized> 2735 _GLIBCXX_SIMD_INTRINSIC static constexpr auto _S_convert_MaskImplBuiltin2736 _S_convert(_BitMask<_Np, _Sanitized> __x) 2737 { 2738 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2739 return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits()); 2740 else 2741 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2742 _S_size<_Tp>>( 2743 __x._M_sanitized()); 2744 } 2745 2746 template <typename _Tp, size_t _Np> 2747 _GLIBCXX_SIMD_INTRINSIC static constexpr auto _S_convert_MaskImplBuiltin2748 _S_convert(_SimdWrapper<bool, _Np> __x) 2749 { 2750 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2751 return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data); 2752 else 2753 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2754 _S_size<_Tp>>( 2755 _BitMask<_Np>(__x._M_data)._M_sanitized()); 2756 } 2757 2758 template <typename _Tp, typename _Up, size_t _Np> 2759 _GLIBCXX_SIMD_INTRINSIC static constexpr auto _S_convert_MaskImplBuiltin2760 _S_convert(_SimdWrapper<_Up, _Np> __x) 2761 { 2762 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2763 return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>( 2764 _SuperImpl::_S_to_bits(__x)); 2765 else 2766 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2767 _S_size<_Tp>>(__x); 2768 } 2769 2770 template <typename _Tp, typename _Up, typename _UAbi> 2771 _GLIBCXX_SIMD_INTRINSIC static constexpr auto _S_convert_MaskImplBuiltin2772 _S_convert(simd_mask<_Up, _UAbi> __x) 2773 { 2774 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2775 { 2776 using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>; 2777 if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits 2778 return _R(__data(__x)); 2779 else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits 2780 return _R(__data(__x)); 2781 else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits 2782 return _R(__data(__x)._M_to_bits()); 2783 else // vector -> bits 2784 return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits()); 2785 } 2786 else 2787 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2788 _S_size<_Tp>>( 2789 __data(__x)); 2790 } 2791 2792 // }}} 2793 // _S_masked_load {{{2 2794 template <typename _Tp, size_t _Np> 2795 static inline _SimdWrapper<_Tp, _Np> _S_masked_load_MaskImplBuiltin2796 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 2797 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 2798 { 2799 // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity 2800 auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); 2801 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), 2802 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2803 __tmp._M_set(__i, -__mem[__i]); 2804 }); 2805 __merge = __wrapper_bitcast<_Tp>(__tmp); 2806 return __merge; 2807 } 2808 2809 // _S_store {{{2 2810 template <typename _Tp, size_t _Np> 2811 _GLIBCXX_SIMD_INTRINSIC static constexpr void _S_store_MaskImplBuiltin2812 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 2813 { 2814 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2815 __mem[__i] = __v[__i]; 2816 }); 2817 } 2818 2819 // _S_masked_store {{{2 2820 template <typename _Tp, size_t _Np> 2821 static inline void _S_masked_store_MaskImplBuiltin2822 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 2823 const _SimdWrapper<_Tp, _Np> __k) noexcept 2824 { 2825 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k), 2826 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2827 __mem[__i] = __v[__i]; 2828 }); 2829 } 2830 2831 // _S_from_bitmask{{{2 2832 template <size_t _Np, typename _Tp> 2833 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> _S_from_bitmask_MaskImplBuiltin2834 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 2835 { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); } 2836 2837 // logical and bitwise operators {{{2 2838 template <typename _Tp, size_t _Np> 2839 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_logical_and_MaskImplBuiltin2840 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2841 { return __and(__x._M_data, __y._M_data); } 2842 2843 template <typename _Tp, size_t _Np> 2844 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_logical_or_MaskImplBuiltin2845 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2846 { return __or(__x._M_data, __y._M_data); } 2847 2848 template <typename _Tp, size_t _Np> 2849 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_not_MaskImplBuiltin2850 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 2851 { 2852 if constexpr (_Abi::template _S_is_partial<_Tp>) 2853 return __andnot(__x, __wrapper_bitcast<_Tp>( 2854 _Abi::template _S_implicit_mask<_Tp>())); 2855 else 2856 return __not(__x._M_data); 2857 } 2858 2859 template <typename _Tp, size_t _Np> 2860 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_and_MaskImplBuiltin2861 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2862 { return __and(__x._M_data, __y._M_data); } 2863 2864 template <typename _Tp, size_t _Np> 2865 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_or_MaskImplBuiltin2866 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2867 { return __or(__x._M_data, __y._M_data); } 2868 2869 template <typename _Tp, size_t _Np> 2870 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> _S_bit_xor_MaskImplBuiltin2871 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2872 { return __xor(__x._M_data, __y._M_data); } 2873 2874 // smart_reference access {{{2 2875 template <typename _Tp, size_t _Np> 2876 static constexpr void _S_set_MaskImplBuiltin2877 _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept 2878 { 2879 if constexpr (is_same_v<_Tp, bool>) 2880 __k._M_set(__i, __x); 2881 else 2882 { 2883 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 2884 if (__builtin_is_constant_evaluated()) 2885 { 2886 __k = __generate_from_n_evaluations<_Np, 2887 __vector_type_t<_Tp, _Np>>( 2888 [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2889 if (__i == static_cast<int>(__j)) 2890 return _Tp(-__x); 2891 else 2892 return __k[+__j]; 2893 }); 2894 } 2895 else 2896 __k._M_data[__i] = -__x; 2897 } 2898 } 2899 2900 // _S_masked_assign{{{2 2901 template <typename _Tp, size_t _Np> 2902 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_assign_MaskImplBuiltin2903 _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2904 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2905 { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); } 2906 2907 template <typename _Tp, size_t _Np> 2908 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_assign_MaskImplBuiltin2909 _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs) 2910 { 2911 if (__builtin_constant_p(__rhs)) 2912 { 2913 if (__rhs == false) 2914 __lhs = __andnot(__k, __lhs); 2915 else 2916 __lhs = __or(__k, __lhs); 2917 return; 2918 } 2919 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2920 __data(simd_mask<_Tp, _Abi>(__rhs))); 2921 } 2922 2923 //}}}2 2924 // _S_all_of {{{ 2925 template <typename _Tp> 2926 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of_MaskImplBuiltin2927 _S_all_of(simd_mask<_Tp, _Abi> __k) 2928 { 2929 return __call_with_subscripts( 2930 __data(__k), make_index_sequence<_S_size<_Tp>>(), 2931 [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 2932 { return (... && !(__ent == 0)); }); 2933 } 2934 2935 // }}} 2936 // _S_any_of {{{ 2937 template <typename _Tp> 2938 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of_MaskImplBuiltin2939 _S_any_of(simd_mask<_Tp, _Abi> __k) 2940 { 2941 return __call_with_subscripts( 2942 __data(__k), make_index_sequence<_S_size<_Tp>>(), 2943 [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 2944 { return (... || !(__ent == 0)); }); 2945 } 2946 2947 // }}} 2948 // _S_none_of {{{ 2949 template <typename _Tp> 2950 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of_MaskImplBuiltin2951 _S_none_of(simd_mask<_Tp, _Abi> __k) 2952 { 2953 return __call_with_subscripts( 2954 __data(__k), make_index_sequence<_S_size<_Tp>>(), 2955 [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 2956 { return (... && (__ent == 0)); }); 2957 } 2958 2959 // }}} 2960 // _S_some_of {{{ 2961 template <typename _Tp> 2962 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of_MaskImplBuiltin2963 _S_some_of(simd_mask<_Tp, _Abi> __k) 2964 { 2965 const int __n_true = _SuperImpl::_S_popcount(__k); 2966 return __n_true > 0 && __n_true < int(_S_size<_Tp>); 2967 } 2968 2969 // }}} 2970 // _S_popcount {{{ 2971 template <typename _Tp> 2972 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount_MaskImplBuiltin2973 _S_popcount(simd_mask<_Tp, _Abi> __k) 2974 { 2975 using _I = __int_for_sizeof_t<_Tp>; 2976 if constexpr (is_default_constructible_v<simd<_I, _Abi>>) 2977 return -reduce( 2978 simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k)))); 2979 else 2980 return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>( 2981 simd<_Tp, _Abi>(__private_init, __data(__k)))); 2982 } 2983 2984 // }}} 2985 // _S_find_first_set {{{ 2986 template <typename _Tp> 2987 _GLIBCXX_SIMD_INTRINSIC static int _S_find_first_set_MaskImplBuiltin2988 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 2989 { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); } 2990 2991 // }}} 2992 // _S_find_last_set {{{ 2993 template <typename _Tp> 2994 _GLIBCXX_SIMD_INTRINSIC static int _S_find_last_set_MaskImplBuiltin2995 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 2996 { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; } 2997 2998 // }}} 2999 }; 3000 3001 //}}}1 3002 _GLIBCXX_SIMD_END_NAMESPACE 3003 #endif // __cplusplus >= 201703L 3004 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 3005 3006 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100 3007