1b1e83836Smrg // Simd NEON specific implementations -*- C++ -*- 2b1e83836Smrg 3b1e83836Smrg // Copyright (C) 2020-2022 Free Software Foundation, Inc. 4b1e83836Smrg // 5b1e83836Smrg // This file is part of the GNU ISO C++ Library. This library is free 6b1e83836Smrg // software; you can redistribute it and/or modify it under the 7b1e83836Smrg // terms of the GNU General Public License as published by the 8b1e83836Smrg // Free Software Foundation; either version 3, or (at your option) 9b1e83836Smrg // any later version. 10b1e83836Smrg 11b1e83836Smrg // This library is distributed in the hope that it will be useful, 12b1e83836Smrg // but WITHOUT ANY WARRANTY; without even the implied warranty of 13b1e83836Smrg // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14b1e83836Smrg // GNU General Public License for more details. 15b1e83836Smrg 16b1e83836Smrg // Under Section 7 of GPL version 3, you are granted additional 17b1e83836Smrg // permissions described in the GCC Runtime Library Exception, version 18b1e83836Smrg // 3.1, as published by the Free Software Foundation. 19b1e83836Smrg 20b1e83836Smrg // You should have received a copy of the GNU General Public License and 21b1e83836Smrg // a copy of the GCC Runtime Library Exception along with this program; 22b1e83836Smrg // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23b1e83836Smrg // <http://www.gnu.org/licenses/>. 24b1e83836Smrg 25b1e83836Smrg #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 26b1e83836Smrg #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 27b1e83836Smrg 28b1e83836Smrg #if __cplusplus >= 201703L 29b1e83836Smrg 30b1e83836Smrg #if !_GLIBCXX_SIMD_HAVE_NEON 31b1e83836Smrg #error "simd_neon.h may only be included when NEON on ARM is available" 32b1e83836Smrg #endif 33b1e83836Smrg 34b1e83836Smrg _GLIBCXX_SIMD_BEGIN_NAMESPACE 35b1e83836Smrg 36b1e83836Smrg // _CommonImplNeon {{{ 37b1e83836Smrg struct _CommonImplNeon : _CommonImplBuiltin 38b1e83836Smrg { 39b1e83836Smrg // _S_store {{{ 40b1e83836Smrg using _CommonImplBuiltin::_S_store; 41b1e83836Smrg 42b1e83836Smrg // }}} 43b1e83836Smrg }; 44b1e83836Smrg 45b1e83836Smrg // }}} 46b1e83836Smrg // _SimdImplNeon {{{ 47b1e83836Smrg template <typename _Abi, typename> 48b1e83836Smrg struct _SimdImplNeon : _SimdImplBuiltin<_Abi> 49b1e83836Smrg { 50b1e83836Smrg using _Base = _SimdImplBuiltin<_Abi>; 51b1e83836Smrg 52b1e83836Smrg template <typename _Tp> 53b1e83836Smrg using _MaskMember = typename _Base::template _MaskMember<_Tp>; 54b1e83836Smrg 55b1e83836Smrg template <typename _Tp> 56b1e83836Smrg static constexpr size_t _S_max_store_size = 16; 57b1e83836Smrg 58b1e83836Smrg // _S_masked_load {{{ 59b1e83836Smrg template <typename _Tp, size_t _Np, typename _Up> 60b1e83836Smrg static inline _SimdWrapper<_Tp, _Np> _S_masked_load_SimdImplNeon61b1e83836Smrg _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 62b1e83836Smrg const _Up* __mem) noexcept 63b1e83836Smrg { 64*0a307195Smrg __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 65b1e83836Smrg if (__k[__i] != 0) 66b1e83836Smrg __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 67b1e83836Smrg }); 68b1e83836Smrg return __merge; 69b1e83836Smrg } 70b1e83836Smrg 71b1e83836Smrg // }}} 72b1e83836Smrg // _S_masked_store_nocvt {{{ 73b1e83836Smrg template <typename _Tp, size_t _Np> 74b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_nocvt_SimdImplNeon75b1e83836Smrg _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 76b1e83836Smrg _MaskMember<_Tp> __k) 77b1e83836Smrg { 78*0a307195Smrg __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 79b1e83836Smrg if (__k[__i] != 0) 80b1e83836Smrg __mem[__i] = __v[__i]; 81b1e83836Smrg }); 82b1e83836Smrg } 83b1e83836Smrg 84b1e83836Smrg // }}} 85b1e83836Smrg // _S_reduce {{{ 86b1e83836Smrg template <typename _Tp, typename _BinaryOperation> 87*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_reduce_SimdImplNeon88b1e83836Smrg _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 89b1e83836Smrg { 90*0a307195Smrg if (not __builtin_is_constant_evaluated()) 91*0a307195Smrg { 92b1e83836Smrg constexpr size_t _Np = __x.size(); 93b1e83836Smrg if constexpr (sizeof(__x) == 16 && _Np >= 4 94b1e83836Smrg && !_Abi::template _S_is_partial<_Tp>) 95b1e83836Smrg { 96b1e83836Smrg const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x); 97b1e83836Smrg const auto __y = __binary_op(__halves[0], __halves[1]); 98b1e83836Smrg return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce( 99b1e83836Smrg __y, static_cast<_BinaryOperation&&>(__binary_op)); 100b1e83836Smrg } 101b1e83836Smrg else if constexpr (_Np == 8) 102b1e83836Smrg { 103b1e83836Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 104*0a307195Smrg __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data))); 105b1e83836Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 106*0a307195Smrg __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data))); 107b1e83836Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 108*0a307195Smrg __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data))); 109b1e83836Smrg return __x[0]; 110b1e83836Smrg } 111b1e83836Smrg else if constexpr (_Np == 4) 112b1e83836Smrg { 113*0a307195Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 114b1e83836Smrg __vector_permute<1, 0, 3, 2>(__x._M_data))); 115*0a307195Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 116b1e83836Smrg __vector_permute<3, 2, 1, 0>(__x._M_data))); 117b1e83836Smrg return __x[0]; 118b1e83836Smrg } 119b1e83836Smrg else if constexpr (_Np == 2) 120b1e83836Smrg { 121b1e83836Smrg __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 122b1e83836Smrg __vector_permute<1, 0>(__x._M_data))); 123b1e83836Smrg return __x[0]; 124b1e83836Smrg } 125*0a307195Smrg } 126*0a307195Smrg return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op)); 127b1e83836Smrg } 128b1e83836Smrg 129b1e83836Smrg // }}} 130b1e83836Smrg // math {{{ 131b1e83836Smrg // _S_sqrt {{{ 132b1e83836Smrg template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 133*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt_SimdImplNeon134*0a307195Smrg _S_sqrt(_Tp __x) 135b1e83836Smrg { 136b1e83836Smrg if constexpr (__have_neon_a64) 137b1e83836Smrg { 138b1e83836Smrg const auto __intrin = __to_intrin(__x); 139b1e83836Smrg if constexpr (_TVT::template _S_is<float, 2>) 140b1e83836Smrg return vsqrt_f32(__intrin); 141b1e83836Smrg else if constexpr (_TVT::template _S_is<float, 4>) 142b1e83836Smrg return vsqrtq_f32(__intrin); 143b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 1>) 144b1e83836Smrg return vsqrt_f64(__intrin); 145b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 2>) 146b1e83836Smrg return vsqrtq_f64(__intrin); 147b1e83836Smrg else 148b1e83836Smrg __assert_unreachable<_Tp>(); 149b1e83836Smrg } 150b1e83836Smrg else 151b1e83836Smrg return _Base::_S_sqrt(__x); 152b1e83836Smrg } 153b1e83836Smrg 154b1e83836Smrg // }}} 155b1e83836Smrg // _S_trunc {{{ 156b1e83836Smrg template <typename _TW, typename _TVT = _VectorTraits<_TW>> 157*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc_SimdImplNeon158*0a307195Smrg _S_trunc(_TW __x) 159b1e83836Smrg { 160b1e83836Smrg using _Tp = typename _TVT::value_type; 161b1e83836Smrg if constexpr (__have_neon_a32) 162b1e83836Smrg { 163b1e83836Smrg const auto __intrin = __to_intrin(__x); 164b1e83836Smrg if constexpr (_TVT::template _S_is<float, 2>) 165b1e83836Smrg return vrnd_f32(__intrin); 166b1e83836Smrg else if constexpr (_TVT::template _S_is<float, 4>) 167b1e83836Smrg return vrndq_f32(__intrin); 168b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 1>) 169b1e83836Smrg return vrnd_f64(__intrin); 170b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 2>) 171b1e83836Smrg return vrndq_f64(__intrin); 172b1e83836Smrg else 173b1e83836Smrg __assert_unreachable<_Tp>(); 174b1e83836Smrg } 175b1e83836Smrg else if constexpr (is_same_v<_Tp, float>) 176b1e83836Smrg { 177b1e83836Smrg auto __intrin = __to_intrin(__x); 178b1e83836Smrg if constexpr (sizeof(__x) == 16) 179b1e83836Smrg __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin)); 180b1e83836Smrg else 181b1e83836Smrg __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin)); 182b1e83836Smrg return _Base::_S_abs(__x)._M_data < 0x1p23f 183b1e83836Smrg ? __vector_bitcast<float>(__intrin) 184b1e83836Smrg : __x._M_data; 185b1e83836Smrg } 186b1e83836Smrg else 187b1e83836Smrg return _Base::_S_trunc(__x); 188b1e83836Smrg } 189b1e83836Smrg 190b1e83836Smrg // }}} 191b1e83836Smrg // _S_round {{{ 192b1e83836Smrg template <typename _Tp, size_t _Np> 193b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_round_SimdImplNeon194b1e83836Smrg _S_round(_SimdWrapper<_Tp, _Np> __x) 195b1e83836Smrg { 196b1e83836Smrg if constexpr (__have_neon_a32) 197b1e83836Smrg { 198b1e83836Smrg const auto __intrin = __to_intrin(__x); 199b1e83836Smrg if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8) 200b1e83836Smrg return vrnda_f32(__intrin); 201b1e83836Smrg else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16) 202b1e83836Smrg return vrndaq_f32(__intrin); 203b1e83836Smrg else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8) 204b1e83836Smrg return vrnda_f64(__intrin); 205b1e83836Smrg else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16) 206b1e83836Smrg return vrndaq_f64(__intrin); 207b1e83836Smrg else 208b1e83836Smrg __assert_unreachable<_Tp>(); 209b1e83836Smrg } 210b1e83836Smrg else 211b1e83836Smrg return _Base::_S_round(__x); 212b1e83836Smrg } 213b1e83836Smrg 214b1e83836Smrg // }}} 215b1e83836Smrg // _S_floor {{{ 216b1e83836Smrg template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 217*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor_SimdImplNeon218*0a307195Smrg _S_floor(_Tp __x) 219b1e83836Smrg { 220b1e83836Smrg if constexpr (__have_neon_a32) 221b1e83836Smrg { 222b1e83836Smrg const auto __intrin = __to_intrin(__x); 223b1e83836Smrg if constexpr (_TVT::template _S_is<float, 2>) 224b1e83836Smrg return vrndm_f32(__intrin); 225b1e83836Smrg else if constexpr (_TVT::template _S_is<float, 4>) 226b1e83836Smrg return vrndmq_f32(__intrin); 227b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 1>) 228b1e83836Smrg return vrndm_f64(__intrin); 229b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 2>) 230b1e83836Smrg return vrndmq_f64(__intrin); 231b1e83836Smrg else 232b1e83836Smrg __assert_unreachable<_Tp>(); 233b1e83836Smrg } 234b1e83836Smrg else 235b1e83836Smrg return _Base::_S_floor(__x); 236b1e83836Smrg } 237b1e83836Smrg 238b1e83836Smrg // }}} 239b1e83836Smrg // _S_ceil {{{ 240b1e83836Smrg template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 241*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil_SimdImplNeon242*0a307195Smrg _S_ceil(_Tp __x) 243b1e83836Smrg { 244b1e83836Smrg if constexpr (__have_neon_a32) 245b1e83836Smrg { 246b1e83836Smrg const auto __intrin = __to_intrin(__x); 247b1e83836Smrg if constexpr (_TVT::template _S_is<float, 2>) 248b1e83836Smrg return vrndp_f32(__intrin); 249b1e83836Smrg else if constexpr (_TVT::template _S_is<float, 4>) 250b1e83836Smrg return vrndpq_f32(__intrin); 251b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 1>) 252b1e83836Smrg return vrndp_f64(__intrin); 253b1e83836Smrg else if constexpr (_TVT::template _S_is<double, 2>) 254b1e83836Smrg return vrndpq_f64(__intrin); 255b1e83836Smrg else 256b1e83836Smrg __assert_unreachable<_Tp>(); 257b1e83836Smrg } 258b1e83836Smrg else 259b1e83836Smrg return _Base::_S_ceil(__x); 260b1e83836Smrg } 261b1e83836Smrg 262b1e83836Smrg //}}} }}} 263b1e83836Smrg }; // }}} 264b1e83836Smrg // _MaskImplNeonMixin {{{ 265b1e83836Smrg struct _MaskImplNeonMixin 266b1e83836Smrg { 267b1e83836Smrg using _Base = _MaskImplBuiltinMixin; 268b1e83836Smrg 269b1e83836Smrg template <typename _Tp, size_t _Np> 270b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> _S_to_bits_MaskImplNeonMixin271b1e83836Smrg _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 272b1e83836Smrg { 273b1e83836Smrg if (__builtin_is_constant_evaluated()) 274b1e83836Smrg return _Base::_S_to_bits(__x); 275b1e83836Smrg 276b1e83836Smrg using _I = __int_for_sizeof_t<_Tp>; 277b1e83836Smrg if constexpr (sizeof(__x) == 16) 278b1e83836Smrg { 279b1e83836Smrg auto __asint = __vector_bitcast<_I>(__x); 280b1e83836Smrg #ifdef __aarch64__ 281b1e83836Smrg [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 282b1e83836Smrg #else 283b1e83836Smrg [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))(); 284b1e83836Smrg #endif 285b1e83836Smrg if constexpr (sizeof(_Tp) == 1) 286b1e83836Smrg { 287b1e83836Smrg constexpr auto __bitsel 288b1e83836Smrg = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( 289*0a307195Smrg [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 290b1e83836Smrg return static_cast<_I>( 291b1e83836Smrg __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); 292b1e83836Smrg }); 293b1e83836Smrg __asint &= __bitsel; 294b1e83836Smrg #ifdef __aarch64__ 295b1e83836Smrg return __vector_bitcast<_UShort>( 296b1e83836Smrg vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero), 297b1e83836Smrg __zero))[0]; 298b1e83836Smrg #else 299b1e83836Smrg return __vector_bitcast<_UShort>( 300b1e83836Smrg vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)), 301b1e83836Smrg __zero), 302b1e83836Smrg __zero))[0]; 303b1e83836Smrg #endif 304b1e83836Smrg } 305b1e83836Smrg else if constexpr (sizeof(_Tp) == 2) 306b1e83836Smrg { 307b1e83836Smrg constexpr auto __bitsel 308b1e83836Smrg = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 309*0a307195Smrg [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 310b1e83836Smrg return static_cast<_I>(__i < _Np ? 1 << __i : 0); 311b1e83836Smrg }); 312b1e83836Smrg __asint &= __bitsel; 313b1e83836Smrg #ifdef __aarch64__ 314b1e83836Smrg return vaddvq_s16(__asint); 315b1e83836Smrg #else 316b1e83836Smrg return vpadd_s16( 317b1e83836Smrg vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), 318b1e83836Smrg __zero)[0]; 319b1e83836Smrg #endif 320b1e83836Smrg } 321b1e83836Smrg else if constexpr (sizeof(_Tp) == 4) 322b1e83836Smrg { 323b1e83836Smrg constexpr auto __bitsel 324b1e83836Smrg = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 325*0a307195Smrg [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 326b1e83836Smrg return static_cast<_I>(__i < _Np ? 1 << __i : 0); 327b1e83836Smrg }); 328b1e83836Smrg __asint &= __bitsel; 329b1e83836Smrg #ifdef __aarch64__ 330b1e83836Smrg return vaddvq_s32(__asint); 331b1e83836Smrg #else 332b1e83836Smrg return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)), 333b1e83836Smrg __zero)[0]; 334b1e83836Smrg #endif 335b1e83836Smrg } 336b1e83836Smrg else if constexpr (sizeof(_Tp) == 8) 337b1e83836Smrg return (__asint[0] & 1) | (__asint[1] & 2); 338b1e83836Smrg else 339b1e83836Smrg __assert_unreachable<_Tp>(); 340b1e83836Smrg } 341b1e83836Smrg else if constexpr (sizeof(__x) == 8) 342b1e83836Smrg { 343b1e83836Smrg auto __asint = __vector_bitcast<_I>(__x); 344b1e83836Smrg [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 345b1e83836Smrg if constexpr (sizeof(_Tp) == 1) 346b1e83836Smrg { 347b1e83836Smrg constexpr auto __bitsel 348b1e83836Smrg = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 349*0a307195Smrg [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 350b1e83836Smrg return static_cast<_I>(__i < _Np ? 1 << __i : 0); 351b1e83836Smrg }); 352b1e83836Smrg __asint &= __bitsel; 353b1e83836Smrg #ifdef __aarch64__ 354b1e83836Smrg return vaddv_s8(__asint); 355b1e83836Smrg #else 356b1e83836Smrg return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero), 357b1e83836Smrg __zero)[0]; 358b1e83836Smrg #endif 359b1e83836Smrg } 360b1e83836Smrg else if constexpr (sizeof(_Tp) == 2) 361b1e83836Smrg { 362b1e83836Smrg constexpr auto __bitsel 363b1e83836Smrg = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 364*0a307195Smrg [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 365b1e83836Smrg return static_cast<_I>(__i < _Np ? 1 << __i : 0); 366b1e83836Smrg }); 367b1e83836Smrg __asint &= __bitsel; 368b1e83836Smrg #ifdef __aarch64__ 369b1e83836Smrg return vaddv_s16(__asint); 370b1e83836Smrg #else 371b1e83836Smrg return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0]; 372b1e83836Smrg #endif 373b1e83836Smrg } 374b1e83836Smrg else if constexpr (sizeof(_Tp) == 4) 375b1e83836Smrg { 376b1e83836Smrg __asint &= __make_vector<_I>(0x1, 0x2); 377b1e83836Smrg #ifdef __aarch64__ 378b1e83836Smrg return vaddv_s32(__asint); 379b1e83836Smrg #else 380b1e83836Smrg return vpadd_s32(__asint, __zero)[0]; 381b1e83836Smrg #endif 382b1e83836Smrg } 383b1e83836Smrg else 384b1e83836Smrg __assert_unreachable<_Tp>(); 385b1e83836Smrg } 386b1e83836Smrg else 387b1e83836Smrg return _Base::_S_to_bits(__x); 388b1e83836Smrg } 389b1e83836Smrg }; 390b1e83836Smrg 391b1e83836Smrg // }}} 392b1e83836Smrg // _MaskImplNeon {{{ 393b1e83836Smrg template <typename _Abi, typename> 394b1e83836Smrg struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi> 395b1e83836Smrg { 396b1e83836Smrg using _MaskImplBuiltinMixin::_S_to_maskvector; 397b1e83836Smrg using _MaskImplNeonMixin::_S_to_bits; 398b1e83836Smrg using _Base = _MaskImplBuiltin<_Abi>; 399b1e83836Smrg using _Base::_S_convert; 400b1e83836Smrg 401b1e83836Smrg // _S_all_of {{{ 402b1e83836Smrg template <typename _Tp> 403*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of_MaskImplNeon404*0a307195Smrg _S_all_of(simd_mask<_Tp, _Abi> __k) 405b1e83836Smrg { 406b1e83836Smrg const auto __kk 407b1e83836Smrg = __vector_bitcast<char>(__k._M_data) 408b1e83836Smrg | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>()); 409b1e83836Smrg if constexpr (sizeof(__k) == 16) 410b1e83836Smrg { 411b1e83836Smrg const auto __x = __vector_bitcast<long long>(__kk); 412b1e83836Smrg return __x[0] + __x[1] == -2; 413b1e83836Smrg } 414b1e83836Smrg else if constexpr (sizeof(__k) <= 8) 415b1e83836Smrg return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1; 416b1e83836Smrg else 417b1e83836Smrg __assert_unreachable<_Tp>(); 418b1e83836Smrg } 419b1e83836Smrg 420b1e83836Smrg // }}} 421b1e83836Smrg // _S_any_of {{{ 422b1e83836Smrg template <typename _Tp> 423*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of_MaskImplNeon424*0a307195Smrg _S_any_of(simd_mask<_Tp, _Abi> __k) 425b1e83836Smrg { 426b1e83836Smrg const auto __kk 427b1e83836Smrg = __vector_bitcast<char>(__k._M_data) 428b1e83836Smrg | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>()); 429b1e83836Smrg if constexpr (sizeof(__k) == 16) 430b1e83836Smrg { 431b1e83836Smrg const auto __x = __vector_bitcast<long long>(__kk); 432b1e83836Smrg return (__x[0] | __x[1]) != 0; 433b1e83836Smrg } 434b1e83836Smrg else if constexpr (sizeof(__k) <= 8) 435b1e83836Smrg return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0; 436b1e83836Smrg else 437b1e83836Smrg __assert_unreachable<_Tp>(); 438b1e83836Smrg } 439b1e83836Smrg 440b1e83836Smrg // }}} 441b1e83836Smrg // _S_none_of {{{ 442b1e83836Smrg template <typename _Tp> 443*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of_MaskImplNeon444*0a307195Smrg _S_none_of(simd_mask<_Tp, _Abi> __k) 445b1e83836Smrg { 446b1e83836Smrg const auto __kk = _Abi::_S_masked(__k._M_data); 447b1e83836Smrg if constexpr (sizeof(__k) == 16) 448b1e83836Smrg { 449b1e83836Smrg const auto __x = __vector_bitcast<long long>(__kk); 450b1e83836Smrg return (__x[0] | __x[1]) == 0; 451b1e83836Smrg } 452b1e83836Smrg else if constexpr (sizeof(__k) <= 8) 453b1e83836Smrg return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0; 454b1e83836Smrg else 455b1e83836Smrg __assert_unreachable<_Tp>(); 456b1e83836Smrg } 457b1e83836Smrg 458b1e83836Smrg // }}} 459b1e83836Smrg // _S_some_of {{{ 460b1e83836Smrg template <typename _Tp> _S_some_of_MaskImplNeon461b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) 462b1e83836Smrg { 463b1e83836Smrg if constexpr (sizeof(__k) <= 8) 464b1e83836Smrg { 465b1e83836Smrg const auto __kk = __vector_bitcast<char>(__k._M_data) 466b1e83836Smrg | ~__vector_bitcast<char>( 467b1e83836Smrg _Abi::template _S_implicit_mask<_Tp>()); 468b1e83836Smrg using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>; 469b1e83836Smrg return __bit_cast<_Up>(__kk) + 1 > 1; 470b1e83836Smrg } 471b1e83836Smrg else 472b1e83836Smrg return _Base::_S_some_of(__k); 473b1e83836Smrg } 474b1e83836Smrg 475b1e83836Smrg // }}} 476b1e83836Smrg // _S_popcount {{{ 477b1e83836Smrg template <typename _Tp> 478*0a307195Smrg _GLIBCXX_SIMD_INTRINSIC static int _S_popcount_MaskImplNeon479*0a307195Smrg _S_popcount(simd_mask<_Tp, _Abi> __k) 480b1e83836Smrg { 481b1e83836Smrg if constexpr (sizeof(_Tp) == 1) 482b1e83836Smrg { 483b1e83836Smrg const auto __s8 = __vector_bitcast<_SChar>(__k._M_data); 484b1e83836Smrg int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8); 485b1e83836Smrg return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()), 486b1e83836Smrg int8x8_t())[0]; 487b1e83836Smrg } 488b1e83836Smrg else if constexpr (sizeof(_Tp) == 2) 489b1e83836Smrg { 490b1e83836Smrg const auto __s16 = __vector_bitcast<short>(__k._M_data); 491b1e83836Smrg int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16); 492b1e83836Smrg return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0]; 493b1e83836Smrg } 494b1e83836Smrg else if constexpr (sizeof(_Tp) == 4) 495b1e83836Smrg { 496b1e83836Smrg const auto __s32 = __vector_bitcast<int>(__k._M_data); 497b1e83836Smrg int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32); 498b1e83836Smrg return -vpadd_s32(__tmp, int32x2_t())[0]; 499b1e83836Smrg } 500b1e83836Smrg else if constexpr (sizeof(_Tp) == 8) 501b1e83836Smrg { 502b1e83836Smrg static_assert(sizeof(__k) == 16); 503b1e83836Smrg const auto __s64 = __vector_bitcast<long>(__k._M_data); 504b1e83836Smrg return -(__s64[0] + __s64[1]); 505b1e83836Smrg } 506b1e83836Smrg } 507b1e83836Smrg 508b1e83836Smrg // }}} 509b1e83836Smrg // _S_find_first_set {{{ 510b1e83836Smrg template <typename _Tp> 511b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static int _S_find_first_set_MaskImplNeon512b1e83836Smrg _S_find_first_set(simd_mask<_Tp, _Abi> __k) 513b1e83836Smrg { 514b1e83836Smrg // TODO: the _Base implementation is not optimal for NEON 515b1e83836Smrg return _Base::_S_find_first_set(__k); 516b1e83836Smrg } 517b1e83836Smrg 518b1e83836Smrg // }}} 519b1e83836Smrg // _S_find_last_set {{{ 520b1e83836Smrg template <typename _Tp> 521b1e83836Smrg _GLIBCXX_SIMD_INTRINSIC static int _S_find_last_set_MaskImplNeon522b1e83836Smrg _S_find_last_set(simd_mask<_Tp, _Abi> __k) 523b1e83836Smrg { 524b1e83836Smrg // TODO: the _Base implementation is not optimal for NEON 525b1e83836Smrg return _Base::_S_find_last_set(__k); 526b1e83836Smrg } 527b1e83836Smrg 528b1e83836Smrg // }}} 529b1e83836Smrg }; // }}} 530b1e83836Smrg 531b1e83836Smrg _GLIBCXX_SIMD_END_NAMESPACE 532b1e83836Smrg #endif // __cplusplus >= 201703L 533b1e83836Smrg #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 534b1e83836Smrg // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 535