xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/experimental/bits/simd_neon.h (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1b1e83836Smrg // Simd NEON specific implementations -*- C++ -*-
2b1e83836Smrg 
3b1e83836Smrg // Copyright (C) 2020-2022 Free Software Foundation, Inc.
4b1e83836Smrg //
5b1e83836Smrg // This file is part of the GNU ISO C++ Library.  This library is free
6b1e83836Smrg // software; you can redistribute it and/or modify it under the
7b1e83836Smrg // terms of the GNU General Public License as published by the
8b1e83836Smrg // Free Software Foundation; either version 3, or (at your option)
9b1e83836Smrg // any later version.
10b1e83836Smrg 
11b1e83836Smrg // This library is distributed in the hope that it will be useful,
12b1e83836Smrg // but WITHOUT ANY WARRANTY; without even the implied warranty of
13b1e83836Smrg // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14b1e83836Smrg // GNU General Public License for more details.
15b1e83836Smrg 
16b1e83836Smrg // Under Section 7 of GPL version 3, you are granted additional
17b1e83836Smrg // permissions described in the GCC Runtime Library Exception, version
18b1e83836Smrg // 3.1, as published by the Free Software Foundation.
19b1e83836Smrg 
20b1e83836Smrg // You should have received a copy of the GNU General Public License and
21b1e83836Smrg // a copy of the GCC Runtime Library Exception along with this program;
22b1e83836Smrg // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23b1e83836Smrg // <http://www.gnu.org/licenses/>.
24b1e83836Smrg 
25b1e83836Smrg #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26b1e83836Smrg #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
27b1e83836Smrg 
28b1e83836Smrg #if __cplusplus >= 201703L
29b1e83836Smrg 
30b1e83836Smrg #if !_GLIBCXX_SIMD_HAVE_NEON
31b1e83836Smrg #error "simd_neon.h may only be included when NEON on ARM is available"
32b1e83836Smrg #endif
33b1e83836Smrg 
34b1e83836Smrg _GLIBCXX_SIMD_BEGIN_NAMESPACE
35b1e83836Smrg 
36b1e83836Smrg // _CommonImplNeon {{{
37b1e83836Smrg struct _CommonImplNeon : _CommonImplBuiltin
38b1e83836Smrg {
39b1e83836Smrg   // _S_store {{{
40b1e83836Smrg   using _CommonImplBuiltin::_S_store;
41b1e83836Smrg 
42b1e83836Smrg   // }}}
43b1e83836Smrg };
44b1e83836Smrg 
45b1e83836Smrg // }}}
46b1e83836Smrg // _SimdImplNeon {{{
47b1e83836Smrg template <typename _Abi, typename>
48b1e83836Smrg   struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
49b1e83836Smrg   {
50b1e83836Smrg     using _Base = _SimdImplBuiltin<_Abi>;
51b1e83836Smrg 
52b1e83836Smrg     template <typename _Tp>
53b1e83836Smrg       using _MaskMember = typename _Base::template _MaskMember<_Tp>;
54b1e83836Smrg 
55b1e83836Smrg     template <typename _Tp>
56b1e83836Smrg       static constexpr size_t _S_max_store_size = 16;
57b1e83836Smrg 
58b1e83836Smrg     // _S_masked_load {{{
59b1e83836Smrg     template <typename _Tp, size_t _Np, typename _Up>
60b1e83836Smrg       static inline _SimdWrapper<_Tp, _Np>
_S_masked_load_SimdImplNeon61b1e83836Smrg       _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
62b1e83836Smrg 		     const _Up* __mem) noexcept
63b1e83836Smrg       {
64*0a307195Smrg 	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
65b1e83836Smrg 	  if (__k[__i] != 0)
66b1e83836Smrg 	    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
67b1e83836Smrg 	});
68b1e83836Smrg 	return __merge;
69b1e83836Smrg       }
70b1e83836Smrg 
71b1e83836Smrg     // }}}
72b1e83836Smrg     // _S_masked_store_nocvt {{{
73b1e83836Smrg     template <typename _Tp, size_t _Np>
74b1e83836Smrg       _GLIBCXX_SIMD_INTRINSIC static void
_S_masked_store_nocvt_SimdImplNeon75b1e83836Smrg       _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
76b1e83836Smrg 			    _MaskMember<_Tp> __k)
77b1e83836Smrg       {
78*0a307195Smrg 	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
79b1e83836Smrg 	  if (__k[__i] != 0)
80b1e83836Smrg 	    __mem[__i] = __v[__i];
81b1e83836Smrg 	});
82b1e83836Smrg       }
83b1e83836Smrg 
84b1e83836Smrg     // }}}
85b1e83836Smrg     // _S_reduce {{{
86b1e83836Smrg     template <typename _Tp, typename _BinaryOperation>
87*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
_S_reduce_SimdImplNeon88b1e83836Smrg       _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
89b1e83836Smrg       {
90*0a307195Smrg 	if (not __builtin_is_constant_evaluated())
91*0a307195Smrg 	  {
92b1e83836Smrg 	    constexpr size_t _Np = __x.size();
93b1e83836Smrg 	    if constexpr (sizeof(__x) == 16 && _Np >= 4
94b1e83836Smrg 			    && !_Abi::template _S_is_partial<_Tp>)
95b1e83836Smrg 	      {
96b1e83836Smrg 		const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
97b1e83836Smrg 		const auto __y = __binary_op(__halves[0], __halves[1]);
98b1e83836Smrg 		return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
99b1e83836Smrg 			 __y, static_cast<_BinaryOperation&&>(__binary_op));
100b1e83836Smrg 	      }
101b1e83836Smrg 	    else if constexpr (_Np == 8)
102b1e83836Smrg 	      {
103b1e83836Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
104*0a307195Smrg 					 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data)));
105b1e83836Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
106*0a307195Smrg 					 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data)));
107b1e83836Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108*0a307195Smrg 					 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data)));
109b1e83836Smrg 		return __x[0];
110b1e83836Smrg 	      }
111b1e83836Smrg 	    else if constexpr (_Np == 4)
112b1e83836Smrg 	      {
113*0a307195Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
114b1e83836Smrg 					 __vector_permute<1, 0, 3, 2>(__x._M_data)));
115*0a307195Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116b1e83836Smrg 					 __vector_permute<3, 2, 1, 0>(__x._M_data)));
117b1e83836Smrg 		return __x[0];
118b1e83836Smrg 	      }
119b1e83836Smrg 	    else if constexpr (_Np == 2)
120b1e83836Smrg 	      {
121b1e83836Smrg 		__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
122b1e83836Smrg 					 __vector_permute<1, 0>(__x._M_data)));
123b1e83836Smrg 		return __x[0];
124b1e83836Smrg 	      }
125*0a307195Smrg 	  }
126*0a307195Smrg 	return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op));
127b1e83836Smrg       }
128b1e83836Smrg 
129b1e83836Smrg     // }}}
130b1e83836Smrg     // math {{{
131b1e83836Smrg     // _S_sqrt {{{
132b1e83836Smrg     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
133*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static _Tp
_S_sqrt_SimdImplNeon134*0a307195Smrg       _S_sqrt(_Tp __x)
135b1e83836Smrg       {
136b1e83836Smrg 	if constexpr (__have_neon_a64)
137b1e83836Smrg 	  {
138b1e83836Smrg 	    const auto __intrin = __to_intrin(__x);
139b1e83836Smrg 	    if constexpr (_TVT::template _S_is<float, 2>)
140b1e83836Smrg 	      return vsqrt_f32(__intrin);
141b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<float, 4>)
142b1e83836Smrg 	      return vsqrtq_f32(__intrin);
143b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 1>)
144b1e83836Smrg 	      return vsqrt_f64(__intrin);
145b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 2>)
146b1e83836Smrg 	      return vsqrtq_f64(__intrin);
147b1e83836Smrg 	    else
148b1e83836Smrg 	      __assert_unreachable<_Tp>();
149b1e83836Smrg 	  }
150b1e83836Smrg 	else
151b1e83836Smrg 	  return _Base::_S_sqrt(__x);
152b1e83836Smrg       }
153b1e83836Smrg 
154b1e83836Smrg     // }}}
155b1e83836Smrg     // _S_trunc {{{
156b1e83836Smrg     template <typename _TW, typename _TVT = _VectorTraits<_TW>>
157*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static _TW
_S_trunc_SimdImplNeon158*0a307195Smrg       _S_trunc(_TW __x)
159b1e83836Smrg       {
160b1e83836Smrg 	using _Tp = typename _TVT::value_type;
161b1e83836Smrg 	if constexpr (__have_neon_a32)
162b1e83836Smrg 	  {
163b1e83836Smrg 	    const auto __intrin = __to_intrin(__x);
164b1e83836Smrg 	    if constexpr (_TVT::template _S_is<float, 2>)
165b1e83836Smrg 	      return vrnd_f32(__intrin);
166b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<float, 4>)
167b1e83836Smrg 	      return vrndq_f32(__intrin);
168b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 1>)
169b1e83836Smrg 	      return vrnd_f64(__intrin);
170b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 2>)
171b1e83836Smrg 	      return vrndq_f64(__intrin);
172b1e83836Smrg 	    else
173b1e83836Smrg 	      __assert_unreachable<_Tp>();
174b1e83836Smrg 	  }
175b1e83836Smrg 	else if constexpr (is_same_v<_Tp, float>)
176b1e83836Smrg 	  {
177b1e83836Smrg 	    auto __intrin = __to_intrin(__x);
178b1e83836Smrg 	    if constexpr (sizeof(__x) == 16)
179b1e83836Smrg 	      __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
180b1e83836Smrg 	    else
181b1e83836Smrg 	      __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
182b1e83836Smrg 	    return _Base::_S_abs(__x)._M_data < 0x1p23f
183b1e83836Smrg 		     ? __vector_bitcast<float>(__intrin)
184b1e83836Smrg 		     : __x._M_data;
185b1e83836Smrg 	  }
186b1e83836Smrg 	else
187b1e83836Smrg 	  return _Base::_S_trunc(__x);
188b1e83836Smrg       }
189b1e83836Smrg 
190b1e83836Smrg     // }}}
191b1e83836Smrg     // _S_round {{{
192b1e83836Smrg     template <typename _Tp, size_t _Np>
193b1e83836Smrg       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
_S_round_SimdImplNeon194b1e83836Smrg       _S_round(_SimdWrapper<_Tp, _Np> __x)
195b1e83836Smrg       {
196b1e83836Smrg 	if constexpr (__have_neon_a32)
197b1e83836Smrg 	  {
198b1e83836Smrg 	    const auto __intrin = __to_intrin(__x);
199b1e83836Smrg 	    if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
200b1e83836Smrg 	      return vrnda_f32(__intrin);
201b1e83836Smrg 	    else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
202b1e83836Smrg 	      return vrndaq_f32(__intrin);
203b1e83836Smrg 	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
204b1e83836Smrg 	      return vrnda_f64(__intrin);
205b1e83836Smrg 	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
206b1e83836Smrg 	      return vrndaq_f64(__intrin);
207b1e83836Smrg 	    else
208b1e83836Smrg 	      __assert_unreachable<_Tp>();
209b1e83836Smrg 	  }
210b1e83836Smrg 	else
211b1e83836Smrg 	  return _Base::_S_round(__x);
212b1e83836Smrg       }
213b1e83836Smrg 
214b1e83836Smrg     // }}}
215b1e83836Smrg     // _S_floor {{{
216b1e83836Smrg     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
217*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static _Tp
_S_floor_SimdImplNeon218*0a307195Smrg       _S_floor(_Tp __x)
219b1e83836Smrg       {
220b1e83836Smrg 	if constexpr (__have_neon_a32)
221b1e83836Smrg 	  {
222b1e83836Smrg 	    const auto __intrin = __to_intrin(__x);
223b1e83836Smrg 	    if constexpr (_TVT::template _S_is<float, 2>)
224b1e83836Smrg 	      return vrndm_f32(__intrin);
225b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<float, 4>)
226b1e83836Smrg 	      return vrndmq_f32(__intrin);
227b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 1>)
228b1e83836Smrg 	      return vrndm_f64(__intrin);
229b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 2>)
230b1e83836Smrg 	      return vrndmq_f64(__intrin);
231b1e83836Smrg 	    else
232b1e83836Smrg 	      __assert_unreachable<_Tp>();
233b1e83836Smrg 	  }
234b1e83836Smrg 	else
235b1e83836Smrg 	  return _Base::_S_floor(__x);
236b1e83836Smrg       }
237b1e83836Smrg 
238b1e83836Smrg     // }}}
239b1e83836Smrg     // _S_ceil {{{
240b1e83836Smrg     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
241*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static _Tp
_S_ceil_SimdImplNeon242*0a307195Smrg       _S_ceil(_Tp __x)
243b1e83836Smrg       {
244b1e83836Smrg 	if constexpr (__have_neon_a32)
245b1e83836Smrg 	  {
246b1e83836Smrg 	    const auto __intrin = __to_intrin(__x);
247b1e83836Smrg 	    if constexpr (_TVT::template _S_is<float, 2>)
248b1e83836Smrg 	      return vrndp_f32(__intrin);
249b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<float, 4>)
250b1e83836Smrg 	      return vrndpq_f32(__intrin);
251b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 1>)
252b1e83836Smrg 	      return vrndp_f64(__intrin);
253b1e83836Smrg 	    else if constexpr (_TVT::template _S_is<double, 2>)
254b1e83836Smrg 	      return vrndpq_f64(__intrin);
255b1e83836Smrg 	    else
256b1e83836Smrg 	      __assert_unreachable<_Tp>();
257b1e83836Smrg 	  }
258b1e83836Smrg 	else
259b1e83836Smrg 	  return _Base::_S_ceil(__x);
260b1e83836Smrg       }
261b1e83836Smrg 
262b1e83836Smrg     //}}} }}}
263b1e83836Smrg   }; // }}}
264b1e83836Smrg // _MaskImplNeonMixin {{{
265b1e83836Smrg struct _MaskImplNeonMixin
266b1e83836Smrg {
267b1e83836Smrg   using _Base = _MaskImplBuiltinMixin;
268b1e83836Smrg 
269b1e83836Smrg   template <typename _Tp, size_t _Np>
270b1e83836Smrg     _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
_S_to_bits_MaskImplNeonMixin271b1e83836Smrg     _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
272b1e83836Smrg     {
273b1e83836Smrg       if (__builtin_is_constant_evaluated())
274b1e83836Smrg 	return _Base::_S_to_bits(__x);
275b1e83836Smrg 
276b1e83836Smrg       using _I = __int_for_sizeof_t<_Tp>;
277b1e83836Smrg       if constexpr (sizeof(__x) == 16)
278b1e83836Smrg 	{
279b1e83836Smrg 	  auto __asint = __vector_bitcast<_I>(__x);
280b1e83836Smrg #ifdef __aarch64__
281b1e83836Smrg 	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
282b1e83836Smrg #else
283b1e83836Smrg 	  [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
284b1e83836Smrg #endif
285b1e83836Smrg 	  if constexpr (sizeof(_Tp) == 1)
286b1e83836Smrg 	    {
287b1e83836Smrg 	      constexpr auto __bitsel
288b1e83836Smrg 		= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
289*0a307195Smrg 		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
290b1e83836Smrg 		    return static_cast<_I>(
291b1e83836Smrg 		      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
292b1e83836Smrg 		  });
293b1e83836Smrg 	      __asint &= __bitsel;
294b1e83836Smrg #ifdef __aarch64__
295b1e83836Smrg 	      return __vector_bitcast<_UShort>(
296b1e83836Smrg 		vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
297b1e83836Smrg 			  __zero))[0];
298b1e83836Smrg #else
299b1e83836Smrg 	      return __vector_bitcast<_UShort>(
300b1e83836Smrg 		vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
301b1e83836Smrg 				  __zero),
302b1e83836Smrg 			 __zero))[0];
303b1e83836Smrg #endif
304b1e83836Smrg 	    }
305b1e83836Smrg 	  else if constexpr (sizeof(_Tp) == 2)
306b1e83836Smrg 	    {
307b1e83836Smrg 	      constexpr auto __bitsel
308b1e83836Smrg 		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
309*0a307195Smrg 		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
310b1e83836Smrg 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
311b1e83836Smrg 		  });
312b1e83836Smrg 	      __asint &= __bitsel;
313b1e83836Smrg #ifdef __aarch64__
314b1e83836Smrg 	      return vaddvq_s16(__asint);
315b1e83836Smrg #else
316b1e83836Smrg 	      return vpadd_s16(
317b1e83836Smrg 		vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
318b1e83836Smrg 		__zero)[0];
319b1e83836Smrg #endif
320b1e83836Smrg 	    }
321b1e83836Smrg 	  else if constexpr (sizeof(_Tp) == 4)
322b1e83836Smrg 	    {
323b1e83836Smrg 	      constexpr auto __bitsel
324b1e83836Smrg 		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
325*0a307195Smrg 		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
326b1e83836Smrg 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
327b1e83836Smrg 		  });
328b1e83836Smrg 	      __asint &= __bitsel;
329b1e83836Smrg #ifdef __aarch64__
330b1e83836Smrg 	      return vaddvq_s32(__asint);
331b1e83836Smrg #else
332b1e83836Smrg 	      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
333b1e83836Smrg 			       __zero)[0];
334b1e83836Smrg #endif
335b1e83836Smrg 	    }
336b1e83836Smrg 	  else if constexpr (sizeof(_Tp) == 8)
337b1e83836Smrg 	    return (__asint[0] & 1) | (__asint[1] & 2);
338b1e83836Smrg 	  else
339b1e83836Smrg 	    __assert_unreachable<_Tp>();
340b1e83836Smrg 	}
341b1e83836Smrg       else if constexpr (sizeof(__x) == 8)
342b1e83836Smrg 	{
343b1e83836Smrg 	  auto __asint = __vector_bitcast<_I>(__x);
344b1e83836Smrg 	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
345b1e83836Smrg 	  if constexpr (sizeof(_Tp) == 1)
346b1e83836Smrg 	    {
347b1e83836Smrg 	      constexpr auto __bitsel
348b1e83836Smrg 		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
349*0a307195Smrg 		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
350b1e83836Smrg 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
351b1e83836Smrg 		  });
352b1e83836Smrg 	      __asint &= __bitsel;
353b1e83836Smrg #ifdef __aarch64__
354b1e83836Smrg 	      return vaddv_s8(__asint);
355b1e83836Smrg #else
356b1e83836Smrg 	      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
357b1e83836Smrg 			      __zero)[0];
358b1e83836Smrg #endif
359b1e83836Smrg 	    }
360b1e83836Smrg 	  else if constexpr (sizeof(_Tp) == 2)
361b1e83836Smrg 	    {
362b1e83836Smrg 	      constexpr auto __bitsel
363b1e83836Smrg 		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
364*0a307195Smrg 		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
365b1e83836Smrg 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
366b1e83836Smrg 		  });
367b1e83836Smrg 	      __asint &= __bitsel;
368b1e83836Smrg #ifdef __aarch64__
369b1e83836Smrg 	      return vaddv_s16(__asint);
370b1e83836Smrg #else
371b1e83836Smrg 	      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
372b1e83836Smrg #endif
373b1e83836Smrg 	    }
374b1e83836Smrg 	  else if constexpr (sizeof(_Tp) == 4)
375b1e83836Smrg 	    {
376b1e83836Smrg 	      __asint &= __make_vector<_I>(0x1, 0x2);
377b1e83836Smrg #ifdef __aarch64__
378b1e83836Smrg 	      return vaddv_s32(__asint);
379b1e83836Smrg #else
380b1e83836Smrg 	      return vpadd_s32(__asint, __zero)[0];
381b1e83836Smrg #endif
382b1e83836Smrg 	    }
383b1e83836Smrg 	  else
384b1e83836Smrg 	    __assert_unreachable<_Tp>();
385b1e83836Smrg 	}
386b1e83836Smrg       else
387b1e83836Smrg 	return _Base::_S_to_bits(__x);
388b1e83836Smrg     }
389b1e83836Smrg };
390b1e83836Smrg 
391b1e83836Smrg // }}}
392b1e83836Smrg // _MaskImplNeon {{{
393b1e83836Smrg template <typename _Abi, typename>
394b1e83836Smrg   struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
395b1e83836Smrg   {
396b1e83836Smrg     using _MaskImplBuiltinMixin::_S_to_maskvector;
397b1e83836Smrg     using _MaskImplNeonMixin::_S_to_bits;
398b1e83836Smrg     using _Base = _MaskImplBuiltin<_Abi>;
399b1e83836Smrg     using _Base::_S_convert;
400b1e83836Smrg 
401b1e83836Smrg     // _S_all_of {{{
402b1e83836Smrg     template <typename _Tp>
403*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static bool
_S_all_of_MaskImplNeon404*0a307195Smrg       _S_all_of(simd_mask<_Tp, _Abi> __k)
405b1e83836Smrg       {
406b1e83836Smrg 	const auto __kk
407b1e83836Smrg 	  = __vector_bitcast<char>(__k._M_data)
408b1e83836Smrg 	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
409b1e83836Smrg 	if constexpr (sizeof(__k) == 16)
410b1e83836Smrg 	  {
411b1e83836Smrg 	    const auto __x = __vector_bitcast<long long>(__kk);
412b1e83836Smrg 	    return __x[0] + __x[1] == -2;
413b1e83836Smrg 	  }
414b1e83836Smrg 	else if constexpr (sizeof(__k) <= 8)
415b1e83836Smrg 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
416b1e83836Smrg 	else
417b1e83836Smrg 	  __assert_unreachable<_Tp>();
418b1e83836Smrg       }
419b1e83836Smrg 
420b1e83836Smrg     // }}}
421b1e83836Smrg     // _S_any_of {{{
422b1e83836Smrg     template <typename _Tp>
423*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static bool
_S_any_of_MaskImplNeon424*0a307195Smrg       _S_any_of(simd_mask<_Tp, _Abi> __k)
425b1e83836Smrg       {
426b1e83836Smrg 	const auto __kk
427b1e83836Smrg 	  = __vector_bitcast<char>(__k._M_data)
428b1e83836Smrg 	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
429b1e83836Smrg 	if constexpr (sizeof(__k) == 16)
430b1e83836Smrg 	  {
431b1e83836Smrg 	    const auto __x = __vector_bitcast<long long>(__kk);
432b1e83836Smrg 	    return (__x[0] | __x[1]) != 0;
433b1e83836Smrg 	  }
434b1e83836Smrg 	else if constexpr (sizeof(__k) <= 8)
435b1e83836Smrg 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
436b1e83836Smrg 	else
437b1e83836Smrg 	  __assert_unreachable<_Tp>();
438b1e83836Smrg       }
439b1e83836Smrg 
440b1e83836Smrg     // }}}
441b1e83836Smrg     // _S_none_of {{{
442b1e83836Smrg     template <typename _Tp>
443*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static bool
_S_none_of_MaskImplNeon444*0a307195Smrg       _S_none_of(simd_mask<_Tp, _Abi> __k)
445b1e83836Smrg       {
446b1e83836Smrg 	const auto __kk = _Abi::_S_masked(__k._M_data);
447b1e83836Smrg 	if constexpr (sizeof(__k) == 16)
448b1e83836Smrg 	  {
449b1e83836Smrg 	    const auto __x = __vector_bitcast<long long>(__kk);
450b1e83836Smrg 	    return (__x[0] | __x[1]) == 0;
451b1e83836Smrg 	  }
452b1e83836Smrg 	else if constexpr (sizeof(__k) <= 8)
453b1e83836Smrg 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
454b1e83836Smrg 	else
455b1e83836Smrg 	  __assert_unreachable<_Tp>();
456b1e83836Smrg       }
457b1e83836Smrg 
458b1e83836Smrg     // }}}
459b1e83836Smrg     // _S_some_of {{{
460b1e83836Smrg     template <typename _Tp>
_S_some_of_MaskImplNeon461b1e83836Smrg       _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
462b1e83836Smrg       {
463b1e83836Smrg 	if constexpr (sizeof(__k) <= 8)
464b1e83836Smrg 	  {
465b1e83836Smrg 	    const auto __kk = __vector_bitcast<char>(__k._M_data)
466b1e83836Smrg 			      | ~__vector_bitcast<char>(
467b1e83836Smrg 				_Abi::template _S_implicit_mask<_Tp>());
468b1e83836Smrg 	    using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
469b1e83836Smrg 	    return __bit_cast<_Up>(__kk) + 1 > 1;
470b1e83836Smrg 	  }
471b1e83836Smrg 	else
472b1e83836Smrg 	  return _Base::_S_some_of(__k);
473b1e83836Smrg       }
474b1e83836Smrg 
475b1e83836Smrg     // }}}
476b1e83836Smrg     // _S_popcount {{{
477b1e83836Smrg     template <typename _Tp>
478*0a307195Smrg       _GLIBCXX_SIMD_INTRINSIC static int
_S_popcount_MaskImplNeon479*0a307195Smrg       _S_popcount(simd_mask<_Tp, _Abi> __k)
480b1e83836Smrg       {
481b1e83836Smrg 	if constexpr (sizeof(_Tp) == 1)
482b1e83836Smrg 	  {
483b1e83836Smrg 	    const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
484b1e83836Smrg 	    int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
485b1e83836Smrg 	    return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
486b1e83836Smrg 			     int8x8_t())[0];
487b1e83836Smrg 	  }
488b1e83836Smrg 	else if constexpr (sizeof(_Tp) == 2)
489b1e83836Smrg 	  {
490b1e83836Smrg 	    const auto __s16 = __vector_bitcast<short>(__k._M_data);
491b1e83836Smrg 	    int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
492b1e83836Smrg 	    return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
493b1e83836Smrg 	  }
494b1e83836Smrg 	else if constexpr (sizeof(_Tp) == 4)
495b1e83836Smrg 	  {
496b1e83836Smrg 	    const auto __s32 = __vector_bitcast<int>(__k._M_data);
497b1e83836Smrg 	    int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
498b1e83836Smrg 	    return -vpadd_s32(__tmp, int32x2_t())[0];
499b1e83836Smrg 	  }
500b1e83836Smrg 	else if constexpr (sizeof(_Tp) == 8)
501b1e83836Smrg 	  {
502b1e83836Smrg 	    static_assert(sizeof(__k) == 16);
503b1e83836Smrg 	    const auto __s64 = __vector_bitcast<long>(__k._M_data);
504b1e83836Smrg 	    return -(__s64[0] + __s64[1]);
505b1e83836Smrg 	  }
506b1e83836Smrg       }
507b1e83836Smrg 
508b1e83836Smrg     // }}}
509b1e83836Smrg     // _S_find_first_set {{{
510b1e83836Smrg     template <typename _Tp>
511b1e83836Smrg       _GLIBCXX_SIMD_INTRINSIC static int
_S_find_first_set_MaskImplNeon512b1e83836Smrg       _S_find_first_set(simd_mask<_Tp, _Abi> __k)
513b1e83836Smrg       {
514b1e83836Smrg 	// TODO: the _Base implementation is not optimal for NEON
515b1e83836Smrg 	return _Base::_S_find_first_set(__k);
516b1e83836Smrg       }
517b1e83836Smrg 
518b1e83836Smrg     // }}}
519b1e83836Smrg     // _S_find_last_set {{{
520b1e83836Smrg     template <typename _Tp>
521b1e83836Smrg       _GLIBCXX_SIMD_INTRINSIC static int
_S_find_last_set_MaskImplNeon522b1e83836Smrg       _S_find_last_set(simd_mask<_Tp, _Abi> __k)
523b1e83836Smrg       {
524b1e83836Smrg 	// TODO: the _Base implementation is not optimal for NEON
525b1e83836Smrg 	return _Base::_S_find_last_set(__k);
526b1e83836Smrg       }
527b1e83836Smrg 
528b1e83836Smrg     // }}}
529b1e83836Smrg   }; // }}}
530b1e83836Smrg 
531b1e83836Smrg _GLIBCXX_SIMD_END_NAMESPACE
532b1e83836Smrg #endif // __cplusplus >= 201703L
533b1e83836Smrg #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
534b1e83836Smrg // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
535