10fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 20fca6ea1SDimitry Andric // 30fca6ea1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40fca6ea1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50fca6ea1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60fca6ea1SDimitry Andric // 70fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 80fca6ea1SDimitry Andric 90fca6ea1SDimitry Andric #ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H 100fca6ea1SDimitry Andric #define _LIBCPP___ALGORITHM_SIMD_UTILS_H 110fca6ea1SDimitry Andric 120fca6ea1SDimitry Andric #include <__algorithm/min.h> 130fca6ea1SDimitry Andric #include <__bit/bit_cast.h> 140fca6ea1SDimitry Andric #include <__bit/countl.h> 150fca6ea1SDimitry Andric #include <__bit/countr.h> 160fca6ea1SDimitry Andric #include <__config> 170fca6ea1SDimitry Andric #include <__type_traits/is_arithmetic.h> 180fca6ea1SDimitry Andric #include <__type_traits/is_same.h> 190fca6ea1SDimitry Andric #include <__utility/integer_sequence.h> 200fca6ea1SDimitry Andric #include <cstddef> 210fca6ea1SDimitry Andric #include <cstdint> 220fca6ea1SDimitry Andric 230fca6ea1SDimitry Andric #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 240fca6ea1SDimitry Andric # pragma GCC system_header 250fca6ea1SDimitry Andric #endif 260fca6ea1SDimitry Andric 270fca6ea1SDimitry Andric _LIBCPP_PUSH_MACROS 280fca6ea1SDimitry Andric #include <__undef_macros> 290fca6ea1SDimitry Andric 300fca6ea1SDimitry Andric // TODO: Find out how altivec changes things and allow vectorizations there too. 31*6846ab2fSDimitry Andric #if _LIBCPP_STD_VER >= 14 && (defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1500) && !defined(__ALTIVEC__) 320fca6ea1SDimitry Andric # define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1 330fca6ea1SDimitry Andric #else 340fca6ea1SDimitry Andric # define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 0 350fca6ea1SDimitry Andric #endif 360fca6ea1SDimitry Andric 370fca6ea1SDimitry Andric #if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS && !defined(__OPTIMIZE_SIZE__) 380fca6ea1SDimitry Andric # define _LIBCPP_VECTORIZE_ALGORITHMS 1 390fca6ea1SDimitry Andric #else 400fca6ea1SDimitry Andric # define _LIBCPP_VECTORIZE_ALGORITHMS 0 410fca6ea1SDimitry Andric #endif 420fca6ea1SDimitry Andric 430fca6ea1SDimitry Andric #if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 440fca6ea1SDimitry Andric 450fca6ea1SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD 460fca6ea1SDimitry Andric 470fca6ea1SDimitry Andric template <class _Tp> 480fca6ea1SDimitry Andric inline constexpr bool __can_map_to_integer_v = 490fca6ea1SDimitry Andric sizeof(_Tp) == alignof(_Tp) && (sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8); 500fca6ea1SDimitry Andric 510fca6ea1SDimitry Andric template <size_t _TypeSize> 520fca6ea1SDimitry Andric struct __get_as_integer_type_impl; 530fca6ea1SDimitry Andric 540fca6ea1SDimitry Andric template <> 550fca6ea1SDimitry Andric struct __get_as_integer_type_impl<1> { 560fca6ea1SDimitry Andric using type = uint8_t; 570fca6ea1SDimitry Andric }; 580fca6ea1SDimitry Andric 590fca6ea1SDimitry Andric template <> 600fca6ea1SDimitry Andric struct __get_as_integer_type_impl<2> { 610fca6ea1SDimitry Andric using type = uint16_t; 620fca6ea1SDimitry Andric }; 630fca6ea1SDimitry Andric template <> 640fca6ea1SDimitry Andric struct __get_as_integer_type_impl<4> { 650fca6ea1SDimitry Andric using type = uint32_t; 660fca6ea1SDimitry Andric }; 670fca6ea1SDimitry Andric template <> 680fca6ea1SDimitry Andric struct __get_as_integer_type_impl<8> { 690fca6ea1SDimitry Andric using type = uint64_t; 700fca6ea1SDimitry Andric }; 710fca6ea1SDimitry Andric 720fca6ea1SDimitry Andric template <class _Tp> 730fca6ea1SDimitry Andric using __get_as_integer_type_t = typename __get_as_integer_type_impl<sizeof(_Tp)>::type; 740fca6ea1SDimitry Andric 750fca6ea1SDimitry Andric // This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance 760fca6ea1SDimitry Andric // in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms 770fca6ea1SDimitry Andric // as far as benchmarks are concerned. 780fca6ea1SDimitry Andric # if defined(__AVX__) || defined(__MVS__) 790fca6ea1SDimitry Andric template <class _Tp> 800fca6ea1SDimitry Andric inline constexpr size_t __native_vector_size = 32 / sizeof(_Tp); 810fca6ea1SDimitry Andric # elif defined(__SSE__) || defined(__ARM_NEON__) 820fca6ea1SDimitry Andric template <class _Tp> 830fca6ea1SDimitry Andric inline constexpr size_t __native_vector_size = 16 / sizeof(_Tp); 840fca6ea1SDimitry Andric # elif defined(__MMX__) 850fca6ea1SDimitry Andric template <class _Tp> 860fca6ea1SDimitry Andric inline constexpr size_t __native_vector_size = 8 / sizeof(_Tp); 870fca6ea1SDimitry Andric # else 880fca6ea1SDimitry Andric template <class _Tp> 890fca6ea1SDimitry Andric inline constexpr size_t __native_vector_size = 1; 900fca6ea1SDimitry Andric # endif 910fca6ea1SDimitry Andric 920fca6ea1SDimitry Andric template <class _ArithmeticT, size_t _Np> 930fca6ea1SDimitry Andric using __simd_vector __attribute__((__ext_vector_type__(_Np))) = _ArithmeticT; 940fca6ea1SDimitry Andric 950fca6ea1SDimitry Andric template <class _VecT> 960fca6ea1SDimitry Andric inline constexpr size_t __simd_vector_size_v = []<bool _False = false>() -> size_t { 970fca6ea1SDimitry Andric static_assert(_False, "Not a vector!"); 980fca6ea1SDimitry Andric }(); 990fca6ea1SDimitry Andric 1000fca6ea1SDimitry Andric template <class _Tp, size_t _Np> 1010fca6ea1SDimitry Andric inline constexpr size_t __simd_vector_size_v<__simd_vector<_Tp, _Np>> = _Np; 1020fca6ea1SDimitry Andric 1030fca6ea1SDimitry Andric template <class _Tp, size_t _Np> 1040fca6ea1SDimitry Andric _LIBCPP_HIDE_FROM_ABI _Tp __simd_vector_underlying_type_impl(__simd_vector<_Tp, _Np>) { 1050fca6ea1SDimitry Andric return _Tp{}; 1060fca6ea1SDimitry Andric } 1070fca6ea1SDimitry Andric 1080fca6ea1SDimitry Andric template <class _VecT> 1090fca6ea1SDimitry Andric using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{})); 1100fca6ea1SDimitry Andric 1110fca6ea1SDimitry Andric // This isn't inlined without always_inline when loading chars. 1120fca6ea1SDimitry Andric template <class _VecT, class _Iter> 1130fca6ea1SDimitry Andric _LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept { 1140fca6ea1SDimitry Andric return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept { 1150fca6ea1SDimitry Andric return _VecT{__iter[_Indices]...}; 1160fca6ea1SDimitry Andric }(make_index_sequence<__simd_vector_size_v<_VecT>>{}); 1170fca6ea1SDimitry Andric } 1180fca6ea1SDimitry Andric 1190fca6ea1SDimitry Andric template <class _Tp, size_t _Np> 1200fca6ea1SDimitry Andric _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept { 1210fca6ea1SDimitry Andric return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>)); 1220fca6ea1SDimitry Andric } 1230fca6ea1SDimitry Andric 1240fca6ea1SDimitry Andric template <class _Tp, size_t _Np> 1250fca6ea1SDimitry Andric _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept { 1260fca6ea1SDimitry Andric using __mask_vec = __simd_vector<bool, _Np>; 1270fca6ea1SDimitry Andric 1280fca6ea1SDimitry Andric // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876 1290fca6ea1SDimitry Andric auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept { 1300fca6ea1SDimitry Andric # if defined(_LIBCPP_BIG_ENDIAN) 1310fca6ea1SDimitry Andric return std::min<size_t>( 1320fca6ea1SDimitry Andric _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); 1330fca6ea1SDimitry Andric # else 1340fca6ea1SDimitry Andric return std::min<size_t>( 1350fca6ea1SDimitry Andric _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); 1360fca6ea1SDimitry Andric # endif 1370fca6ea1SDimitry Andric }; 1380fca6ea1SDimitry Andric 1390fca6ea1SDimitry Andric if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) { 1400fca6ea1SDimitry Andric return __impl(uint8_t{}); 1410fca6ea1SDimitry Andric } else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) { 1420fca6ea1SDimitry Andric return __impl(uint16_t{}); 1430fca6ea1SDimitry Andric } else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) { 1440fca6ea1SDimitry Andric return __impl(uint32_t{}); 1450fca6ea1SDimitry Andric } else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) { 1460fca6ea1SDimitry Andric return __impl(uint64_t{}); 1470fca6ea1SDimitry Andric } else { 1480fca6ea1SDimitry Andric static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type"); 1490fca6ea1SDimitry Andric return 0; 1500fca6ea1SDimitry Andric } 1510fca6ea1SDimitry Andric } 1520fca6ea1SDimitry Andric 1530fca6ea1SDimitry Andric template <class _Tp, size_t _Np> 1540fca6ea1SDimitry Andric _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<_Tp, _Np> __vec) noexcept { 1550fca6ea1SDimitry Andric return std::__find_first_set(~__vec); 1560fca6ea1SDimitry Andric } 1570fca6ea1SDimitry Andric 1580fca6ea1SDimitry Andric _LIBCPP_END_NAMESPACE_STD 1590fca6ea1SDimitry Andric 1600fca6ea1SDimitry Andric #endif // _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1610fca6ea1SDimitry Andric 1620fca6ea1SDimitry Andric _LIBCPP_POP_MACROS 1630fca6ea1SDimitry Andric 1640fca6ea1SDimitry Andric #endif // _LIBCPP___ALGORITHM_SIMD_UTILS_H 165