//===-- Generic implementation of memory function building blocks ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file provides generic C++ building blocks. // Depending on the requested size, the block operation uses unsigned integral // types, vector types or an array of the type with the maximum size. // // The maximum size is passed as a template argument. For instance, on x86 // platforms that only supports integral types the maximum size would be 8 // (corresponding to uint64_t). On this platform if we request the size 32, this // would be treated as a cpp::array. // // On the other hand, if the platform is x86 with support for AVX the maximum // size is 32 and the operation can be handled with a single native operation. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H #include "src/__support/CPP/array.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/common.h" #include "src/__support/endian_internal.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64 #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/utils.h" #include static_assert((UINTPTR_MAX == 4294967295U) || (UINTPTR_MAX == 18446744073709551615UL), "We currently only support 32- or 64-bit platforms"); namespace LIBC_NAMESPACE_DECL { // Compiler types using the vector attributes. using generic_v128 = uint8_t __attribute__((__vector_size__(16))); using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE_DECL namespace LIBC_NAMESPACE_DECL { namespace generic { // We accept three types of values as elements for generic operations: // - scalar : unsigned integral types, // - vector : compiler types using the vector attributes or platform builtins, // - array : a cpp::array where T is itself either a scalar or a vector. // The following traits help discriminate between these cases. template struct is_scalar : cpp::false_type {}; template <> struct is_scalar : cpp::true_type {}; template <> struct is_scalar : cpp::true_type {}; template <> struct is_scalar : cpp::true_type {}; #ifdef LIBC_TYPES_HAS_INT64 template <> struct is_scalar : cpp::true_type {}; #endif // LIBC_TYPES_HAS_INT64 // Meant to match std::numeric_limits interface. // NOLINTNEXTLINE(readability-identifier-naming) template constexpr bool is_scalar_v = is_scalar::value; template struct is_vector : cpp::false_type {}; template <> struct is_vector : cpp::true_type {}; template <> struct is_vector : cpp::true_type {}; template <> struct is_vector : cpp::true_type {}; // Meant to match std::numeric_limits interface. // NOLINTNEXTLINE(readability-identifier-naming) template constexpr bool is_vector_v = is_vector::value; template struct is_array : cpp::false_type {}; template struct is_array> { // Meant to match std::numeric_limits interface. // NOLINTNEXTLINE(readability-identifier-naming) static constexpr bool value = is_scalar_v || is_vector_v; }; // Meant to match std::numeric_limits interface. // NOLINTNEXTLINE(readability-identifier-naming) template constexpr bool is_array_v = is_array::value; // Meant to match std::numeric_limits interface. // NOLINTBEGIN(readability-identifier-naming) template constexpr bool is_element_type_v = is_scalar_v || is_vector_v || is_array_v; // NOLINTEND(readability-identifier-naming) // Helper struct to retrieve the number of elements of an array. template struct array_size {}; template struct array_size> : cpp::integral_constant {}; // Meant to match std::numeric_limits interface. // NOLINTNEXTLINE(readability-identifier-naming) template constexpr size_t array_size_v = array_size::value; // Generic operations for the above type categories. template T load(CPtr src) { static_assert(is_element_type_v); if constexpr (is_scalar_v || is_vector_v) { return ::LIBC_NAMESPACE::load(src); } else if constexpr (is_array_v) { using value_type = typename T::value_type; T value; for (size_t i = 0; i < array_size_v; ++i) value[i] = load(src + (i * sizeof(value_type))); return value; } } template void store(Ptr dst, T value) { static_assert(is_element_type_v); if constexpr (is_scalar_v || is_vector_v) { ::LIBC_NAMESPACE::store(dst, value); } else if constexpr (is_array_v) { using value_type = typename T::value_type; for (size_t i = 0; i < array_size_v; ++i) store(dst + (i * sizeof(value_type)), value[i]); } } template T splat(uint8_t value) { static_assert(is_scalar_v || is_vector_v); if constexpr (is_scalar_v) return T(~0) / T(0xFF) * T(value); else if constexpr (is_vector_v) { T out; // This for loop is optimized out for vector types. for (size_t i = 0; i < sizeof(T); ++i) out[i] = value; return out; } } /////////////////////////////////////////////////////////////////////////////// // Memset /////////////////////////////////////////////////////////////////////////////// template struct Memset { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); LIBC_INLINE static void block(Ptr dst, uint8_t value) { if constexpr (is_scalar_v || is_vector_v) { store(dst, splat(value)); } else if constexpr (is_array_v) { using value_type = typename T::value_type; const auto Splat = splat(value); for (size_t i = 0; i < array_size_v; ++i) store(dst + (i * sizeof(value_type)), Splat); } } LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { block(dst + count - SIZE, value); } LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { block(dst, value); tail(dst, value, count); } LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); do { block(dst + offset, value); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); } LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { return loop_and_tail_offset(dst, value, count, 0); } }; template struct MemsetSequence { static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); LIBC_INLINE static void block(Ptr dst, uint8_t value) { Memset::block(dst, value); if constexpr (sizeof...(TS) > 0) return MemsetSequence::block(dst + sizeof(T), value); } }; /////////////////////////////////////////////////////////////////////////////// // Memmove /////////////////////////////////////////////////////////////////////////////// template struct Memmove { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); LIBC_INLINE static void block(Ptr dst, CPtr src) { store(dst, load(src)); } LIBC_INLINE static void head_tail(Ptr dst, CPtr src, size_t count) { const size_t offset = count - SIZE; // The load and store operations can be performed in any order as long as // they are not interleaved. More investigations are needed to determine // the best order. const auto head = load(src); const auto tail = load(src + offset); store(dst, head); store(dst + offset, tail); } // Align forward suitable when dst < src. The alignment is performed with // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. // // e.g. Moving two bytes forward, we make sure src is aligned. // [ | | | | ] // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] // [____LLLLLLLL_____________________] // [___________LLLLLLLA______________] // [_SSSSSSSS________________________] // [________SSSSSSSS_________________] // // e.g. Moving two bytes forward, we make sure dst is aligned. // [ | | | | ] // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] // [____LLLLLLLL_____________________] // [______LLLLLLLL___________________] // [_SSSSSSSS________________________] // [___SSSSSSSA______________________] template LIBC_INLINE static void align_forward(Ptr &dst, CPtr &src, size_t &count) { Ptr prev_dst = dst; CPtr prev_src = src; size_t prev_count = count; align_to_next_boundary(dst, src, count); adjust(SIZE, dst, src, count); head_tail(prev_dst, prev_src, prev_count - count); } // Align backward suitable when dst > src. The alignment is performed with // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. // // e.g. Moving two bytes backward, we make sure src is aligned. // [ | | | | ] // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] // [ _________________ALLLLLLL_______] // [ ___________________LLLLLLLL_____] // [____________________SSSSSSSS_____] // [______________________SSSSSSSS___] // // e.g. Moving two bytes backward, we make sure dst is aligned. // [ | | | | ] // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] // [ _______________LLLLLLLL_________] // [ ___________________LLLLLLLL_____] // [__________________ASSSSSSS_______] // [______________________SSSSSSSS___] template LIBC_INLINE static void align_backward(Ptr &dst, CPtr &src, size_t &count) { Ptr headtail_dst = dst + count; CPtr headtail_src = src + count; size_t headtail_size = 0; align_to_next_boundary(headtail_dst, headtail_src, headtail_size); adjust(-2 * SIZE, headtail_dst, headtail_src, headtail_size); head_tail(headtail_dst, headtail_src, headtail_size); count -= headtail_size; } // Move forward suitable when dst < src. We load the tail bytes before // handling the loop. // // e.g. Moving two bytes // [ | | | | |] // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] // [_________________________LLLLLLLL___] // [___LLLLLLLL_________________________] // [_SSSSSSSS___________________________] // [___________LLLLLLLL_________________] // [_________SSSSSSSS___________________] // [___________________LLLLLLLL_________] // [_________________SSSSSSSS___________] // [_______________________SSSSSSSS_____] LIBC_INLINE static void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); const size_t tail_offset = count - SIZE; const auto tail_value = load(src + tail_offset); size_t offset = 0; LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset += SIZE; } while (offset < count - SIZE); store(dst + tail_offset, tail_value); } // Move backward suitable when dst > src. We load the head bytes before // handling the loop. // // e.g. Moving two bytes // [ | | | | |] // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] // [___LLLLLLLL_________________________] // [_________________________LLLLLLLL___] // [___________________________SSSSSSSS_] // [_________________LLLLLLLL___________] // [___________________SSSSSSSS_________] // [_________LLLLLLLL___________________] // [___________SSSSSSSS_________________] // [_____SSSSSSSS_______________________] LIBC_INLINE static void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); const auto head_value = load(src); ptrdiff_t offset = count - SIZE; LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset -= SIZE; } while (offset >= 0); store(dst, head_value); } }; /////////////////////////////////////////////////////////////////////////////// // Low level operations for Bcmp and Memcmp that operate on memory locations. /////////////////////////////////////////////////////////////////////////////// // Same as load above but with an offset to the pointer. // Making the offset explicit hints the compiler to use relevant addressing mode // consistently. template LIBC_INLINE T load(CPtr ptr, size_t offset) { return ::LIBC_NAMESPACE::load(ptr + offset); } // Same as above but also makes sure the loaded value is in big endian format. // This is useful when implementing lexicograhic comparisons as big endian // scalar comparison directly maps to lexicographic byte comparisons. template LIBC_INLINE T load_be(CPtr ptr, size_t offset) { return Endian::to_big_endian(load(ptr, offset)); } // Equality: returns true iff values at locations (p1 + offset) and (p2 + // offset) compare equal. template LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset); // Not equals: returns non-zero iff values at locations (p1 + offset) and (p2 + // offset) differ. template LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset); // Lexicographic comparison: // - returns 0 iff values at locations (p1 + offset) and (p2 + offset) compare // equal. // - returns a negative value if value at location (p1 + offset) is // lexicographically less than value at (p2 + offset). // - returns a positive value if value at location (p1 + offset) is // lexicographically greater than value at (p2 + offset). template LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset); // Lexicographic comparison of non-equal values: // - returns a negative value if value at location (p1 + offset) is // lexicographically less than value at (p2 + offset). // - returns a positive value if value at location (p1 + offset) is // lexicographically greater than value at (p2 + offset). template LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); /////////////////////////////////////////////////////////////////////////////// // Memcmp implementation // // When building memcmp, not all types are considered equals. // // For instance, the lexicographic comparison of two uint8_t can be implemented // as a simple subtraction, but for wider operations the logic can be much more // involving, especially on little endian platforms. // // For such wider types it is a good strategy to test for equality first and // only do the expensive lexicographic comparison if necessary. // // Decomposing the algorithm like this for wider types allows us to have // efficient implementation of higher order functions like 'head_tail' or // 'loop_and_tail'. /////////////////////////////////////////////////////////////////////////////// // Type traits to decide whether we can use 'cmp' directly or if we need to // split the computation. template struct cmp_is_expensive; template struct Memcmp { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); private: LIBC_INLINE static MemcmpReturnType block_offset(CPtr p1, CPtr p2, size_t offset) { if constexpr (cmp_is_expensive::value) { if (!eq(p1, p2, offset)) return cmp_neq(p1, p2, offset); return MemcmpReturnType::zero(); } else { return cmp(p1, p2, offset); } } public: LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { return block_offset(p1, p2, 0); } LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { return block_offset(p1, p2, count - SIZE); } LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { if constexpr (cmp_is_expensive::value) { if (!eq(p1, p2, 0)) return cmp_neq(p1, p2, 0); } else { if (const auto value = cmp(p1, p2, 0)) return value; } return tail(p1, p2, count); } LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { return loop_and_tail_offset(p1, p2, count, 0); } LIBC_INLINE static MemcmpReturnType loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { if constexpr (SIZE > 1) { const size_t limit = count - SIZE; LIBC_LOOP_NOUNROLL for (; offset < limit; offset += SIZE) { if constexpr (cmp_is_expensive::value) { if (!eq(p1, p2, offset)) return cmp_neq(p1, p2, offset); } else { if (const auto value = cmp(p1, p2, offset)) return value; } } return block_offset(p1, p2, limit); // tail } else { // No need for a tail operation when SIZE == 1. LIBC_LOOP_NOUNROLL for (; offset < count; offset += SIZE) if (auto value = cmp(p1, p2, offset)) return value; return MemcmpReturnType::zero(); } } LIBC_INLINE static MemcmpReturnType loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { const AlignHelper helper(p1); if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { if (auto value = block(p1, p2)) return value; adjust(helper.offset, p1, p2, count); } return loop_and_tail(p1, p2, count); } }; template struct MemcmpSequence { static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { // TODO: test suggestion in // https://reviews.llvm.org/D148717?id=515724#inline-1446890 // once we have a proper way to check memory operation latency. if constexpr (cmp_is_expensive::value) { if (!eq(p1, p2, 0)) return cmp_neq(p1, p2, 0); } else { if (auto value = cmp(p1, p2, 0)) return value; } if constexpr (sizeof...(TS) > 0) return MemcmpSequence::block(p1 + sizeof(T), p2 + sizeof(T)); else return MemcmpReturnType::zero(); } }; /////////////////////////////////////////////////////////////////////////////// // Bcmp /////////////////////////////////////////////////////////////////////////////// template struct Bcmp { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { return neq(p1, p2, 0); } LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { const size_t tail_offset = count - SIZE; return neq(p1, p2, tail_offset); } LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { if (const auto value = neq(p1, p2, 0)) return value; return tail(p1, p2, count); } LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { return loop_and_tail_offset(p1, p2, count, 0); } LIBC_INLINE static BcmpReturnType loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { if constexpr (SIZE > 1) { const size_t limit = count - SIZE; LIBC_LOOP_NOUNROLL for (; offset < limit; offset += SIZE) if (const auto value = neq(p1, p2, offset)) return value; return tail(p1, p2, count); } else { // No need for a tail operation when SIZE == 1. LIBC_LOOP_NOUNROLL for (; offset < count; offset += SIZE) if (const auto value = neq(p1, p2, offset)) return value; return BcmpReturnType::zero(); } } LIBC_INLINE static BcmpReturnType loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { static_assert(SIZE > 1, "No need to align when processing one byte at a time"); const AlignHelper helper(p1); if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { if (auto value = block(p1, p2)) return value; adjust(helper.offset, p1, p2, count); } return loop_and_tail(p1, p2, count); } }; template struct BcmpSequence { static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { if (auto value = neq(p1, p2, 0)) return value; if constexpr (sizeof...(TS) > 0) return BcmpSequence::block(p1 + sizeof(T), p2 + sizeof(T)); else return BcmpReturnType::zero(); } }; /////////////////////////////////////////////////////////////////////////////// // Specializations for uint8_t template <> struct cmp_is_expensive : public cpp::false_type {}; template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { return load(p1, offset) == load(p2, offset); } template <> LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { return load(p1, offset) ^ load(p2, offset); } template <> LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { return static_cast(load(p1, offset)) - static_cast(load(p2, offset)); } template <> LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); } // namespace generic } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H