1 //===-- Memory utils --------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H 10 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H 11 12 #include "src/__support/CPP/bit.h" 13 #include "src/__support/CPP/cstddef.h" 14 #include "src/__support/CPP/type_traits.h" 15 #include "src/__support/endian_internal.h" 16 #include "src/__support/macros/attributes.h" // LIBC_INLINE 17 #include "src/__support/macros/config.h" 18 #include "src/__support/macros/properties/architectures.h" 19 20 #include <stddef.h> // size_t 21 #include <stdint.h> // intptr_t / uintptr_t / INT32_MAX / INT32_MIN 22 23 namespace LIBC_NAMESPACE_DECL { 24 25 // Returns the number of bytes to substract from ptr to get to the previous 26 // multiple of alignment. If ptr is already aligned returns 0. 27 template <size_t alignment> 28 LIBC_INLINE uintptr_t distance_to_align_down(const void *ptr) { 29 static_assert(cpp::has_single_bit(alignment), 30 "alignment must be a power of 2"); 31 return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U); 32 } 33 34 // Returns the number of bytes to add to ptr to get to the next multiple of 35 // alignment. If ptr is already aligned returns 0. 36 template <size_t alignment> 37 LIBC_INLINE uintptr_t distance_to_align_up(const void *ptr) { 38 static_assert(cpp::has_single_bit(alignment), 39 "alignment must be a power of 2"); 40 // The logic is not straightforward and involves unsigned modulo arithmetic 41 // but the generated code is as fast as it can be. 42 return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U); 43 } 44 45 // Returns the number of bytes to add to ptr to get to the next multiple of 46 // alignment. If ptr is already aligned returns alignment. 47 template <size_t alignment> 48 LIBC_INLINE uintptr_t distance_to_next_aligned(const void *ptr) { 49 return alignment - distance_to_align_down<alignment>(ptr); 50 } 51 52 // Returns the same pointer but notifies the compiler that it is aligned. 53 template <size_t alignment, typename T> LIBC_INLINE T *assume_aligned(T *ptr) { 54 return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment)); 55 } 56 57 // Returns true iff memory regions [p1, p1 + size] and [p2, p2 + size] are 58 // disjoint. 59 LIBC_INLINE bool is_disjoint(const void *p1, const void *p2, size_t size) { 60 const ptrdiff_t sdiff = 61 static_cast<const char *>(p1) - static_cast<const char *>(p2); 62 // We use bit_cast to make sure that we don't run into accidental integer 63 // promotion. Notably the unary minus operator goes through integer promotion 64 // at the expression level. We assume arithmetic to be two's complement (i.e., 65 // bit_cast has the same behavior as a regular signed to unsigned cast). 66 static_assert(-1 == ~0, "not 2's complement"); 67 const size_t udiff = cpp::bit_cast<size_t>(sdiff); 68 // Integer promition would be caught here. 69 const size_t neg_udiff = cpp::bit_cast<size_t>(-sdiff); 70 // This is expected to compile a conditional move. 71 return sdiff >= 0 ? size <= udiff : size <= neg_udiff; 72 } 73 74 #if __has_builtin(__builtin_memcpy_inline) 75 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE 76 #endif 77 78 #if __has_builtin(__builtin_memset_inline) 79 #define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE 80 #endif 81 82 // Performs a constant count copy. 83 template <size_t Size> 84 LIBC_INLINE void memcpy_inline(void *__restrict dst, 85 const void *__restrict src) { 86 #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE 87 __builtin_memcpy_inline(dst, src, Size); 88 #else 89 // In memory functions `memcpy_inline` is instantiated several times with 90 // different value of the Size parameter. This doesn't play well with GCC's 91 // Value Range Analysis that wrongly detects out of bounds accesses. We 92 // disable these warnings for the purpose of this function. 93 #pragma GCC diagnostic push 94 #pragma GCC diagnostic ignored "-Warray-bounds" 95 #pragma GCC diagnostic ignored "-Wstringop-overread" 96 #pragma GCC diagnostic ignored "-Wstringop-overflow" 97 for (size_t i = 0; i < Size; ++i) 98 static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i]; 99 #pragma GCC diagnostic pop 100 #endif 101 } 102 103 using Ptr = cpp::byte *; // Pointer to raw data. 104 using CPtr = const cpp::byte *; // Const pointer to raw data. 105 106 // This type makes sure that we don't accidentally promote an integral type to 107 // another one. It is only constructible from the exact T type. 108 template <typename T> struct StrictIntegralType { 109 static_assert(cpp::is_integral_v<T>); 110 111 // Can only be constructed from a T. 112 template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0> 113 LIBC_INLINE StrictIntegralType(U value) : value(value) {} 114 115 // Allows using the type in an if statement. 116 LIBC_INLINE explicit operator bool() const { return value; } 117 118 // If type is unsigned (bcmp) we allow bitwise OR operations. 119 LIBC_INLINE StrictIntegralType 120 operator|(const StrictIntegralType &Rhs) const { 121 static_assert(!cpp::is_signed_v<T>); 122 return value | Rhs.value; 123 } 124 125 // For interation with the C API we allow explicit conversion back to the 126 // `int` type. 127 LIBC_INLINE explicit operator int() const { 128 // bit_cast makes sure that T and int have the same size. 129 return cpp::bit_cast<int>(value); 130 } 131 132 // Helper to get the zero value. 133 LIBC_INLINE static constexpr StrictIntegralType zero() { return {T(0)}; } 134 LIBC_INLINE static constexpr StrictIntegralType nonzero() { return {T(1)}; } 135 136 private: 137 T value; 138 }; 139 140 using MemcmpReturnType = StrictIntegralType<int32_t>; 141 using BcmpReturnType = StrictIntegralType<uint32_t>; 142 143 // This implements the semantic of 'memcmp' returning a negative value when 'a' 144 // is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise. 145 LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) { 146 // We perform the difference as an int64_t. 147 const int64_t diff = static_cast<int64_t>(a) - static_cast<int64_t>(b); 148 // For the int64_t to int32_t conversion we want the following properties: 149 // - int32_t[31:31] == 1 iff diff < 0 150 // - int32_t[31:0] == 0 iff diff == 0 151 152 // We also observe that: 153 // - When diff < 0: diff[63:32] == 0xffffffff and diff[31:0] != 0 154 // - When diff > 0: diff[63:32] == 0 and diff[31:0] != 0 155 // - When diff == 0: diff[63:32] == 0 and diff[31:0] == 0 156 // - https://godbolt.org/z/8W7qWP6e5 157 // - This implies that we can only look at diff[32:32] for determining the 158 // sign bit for the returned int32_t. 159 160 // So, we do the following: 161 // - int32_t[31:31] = diff[32:32] 162 // - int32_t[30:0] = diff[31:0] == 0 ? 0 : non-0. 163 164 // And, we can achieve the above by the expression below. We could have also 165 // used (diff64 >> 1) | (diff64 & 0x1) but (diff64 & 0xFFFF) is faster than 166 // (diff64 & 0x1). https://godbolt.org/z/j3b569rW1 167 return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF)); 168 } 169 170 // Returns a negative value if 'a' is less than 'b' and a positive value 171 // otherwise. This implements the semantic of 'memcmp' when we know that 'a' and 172 // 'b' differ. 173 LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) { 174 #if defined(LIBC_TARGET_ARCH_IS_X86) 175 // On x86, the best strategy would be to use 'INT32_MAX' and 'INT32_MIN' for 176 // positive and negative value respectively as they are one value apart: 177 // xor eax, eax <- free 178 // cmp rdi, rsi <- serializing 179 // adc eax, 2147483647 <- serializing 180 181 // Unfortunately we found instances of client code that negate the result of 182 // 'memcmp' to reverse ordering. Because signed integers are not symmetric 183 // (e.g., int8_t ∈ [-128, 127]) returning 'INT_MIN' would break such code as 184 // `-INT_MIN` is not representable as an int32_t. 185 186 // As a consequence, we use 5 and -5 which is still OK nice in terms of 187 // latency. 188 // cmp rdi, rsi <- serializing 189 // mov ecx, -5 <- can be done in parallel 190 // mov eax, 5 <- can be done in parallel 191 // cmovb eax, ecx <- serializing 192 static constexpr int32_t POSITIVE = 5; 193 static constexpr int32_t NEGATIVE = -5; 194 #else 195 // On RISC-V we simply use '1' and '-1' as it leads to branchless code. 196 // On ARMv8, both strategies lead to the same performance. 197 static constexpr int32_t POSITIVE = 1; 198 static constexpr int32_t NEGATIVE = -1; 199 #endif 200 static_assert(POSITIVE > 0); 201 static_assert(NEGATIVE < 0); 202 return a < b ? NEGATIVE : POSITIVE; 203 } 204 205 // Loads bytes from memory (possibly unaligned) and materializes them as 206 // type. 207 template <typename T> LIBC_INLINE T load(CPtr ptr) { 208 T out; 209 memcpy_inline<sizeof(T)>(&out, ptr); 210 return out; 211 } 212 213 // Stores a value of type T in memory (possibly unaligned). 214 template <typename T> LIBC_INLINE void store(Ptr ptr, T value) { 215 memcpy_inline<sizeof(T)>(ptr, &value); 216 } 217 218 // On architectures that do not allow for unaligned access we perform several 219 // aligned accesses and recombine them through shifts and logicals operations. 220 // For instance, if we know that the pointer is 2-byte aligned we can decompose 221 // a 64-bit operation into four 16-bit operations. 222 223 // Loads a 'ValueType' by decomposing it into several loads that are assumed to 224 // be aligned. 225 // e.g. load_aligned<uint32_t, uint16_t, uint16_t>(ptr); 226 template <typename ValueType, typename T, typename... TS> 227 LIBC_INLINE ValueType load_aligned(CPtr src) { 228 static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS))); 229 const ValueType value = load<T>(assume_aligned<sizeof(T)>(src)); 230 if constexpr (sizeof...(TS) > 0) { 231 constexpr size_t SHIFT = sizeof(T) * 8; 232 const ValueType next = load_aligned<ValueType, TS...>(src + sizeof(T)); 233 if constexpr (Endian::IS_LITTLE) 234 return value | (next << SHIFT); 235 else if constexpr (Endian::IS_BIG) 236 return (value << SHIFT) | next; 237 else 238 static_assert(cpp::always_false<T>, "Invalid endianness"); 239 } else { 240 return value; 241 } 242 } 243 244 // Alias for loading a 'uint32_t'. 245 template <typename T, typename... TS> 246 LIBC_INLINE auto load32_aligned(CPtr src, size_t offset) { 247 static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t)); 248 return load_aligned<uint32_t, T, TS...>(src + offset); 249 } 250 251 // Alias for loading a 'uint64_t'. 252 template <typename T, typename... TS> 253 LIBC_INLINE auto load64_aligned(CPtr src, size_t offset) { 254 static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t)); 255 return load_aligned<uint64_t, T, TS...>(src + offset); 256 } 257 258 // Stores a 'ValueType' by decomposing it into several stores that are assumed 259 // to be aligned. 260 // e.g. store_aligned<uint32_t, uint16_t, uint16_t>(value, ptr); 261 template <typename ValueType, typename T, typename... TS> 262 LIBC_INLINE void store_aligned(ValueType value, Ptr dst) { 263 static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS))); 264 constexpr size_t SHIFT = sizeof(T) * 8; 265 if constexpr (Endian::IS_LITTLE) { 266 store<T>(assume_aligned<sizeof(T)>(dst), value & ~T(0)); 267 if constexpr (sizeof...(TS) > 0) 268 store_aligned<ValueType, TS...>(value >> SHIFT, dst + sizeof(T)); 269 } else if constexpr (Endian::IS_BIG) { 270 constexpr size_t OFFSET = (0 + ... + sizeof(TS)); 271 store<T>(assume_aligned<sizeof(T)>(dst + OFFSET), value & ~T(0)); 272 if constexpr (sizeof...(TS) > 0) 273 store_aligned<ValueType, TS...>(value >> SHIFT, dst); 274 } else { 275 static_assert(cpp::always_false<T>, "Invalid endianness"); 276 } 277 } 278 279 // Alias for storing a 'uint32_t'. 280 template <typename T, typename... TS> 281 LIBC_INLINE void store32_aligned(uint32_t value, Ptr dst, size_t offset) { 282 static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t)); 283 store_aligned<uint32_t, T, TS...>(value, dst + offset); 284 } 285 286 // Alias for storing a 'uint64_t'. 287 template <typename T, typename... TS> 288 LIBC_INLINE void store64_aligned(uint64_t value, Ptr dst, size_t offset) { 289 static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t)); 290 store_aligned<uint64_t, T, TS...>(value, dst + offset); 291 } 292 293 // Advances the pointers p1 and p2 by offset bytes and decrease count by the 294 // same amount. 295 template <typename T1, typename T2> 296 LIBC_INLINE void adjust(ptrdiff_t offset, T1 *__restrict &p1, 297 T2 *__restrict &p2, size_t &count) { 298 p1 += offset; 299 p2 += offset; 300 count -= offset; 301 } 302 303 // Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary 304 // and decrease count by the same amount. 305 // We make sure the compiler knows about the adjusted pointer alignment. 306 template <size_t SIZE, typename T1, typename T2> 307 void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, 308 size_t &count) { 309 adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count); 310 p1 = assume_aligned<SIZE>(p1); 311 } 312 313 // Same as align_p1_to_next_boundary above but with a single pointer instead. 314 template <size_t SIZE, typename T> 315 LIBC_INLINE void align_to_next_boundary(T *&p1, size_t &count) { 316 const T *dummy = p1; 317 align_p1_to_next_boundary<SIZE>(p1, dummy, count); 318 } 319 320 // An enum class that discriminates between the first and second pointer. 321 enum class Arg { P1, P2, Dst = P1, Src = P2 }; 322 323 // Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1. 324 // Precondition: &p1 != &p2 325 template <size_t SIZE, Arg AlignOn, typename T1, typename T2> 326 LIBC_INLINE void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, 327 size_t &count) { 328 if constexpr (AlignOn == Arg::P1) 329 align_p1_to_next_boundary<SIZE>(p1, p2, count); 330 else if constexpr (AlignOn == Arg::P2) 331 align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2. 332 else 333 static_assert(cpp::always_false<T1>, 334 "AlignOn must be either Arg::P1 or Arg::P2"); 335 } 336 337 template <size_t SIZE> struct AlignHelper { 338 LIBC_INLINE AlignHelper(CPtr ptr) 339 : offset(distance_to_next_aligned<SIZE>(ptr)) {} 340 341 LIBC_INLINE bool not_aligned() const { return offset != SIZE; } 342 uintptr_t offset; 343 }; 344 345 LIBC_INLINE void prefetch_for_write(CPtr dst) { 346 __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3); 347 } 348 349 LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { 350 __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3); 351 } 352 353 } // namespace LIBC_NAMESPACE_DECL 354 355 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H 356