1 //===-- Portable string hash function ---------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC___SUPPORT_HASH_H 10 #define LLVM_LIBC_SRC___SUPPORT_HASH_H 11 12 #include "src/__support/CPP/bit.h" // rotl 13 #include "src/__support/CPP/limits.h" // numeric_limits 14 #include "src/__support/macros/attributes.h" // LIBC_INLINE 15 #include "src/__support/macros/config.h" 16 #include "src/__support/uint128.h" // UInt128 17 #include <stdint.h> // For uint64_t 18 19 namespace LIBC_NAMESPACE_DECL { 20 namespace internal { 21 22 // Folded multiplication. 23 // This function multiplies two 64-bit integers and xor the high and 24 // low 64-bit parts of the result. 25 LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) { 26 UInt128 p = static_cast<UInt128>(x) * static_cast<UInt128>(y); 27 uint64_t low = static_cast<uint64_t>(p); 28 uint64_t high = static_cast<uint64_t>(p >> 64); 29 return low ^ high; 30 } 31 32 // Read as little endian. 33 // Shift-and-or implementation does not give a satisfactory code on aarch64. 34 // Therefore, we use a union to read the value. 35 template <typename T> LIBC_INLINE T read_little_endian(const void *ptr) { 36 const uint8_t *bytes = static_cast<const uint8_t *>(ptr); 37 uint8_t buffer[sizeof(T)]; 38 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ 39 // Compiler should able to optimize this as a load followed by a byte 40 // swap. On aarch64 (-mbig-endian), this compiles to the following for 41 // int: 42 // ldr w0, [x0] 43 // rev w0, w0 44 // ret 45 for (size_t i = 0; i < sizeof(T); ++i) { 46 buffer[i] = bytes[sizeof(T) - i - 1]; 47 } 48 #else 49 for (size_t i = 0; i < sizeof(T); ++i) { 50 buffer[i] = bytes[i]; 51 } 52 #endif 53 return cpp::bit_cast<T>(buffer); 54 } 55 56 // Specialized read functions for small values. size must be <= 8. 57 LIBC_INLINE void read_small_values(const void *ptr, size_t size, uint64_t &low, 58 uint64_t &high) { 59 const uint8_t *bytes = static_cast<const uint8_t *>(ptr); 60 if (size >= 2) { 61 if (size >= 4) { 62 low = static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[0])); 63 high = 64 static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[size - 4])); 65 } else { 66 low = static_cast<uint64_t>(read_little_endian<uint16_t>(&bytes[0])); 67 high = static_cast<uint64_t>(bytes[size - 1]); 68 } 69 } else { 70 if (size > 0) { 71 low = static_cast<uint64_t>(bytes[0]); 72 high = static_cast<uint64_t>(bytes[0]); 73 } else { 74 low = 0; 75 high = 0; 76 } 77 } 78 } 79 80 // This constant comes from Kunth's prng (it empirically works well). 81 LIBC_INLINE_VAR constexpr uint64_t MULTIPLE = 6364136223846793005; 82 // Rotation amount for mixing. 83 LIBC_INLINE_VAR constexpr uint64_t ROTATE = 23; 84 85 // Randomly generated values. For now, we use the same values as in aHash as 86 // they are widely tested. 87 // https://github.com/tkaitchuck/aHash/blob/9f6a2ad8b721fd28da8dc1d0b7996677b374357c/src/random_state.rs#L38 88 LIBC_INLINE_VAR constexpr uint64_t RANDOMNESS[2][4] = { 89 {0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0, 90 0x082efa98ec4e6c89}, 91 {0x452821e638d01377, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd, 92 0x3f84d5b5b5470917}, 93 }; 94 95 // This is a portable string hasher. It is not cryptographically secure. 96 // The quality of the hash is good enough to pass all tests in SMHasher. 97 // The implementation is derived from the generic routine of aHash. 98 class HashState { 99 uint64_t buffer; 100 uint64_t pad; 101 uint64_t extra_keys[2]; 102 LIBC_INLINE void update(uint64_t low, uint64_t high) { 103 uint64_t combined = 104 folded_multiply(low ^ extra_keys[0], high ^ extra_keys[1]); 105 buffer = (buffer + pad) ^ combined; 106 buffer = cpp::rotl(buffer, ROTATE); 107 } 108 LIBC_INLINE static uint64_t mix(uint64_t seed) { 109 HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2], 110 RANDOMNESS[0][3]}; 111 mixer.update(seed, 0); 112 return mixer.finish(); 113 } 114 115 public: 116 LIBC_INLINE constexpr HashState(uint64_t a, uint64_t b, uint64_t c, 117 uint64_t d) 118 : buffer(a), pad(b), extra_keys{c, d} {} 119 LIBC_INLINE HashState(uint64_t seed) { 120 // Mix one more round of the seed to make it stronger. 121 uint64_t mixed = mix(seed); 122 buffer = RANDOMNESS[1][0] ^ mixed; 123 pad = RANDOMNESS[1][1] ^ mixed; 124 extra_keys[0] = RANDOMNESS[1][2] ^ mixed; 125 extra_keys[1] = RANDOMNESS[1][3] ^ mixed; 126 } 127 LIBC_INLINE void update(const void *ptr, size_t size) { 128 uint8_t const *bytes = static_cast<const uint8_t *>(ptr); 129 buffer = (buffer + size) * MULTIPLE; 130 uint64_t low, high; 131 if (size > 8) { 132 if (size > 16) { 133 // update tail 134 low = read_little_endian<uint64_t>(&bytes[size - 16]); 135 high = read_little_endian<uint64_t>(&bytes[size - 8]); 136 update(low, high); 137 while (size > 16) { 138 low = read_little_endian<uint64_t>(&bytes[0]); 139 high = read_little_endian<uint64_t>(&bytes[8]); 140 update(low, high); 141 bytes += 16; 142 size -= 16; 143 } 144 } else { 145 low = read_little_endian<uint64_t>(&bytes[0]); 146 high = read_little_endian<uint64_t>(&bytes[size - 8]); 147 update(low, high); 148 } 149 } else { 150 read_small_values(ptr, size, low, high); 151 update(low, high); 152 } 153 } 154 LIBC_INLINE uint64_t finish() { 155 int rot = buffer & 63; 156 uint64_t folded = folded_multiply(buffer, pad); 157 return cpp::rotl(folded, rot); 158 } 159 }; 160 161 } // namespace internal 162 } // namespace LIBC_NAMESPACE_DECL 163 164 #endif // LLVM_LIBC_SRC___SUPPORT_HASH_H 165