1 //===-- Resizable Monotonic HashTable ---------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 10 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 11 12 #include "include/llvm-libc-types/ENTRY.h" 13 #include "src/__support/CPP/bit.h" // bit_ceil 14 #include "src/__support/CPP/new.h" 15 #include "src/__support/HashTable/bitmask.h" 16 #include "src/__support/hash.h" 17 #include "src/__support/macros/attributes.h" 18 #include "src/__support/macros/config.h" 19 #include "src/__support/macros/optimization.h" 20 #include "src/__support/memory_size.h" 21 #include "src/string/memset.h" 22 #include "src/string/strcmp.h" 23 #include "src/string/strlen.h" 24 #include <stddef.h> 25 #include <stdint.h> 26 27 namespace LIBC_NAMESPACE_DECL { 28 namespace internal { 29 30 LIBC_INLINE uint8_t secondary_hash(uint64_t hash) { 31 // top 7 bits of the hash. 32 return static_cast<uint8_t>(hash >> 57); 33 } 34 35 // Probe sequence based on triangular numbers, which is guaranteed (since our 36 // table size is a power of two) to visit every group of elements exactly once. 37 // 38 // A triangular probe has us jump by 1 more group every time. So first we 39 // jump by 1 group (meaning we just continue our linear scan), then 2 groups 40 // (skipping over 1 group), then 3 groups (skipping over 2 groups), and so on. 41 // 42 // If we set sizeof(Group) to be one unit: 43 // T[k] = sum {1 + 2 + ... + k} = k * (k + 1) / 2 44 // It is provable that T[k] mod 2^m generates a permutation of 45 // 0, 1, 2, 3, ..., 2^m - 2, 2^m - 1 46 // Detailed proof is available at: 47 // https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ 48 struct ProbeSequence { 49 size_t position; 50 size_t stride; 51 size_t entries_mask; 52 53 LIBC_INLINE size_t next() { 54 position += stride; 55 position &= entries_mask; 56 stride += sizeof(Group); 57 return position; 58 } 59 }; 60 61 // The number of entries is at least group width: we do not 62 // need to do the fixup when we set the control bytes. 63 // The number of entries is at least 8: we don't have to worry 64 // about special sizes when check the fullness of the table. 65 LIBC_INLINE size_t capacity_to_entries(size_t cap) { 66 if (8 >= sizeof(Group) && cap < 8) 67 return 8; 68 if (16 >= sizeof(Group) && cap < 15) 69 return 16; 70 if (cap < sizeof(Group)) 71 cap = sizeof(Group); 72 // overflow is always checked in allocate() 73 return cpp::bit_ceil(cap * 8 / 7); 74 } 75 76 // The heap memory layout for N buckets HashTable is as follows: 77 // 78 // ======================= 79 // | N * Entry | 80 // ======================= <- align boundary 81 // | Header | 82 // ======================= <- align boundary (for fast resize) 83 // | (N + 1) * Byte | 84 // ======================= 85 // 86 // The trailing group part is to make sure we can always load 87 // a whole group of control bytes. 88 89 struct HashTable { 90 HashState state; 91 size_t entries_mask; // number of buckets - 1 92 size_t available_slots; // less than capacity 93 private: 94 // How many entries are there in the table. 95 LIBC_INLINE size_t num_of_entries() const { return entries_mask + 1; } 96 97 // How many entries can we store in the table before resizing. 98 LIBC_INLINE size_t full_capacity() const { return num_of_entries() / 8 * 7; } 99 100 // The alignment of the whole memory area is the maximum of the alignment 101 // among the following types: 102 // - HashTable 103 // - ENTRY 104 // - Group 105 LIBC_INLINE constexpr static size_t table_alignment() { 106 size_t left_align = alignof(HashTable) > alignof(ENTRY) ? alignof(HashTable) 107 : alignof(ENTRY); 108 return left_align > alignof(Group) ? left_align : alignof(Group); 109 } 110 111 LIBC_INLINE bool is_full() const { return available_slots == 0; } 112 113 LIBC_INLINE size_t offset_from_entries() const { 114 size_t entries_size = num_of_entries() * sizeof(ENTRY); 115 return entries_size + 116 SafeMemSize::offset_to(entries_size, table_alignment()); 117 } 118 119 LIBC_INLINE constexpr static size_t offset_to_groups() { 120 size_t header_size = sizeof(HashTable); 121 return header_size + SafeMemSize::offset_to(header_size, table_alignment()); 122 } 123 124 LIBC_INLINE ENTRY &entry(size_t i) { 125 return reinterpret_cast<ENTRY *>(this)[-i - 1]; 126 } 127 128 LIBC_INLINE const ENTRY &entry(size_t i) const { 129 return reinterpret_cast<const ENTRY *>(this)[-i - 1]; 130 } 131 132 LIBC_INLINE uint8_t &control(size_t i) { 133 uint8_t *ptr = reinterpret_cast<uint8_t *>(this) + offset_to_groups(); 134 return ptr[i]; 135 } 136 137 LIBC_INLINE const uint8_t &control(size_t i) const { 138 const uint8_t *ptr = 139 reinterpret_cast<const uint8_t *>(this) + offset_to_groups(); 140 return ptr[i]; 141 } 142 143 // We duplicate a group of control bytes to the end. Thus, it is possible that 144 // we need to set two control bytes at the same time. 145 LIBC_INLINE void set_ctrl(size_t index, uint8_t value) { 146 size_t index2 = ((index - sizeof(Group)) & entries_mask) + sizeof(Group); 147 control(index) = value; 148 control(index2) = value; 149 } 150 151 LIBC_INLINE size_t find(const char *key, uint64_t primary) { 152 uint8_t secondary = secondary_hash(primary); 153 ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask}; 154 while (true) { 155 size_t pos = sequence.next(); 156 Group ctrls = Group::load(&control(pos)); 157 IteratableBitMask masks = ctrls.match_byte(secondary); 158 for (size_t i : masks) { 159 size_t index = (pos + i) & entries_mask; 160 ENTRY &entry = this->entry(index); 161 if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0)) 162 return index; 163 } 164 BitMask available = ctrls.mask_available(); 165 // Since there is no deletion, the first time we find an available slot 166 // it is also ready to be used as an insertion point. Therefore, we also 167 // return the first available slot we find. If such entry is empty, the 168 // key will be nullptr. 169 if (LIBC_LIKELY(available.any_bit_set())) { 170 size_t index = 171 (pos + available.lowest_set_bit_nonzero()) & entries_mask; 172 return index; 173 } 174 } 175 } 176 177 LIBC_INLINE uint64_t oneshot_hash(const char *key) const { 178 LIBC_NAMESPACE::internal::HashState hasher = state; 179 hasher.update(key, strlen(key)); 180 return hasher.finish(); 181 } 182 183 // A fast insertion routine without checking if a key already exists. 184 // Nor does the routine check if the table is full. 185 // This is only to be used in grow() where we insert all existing entries 186 // into a new table. Hence, the requirements are naturally satisfied. 187 LIBC_INLINE ENTRY *unsafe_insert(ENTRY item) { 188 uint64_t primary = oneshot_hash(item.key); 189 uint8_t secondary = secondary_hash(primary); 190 ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask}; 191 while (true) { 192 size_t pos = sequence.next(); 193 Group ctrls = Group::load(&control(pos)); 194 BitMask available = ctrls.mask_available(); 195 if (available.any_bit_set()) { 196 size_t index = 197 (pos + available.lowest_set_bit_nonzero()) & entries_mask; 198 set_ctrl(index, secondary); 199 entry(index).key = item.key; 200 entry(index).data = item.data; 201 available_slots--; 202 return &entry(index); 203 } 204 } 205 } 206 207 LIBC_INLINE HashTable *grow() const { 208 size_t hint = full_capacity() + 1; 209 HashState state = this->state; 210 // migrate to a new random state 211 state.update(&hint, sizeof(hint)); 212 HashTable *new_table = allocate(hint, state.finish()); 213 // It is safe to call unsafe_insert() because we know that: 214 // - the new table has enough capacity to hold all the entries 215 // - there is no duplicate key in the old table 216 if (new_table != nullptr) 217 for (ENTRY e : *this) 218 new_table->unsafe_insert(e); 219 return new_table; 220 } 221 222 LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item, 223 uint64_t primary) { 224 auto index = table->find(item.key, primary); 225 auto slot = &table->entry(index); 226 // SVr4 and POSIX.1-2001 specify that action is significant only for 227 // unsuccessful searches, so that an ENTER should not do anything 228 // for a successful search. 229 if (slot->key != nullptr) 230 return slot; 231 232 // if table of full, we try to grow the table 233 if (table->is_full()) { 234 HashTable *new_table = table->grow(); 235 // allocation failed, return nullptr to indicate failure 236 if (new_table == nullptr) 237 return nullptr; 238 // resized sccuessfully: clean up the old table and use the new one 239 deallocate(table); 240 table = new_table; 241 // it is still valid to use the fastpath insertion. 242 return table->unsafe_insert(item); 243 } 244 245 table->set_ctrl(index, secondary_hash(primary)); 246 slot->key = item.key; 247 slot->data = item.data; 248 table->available_slots--; 249 return slot; 250 } 251 252 public: 253 LIBC_INLINE static void deallocate(HashTable *table) { 254 if (table) { 255 void *ptr = 256 reinterpret_cast<uint8_t *>(table) - table->offset_from_entries(); 257 operator delete(ptr, std::align_val_t{table_alignment()}); 258 } 259 } 260 261 LIBC_INLINE static HashTable *allocate(size_t capacity, uint64_t randomness) { 262 // check if capacity_to_entries overflows MAX_MEM_SIZE 263 if (capacity > size_t{1} << (8 * sizeof(size_t) - 1 - 3)) 264 return nullptr; 265 SafeMemSize entries{capacity_to_entries(capacity)}; 266 SafeMemSize entries_size = entries * SafeMemSize{sizeof(ENTRY)}; 267 SafeMemSize align_boundary = entries_size.align_up(table_alignment()); 268 SafeMemSize ctrl_sizes = entries + SafeMemSize{sizeof(Group)}; 269 SafeMemSize header_size{offset_to_groups()}; 270 SafeMemSize total_size = 271 (align_boundary + header_size + ctrl_sizes).align_up(table_alignment()); 272 if (!total_size.valid()) 273 return nullptr; 274 AllocChecker ac; 275 276 void *mem = operator new(total_size, std::align_val_t{table_alignment()}, 277 ac); 278 279 HashTable *table = reinterpret_cast<HashTable *>( 280 static_cast<uint8_t *>(mem) + align_boundary); 281 if (ac) { 282 table->entries_mask = entries - 1u; 283 table->available_slots = entries / 8 * 7; 284 table->state = HashState{randomness}; 285 memset(&table->control(0), 0x80, ctrl_sizes); 286 memset(mem, 0, table->offset_from_entries()); 287 } 288 return table; 289 } 290 291 struct FullTableIterator { 292 size_t current_offset; 293 size_t remaining; 294 IteratableBitMask current_mask; 295 const HashTable &table; 296 297 // It is fine to use remaining to represent the iterator: 298 // - this comparison only happens with the same table 299 // - hashtable will not be mutated during the iteration 300 LIBC_INLINE bool operator==(const FullTableIterator &other) const { 301 return remaining == other.remaining; 302 } 303 LIBC_INLINE bool operator!=(const FullTableIterator &other) const { 304 return remaining != other.remaining; 305 } 306 307 LIBC_INLINE FullTableIterator &operator++() { 308 this->ensure_valid_group(); 309 current_mask.remove_lowest_bit(); 310 remaining--; 311 return *this; 312 } 313 LIBC_INLINE const ENTRY &operator*() { 314 this->ensure_valid_group(); 315 return table.entry( 316 (current_offset + current_mask.lowest_set_bit_nonzero()) & 317 table.entries_mask); 318 } 319 320 private: 321 LIBC_INLINE void ensure_valid_group() { 322 while (!current_mask.any_bit_set()) { 323 current_offset += sizeof(Group); 324 // It is ensured that the load will only happen at aligned boundaries. 325 current_mask = 326 Group::load_aligned(&table.control(current_offset)).occupied(); 327 } 328 } 329 }; 330 331 using value_type = ENTRY; 332 using iterator = FullTableIterator; 333 iterator begin() const { 334 return {0, full_capacity() - available_slots, 335 Group::load_aligned(&control(0)).occupied(), *this}; 336 } 337 iterator end() const { return {0, 0, {BitMask{0}}, *this}; } 338 339 LIBC_INLINE ENTRY *find(const char *key) { 340 uint64_t primary = oneshot_hash(key); 341 ENTRY &entry = this->entry(find(key, primary)); 342 if (entry.key == nullptr) 343 return nullptr; 344 return &entry; 345 } 346 347 LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item) { 348 uint64_t primary = table->oneshot_hash(item.key); 349 return insert(table, item, primary); 350 } 351 }; 352 } // namespace internal 353 } // namespace LIBC_NAMESPACE_DECL 354 355 #endif // LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H 356