1 /* $NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * Upstream: https://github.com/rmind/thmap/ 29 */ 30 31 /* 32 * Concurrent trie-hash map. 33 * 34 * The data structure is conceptually a radix trie on hashed keys. 35 * Keys are hashed using a 32-bit function. The root level is a special 36 * case: it is managed using the compare-and-swap (CAS) atomic operation 37 * and has a fanout of 64. The subsequent levels are constructed using 38 * intermediate nodes with a fanout of 16 (using 4 bits). As more levels 39 * are created, more blocks of the 32-bit hash value might be generated 40 * by incrementing the seed parameter of the hash function. 41 * 42 * Concurrency 43 * 44 * - READERS: Descending is simply walking through the slot values of 45 * the intermediate nodes. It is lock-free as there is no intermediate 46 * state: the slot is either empty or has a pointer to the child node. 47 * The main assumptions here are the following: 48 * 49 * i) modifications must preserve consistency with the respect to the 50 * readers i.e. the readers can only see the valid node values; 51 * 52 * ii) any invalid view must "fail" the reads, e.g. by making them 53 * re-try from the root; this is a case for deletions and is achieved 54 * using the NODE_DELETED flag. 55 * 56 * iii) the node destruction must be synchronized with the readers, 57 * e.g. by using the Epoch-based reclamation or other techniques. 58 * 59 * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which 60 * is implemented using the NODE_LOCKED bit) -- it provides mutual 61 * exclusion amongst concurrent writers. The lock order for the nodes 62 * is "bottom-up" i.e. they are locked as we ascend the trie. A key 63 * constraint here is that parent pointer never changes. 64 * 65 * - DELETES: In addition to writer's locking, the deletion keeps the 66 * intermediate nodes in a valid state and sets the NODE_DELETED flag, 67 * to indicate that the readers must re-start the walk from the root. 68 * As the levels are collapsed, NODE_DELETED gets propagated up-tree. 69 * The leaf nodes just stay as-is until they are reclaimed. 70 * 71 * - ROOT LEVEL: The root level is a special case, as it is implemented 72 * as an array (rather than intermediate node). The root-level slot can 73 * only be set using CAS and it can only be set to a valid intermediate 74 * node. The root-level slot can only be cleared when the node it points 75 * at becomes empty, is locked and marked as NODE_DELETED (this causes 76 * the insert/delete operations to re-try until the slot is set to NULL). 77 * 78 * References: 79 * 80 * W. Litwin, 1981, Trie Hashing. 81 * Proceedings of the 1981 ACM SIGMOD, p. 19-29 82 * https://dl.acm.org/citation.cfm?id=582322 83 * 84 * P. L. Lehman and S. B. Yao. 85 * Efficient locking for concurrent operations on B-trees. 86 * ACM TODS, 6(4):650-670, 1981 87 * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf 88 */ 89 90 #ifdef _KERNEL 91 #include <sys/cdefs.h> 92 #include <sys/param.h> 93 #include <sys/types.h> 94 #include <sys/thmap.h> 95 #include <sys/kmem.h> 96 #include <sys/lock.h> 97 #include <sys/atomic.h> 98 #include <sys/hash.h> 99 #include <sys/cprng.h> 100 #define THMAP_RCSID(a) __KERNEL_RCSID(0, a) 101 #else 102 #include <stdio.h> 103 #include <stdlib.h> 104 #include <stdbool.h> 105 #include <stddef.h> 106 #include <inttypes.h> 107 #include <string.h> 108 #include <limits.h> 109 #define THMAP_RCSID(a) __RCSID(a) 110 111 #include "thmap.h" 112 #include "utils.h" 113 #endif 114 115 THMAP_RCSID("$NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $"); 116 117 #include <crypto/blake2/blake2s.h> 118 119 /* 120 * NetBSD kernel wrappers 121 */ 122 #ifdef _KERNEL 123 #define ASSERT KASSERT 124 #define atomic_thread_fence(x) membar_release() /* only used for release order */ 125 #define atomic_compare_exchange_weak_explicit_32(p, e, n, m1, m2) \ 126 (atomic_cas_32((p), *(e), (n)) == *(e)) 127 #define atomic_compare_exchange_weak_explicit_ptr(p, e, n, m1, m2) \ 128 (atomic_cas_ptr((p), *(void **)(e), (void *)(n)) == *(void **)(e)) 129 #define atomic_exchange_explicit(o, n, m1) atomic_swap_ptr((o), (n)) 130 #define murmurhash3 murmurhash2 131 #endif 132 133 /* 134 * The root level fanout is 64 (indexed by the last 6 bits of the hash 135 * value XORed with the length). Each subsequent level, represented by 136 * intermediate nodes, has a fanout of 16 (using 4 bits). 137 * 138 * The hash function produces 32-bit values. 139 */ 140 141 #define HASHVAL_SEEDLEN (16) 142 #define HASHVAL_BITS (32) 143 #define HASHVAL_MOD (HASHVAL_BITS - 1) 144 #define HASHVAL_SHIFT (5) 145 146 #define ROOT_BITS (6) 147 #define ROOT_SIZE (1 << ROOT_BITS) 148 #define ROOT_MASK (ROOT_SIZE - 1) 149 #define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS) 150 151 #define LEVEL_BITS (4) 152 #define LEVEL_SIZE (1 << LEVEL_BITS) 153 #define LEVEL_MASK (LEVEL_SIZE - 1) 154 155 /* 156 * Instead of raw pointers, we use offsets from the base address. 157 * This accommodates the use of this data structure in shared memory, 158 * where mappings can be in different address spaces. 159 * 160 * The pointers must be aligned, since pointer tagging is used to 161 * differentiate the intermediate nodes from leaves. We reserve the 162 * least significant bit. 163 */ 164 typedef uintptr_t thmap_ptr_t; 165 typedef uintptr_t atomic_thmap_ptr_t; // C11 _Atomic 166 167 #define THMAP_NULL ((thmap_ptr_t)0) 168 169 #define THMAP_LEAF_BIT (0x1) 170 171 #define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0) 172 #define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3) 173 #define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0) 174 175 #define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p))) 176 #define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr)) 177 #define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p)) 178 179 /* 180 * State field. 181 */ 182 183 #define NODE_LOCKED (1U << 31) // lock (writers) 184 #define NODE_DELETED (1U << 30) // node deleted 185 #define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask 186 187 /* 188 * There are two types of nodes: 189 * - Intermediate nodes -- arrays pointing to another level or a leaf; 190 * - Leaves, which store a key-value pair. 191 */ 192 193 typedef struct { 194 uint32_t state; // C11 _Atomic 195 thmap_ptr_t parent; 196 atomic_thmap_ptr_t slots[LEVEL_SIZE]; 197 } thmap_inode_t; 198 199 #define THMAP_INODE_LEN sizeof(thmap_inode_t) 200 201 typedef struct { 202 thmap_ptr_t key; 203 size_t len; 204 void * val; 205 } thmap_leaf_t; 206 207 typedef struct { 208 const uint8_t * seed; // secret seed 209 unsigned rslot; // root-level slot index 210 unsigned level; // current level in the tree 211 unsigned hashidx; // current hash index (block of bits) 212 uint32_t hashval; // current hash value 213 } thmap_query_t; 214 215 union thmap_align { 216 void * p; 217 uint64_t v; 218 }; 219 220 typedef struct thmap_gc thmap_gc_t; 221 struct thmap_gc { 222 size_t len; 223 thmap_gc_t * next; 224 char data[] __aligned(sizeof(union thmap_align)); 225 }; 226 227 #define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE) 228 229 struct thmap { 230 uintptr_t baseptr; 231 atomic_thmap_ptr_t * root; 232 unsigned flags; 233 const thmap_ops_t * ops; 234 thmap_gc_t * gc_list; // C11 _Atomic 235 uint8_t seed[HASHVAL_SEEDLEN]; 236 }; 237 238 static void stage_mem_gc(thmap_t *, uintptr_t, size_t); 239 240 /* 241 * A few low-level helper routines. 242 */ 243 244 static uintptr_t 245 alloc_wrapper(size_t len) 246 { 247 return (uintptr_t)kmem_intr_alloc(len, KM_NOSLEEP); 248 } 249 250 static void 251 free_wrapper(uintptr_t addr, size_t len) 252 { 253 kmem_intr_free((void *)addr, len); 254 } 255 256 static const thmap_ops_t thmap_default_ops = { 257 .alloc = alloc_wrapper, 258 .free = free_wrapper 259 }; 260 261 static uintptr_t 262 gc_alloc(const thmap_t *thmap, size_t len) 263 { 264 const size_t alloclen = offsetof(struct thmap_gc, data[len]); 265 const uintptr_t gcaddr = thmap->ops->alloc(alloclen); 266 267 if (!gcaddr) 268 return 0; 269 270 thmap_gc_t *const gc = THMAP_GETPTR(thmap, gcaddr); 271 gc->len = len; 272 return THMAP_GETOFF(thmap, &gc->data[0]); 273 } 274 275 static void 276 gc_free(const thmap_t *thmap, uintptr_t addr, size_t len) 277 { 278 const size_t alloclen = offsetof(struct thmap_gc, data[len]); 279 char *const ptr = THMAP_GETPTR(thmap, addr); 280 thmap_gc_t *const gc = container_of(ptr, struct thmap_gc, data[0]); 281 const uintptr_t gcaddr = THMAP_GETOFF(thmap, gc); 282 283 KASSERTMSG(gc->len == len, "thmap=%p ops=%p addr=%p len=%zu" 284 " gc=%p gc->len=%zu", 285 thmap, thmap->ops, (void *)addr, len, gc, gc->len); 286 thmap->ops->free(gcaddr, alloclen); 287 } 288 289 /* 290 * NODE LOCKING. 291 */ 292 293 static inline bool __diagused 294 node_locked_p(thmap_inode_t *node) 295 { 296 return (atomic_load_relaxed(&node->state) & NODE_LOCKED) != 0; 297 } 298 299 static void 300 lock_node(thmap_inode_t *node) 301 { 302 unsigned bcount = SPINLOCK_BACKOFF_MIN; 303 uint32_t s; 304 again: 305 s = atomic_load_relaxed(&node->state); 306 if (s & NODE_LOCKED) { 307 SPINLOCK_BACKOFF(bcount); 308 goto again; 309 } 310 /* Acquire from prior release in unlock_node.() */ 311 if (!atomic_compare_exchange_weak_explicit_32(&node->state, 312 &s, s | NODE_LOCKED, memory_order_acquire, memory_order_relaxed)) { 313 bcount = SPINLOCK_BACKOFF_MIN; 314 goto again; 315 } 316 } 317 318 static void 319 unlock_node(thmap_inode_t *node) 320 { 321 uint32_t s = atomic_load_relaxed(&node->state) & ~NODE_LOCKED; 322 323 ASSERT(node_locked_p(node)); 324 /* Release to subsequent acquire in lock_node(). */ 325 atomic_store_release(&node->state, s); 326 } 327 328 /* 329 * HASH VALUE AND KEY OPERATIONS. 330 */ 331 332 static inline uint32_t 333 hash(const uint8_t seed[static HASHVAL_SEEDLEN], const void *key, size_t len, 334 uint32_t level) 335 { 336 struct blake2s B; 337 uint32_t h; 338 339 if (level == 0) 340 return murmurhash3(key, len, 0); 341 342 /* 343 * Byte order is not significant here because this is 344 * intentionally secret and independent for each thmap. 345 * 346 * XXX We get 32 bytes of output at a time; we could march 347 * through them sequentially rather than throwing away 28 bytes 348 * and recomputing BLAKE2 each time. But the number of 349 * iterations ought to be geometric in the collision 350 * probability at each level which should be very small anyway. 351 */ 352 blake2s_init(&B, sizeof h, seed, HASHVAL_SEEDLEN); 353 blake2s_update(&B, &level, sizeof level); 354 blake2s_update(&B, key, len); 355 blake2s_final(&B, &h); 356 357 return h; 358 } 359 360 static inline void 361 hashval_init(thmap_query_t *query, const uint8_t seed[static HASHVAL_SEEDLEN], 362 const void * restrict key, size_t len) 363 { 364 const uint32_t hashval = hash(seed, key, len, 0); 365 366 query->seed = seed; 367 query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK; 368 query->level = 0; 369 query->hashval = hashval; 370 query->hashidx = 0; 371 } 372 373 /* 374 * hashval_getslot: given the key, compute the hash (if not already cached) 375 * and return the offset for the current level. 376 */ 377 static unsigned 378 hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len) 379 { 380 const unsigned offset = query->level * LEVEL_BITS; 381 const unsigned shift = offset & HASHVAL_MOD; 382 const unsigned i = offset >> HASHVAL_SHIFT; 383 384 if (query->hashidx != i) { 385 /* Generate a hash value for a required range. */ 386 query->hashval = hash(query->seed, key, len, i); 387 query->hashidx = i; 388 } 389 return (query->hashval >> shift) & LEVEL_MASK; 390 } 391 392 static unsigned 393 hashval_getleafslot(const thmap_t *thmap, 394 const thmap_leaf_t *leaf, unsigned level) 395 { 396 const void *key = THMAP_GETPTR(thmap, leaf->key); 397 const unsigned offset = level * LEVEL_BITS; 398 const unsigned shift = offset & HASHVAL_MOD; 399 const unsigned i = offset >> HASHVAL_SHIFT; 400 401 return (hash(thmap->seed, key, leaf->len, i) >> shift) & LEVEL_MASK; 402 } 403 404 static inline unsigned 405 hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query, 406 const thmap_leaf_t *leaf) 407 { 408 if (__predict_true(query->hashidx == 0)) { 409 return query->hashval & LEVEL_MASK; 410 } 411 return hashval_getleafslot(thmap, leaf, 0); 412 } 413 414 static bool 415 key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf, 416 const void * restrict key, size_t len) 417 { 418 const void *leafkey = THMAP_GETPTR(thmap, leaf->key); 419 return len == leaf->len && memcmp(key, leafkey, len) == 0; 420 } 421 422 /* 423 * INTER-NODE OPERATIONS. 424 */ 425 426 static thmap_inode_t * 427 node_create(thmap_t *thmap, thmap_inode_t *parent) 428 { 429 thmap_inode_t *node; 430 uintptr_t p; 431 432 p = gc_alloc(thmap, THMAP_INODE_LEN); 433 if (!p) { 434 return NULL; 435 } 436 node = THMAP_GETPTR(thmap, p); 437 ASSERT(THMAP_ALIGNED_P(node)); 438 439 memset(node, 0, THMAP_INODE_LEN); 440 if (parent) { 441 /* Not yet published, no need for ordering. */ 442 atomic_store_relaxed(&node->state, NODE_LOCKED); 443 node->parent = THMAP_GETOFF(thmap, parent); 444 } 445 return node; 446 } 447 448 static void 449 node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child) 450 { 451 ASSERT(node_locked_p(node) || node->parent == THMAP_NULL); 452 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0); 453 ASSERT(atomic_load_relaxed(&node->slots[slot]) == THMAP_NULL); 454 455 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) < LEVEL_SIZE); 456 457 /* 458 * If node is public already, caller is responsible for issuing 459 * release fence; if node is not public, no ordering is needed. 460 * Hence relaxed ordering. 461 */ 462 atomic_store_relaxed(&node->slots[slot], child); 463 atomic_store_relaxed(&node->state, 464 atomic_load_relaxed(&node->state) + 1); 465 } 466 467 static void 468 node_remove(thmap_inode_t *node, unsigned slot) 469 { 470 ASSERT(node_locked_p(node)); 471 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0); 472 ASSERT(atomic_load_relaxed(&node->slots[slot]) != THMAP_NULL); 473 474 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) > 0); 475 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) <= LEVEL_SIZE); 476 477 /* Element will be GC-ed later; no need for ordering here. */ 478 atomic_store_relaxed(&node->slots[slot], THMAP_NULL); 479 atomic_store_relaxed(&node->state, 480 atomic_load_relaxed(&node->state) - 1); 481 } 482 483 /* 484 * LEAF OPERATIONS. 485 */ 486 487 static thmap_leaf_t * 488 leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val) 489 { 490 thmap_leaf_t *leaf; 491 uintptr_t leaf_off, key_off; 492 493 leaf_off = gc_alloc(thmap, sizeof(thmap_leaf_t)); 494 if (!leaf_off) { 495 return NULL; 496 } 497 leaf = THMAP_GETPTR(thmap, leaf_off); 498 ASSERT(THMAP_ALIGNED_P(leaf)); 499 500 if ((thmap->flags & THMAP_NOCOPY) == 0) { 501 /* 502 * Copy the key. 503 */ 504 key_off = gc_alloc(thmap, len); 505 if (!key_off) { 506 gc_free(thmap, leaf_off, sizeof(thmap_leaf_t)); 507 return NULL; 508 } 509 memcpy(THMAP_GETPTR(thmap, key_off), key, len); 510 leaf->key = key_off; 511 } else { 512 /* Otherwise, we use a reference. */ 513 leaf->key = (uintptr_t)key; 514 } 515 leaf->len = len; 516 leaf->val = val; 517 return leaf; 518 } 519 520 static void 521 leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf) 522 { 523 if ((thmap->flags & THMAP_NOCOPY) == 0) { 524 gc_free(thmap, leaf->key, leaf->len); 525 } 526 gc_free(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t)); 527 } 528 529 static thmap_leaf_t * 530 get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot) 531 { 532 thmap_ptr_t node; 533 534 /* Consume from prior release in thmap_put(). */ 535 node = atomic_load_consume(&parent->slots[slot]); 536 if (THMAP_INODE_P(node)) { 537 return NULL; 538 } 539 return THMAP_NODE(thmap, node); 540 } 541 542 /* 543 * ROOT OPERATIONS. 544 */ 545 546 /* 547 * root_try_put: Try to set a root pointer at query->rslot. 548 * 549 * => Implies release operation on success. 550 * => Implies no ordering on failure. 551 */ 552 static inline int 553 root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf) 554 { 555 thmap_ptr_t expected; 556 const unsigned i = query->rslot; 557 thmap_inode_t *node; 558 thmap_ptr_t nptr; 559 unsigned slot; 560 561 /* 562 * Must pre-check first. No ordering required because we will 563 * check again before taking any actions, and start over if 564 * this changes from null. 565 */ 566 if (atomic_load_relaxed(&thmap->root[i])) { 567 return EEXIST; 568 } 569 570 /* 571 * Create an intermediate node. Since there is no parent set, 572 * it will be created unlocked and the CAS operation will 573 * release it to readers. 574 */ 575 node = node_create(thmap, NULL); 576 if (__predict_false(node == NULL)) { 577 return ENOMEM; 578 } 579 slot = hashval_getl0slot(thmap, query, leaf); 580 node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT); 581 nptr = THMAP_GETOFF(thmap, node); 582 again: 583 if (atomic_load_relaxed(&thmap->root[i])) { 584 gc_free(thmap, nptr, THMAP_INODE_LEN); 585 return EEXIST; 586 } 587 /* Release to subsequent consume in find_edge_node(). */ 588 expected = THMAP_NULL; 589 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->root[i], &expected, 590 nptr, memory_order_release, memory_order_relaxed)) { 591 goto again; 592 } 593 return 0; 594 } 595 596 /* 597 * find_edge_node: given the hash, traverse the tree to find the edge node. 598 * 599 * => Returns an aligned (clean) pointer to the parent node. 600 * => Returns the slot number and sets current level. 601 */ 602 static thmap_inode_t * 603 find_edge_node(const thmap_t *thmap, thmap_query_t *query, 604 const void * restrict key, size_t len, unsigned *slot) 605 { 606 thmap_ptr_t root_slot; 607 thmap_inode_t *parent; 608 thmap_ptr_t node; 609 unsigned off; 610 611 ASSERT(query->level == 0); 612 613 /* Consume from prior release in root_try_put(). */ 614 root_slot = atomic_load_consume(&thmap->root[query->rslot]); 615 parent = THMAP_NODE(thmap, root_slot); 616 if (!parent) { 617 return NULL; 618 } 619 descend: 620 off = hashval_getslot(query, key, len); 621 /* Consume from prior release in thmap_put(). */ 622 node = atomic_load_consume(&parent->slots[off]); 623 624 /* Descend the tree until we find a leaf or empty slot. */ 625 if (node && THMAP_INODE_P(node)) { 626 parent = THMAP_NODE(thmap, node); 627 query->level++; 628 goto descend; 629 } 630 /* 631 * NODE_DELETED does not become stale until GC runs, which 632 * cannot happen while we are in the middle of an operation, 633 * hence relaxed ordering. 634 */ 635 if (atomic_load_relaxed(&parent->state) & NODE_DELETED) { 636 return NULL; 637 } 638 *slot = off; 639 return parent; 640 } 641 642 /* 643 * find_edge_node_locked: traverse the tree, like find_edge_node(), 644 * but attempt to lock the edge node. 645 * 646 * => Returns NULL if the deleted node is found. This indicates that 647 * the caller must re-try from the root, as the root slot might have 648 * changed too. 649 */ 650 static thmap_inode_t * 651 find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query, 652 const void * restrict key, size_t len, unsigned *slot) 653 { 654 thmap_inode_t *node; 655 thmap_ptr_t target; 656 retry: 657 /* 658 * Find the edge node and lock it! Re-check the state since 659 * the tree might change by the time we acquire the lock. 660 */ 661 node = find_edge_node(thmap, query, key, len, slot); 662 if (!node) { 663 /* The root slot is empty -- let the caller decide. */ 664 query->level = 0; 665 return NULL; 666 } 667 lock_node(node); 668 if (__predict_false(atomic_load_relaxed(&node->state) & NODE_DELETED)) { 669 /* 670 * The node has been deleted. The tree might have a new 671 * shape now, therefore we must re-start from the root. 672 */ 673 unlock_node(node); 674 query->level = 0; 675 return NULL; 676 } 677 target = atomic_load_relaxed(&node->slots[*slot]); 678 if (__predict_false(target && THMAP_INODE_P(target))) { 679 /* 680 * The target slot has been changed and it is now an 681 * intermediate node. Re-start from the top internode. 682 */ 683 unlock_node(node); 684 query->level = 0; 685 goto retry; 686 } 687 return node; 688 } 689 690 /* 691 * thmap_get: lookup a value given the key. 692 */ 693 void * 694 thmap_get(thmap_t *thmap, const void *key, size_t len) 695 { 696 thmap_query_t query; 697 thmap_inode_t *parent; 698 thmap_leaf_t *leaf; 699 unsigned slot; 700 701 hashval_init(&query, thmap->seed, key, len); 702 parent = find_edge_node(thmap, &query, key, len, &slot); 703 if (!parent) { 704 return NULL; 705 } 706 leaf = get_leaf(thmap, parent, slot); 707 if (!leaf) { 708 return NULL; 709 } 710 if (!key_cmp_p(thmap, leaf, key, len)) { 711 return NULL; 712 } 713 return leaf->val; 714 } 715 716 /* 717 * thmap_put: insert a value given the key. 718 * 719 * => If the key is already present, return the associated value. 720 * => Otherwise, on successful insert, return the given value. 721 */ 722 void * 723 thmap_put(thmap_t *thmap, const void *key, size_t len, void *val) 724 { 725 thmap_query_t query; 726 thmap_leaf_t *leaf, *other; 727 thmap_inode_t *parent, *child; 728 unsigned slot, other_slot; 729 thmap_ptr_t target; 730 731 /* 732 * First, pre-allocate and initialize the leaf node. 733 */ 734 leaf = leaf_create(thmap, key, len, val); 735 if (__predict_false(!leaf)) { 736 return NULL; 737 } 738 hashval_init(&query, thmap->seed, key, len); 739 retry: 740 /* 741 * Try to insert into the root first, if its slot is empty. 742 */ 743 switch (root_try_put(thmap, &query, leaf)) { 744 case 0: 745 /* Success: the leaf was inserted; no locking involved. */ 746 return val; 747 case EEXIST: 748 break; 749 case ENOMEM: 750 return NULL; 751 default: 752 __unreachable(); 753 } 754 755 /* 756 * Release node via store in node_insert (*) to subsequent 757 * consume in get_leaf() or find_edge_node(). 758 */ 759 atomic_thread_fence(memory_order_release); 760 761 /* 762 * Find the edge node and the target slot. 763 */ 764 parent = find_edge_node_locked(thmap, &query, key, len, &slot); 765 if (!parent) { 766 goto retry; 767 } 768 target = atomic_load_relaxed(&parent->slots[slot]); // tagged offset 769 if (THMAP_INODE_P(target)) { 770 /* 771 * Empty slot: simply insert the new leaf. The release 772 * fence is already issued for us. 773 */ 774 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT; 775 node_insert(parent, slot, target); /* (*) */ 776 goto out; 777 } 778 779 /* 780 * Collision or duplicate. 781 */ 782 other = THMAP_NODE(thmap, target); 783 if (key_cmp_p(thmap, other, key, len)) { 784 /* 785 * Duplicate. Free the pre-allocated leaf and 786 * return the present value. 787 */ 788 leaf_free(thmap, leaf); 789 val = other->val; 790 goto out; 791 } 792 descend: 793 /* 794 * Collision -- expand the tree. Create an intermediate node 795 * which will be locked (NODE_LOCKED) for us. At this point, 796 * we advance to the next level. 797 */ 798 child = node_create(thmap, parent); 799 if (__predict_false(!child)) { 800 leaf_free(thmap, leaf); 801 val = NULL; 802 goto out; 803 } 804 query.level++; 805 806 /* 807 * Insert the other (colliding) leaf first. The new child is 808 * not yet published, so memory order is relaxed. 809 */ 810 other_slot = hashval_getleafslot(thmap, other, query.level); 811 target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT; 812 node_insert(child, other_slot, target); 813 814 /* 815 * Insert the intermediate node into the parent node. 816 * It becomes the new parent for the our new leaf. 817 * 818 * Ensure that stores to the child (and leaf) reach global 819 * visibility before it gets inserted to the parent, as 820 * consumed by get_leaf() or find_edge_node(). 821 */ 822 atomic_store_release(&parent->slots[slot], THMAP_GETOFF(thmap, child)); 823 824 unlock_node(parent); 825 ASSERT(node_locked_p(child)); 826 parent = child; 827 828 /* 829 * Get the new slot and check for another collision 830 * at the next level. 831 */ 832 slot = hashval_getslot(&query, key, len); 833 if (slot == other_slot) { 834 /* Another collision -- descend and expand again. */ 835 goto descend; 836 } 837 838 /* 839 * Insert our new leaf once we expanded enough. The release 840 * fence is already issued for us. 841 */ 842 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT; 843 node_insert(parent, slot, target); /* (*) */ 844 out: 845 unlock_node(parent); 846 return val; 847 } 848 849 /* 850 * thmap_del: remove the entry given the key. 851 */ 852 void * 853 thmap_del(thmap_t *thmap, const void *key, size_t len) 854 { 855 thmap_query_t query; 856 thmap_leaf_t *leaf; 857 thmap_inode_t *parent; 858 unsigned slot; 859 void *val; 860 861 hashval_init(&query, thmap->seed, key, len); 862 parent = find_edge_node_locked(thmap, &query, key, len, &slot); 863 if (!parent) { 864 /* Root slot empty: not found. */ 865 return NULL; 866 } 867 leaf = get_leaf(thmap, parent, slot); 868 if (!leaf || !key_cmp_p(thmap, leaf, key, len)) { 869 /* Not found. */ 870 unlock_node(parent); 871 return NULL; 872 } 873 874 /* Remove the leaf. */ 875 ASSERT(THMAP_NODE(thmap, atomic_load_relaxed(&parent->slots[slot])) 876 == leaf); 877 node_remove(parent, slot); 878 879 /* 880 * Collapse the levels if removing the last item. 881 */ 882 while (query.level && 883 NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) { 884 thmap_inode_t *node = parent; 885 886 ASSERT(atomic_load_relaxed(&node->state) == NODE_LOCKED); 887 888 /* 889 * Ascend one level up. 890 * => Mark our current parent as deleted. 891 * => Lock the parent one level up. 892 */ 893 query.level--; 894 slot = hashval_getslot(&query, key, len); 895 parent = THMAP_NODE(thmap, node->parent); 896 ASSERT(parent != NULL); 897 898 lock_node(parent); 899 ASSERT((atomic_load_relaxed(&parent->state) & NODE_DELETED) 900 == 0); 901 902 /* 903 * Lock is exclusive, so nobody else can be writing at 904 * the same time, and no need for atomic R/M/W, but 905 * readers may read without the lock and so need atomic 906 * load/store. No ordering here needed because the 907 * entry itself stays valid until GC. 908 */ 909 atomic_store_relaxed(&node->state, 910 atomic_load_relaxed(&node->state) | NODE_DELETED); 911 unlock_node(node); // memory_order_release 912 913 ASSERT(THMAP_NODE(thmap, 914 atomic_load_relaxed(&parent->slots[slot])) == node); 915 node_remove(parent, slot); 916 917 /* Stage the removed node for G/C. */ 918 stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN); 919 } 920 921 /* 922 * If the top node is empty, then we need to remove it from the 923 * root level. Mark the node as deleted and clear the slot. 924 * 925 * Note: acquiring the lock on the top node effectively prevents 926 * the root slot from changing. 927 */ 928 if (NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) { 929 const unsigned rslot = query.rslot; 930 const thmap_ptr_t nptr = 931 atomic_load_relaxed(&thmap->root[rslot]); 932 933 ASSERT(query.level == 0); 934 ASSERT(parent->parent == THMAP_NULL); 935 ASSERT(THMAP_GETOFF(thmap, parent) == nptr); 936 937 /* Mark as deleted and remove from the root-level slot. */ 938 atomic_store_relaxed(&parent->state, 939 atomic_load_relaxed(&parent->state) | NODE_DELETED); 940 atomic_store_relaxed(&thmap->root[rslot], THMAP_NULL); 941 942 stage_mem_gc(thmap, nptr, THMAP_INODE_LEN); 943 } 944 unlock_node(parent); 945 946 /* 947 * Save the value and stage the leaf for G/C. 948 */ 949 val = leaf->val; 950 if ((thmap->flags & THMAP_NOCOPY) == 0) { 951 stage_mem_gc(thmap, leaf->key, leaf->len); 952 } 953 stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t)); 954 return val; 955 } 956 957 /* 958 * G/C routines. 959 */ 960 961 static void 962 stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len) 963 { 964 char *const ptr = THMAP_GETPTR(thmap, addr); 965 thmap_gc_t *head, *gc; 966 967 gc = container_of(ptr, struct thmap_gc, data[0]); 968 KASSERTMSG(gc->len == len, 969 "thmap=%p ops=%p ptr=%p len=%zu gc=%p gc->len=%zu", 970 thmap, thmap->ops, (char *)addr, len, gc, gc->len); 971 retry: 972 head = atomic_load_relaxed(&thmap->gc_list); 973 gc->next = head; // not yet published 974 975 /* Release to subsequent acquire in thmap_stage_gc(). */ 976 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->gc_list, &head, gc, 977 memory_order_release, memory_order_relaxed)) { 978 goto retry; 979 } 980 } 981 982 void * 983 thmap_stage_gc(thmap_t *thmap) 984 { 985 /* Acquire from prior release in stage_mem_gc(). */ 986 return atomic_exchange_explicit(&thmap->gc_list, NULL, 987 memory_order_acquire); 988 } 989 990 void 991 thmap_gc(thmap_t *thmap, void *ref) 992 { 993 thmap_gc_t *gc = ref; 994 995 while (gc) { 996 thmap_gc_t *next = gc->next; 997 gc_free(thmap, THMAP_GETOFF(thmap, &gc->data[0]), gc->len); 998 gc = next; 999 } 1000 } 1001 1002 /* 1003 * thmap_create: construct a new trie-hash map object. 1004 */ 1005 thmap_t * 1006 thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags) 1007 { 1008 thmap_t *thmap; 1009 uintptr_t root; 1010 1011 /* 1012 * Setup the map object. 1013 */ 1014 if (!THMAP_ALIGNED_P(baseptr)) { 1015 return NULL; 1016 } 1017 thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP); 1018 thmap->baseptr = baseptr; 1019 thmap->ops = ops ? ops : &thmap_default_ops; 1020 thmap->flags = flags; 1021 1022 if ((thmap->flags & THMAP_SETROOT) == 0) { 1023 /* Allocate the root level. */ 1024 root = gc_alloc(thmap, THMAP_ROOT_LEN); 1025 if (!root) { 1026 kmem_free(thmap, sizeof(thmap_t)); 1027 return NULL; 1028 } 1029 thmap->root = THMAP_GETPTR(thmap, root); 1030 memset(thmap->root, 0, THMAP_ROOT_LEN); 1031 } 1032 1033 cprng_strong(kern_cprng, thmap->seed, sizeof thmap->seed, 0); 1034 1035 return thmap; 1036 } 1037 1038 int 1039 thmap_setroot(thmap_t *thmap, uintptr_t root_off) 1040 { 1041 if (thmap->root) { 1042 return -1; 1043 } 1044 thmap->root = THMAP_GETPTR(thmap, root_off); 1045 return 0; 1046 } 1047 1048 uintptr_t 1049 thmap_getroot(const thmap_t *thmap) 1050 { 1051 return THMAP_GETOFF(thmap, thmap->root); 1052 } 1053 1054 void 1055 thmap_destroy(thmap_t *thmap) 1056 { 1057 uintptr_t root = THMAP_GETOFF(thmap, thmap->root); 1058 void *ref; 1059 1060 ref = thmap_stage_gc(thmap); 1061 thmap_gc(thmap, ref); 1062 1063 if ((thmap->flags & THMAP_SETROOT) == 0) { 1064 gc_free(thmap, root, THMAP_ROOT_LEN); 1065 } 1066 kmem_free(thmap, sizeof(thmap_t)); 1067 } 1068