1 /* $NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * Upstream: https://github.com/rmind/thmap/
29 */
30
31 /*
32 * Concurrent trie-hash map.
33 *
34 * The data structure is conceptually a radix trie on hashed keys.
35 * Keys are hashed using a 32-bit function. The root level is a special
36 * case: it is managed using the compare-and-swap (CAS) atomic operation
37 * and has a fanout of 64. The subsequent levels are constructed using
38 * intermediate nodes with a fanout of 16 (using 4 bits). As more levels
39 * are created, more blocks of the 32-bit hash value might be generated
40 * by incrementing the seed parameter of the hash function.
41 *
42 * Concurrency
43 *
44 * - READERS: Descending is simply walking through the slot values of
45 * the intermediate nodes. It is lock-free as there is no intermediate
46 * state: the slot is either empty or has a pointer to the child node.
47 * The main assumptions here are the following:
48 *
49 * i) modifications must preserve consistency with the respect to the
50 * readers i.e. the readers can only see the valid node values;
51 *
52 * ii) any invalid view must "fail" the reads, e.g. by making them
53 * re-try from the root; this is a case for deletions and is achieved
54 * using the NODE_DELETED flag.
55 *
56 * iii) the node destruction must be synchronized with the readers,
57 * e.g. by using the Epoch-based reclamation or other techniques.
58 *
59 * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
60 * is implemented using the NODE_LOCKED bit) -- it provides mutual
61 * exclusion amongst concurrent writers. The lock order for the nodes
62 * is "bottom-up" i.e. they are locked as we ascend the trie. A key
63 * constraint here is that parent pointer never changes.
64 *
65 * - DELETES: In addition to writer's locking, the deletion keeps the
66 * intermediate nodes in a valid state and sets the NODE_DELETED flag,
67 * to indicate that the readers must re-start the walk from the root.
68 * As the levels are collapsed, NODE_DELETED gets propagated up-tree.
69 * The leaf nodes just stay as-is until they are reclaimed.
70 *
71 * - ROOT LEVEL: The root level is a special case, as it is implemented
72 * as an array (rather than intermediate node). The root-level slot can
73 * only be set using CAS and it can only be set to a valid intermediate
74 * node. The root-level slot can only be cleared when the node it points
75 * at becomes empty, is locked and marked as NODE_DELETED (this causes
76 * the insert/delete operations to re-try until the slot is set to NULL).
77 *
78 * References:
79 *
80 * W. Litwin, 1981, Trie Hashing.
81 * Proceedings of the 1981 ACM SIGMOD, p. 19-29
82 * https://dl.acm.org/citation.cfm?id=582322
83 *
84 * P. L. Lehman and S. B. Yao.
85 * Efficient locking for concurrent operations on B-trees.
86 * ACM TODS, 6(4):650-670, 1981
87 * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
88 */
89
90 #ifdef _KERNEL
91 #include <sys/cdefs.h>
92 #include <sys/param.h>
93 #include <sys/types.h>
94 #include <sys/thmap.h>
95 #include <sys/kmem.h>
96 #include <sys/lock.h>
97 #include <sys/atomic.h>
98 #include <sys/hash.h>
99 #include <sys/cprng.h>
100 #define THMAP_RCSID(a) __KERNEL_RCSID(0, a)
101 #else
102 #include <stdio.h>
103 #include <stdlib.h>
104 #include <stdbool.h>
105 #include <stddef.h>
106 #include <inttypes.h>
107 #include <string.h>
108 #include <limits.h>
109 #define THMAP_RCSID(a) __RCSID(a)
110
111 #include "thmap.h"
112 #include "utils.h"
113 #endif
114
115 THMAP_RCSID("$NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $");
116
117 #include <crypto/blake2/blake2s.h>
118
119 /*
120 * NetBSD kernel wrappers
121 */
122 #ifdef _KERNEL
123 #define ASSERT KASSERT
124 #define atomic_thread_fence(x) membar_release() /* only used for release order */
125 #define atomic_compare_exchange_weak_explicit_32(p, e, n, m1, m2) \
126 (atomic_cas_32((p), *(e), (n)) == *(e))
127 #define atomic_compare_exchange_weak_explicit_ptr(p, e, n, m1, m2) \
128 (atomic_cas_ptr((p), *(void **)(e), (void *)(n)) == *(void **)(e))
129 #define atomic_exchange_explicit(o, n, m1) atomic_swap_ptr((o), (n))
130 #define murmurhash3 murmurhash2
131 #endif
132
133 /*
134 * The root level fanout is 64 (indexed by the last 6 bits of the hash
135 * value XORed with the length). Each subsequent level, represented by
136 * intermediate nodes, has a fanout of 16 (using 4 bits).
137 *
138 * The hash function produces 32-bit values.
139 */
140
141 #define HASHVAL_SEEDLEN (16)
142 #define HASHVAL_BITS (32)
143 #define HASHVAL_MOD (HASHVAL_BITS - 1)
144 #define HASHVAL_SHIFT (5)
145
146 #define ROOT_BITS (6)
147 #define ROOT_SIZE (1 << ROOT_BITS)
148 #define ROOT_MASK (ROOT_SIZE - 1)
149 #define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS)
150
151 #define LEVEL_BITS (4)
152 #define LEVEL_SIZE (1 << LEVEL_BITS)
153 #define LEVEL_MASK (LEVEL_SIZE - 1)
154
155 /*
156 * Instead of raw pointers, we use offsets from the base address.
157 * This accommodates the use of this data structure in shared memory,
158 * where mappings can be in different address spaces.
159 *
160 * The pointers must be aligned, since pointer tagging is used to
161 * differentiate the intermediate nodes from leaves. We reserve the
162 * least significant bit.
163 */
164 typedef uintptr_t thmap_ptr_t;
165 typedef uintptr_t atomic_thmap_ptr_t; // C11 _Atomic
166
167 #define THMAP_NULL ((thmap_ptr_t)0)
168
169 #define THMAP_LEAF_BIT (0x1)
170
171 #define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0)
172 #define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3)
173 #define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
174
175 #define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p)))
176 #define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
177 #define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p))
178
179 /*
180 * State field.
181 */
182
183 #define NODE_LOCKED (1U << 31) // lock (writers)
184 #define NODE_DELETED (1U << 30) // node deleted
185 #define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask
186
187 /*
188 * There are two types of nodes:
189 * - Intermediate nodes -- arrays pointing to another level or a leaf;
190 * - Leaves, which store a key-value pair.
191 */
192
193 typedef struct {
194 uint32_t state; // C11 _Atomic
195 thmap_ptr_t parent;
196 atomic_thmap_ptr_t slots[LEVEL_SIZE];
197 } thmap_inode_t;
198
199 #define THMAP_INODE_LEN sizeof(thmap_inode_t)
200
201 typedef struct {
202 thmap_ptr_t key;
203 size_t len;
204 void * val;
205 } thmap_leaf_t;
206
207 typedef struct {
208 const uint8_t * seed; // secret seed
209 unsigned rslot; // root-level slot index
210 unsigned level; // current level in the tree
211 unsigned hashidx; // current hash index (block of bits)
212 uint32_t hashval; // current hash value
213 } thmap_query_t;
214
215 union thmap_align {
216 void * p;
217 uint64_t v;
218 };
219
220 typedef struct thmap_gc thmap_gc_t;
221 struct thmap_gc {
222 size_t len;
223 thmap_gc_t * next;
224 char data[] __aligned(sizeof(union thmap_align));
225 };
226
227 #define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE)
228
229 struct thmap {
230 uintptr_t baseptr;
231 atomic_thmap_ptr_t * root;
232 unsigned flags;
233 const thmap_ops_t * ops;
234 thmap_gc_t * gc_list; // C11 _Atomic
235 uint8_t seed[HASHVAL_SEEDLEN];
236 };
237
238 static void stage_mem_gc(thmap_t *, uintptr_t, size_t);
239
240 /*
241 * A few low-level helper routines.
242 */
243
244 static uintptr_t
alloc_wrapper(size_t len)245 alloc_wrapper(size_t len)
246 {
247 return (uintptr_t)kmem_intr_alloc(len, KM_NOSLEEP);
248 }
249
250 static void
free_wrapper(uintptr_t addr,size_t len)251 free_wrapper(uintptr_t addr, size_t len)
252 {
253 kmem_intr_free((void *)addr, len);
254 }
255
256 static const thmap_ops_t thmap_default_ops = {
257 .alloc = alloc_wrapper,
258 .free = free_wrapper
259 };
260
261 static uintptr_t
gc_alloc(const thmap_t * thmap,size_t len)262 gc_alloc(const thmap_t *thmap, size_t len)
263 {
264 const size_t alloclen = offsetof(struct thmap_gc, data[len]);
265 const uintptr_t gcaddr = thmap->ops->alloc(alloclen);
266
267 if (!gcaddr)
268 return 0;
269
270 thmap_gc_t *const gc = THMAP_GETPTR(thmap, gcaddr);
271 gc->len = len;
272 return THMAP_GETOFF(thmap, &gc->data[0]);
273 }
274
275 static void
gc_free(const thmap_t * thmap,uintptr_t addr,size_t len)276 gc_free(const thmap_t *thmap, uintptr_t addr, size_t len)
277 {
278 const size_t alloclen = offsetof(struct thmap_gc, data[len]);
279 char *const ptr = THMAP_GETPTR(thmap, addr);
280 thmap_gc_t *const gc = container_of(ptr, struct thmap_gc, data[0]);
281 const uintptr_t gcaddr = THMAP_GETOFF(thmap, gc);
282
283 KASSERTMSG(gc->len == len, "thmap=%p ops=%p addr=%p len=%zu"
284 " gc=%p gc->len=%zu",
285 thmap, thmap->ops, (void *)addr, len, gc, gc->len);
286 thmap->ops->free(gcaddr, alloclen);
287 }
288
289 /*
290 * NODE LOCKING.
291 */
292
293 static inline bool __diagused
node_locked_p(thmap_inode_t * node)294 node_locked_p(thmap_inode_t *node)
295 {
296 return (atomic_load_relaxed(&node->state) & NODE_LOCKED) != 0;
297 }
298
299 static void
lock_node(thmap_inode_t * node)300 lock_node(thmap_inode_t *node)
301 {
302 unsigned bcount = SPINLOCK_BACKOFF_MIN;
303 uint32_t s;
304 again:
305 s = atomic_load_relaxed(&node->state);
306 if (s & NODE_LOCKED) {
307 SPINLOCK_BACKOFF(bcount);
308 goto again;
309 }
310 /* Acquire from prior release in unlock_node.() */
311 if (!atomic_compare_exchange_weak_explicit_32(&node->state,
312 &s, s | NODE_LOCKED, memory_order_acquire, memory_order_relaxed)) {
313 bcount = SPINLOCK_BACKOFF_MIN;
314 goto again;
315 }
316 }
317
318 static void
unlock_node(thmap_inode_t * node)319 unlock_node(thmap_inode_t *node)
320 {
321 uint32_t s = atomic_load_relaxed(&node->state) & ~NODE_LOCKED;
322
323 ASSERT(node_locked_p(node));
324 /* Release to subsequent acquire in lock_node(). */
325 atomic_store_release(&node->state, s);
326 }
327
328 /*
329 * HASH VALUE AND KEY OPERATIONS.
330 */
331
332 static inline uint32_t
hash(const uint8_t seed[static HASHVAL_SEEDLEN],const void * key,size_t len,uint32_t level)333 hash(const uint8_t seed[static HASHVAL_SEEDLEN], const void *key, size_t len,
334 uint32_t level)
335 {
336 struct blake2s B;
337 uint32_t h;
338
339 if (level == 0)
340 return murmurhash3(key, len, 0);
341
342 /*
343 * Byte order is not significant here because this is
344 * intentionally secret and independent for each thmap.
345 *
346 * XXX We get 32 bytes of output at a time; we could march
347 * through them sequentially rather than throwing away 28 bytes
348 * and recomputing BLAKE2 each time. But the number of
349 * iterations ought to be geometric in the collision
350 * probability at each level which should be very small anyway.
351 */
352 blake2s_init(&B, sizeof h, seed, HASHVAL_SEEDLEN);
353 blake2s_update(&B, &level, sizeof level);
354 blake2s_update(&B, key, len);
355 blake2s_final(&B, &h);
356
357 return h;
358 }
359
360 static inline void
hashval_init(thmap_query_t * query,const uint8_t seed[static HASHVAL_SEEDLEN],const void * restrict key,size_t len)361 hashval_init(thmap_query_t *query, const uint8_t seed[static HASHVAL_SEEDLEN],
362 const void * restrict key, size_t len)
363 {
364 const uint32_t hashval = hash(seed, key, len, 0);
365
366 query->seed = seed;
367 query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK;
368 query->level = 0;
369 query->hashval = hashval;
370 query->hashidx = 0;
371 }
372
373 /*
374 * hashval_getslot: given the key, compute the hash (if not already cached)
375 * and return the offset for the current level.
376 */
377 static unsigned
hashval_getslot(thmap_query_t * query,const void * restrict key,size_t len)378 hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len)
379 {
380 const unsigned offset = query->level * LEVEL_BITS;
381 const unsigned shift = offset & HASHVAL_MOD;
382 const unsigned i = offset >> HASHVAL_SHIFT;
383
384 if (query->hashidx != i) {
385 /* Generate a hash value for a required range. */
386 query->hashval = hash(query->seed, key, len, i);
387 query->hashidx = i;
388 }
389 return (query->hashval >> shift) & LEVEL_MASK;
390 }
391
392 static unsigned
hashval_getleafslot(const thmap_t * thmap,const thmap_leaf_t * leaf,unsigned level)393 hashval_getleafslot(const thmap_t *thmap,
394 const thmap_leaf_t *leaf, unsigned level)
395 {
396 const void *key = THMAP_GETPTR(thmap, leaf->key);
397 const unsigned offset = level * LEVEL_BITS;
398 const unsigned shift = offset & HASHVAL_MOD;
399 const unsigned i = offset >> HASHVAL_SHIFT;
400
401 return (hash(thmap->seed, key, leaf->len, i) >> shift) & LEVEL_MASK;
402 }
403
404 static inline unsigned
hashval_getl0slot(const thmap_t * thmap,const thmap_query_t * query,const thmap_leaf_t * leaf)405 hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query,
406 const thmap_leaf_t *leaf)
407 {
408 if (__predict_true(query->hashidx == 0)) {
409 return query->hashval & LEVEL_MASK;
410 }
411 return hashval_getleafslot(thmap, leaf, 0);
412 }
413
414 static bool
key_cmp_p(const thmap_t * thmap,const thmap_leaf_t * leaf,const void * restrict key,size_t len)415 key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf,
416 const void * restrict key, size_t len)
417 {
418 const void *leafkey = THMAP_GETPTR(thmap, leaf->key);
419 return len == leaf->len && memcmp(key, leafkey, len) == 0;
420 }
421
422 /*
423 * INTER-NODE OPERATIONS.
424 */
425
426 static thmap_inode_t *
node_create(thmap_t * thmap,thmap_inode_t * parent)427 node_create(thmap_t *thmap, thmap_inode_t *parent)
428 {
429 thmap_inode_t *node;
430 uintptr_t p;
431
432 p = gc_alloc(thmap, THMAP_INODE_LEN);
433 if (!p) {
434 return NULL;
435 }
436 node = THMAP_GETPTR(thmap, p);
437 ASSERT(THMAP_ALIGNED_P(node));
438
439 memset(node, 0, THMAP_INODE_LEN);
440 if (parent) {
441 /* Not yet published, no need for ordering. */
442 atomic_store_relaxed(&node->state, NODE_LOCKED);
443 node->parent = THMAP_GETOFF(thmap, parent);
444 }
445 return node;
446 }
447
448 static void
node_insert(thmap_inode_t * node,unsigned slot,thmap_ptr_t child)449 node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child)
450 {
451 ASSERT(node_locked_p(node) || node->parent == THMAP_NULL);
452 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
453 ASSERT(atomic_load_relaxed(&node->slots[slot]) == THMAP_NULL);
454
455 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) < LEVEL_SIZE);
456
457 /*
458 * If node is public already, caller is responsible for issuing
459 * release fence; if node is not public, no ordering is needed.
460 * Hence relaxed ordering.
461 */
462 atomic_store_relaxed(&node->slots[slot], child);
463 atomic_store_relaxed(&node->state,
464 atomic_load_relaxed(&node->state) + 1);
465 }
466
467 static void
node_remove(thmap_inode_t * node,unsigned slot)468 node_remove(thmap_inode_t *node, unsigned slot)
469 {
470 ASSERT(node_locked_p(node));
471 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
472 ASSERT(atomic_load_relaxed(&node->slots[slot]) != THMAP_NULL);
473
474 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) > 0);
475 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) <= LEVEL_SIZE);
476
477 /* Element will be GC-ed later; no need for ordering here. */
478 atomic_store_relaxed(&node->slots[slot], THMAP_NULL);
479 atomic_store_relaxed(&node->state,
480 atomic_load_relaxed(&node->state) - 1);
481 }
482
483 /*
484 * LEAF OPERATIONS.
485 */
486
487 static thmap_leaf_t *
leaf_create(const thmap_t * thmap,const void * key,size_t len,void * val)488 leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val)
489 {
490 thmap_leaf_t *leaf;
491 uintptr_t leaf_off, key_off;
492
493 leaf_off = gc_alloc(thmap, sizeof(thmap_leaf_t));
494 if (!leaf_off) {
495 return NULL;
496 }
497 leaf = THMAP_GETPTR(thmap, leaf_off);
498 ASSERT(THMAP_ALIGNED_P(leaf));
499
500 if ((thmap->flags & THMAP_NOCOPY) == 0) {
501 /*
502 * Copy the key.
503 */
504 key_off = gc_alloc(thmap, len);
505 if (!key_off) {
506 gc_free(thmap, leaf_off, sizeof(thmap_leaf_t));
507 return NULL;
508 }
509 memcpy(THMAP_GETPTR(thmap, key_off), key, len);
510 leaf->key = key_off;
511 } else {
512 /* Otherwise, we use a reference. */
513 leaf->key = (uintptr_t)key;
514 }
515 leaf->len = len;
516 leaf->val = val;
517 return leaf;
518 }
519
520 static void
leaf_free(const thmap_t * thmap,thmap_leaf_t * leaf)521 leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf)
522 {
523 if ((thmap->flags & THMAP_NOCOPY) == 0) {
524 gc_free(thmap, leaf->key, leaf->len);
525 }
526 gc_free(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
527 }
528
529 static thmap_leaf_t *
get_leaf(const thmap_t * thmap,thmap_inode_t * parent,unsigned slot)530 get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot)
531 {
532 thmap_ptr_t node;
533
534 /* Consume from prior release in thmap_put(). */
535 node = atomic_load_consume(&parent->slots[slot]);
536 if (THMAP_INODE_P(node)) {
537 return NULL;
538 }
539 return THMAP_NODE(thmap, node);
540 }
541
542 /*
543 * ROOT OPERATIONS.
544 */
545
546 /*
547 * root_try_put: Try to set a root pointer at query->rslot.
548 *
549 * => Implies release operation on success.
550 * => Implies no ordering on failure.
551 */
552 static inline int
root_try_put(thmap_t * thmap,const thmap_query_t * query,thmap_leaf_t * leaf)553 root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf)
554 {
555 thmap_ptr_t expected;
556 const unsigned i = query->rslot;
557 thmap_inode_t *node;
558 thmap_ptr_t nptr;
559 unsigned slot;
560
561 /*
562 * Must pre-check first. No ordering required because we will
563 * check again before taking any actions, and start over if
564 * this changes from null.
565 */
566 if (atomic_load_relaxed(&thmap->root[i])) {
567 return EEXIST;
568 }
569
570 /*
571 * Create an intermediate node. Since there is no parent set,
572 * it will be created unlocked and the CAS operation will
573 * release it to readers.
574 */
575 node = node_create(thmap, NULL);
576 if (__predict_false(node == NULL)) {
577 return ENOMEM;
578 }
579 slot = hashval_getl0slot(thmap, query, leaf);
580 node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT);
581 nptr = THMAP_GETOFF(thmap, node);
582 again:
583 if (atomic_load_relaxed(&thmap->root[i])) {
584 gc_free(thmap, nptr, THMAP_INODE_LEN);
585 return EEXIST;
586 }
587 /* Release to subsequent consume in find_edge_node(). */
588 expected = THMAP_NULL;
589 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->root[i], &expected,
590 nptr, memory_order_release, memory_order_relaxed)) {
591 goto again;
592 }
593 return 0;
594 }
595
596 /*
597 * find_edge_node: given the hash, traverse the tree to find the edge node.
598 *
599 * => Returns an aligned (clean) pointer to the parent node.
600 * => Returns the slot number and sets current level.
601 */
602 static thmap_inode_t *
find_edge_node(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)603 find_edge_node(const thmap_t *thmap, thmap_query_t *query,
604 const void * restrict key, size_t len, unsigned *slot)
605 {
606 thmap_ptr_t root_slot;
607 thmap_inode_t *parent;
608 thmap_ptr_t node;
609 unsigned off;
610
611 ASSERT(query->level == 0);
612
613 /* Consume from prior release in root_try_put(). */
614 root_slot = atomic_load_consume(&thmap->root[query->rslot]);
615 parent = THMAP_NODE(thmap, root_slot);
616 if (!parent) {
617 return NULL;
618 }
619 descend:
620 off = hashval_getslot(query, key, len);
621 /* Consume from prior release in thmap_put(). */
622 node = atomic_load_consume(&parent->slots[off]);
623
624 /* Descend the tree until we find a leaf or empty slot. */
625 if (node && THMAP_INODE_P(node)) {
626 parent = THMAP_NODE(thmap, node);
627 query->level++;
628 goto descend;
629 }
630 /*
631 * NODE_DELETED does not become stale until GC runs, which
632 * cannot happen while we are in the middle of an operation,
633 * hence relaxed ordering.
634 */
635 if (atomic_load_relaxed(&parent->state) & NODE_DELETED) {
636 return NULL;
637 }
638 *slot = off;
639 return parent;
640 }
641
642 /*
643 * find_edge_node_locked: traverse the tree, like find_edge_node(),
644 * but attempt to lock the edge node.
645 *
646 * => Returns NULL if the deleted node is found. This indicates that
647 * the caller must re-try from the root, as the root slot might have
648 * changed too.
649 */
650 static thmap_inode_t *
find_edge_node_locked(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)651 find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query,
652 const void * restrict key, size_t len, unsigned *slot)
653 {
654 thmap_inode_t *node;
655 thmap_ptr_t target;
656 retry:
657 /*
658 * Find the edge node and lock it! Re-check the state since
659 * the tree might change by the time we acquire the lock.
660 */
661 node = find_edge_node(thmap, query, key, len, slot);
662 if (!node) {
663 /* The root slot is empty -- let the caller decide. */
664 query->level = 0;
665 return NULL;
666 }
667 lock_node(node);
668 if (__predict_false(atomic_load_relaxed(&node->state) & NODE_DELETED)) {
669 /*
670 * The node has been deleted. The tree might have a new
671 * shape now, therefore we must re-start from the root.
672 */
673 unlock_node(node);
674 query->level = 0;
675 return NULL;
676 }
677 target = atomic_load_relaxed(&node->slots[*slot]);
678 if (__predict_false(target && THMAP_INODE_P(target))) {
679 /*
680 * The target slot has been changed and it is now an
681 * intermediate node. Re-start from the top internode.
682 */
683 unlock_node(node);
684 query->level = 0;
685 goto retry;
686 }
687 return node;
688 }
689
690 /*
691 * thmap_get: lookup a value given the key.
692 */
693 void *
thmap_get(thmap_t * thmap,const void * key,size_t len)694 thmap_get(thmap_t *thmap, const void *key, size_t len)
695 {
696 thmap_query_t query;
697 thmap_inode_t *parent;
698 thmap_leaf_t *leaf;
699 unsigned slot;
700
701 hashval_init(&query, thmap->seed, key, len);
702 parent = find_edge_node(thmap, &query, key, len, &slot);
703 if (!parent) {
704 return NULL;
705 }
706 leaf = get_leaf(thmap, parent, slot);
707 if (!leaf) {
708 return NULL;
709 }
710 if (!key_cmp_p(thmap, leaf, key, len)) {
711 return NULL;
712 }
713 return leaf->val;
714 }
715
716 /*
717 * thmap_put: insert a value given the key.
718 *
719 * => If the key is already present, return the associated value.
720 * => Otherwise, on successful insert, return the given value.
721 */
722 void *
thmap_put(thmap_t * thmap,const void * key,size_t len,void * val)723 thmap_put(thmap_t *thmap, const void *key, size_t len, void *val)
724 {
725 thmap_query_t query;
726 thmap_leaf_t *leaf, *other;
727 thmap_inode_t *parent, *child;
728 unsigned slot, other_slot;
729 thmap_ptr_t target;
730
731 /*
732 * First, pre-allocate and initialize the leaf node.
733 */
734 leaf = leaf_create(thmap, key, len, val);
735 if (__predict_false(!leaf)) {
736 return NULL;
737 }
738 hashval_init(&query, thmap->seed, key, len);
739 retry:
740 /*
741 * Try to insert into the root first, if its slot is empty.
742 */
743 switch (root_try_put(thmap, &query, leaf)) {
744 case 0:
745 /* Success: the leaf was inserted; no locking involved. */
746 return val;
747 case EEXIST:
748 break;
749 case ENOMEM:
750 return NULL;
751 default:
752 __unreachable();
753 }
754
755 /*
756 * Release node via store in node_insert (*) to subsequent
757 * consume in get_leaf() or find_edge_node().
758 */
759 atomic_thread_fence(memory_order_release);
760
761 /*
762 * Find the edge node and the target slot.
763 */
764 parent = find_edge_node_locked(thmap, &query, key, len, &slot);
765 if (!parent) {
766 goto retry;
767 }
768 target = atomic_load_relaxed(&parent->slots[slot]); // tagged offset
769 if (THMAP_INODE_P(target)) {
770 /*
771 * Empty slot: simply insert the new leaf. The release
772 * fence is already issued for us.
773 */
774 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
775 node_insert(parent, slot, target); /* (*) */
776 goto out;
777 }
778
779 /*
780 * Collision or duplicate.
781 */
782 other = THMAP_NODE(thmap, target);
783 if (key_cmp_p(thmap, other, key, len)) {
784 /*
785 * Duplicate. Free the pre-allocated leaf and
786 * return the present value.
787 */
788 leaf_free(thmap, leaf);
789 val = other->val;
790 goto out;
791 }
792 descend:
793 /*
794 * Collision -- expand the tree. Create an intermediate node
795 * which will be locked (NODE_LOCKED) for us. At this point,
796 * we advance to the next level.
797 */
798 child = node_create(thmap, parent);
799 if (__predict_false(!child)) {
800 leaf_free(thmap, leaf);
801 val = NULL;
802 goto out;
803 }
804 query.level++;
805
806 /*
807 * Insert the other (colliding) leaf first. The new child is
808 * not yet published, so memory order is relaxed.
809 */
810 other_slot = hashval_getleafslot(thmap, other, query.level);
811 target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT;
812 node_insert(child, other_slot, target);
813
814 /*
815 * Insert the intermediate node into the parent node.
816 * It becomes the new parent for the our new leaf.
817 *
818 * Ensure that stores to the child (and leaf) reach global
819 * visibility before it gets inserted to the parent, as
820 * consumed by get_leaf() or find_edge_node().
821 */
822 atomic_store_release(&parent->slots[slot], THMAP_GETOFF(thmap, child));
823
824 unlock_node(parent);
825 ASSERT(node_locked_p(child));
826 parent = child;
827
828 /*
829 * Get the new slot and check for another collision
830 * at the next level.
831 */
832 slot = hashval_getslot(&query, key, len);
833 if (slot == other_slot) {
834 /* Another collision -- descend and expand again. */
835 goto descend;
836 }
837
838 /*
839 * Insert our new leaf once we expanded enough. The release
840 * fence is already issued for us.
841 */
842 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
843 node_insert(parent, slot, target); /* (*) */
844 out:
845 unlock_node(parent);
846 return val;
847 }
848
849 /*
850 * thmap_del: remove the entry given the key.
851 */
852 void *
thmap_del(thmap_t * thmap,const void * key,size_t len)853 thmap_del(thmap_t *thmap, const void *key, size_t len)
854 {
855 thmap_query_t query;
856 thmap_leaf_t *leaf;
857 thmap_inode_t *parent;
858 unsigned slot;
859 void *val;
860
861 hashval_init(&query, thmap->seed, key, len);
862 parent = find_edge_node_locked(thmap, &query, key, len, &slot);
863 if (!parent) {
864 /* Root slot empty: not found. */
865 return NULL;
866 }
867 leaf = get_leaf(thmap, parent, slot);
868 if (!leaf || !key_cmp_p(thmap, leaf, key, len)) {
869 /* Not found. */
870 unlock_node(parent);
871 return NULL;
872 }
873
874 /* Remove the leaf. */
875 ASSERT(THMAP_NODE(thmap, atomic_load_relaxed(&parent->slots[slot]))
876 == leaf);
877 node_remove(parent, slot);
878
879 /*
880 * Collapse the levels if removing the last item.
881 */
882 while (query.level &&
883 NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
884 thmap_inode_t *node = parent;
885
886 ASSERT(atomic_load_relaxed(&node->state) == NODE_LOCKED);
887
888 /*
889 * Ascend one level up.
890 * => Mark our current parent as deleted.
891 * => Lock the parent one level up.
892 */
893 query.level--;
894 slot = hashval_getslot(&query, key, len);
895 parent = THMAP_NODE(thmap, node->parent);
896 ASSERT(parent != NULL);
897
898 lock_node(parent);
899 ASSERT((atomic_load_relaxed(&parent->state) & NODE_DELETED)
900 == 0);
901
902 /*
903 * Lock is exclusive, so nobody else can be writing at
904 * the same time, and no need for atomic R/M/W, but
905 * readers may read without the lock and so need atomic
906 * load/store. No ordering here needed because the
907 * entry itself stays valid until GC.
908 */
909 atomic_store_relaxed(&node->state,
910 atomic_load_relaxed(&node->state) | NODE_DELETED);
911 unlock_node(node); // memory_order_release
912
913 ASSERT(THMAP_NODE(thmap,
914 atomic_load_relaxed(&parent->slots[slot])) == node);
915 node_remove(parent, slot);
916
917 /* Stage the removed node for G/C. */
918 stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN);
919 }
920
921 /*
922 * If the top node is empty, then we need to remove it from the
923 * root level. Mark the node as deleted and clear the slot.
924 *
925 * Note: acquiring the lock on the top node effectively prevents
926 * the root slot from changing.
927 */
928 if (NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
929 const unsigned rslot = query.rslot;
930 const thmap_ptr_t nptr =
931 atomic_load_relaxed(&thmap->root[rslot]);
932
933 ASSERT(query.level == 0);
934 ASSERT(parent->parent == THMAP_NULL);
935 ASSERT(THMAP_GETOFF(thmap, parent) == nptr);
936
937 /* Mark as deleted and remove from the root-level slot. */
938 atomic_store_relaxed(&parent->state,
939 atomic_load_relaxed(&parent->state) | NODE_DELETED);
940 atomic_store_relaxed(&thmap->root[rslot], THMAP_NULL);
941
942 stage_mem_gc(thmap, nptr, THMAP_INODE_LEN);
943 }
944 unlock_node(parent);
945
946 /*
947 * Save the value and stage the leaf for G/C.
948 */
949 val = leaf->val;
950 if ((thmap->flags & THMAP_NOCOPY) == 0) {
951 stage_mem_gc(thmap, leaf->key, leaf->len);
952 }
953 stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
954 return val;
955 }
956
957 /*
958 * G/C routines.
959 */
960
961 static void
stage_mem_gc(thmap_t * thmap,uintptr_t addr,size_t len)962 stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len)
963 {
964 char *const ptr = THMAP_GETPTR(thmap, addr);
965 thmap_gc_t *head, *gc;
966
967 gc = container_of(ptr, struct thmap_gc, data[0]);
968 KASSERTMSG(gc->len == len,
969 "thmap=%p ops=%p ptr=%p len=%zu gc=%p gc->len=%zu",
970 thmap, thmap->ops, (char *)addr, len, gc, gc->len);
971 retry:
972 head = atomic_load_relaxed(&thmap->gc_list);
973 gc->next = head; // not yet published
974
975 /* Release to subsequent acquire in thmap_stage_gc(). */
976 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->gc_list, &head, gc,
977 memory_order_release, memory_order_relaxed)) {
978 goto retry;
979 }
980 }
981
982 void *
thmap_stage_gc(thmap_t * thmap)983 thmap_stage_gc(thmap_t *thmap)
984 {
985 /* Acquire from prior release in stage_mem_gc(). */
986 return atomic_exchange_explicit(&thmap->gc_list, NULL,
987 memory_order_acquire);
988 }
989
990 void
thmap_gc(thmap_t * thmap,void * ref)991 thmap_gc(thmap_t *thmap, void *ref)
992 {
993 thmap_gc_t *gc = ref;
994
995 while (gc) {
996 thmap_gc_t *next = gc->next;
997 gc_free(thmap, THMAP_GETOFF(thmap, &gc->data[0]), gc->len);
998 gc = next;
999 }
1000 }
1001
1002 /*
1003 * thmap_create: construct a new trie-hash map object.
1004 */
1005 thmap_t *
thmap_create(uintptr_t baseptr,const thmap_ops_t * ops,unsigned flags)1006 thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags)
1007 {
1008 thmap_t *thmap;
1009 uintptr_t root;
1010
1011 /*
1012 * Setup the map object.
1013 */
1014 if (!THMAP_ALIGNED_P(baseptr)) {
1015 return NULL;
1016 }
1017 thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP);
1018 thmap->baseptr = baseptr;
1019 thmap->ops = ops ? ops : &thmap_default_ops;
1020 thmap->flags = flags;
1021
1022 if ((thmap->flags & THMAP_SETROOT) == 0) {
1023 /* Allocate the root level. */
1024 root = gc_alloc(thmap, THMAP_ROOT_LEN);
1025 if (!root) {
1026 kmem_free(thmap, sizeof(thmap_t));
1027 return NULL;
1028 }
1029 thmap->root = THMAP_GETPTR(thmap, root);
1030 memset(thmap->root, 0, THMAP_ROOT_LEN);
1031 }
1032
1033 cprng_strong(kern_cprng, thmap->seed, sizeof thmap->seed, 0);
1034
1035 return thmap;
1036 }
1037
1038 int
thmap_setroot(thmap_t * thmap,uintptr_t root_off)1039 thmap_setroot(thmap_t *thmap, uintptr_t root_off)
1040 {
1041 if (thmap->root) {
1042 return -1;
1043 }
1044 thmap->root = THMAP_GETPTR(thmap, root_off);
1045 return 0;
1046 }
1047
1048 uintptr_t
thmap_getroot(const thmap_t * thmap)1049 thmap_getroot(const thmap_t *thmap)
1050 {
1051 return THMAP_GETOFF(thmap, thmap->root);
1052 }
1053
1054 void
thmap_destroy(thmap_t * thmap)1055 thmap_destroy(thmap_t *thmap)
1056 {
1057 uintptr_t root = THMAP_GETOFF(thmap, thmap->root);
1058 void *ref;
1059
1060 ref = thmap_stage_gc(thmap);
1061 thmap_gc(thmap, ref);
1062
1063 if ((thmap->flags & THMAP_SETROOT) == 0) {
1064 gc_free(thmap, root, THMAP_ROOT_LEN);
1065 }
1066 kmem_free(thmap, sizeof(thmap_t));
1067 }
1068