1*ed443730Sriastradh /* $NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $ */
23f448747Schristos
36577bb50Srmind /*-
46577bb50Srmind * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
56577bb50Srmind * All rights reserved.
66577bb50Srmind *
76577bb50Srmind * Redistribution and use in source and binary forms, with or without
86577bb50Srmind * modification, are permitted provided that the following conditions
96577bb50Srmind * are met:
106577bb50Srmind * 1. Redistributions of source code must retain the above copyright
116577bb50Srmind * notice, this list of conditions and the following disclaimer.
126577bb50Srmind * 2. Redistributions in binary form must reproduce the above copyright
136577bb50Srmind * notice, this list of conditions and the following disclaimer in the
146577bb50Srmind * documentation and/or other materials provided with the distribution.
156577bb50Srmind *
166577bb50Srmind * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
176577bb50Srmind * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
186577bb50Srmind * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
196577bb50Srmind * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
206577bb50Srmind * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
216577bb50Srmind * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
226577bb50Srmind * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
236577bb50Srmind * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
246577bb50Srmind * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
256577bb50Srmind * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
266577bb50Srmind * SUCH DAMAGE.
276577bb50Srmind *
286577bb50Srmind * Upstream: https://github.com/rmind/thmap/
296577bb50Srmind */
306577bb50Srmind
316577bb50Srmind /*
326577bb50Srmind * Concurrent trie-hash map.
336577bb50Srmind *
346577bb50Srmind * The data structure is conceptually a radix trie on hashed keys.
356577bb50Srmind * Keys are hashed using a 32-bit function. The root level is a special
366577bb50Srmind * case: it is managed using the compare-and-swap (CAS) atomic operation
376577bb50Srmind * and has a fanout of 64. The subsequent levels are constructed using
386577bb50Srmind * intermediate nodes with a fanout of 16 (using 4 bits). As more levels
396577bb50Srmind * are created, more blocks of the 32-bit hash value might be generated
406577bb50Srmind * by incrementing the seed parameter of the hash function.
416577bb50Srmind *
426577bb50Srmind * Concurrency
436577bb50Srmind *
446577bb50Srmind * - READERS: Descending is simply walking through the slot values of
456577bb50Srmind * the intermediate nodes. It is lock-free as there is no intermediate
466577bb50Srmind * state: the slot is either empty or has a pointer to the child node.
476577bb50Srmind * The main assumptions here are the following:
486577bb50Srmind *
496577bb50Srmind * i) modifications must preserve consistency with the respect to the
506577bb50Srmind * readers i.e. the readers can only see the valid node values;
516577bb50Srmind *
526577bb50Srmind * ii) any invalid view must "fail" the reads, e.g. by making them
536577bb50Srmind * re-try from the root; this is a case for deletions and is achieved
546577bb50Srmind * using the NODE_DELETED flag.
556577bb50Srmind *
5682ca45b1Srmind * iii) the node destruction must be synchronized with the readers,
576577bb50Srmind * e.g. by using the Epoch-based reclamation or other techniques.
586577bb50Srmind *
596577bb50Srmind * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
606577bb50Srmind * is implemented using the NODE_LOCKED bit) -- it provides mutual
616577bb50Srmind * exclusion amongst concurrent writers. The lock order for the nodes
626577bb50Srmind * is "bottom-up" i.e. they are locked as we ascend the trie. A key
636577bb50Srmind * constraint here is that parent pointer never changes.
646577bb50Srmind *
656577bb50Srmind * - DELETES: In addition to writer's locking, the deletion keeps the
666577bb50Srmind * intermediate nodes in a valid state and sets the NODE_DELETED flag,
676577bb50Srmind * to indicate that the readers must re-start the walk from the root.
686577bb50Srmind * As the levels are collapsed, NODE_DELETED gets propagated up-tree.
696577bb50Srmind * The leaf nodes just stay as-is until they are reclaimed.
706577bb50Srmind *
716577bb50Srmind * - ROOT LEVEL: The root level is a special case, as it is implemented
726577bb50Srmind * as an array (rather than intermediate node). The root-level slot can
736577bb50Srmind * only be set using CAS and it can only be set to a valid intermediate
746577bb50Srmind * node. The root-level slot can only be cleared when the node it points
756577bb50Srmind * at becomes empty, is locked and marked as NODE_DELETED (this causes
766577bb50Srmind * the insert/delete operations to re-try until the slot is set to NULL).
776577bb50Srmind *
786577bb50Srmind * References:
796577bb50Srmind *
806577bb50Srmind * W. Litwin, 1981, Trie Hashing.
816577bb50Srmind * Proceedings of the 1981 ACM SIGMOD, p. 19-29
826577bb50Srmind * https://dl.acm.org/citation.cfm?id=582322
836577bb50Srmind *
846577bb50Srmind * P. L. Lehman and S. B. Yao.
856577bb50Srmind * Efficient locking for concurrent operations on B-trees.
866577bb50Srmind * ACM TODS, 6(4):650-670, 1981
876577bb50Srmind * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
886577bb50Srmind */
896577bb50Srmind
906577bb50Srmind #ifdef _KERNEL
916577bb50Srmind #include <sys/cdefs.h>
926577bb50Srmind #include <sys/param.h>
936577bb50Srmind #include <sys/types.h>
946577bb50Srmind #include <sys/thmap.h>
956577bb50Srmind #include <sys/kmem.h>
966577bb50Srmind #include <sys/lock.h>
976577bb50Srmind #include <sys/atomic.h>
986577bb50Srmind #include <sys/hash.h>
99aced4ea1Sriastradh #include <sys/cprng.h>
1003f448747Schristos #define THMAP_RCSID(a) __KERNEL_RCSID(0, a)
1016577bb50Srmind #else
1026577bb50Srmind #include <stdio.h>
1036577bb50Srmind #include <stdlib.h>
1046577bb50Srmind #include <stdbool.h>
1056577bb50Srmind #include <stddef.h>
1066577bb50Srmind #include <inttypes.h>
1076577bb50Srmind #include <string.h>
1086577bb50Srmind #include <limits.h>
1093f448747Schristos #define THMAP_RCSID(a) __RCSID(a)
1106577bb50Srmind
1116577bb50Srmind #include "thmap.h"
1126577bb50Srmind #include "utils.h"
1136577bb50Srmind #endif
1146577bb50Srmind
115*ed443730Sriastradh THMAP_RCSID("$NetBSD: subr_thmap.c,v 1.15 2023/10/17 11:57:20 riastradh Exp $");
116aced4ea1Sriastradh
117aced4ea1Sriastradh #include <crypto/blake2/blake2s.h>
1183f448747Schristos
1196577bb50Srmind /*
1206577bb50Srmind * NetBSD kernel wrappers
1216577bb50Srmind */
1226577bb50Srmind #ifdef _KERNEL
1236577bb50Srmind #define ASSERT KASSERT
12473f8fd37Sriastradh #define atomic_thread_fence(x) membar_release() /* only used for release order */
12582ca45b1Srmind #define atomic_compare_exchange_weak_explicit_32(p, e, n, m1, m2) \
12682ca45b1Srmind (atomic_cas_32((p), *(e), (n)) == *(e))
12782ca45b1Srmind #define atomic_compare_exchange_weak_explicit_ptr(p, e, n, m1, m2) \
12882ca45b1Srmind (atomic_cas_ptr((p), *(void **)(e), (void *)(n)) == *(void **)(e))
12982ca45b1Srmind #define atomic_exchange_explicit(o, n, m1) atomic_swap_ptr((o), (n))
1306577bb50Srmind #define murmurhash3 murmurhash2
1316577bb50Srmind #endif
1326577bb50Srmind
1336577bb50Srmind /*
1346577bb50Srmind * The root level fanout is 64 (indexed by the last 6 bits of the hash
1356577bb50Srmind * value XORed with the length). Each subsequent level, represented by
1366577bb50Srmind * intermediate nodes, has a fanout of 16 (using 4 bits).
1376577bb50Srmind *
1386577bb50Srmind * The hash function produces 32-bit values.
1396577bb50Srmind */
1406577bb50Srmind
141aced4ea1Sriastradh #define HASHVAL_SEEDLEN (16)
1426577bb50Srmind #define HASHVAL_BITS (32)
1436577bb50Srmind #define HASHVAL_MOD (HASHVAL_BITS - 1)
1446577bb50Srmind #define HASHVAL_SHIFT (5)
1456577bb50Srmind
1466577bb50Srmind #define ROOT_BITS (6)
1476577bb50Srmind #define ROOT_SIZE (1 << ROOT_BITS)
1486577bb50Srmind #define ROOT_MASK (ROOT_SIZE - 1)
1496577bb50Srmind #define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS)
1506577bb50Srmind
1516577bb50Srmind #define LEVEL_BITS (4)
1526577bb50Srmind #define LEVEL_SIZE (1 << LEVEL_BITS)
1536577bb50Srmind #define LEVEL_MASK (LEVEL_SIZE - 1)
1546577bb50Srmind
1556577bb50Srmind /*
1566577bb50Srmind * Instead of raw pointers, we use offsets from the base address.
1576577bb50Srmind * This accommodates the use of this data structure in shared memory,
1586577bb50Srmind * where mappings can be in different address spaces.
1596577bb50Srmind *
1606577bb50Srmind * The pointers must be aligned, since pointer tagging is used to
1616577bb50Srmind * differentiate the intermediate nodes from leaves. We reserve the
1626577bb50Srmind * least significant bit.
1636577bb50Srmind */
1646577bb50Srmind typedef uintptr_t thmap_ptr_t;
16582ca45b1Srmind typedef uintptr_t atomic_thmap_ptr_t; // C11 _Atomic
1666577bb50Srmind
1676577bb50Srmind #define THMAP_NULL ((thmap_ptr_t)0)
1686577bb50Srmind
1696577bb50Srmind #define THMAP_LEAF_BIT (0x1)
1706577bb50Srmind
1716577bb50Srmind #define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0)
1726577bb50Srmind #define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3)
1736577bb50Srmind #define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
1746577bb50Srmind
1756577bb50Srmind #define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p)))
1766577bb50Srmind #define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
1776577bb50Srmind #define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p))
1786577bb50Srmind
1796577bb50Srmind /*
1806577bb50Srmind * State field.
1816577bb50Srmind */
1826577bb50Srmind
1836577bb50Srmind #define NODE_LOCKED (1U << 31) // lock (writers)
1846577bb50Srmind #define NODE_DELETED (1U << 30) // node deleted
1856577bb50Srmind #define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask
1866577bb50Srmind
1876577bb50Srmind /*
1886577bb50Srmind * There are two types of nodes:
1896577bb50Srmind * - Intermediate nodes -- arrays pointing to another level or a leaf;
1906577bb50Srmind * - Leaves, which store a key-value pair.
1916577bb50Srmind */
1926577bb50Srmind
1936577bb50Srmind typedef struct {
19482ca45b1Srmind uint32_t state; // C11 _Atomic
1956577bb50Srmind thmap_ptr_t parent;
19682ca45b1Srmind atomic_thmap_ptr_t slots[LEVEL_SIZE];
1976577bb50Srmind } thmap_inode_t;
1986577bb50Srmind
1996577bb50Srmind #define THMAP_INODE_LEN sizeof(thmap_inode_t)
2006577bb50Srmind
2016577bb50Srmind typedef struct {
2026577bb50Srmind thmap_ptr_t key;
2036577bb50Srmind size_t len;
2046577bb50Srmind void * val;
2056577bb50Srmind } thmap_leaf_t;
2066577bb50Srmind
2076577bb50Srmind typedef struct {
208aced4ea1Sriastradh const uint8_t * seed; // secret seed
2096577bb50Srmind unsigned rslot; // root-level slot index
2106577bb50Srmind unsigned level; // current level in the tree
2116577bb50Srmind unsigned hashidx; // current hash index (block of bits)
2126577bb50Srmind uint32_t hashval; // current hash value
2136577bb50Srmind } thmap_query_t;
2146577bb50Srmind
215*ed443730Sriastradh union thmap_align {
216*ed443730Sriastradh void * p;
217*ed443730Sriastradh uint64_t v;
218*ed443730Sriastradh };
219*ed443730Sriastradh
220*ed443730Sriastradh typedef struct thmap_gc thmap_gc_t;
221*ed443730Sriastradh struct thmap_gc {
2226577bb50Srmind size_t len;
223*ed443730Sriastradh thmap_gc_t * next;
224*ed443730Sriastradh char data[] __aligned(sizeof(union thmap_align));
225*ed443730Sriastradh };
2266577bb50Srmind
2276577bb50Srmind #define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE)
2286577bb50Srmind
2296577bb50Srmind struct thmap {
2306577bb50Srmind uintptr_t baseptr;
23182ca45b1Srmind atomic_thmap_ptr_t * root;
2326577bb50Srmind unsigned flags;
2336577bb50Srmind const thmap_ops_t * ops;
23482ca45b1Srmind thmap_gc_t * gc_list; // C11 _Atomic
235aced4ea1Sriastradh uint8_t seed[HASHVAL_SEEDLEN];
2366577bb50Srmind };
2376577bb50Srmind
2386577bb50Srmind static void stage_mem_gc(thmap_t *, uintptr_t, size_t);
2396577bb50Srmind
2406577bb50Srmind /*
2416577bb50Srmind * A few low-level helper routines.
2426577bb50Srmind */
2436577bb50Srmind
2446577bb50Srmind static uintptr_t
alloc_wrapper(size_t len)2456577bb50Srmind alloc_wrapper(size_t len)
2466577bb50Srmind {
247c2307f9cSrmind return (uintptr_t)kmem_intr_alloc(len, KM_NOSLEEP);
2486577bb50Srmind }
2496577bb50Srmind
2506577bb50Srmind static void
free_wrapper(uintptr_t addr,size_t len)2516577bb50Srmind free_wrapper(uintptr_t addr, size_t len)
2526577bb50Srmind {
2536577bb50Srmind kmem_intr_free((void *)addr, len);
2546577bb50Srmind }
2556577bb50Srmind
2566577bb50Srmind static const thmap_ops_t thmap_default_ops = {
2576577bb50Srmind .alloc = alloc_wrapper,
2586577bb50Srmind .free = free_wrapper
2596577bb50Srmind };
2606577bb50Srmind
261*ed443730Sriastradh static uintptr_t
gc_alloc(const thmap_t * thmap,size_t len)262*ed443730Sriastradh gc_alloc(const thmap_t *thmap, size_t len)
263*ed443730Sriastradh {
264*ed443730Sriastradh const size_t alloclen = offsetof(struct thmap_gc, data[len]);
265*ed443730Sriastradh const uintptr_t gcaddr = thmap->ops->alloc(alloclen);
266*ed443730Sriastradh
267*ed443730Sriastradh if (!gcaddr)
268*ed443730Sriastradh return 0;
269*ed443730Sriastradh
270*ed443730Sriastradh thmap_gc_t *const gc = THMAP_GETPTR(thmap, gcaddr);
271*ed443730Sriastradh gc->len = len;
272*ed443730Sriastradh return THMAP_GETOFF(thmap, &gc->data[0]);
273*ed443730Sriastradh }
274*ed443730Sriastradh
275*ed443730Sriastradh static void
gc_free(const thmap_t * thmap,uintptr_t addr,size_t len)276*ed443730Sriastradh gc_free(const thmap_t *thmap, uintptr_t addr, size_t len)
277*ed443730Sriastradh {
278*ed443730Sriastradh const size_t alloclen = offsetof(struct thmap_gc, data[len]);
279*ed443730Sriastradh char *const ptr = THMAP_GETPTR(thmap, addr);
280*ed443730Sriastradh thmap_gc_t *const gc = container_of(ptr, struct thmap_gc, data[0]);
281*ed443730Sriastradh const uintptr_t gcaddr = THMAP_GETOFF(thmap, gc);
282*ed443730Sriastradh
283*ed443730Sriastradh KASSERTMSG(gc->len == len, "thmap=%p ops=%p addr=%p len=%zu"
284*ed443730Sriastradh " gc=%p gc->len=%zu",
285*ed443730Sriastradh thmap, thmap->ops, (void *)addr, len, gc, gc->len);
286*ed443730Sriastradh thmap->ops->free(gcaddr, alloclen);
287*ed443730Sriastradh }
288*ed443730Sriastradh
2896577bb50Srmind /*
2906577bb50Srmind * NODE LOCKING.
2916577bb50Srmind */
2926577bb50Srmind
293ae43ee51Sriastradh static inline bool __diagused
node_locked_p(thmap_inode_t * node)29482ca45b1Srmind node_locked_p(thmap_inode_t *node)
2956577bb50Srmind {
29682ca45b1Srmind return (atomic_load_relaxed(&node->state) & NODE_LOCKED) != 0;
2976577bb50Srmind }
2986577bb50Srmind
2996577bb50Srmind static void
lock_node(thmap_inode_t * node)3006577bb50Srmind lock_node(thmap_inode_t *node)
3016577bb50Srmind {
3026577bb50Srmind unsigned bcount = SPINLOCK_BACKOFF_MIN;
3036577bb50Srmind uint32_t s;
3046577bb50Srmind again:
30582ca45b1Srmind s = atomic_load_relaxed(&node->state);
3066577bb50Srmind if (s & NODE_LOCKED) {
3076577bb50Srmind SPINLOCK_BACKOFF(bcount);
3086577bb50Srmind goto again;
3096577bb50Srmind }
31082ca45b1Srmind /* Acquire from prior release in unlock_node.() */
31182ca45b1Srmind if (!atomic_compare_exchange_weak_explicit_32(&node->state,
31282ca45b1Srmind &s, s | NODE_LOCKED, memory_order_acquire, memory_order_relaxed)) {
3136577bb50Srmind bcount = SPINLOCK_BACKOFF_MIN;
3146577bb50Srmind goto again;
3156577bb50Srmind }
3166577bb50Srmind }
3176577bb50Srmind
3186577bb50Srmind static void
unlock_node(thmap_inode_t * node)3196577bb50Srmind unlock_node(thmap_inode_t *node)
3206577bb50Srmind {
32182ca45b1Srmind uint32_t s = atomic_load_relaxed(&node->state) & ~NODE_LOCKED;
3226577bb50Srmind
3236577bb50Srmind ASSERT(node_locked_p(node));
32482ca45b1Srmind /* Release to subsequent acquire in lock_node(). */
32582ca45b1Srmind atomic_store_release(&node->state, s);
3266577bb50Srmind }
3276577bb50Srmind
3286577bb50Srmind /*
3296577bb50Srmind * HASH VALUE AND KEY OPERATIONS.
3306577bb50Srmind */
3316577bb50Srmind
332aced4ea1Sriastradh static inline uint32_t
hash(const uint8_t seed[static HASHVAL_SEEDLEN],const void * key,size_t len,uint32_t level)333aced4ea1Sriastradh hash(const uint8_t seed[static HASHVAL_SEEDLEN], const void *key, size_t len,
334aced4ea1Sriastradh uint32_t level)
3356577bb50Srmind {
336aced4ea1Sriastradh struct blake2s B;
337aced4ea1Sriastradh uint32_t h;
3386577bb50Srmind
339aced4ea1Sriastradh if (level == 0)
340aced4ea1Sriastradh return murmurhash3(key, len, 0);
341aced4ea1Sriastradh
342aced4ea1Sriastradh /*
343aced4ea1Sriastradh * Byte order is not significant here because this is
344aced4ea1Sriastradh * intentionally secret and independent for each thmap.
345aced4ea1Sriastradh *
346aced4ea1Sriastradh * XXX We get 32 bytes of output at a time; we could march
347aced4ea1Sriastradh * through them sequentially rather than throwing away 28 bytes
348aced4ea1Sriastradh * and recomputing BLAKE2 each time. But the number of
349aced4ea1Sriastradh * iterations ought to be geometric in the collision
350aced4ea1Sriastradh * probability at each level which should be very small anyway.
351aced4ea1Sriastradh */
352aced4ea1Sriastradh blake2s_init(&B, sizeof h, seed, HASHVAL_SEEDLEN);
353aced4ea1Sriastradh blake2s_update(&B, &level, sizeof level);
354aced4ea1Sriastradh blake2s_update(&B, key, len);
355aced4ea1Sriastradh blake2s_final(&B, &h);
356aced4ea1Sriastradh
357aced4ea1Sriastradh return h;
358aced4ea1Sriastradh }
359aced4ea1Sriastradh
360aced4ea1Sriastradh static inline void
hashval_init(thmap_query_t * query,const uint8_t seed[static HASHVAL_SEEDLEN],const void * restrict key,size_t len)361aced4ea1Sriastradh hashval_init(thmap_query_t *query, const uint8_t seed[static HASHVAL_SEEDLEN],
362aced4ea1Sriastradh const void * restrict key, size_t len)
363aced4ea1Sriastradh {
364aced4ea1Sriastradh const uint32_t hashval = hash(seed, key, len, 0);
365aced4ea1Sriastradh
366aced4ea1Sriastradh query->seed = seed;
3676577bb50Srmind query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK;
3686577bb50Srmind query->level = 0;
3696577bb50Srmind query->hashval = hashval;
3706577bb50Srmind query->hashidx = 0;
3716577bb50Srmind }
3726577bb50Srmind
3736577bb50Srmind /*
3746577bb50Srmind * hashval_getslot: given the key, compute the hash (if not already cached)
3756577bb50Srmind * and return the offset for the current level.
3766577bb50Srmind */
3776577bb50Srmind static unsigned
hashval_getslot(thmap_query_t * query,const void * restrict key,size_t len)3786577bb50Srmind hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len)
3796577bb50Srmind {
3806577bb50Srmind const unsigned offset = query->level * LEVEL_BITS;
3816577bb50Srmind const unsigned shift = offset & HASHVAL_MOD;
3826577bb50Srmind const unsigned i = offset >> HASHVAL_SHIFT;
3836577bb50Srmind
3846577bb50Srmind if (query->hashidx != i) {
3856577bb50Srmind /* Generate a hash value for a required range. */
386aced4ea1Sriastradh query->hashval = hash(query->seed, key, len, i);
3876577bb50Srmind query->hashidx = i;
3886577bb50Srmind }
3896577bb50Srmind return (query->hashval >> shift) & LEVEL_MASK;
3906577bb50Srmind }
3916577bb50Srmind
3926577bb50Srmind static unsigned
hashval_getleafslot(const thmap_t * thmap,const thmap_leaf_t * leaf,unsigned level)3936577bb50Srmind hashval_getleafslot(const thmap_t *thmap,
3946577bb50Srmind const thmap_leaf_t *leaf, unsigned level)
3956577bb50Srmind {
3966577bb50Srmind const void *key = THMAP_GETPTR(thmap, leaf->key);
3976577bb50Srmind const unsigned offset = level * LEVEL_BITS;
3986577bb50Srmind const unsigned shift = offset & HASHVAL_MOD;
3996577bb50Srmind const unsigned i = offset >> HASHVAL_SHIFT;
4006577bb50Srmind
401aced4ea1Sriastradh return (hash(thmap->seed, key, leaf->len, i) >> shift) & LEVEL_MASK;
4026577bb50Srmind }
4036577bb50Srmind
4046577bb50Srmind static inline unsigned
hashval_getl0slot(const thmap_t * thmap,const thmap_query_t * query,const thmap_leaf_t * leaf)4056577bb50Srmind hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query,
4066577bb50Srmind const thmap_leaf_t *leaf)
4076577bb50Srmind {
4086577bb50Srmind if (__predict_true(query->hashidx == 0)) {
4096577bb50Srmind return query->hashval & LEVEL_MASK;
4106577bb50Srmind }
4116577bb50Srmind return hashval_getleafslot(thmap, leaf, 0);
4126577bb50Srmind }
4136577bb50Srmind
4146577bb50Srmind static bool
key_cmp_p(const thmap_t * thmap,const thmap_leaf_t * leaf,const void * restrict key,size_t len)4156577bb50Srmind key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf,
4166577bb50Srmind const void * restrict key, size_t len)
4176577bb50Srmind {
4186577bb50Srmind const void *leafkey = THMAP_GETPTR(thmap, leaf->key);
4196577bb50Srmind return len == leaf->len && memcmp(key, leafkey, len) == 0;
4206577bb50Srmind }
4216577bb50Srmind
4226577bb50Srmind /*
4236577bb50Srmind * INTER-NODE OPERATIONS.
4246577bb50Srmind */
4256577bb50Srmind
4266577bb50Srmind static thmap_inode_t *
node_create(thmap_t * thmap,thmap_inode_t * parent)4276577bb50Srmind node_create(thmap_t *thmap, thmap_inode_t *parent)
4286577bb50Srmind {
4296577bb50Srmind thmap_inode_t *node;
4306577bb50Srmind uintptr_t p;
4316577bb50Srmind
432*ed443730Sriastradh p = gc_alloc(thmap, THMAP_INODE_LEN);
4336577bb50Srmind if (!p) {
4346577bb50Srmind return NULL;
4356577bb50Srmind }
4366577bb50Srmind node = THMAP_GETPTR(thmap, p);
4376577bb50Srmind ASSERT(THMAP_ALIGNED_P(node));
4386577bb50Srmind
4396577bb50Srmind memset(node, 0, THMAP_INODE_LEN);
4406577bb50Srmind if (parent) {
44182ca45b1Srmind /* Not yet published, no need for ordering. */
44282ca45b1Srmind atomic_store_relaxed(&node->state, NODE_LOCKED);
4436577bb50Srmind node->parent = THMAP_GETOFF(thmap, parent);
4446577bb50Srmind }
4456577bb50Srmind return node;
4466577bb50Srmind }
4476577bb50Srmind
4486577bb50Srmind static void
node_insert(thmap_inode_t * node,unsigned slot,thmap_ptr_t child)4496577bb50Srmind node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child)
4506577bb50Srmind {
4516577bb50Srmind ASSERT(node_locked_p(node) || node->parent == THMAP_NULL);
45282ca45b1Srmind ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
45382ca45b1Srmind ASSERT(atomic_load_relaxed(&node->slots[slot]) == THMAP_NULL);
4546577bb50Srmind
45582ca45b1Srmind ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) < LEVEL_SIZE);
4566577bb50Srmind
45782ca45b1Srmind /*
45882ca45b1Srmind * If node is public already, caller is responsible for issuing
45982ca45b1Srmind * release fence; if node is not public, no ordering is needed.
46082ca45b1Srmind * Hence relaxed ordering.
46182ca45b1Srmind */
46282ca45b1Srmind atomic_store_relaxed(&node->slots[slot], child);
46382ca45b1Srmind atomic_store_relaxed(&node->state,
46482ca45b1Srmind atomic_load_relaxed(&node->state) + 1);
4656577bb50Srmind }
4666577bb50Srmind
4676577bb50Srmind static void
node_remove(thmap_inode_t * node,unsigned slot)4686577bb50Srmind node_remove(thmap_inode_t *node, unsigned slot)
4696577bb50Srmind {
4706577bb50Srmind ASSERT(node_locked_p(node));
47182ca45b1Srmind ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0);
47282ca45b1Srmind ASSERT(atomic_load_relaxed(&node->slots[slot]) != THMAP_NULL);
4736577bb50Srmind
47482ca45b1Srmind ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) > 0);
47582ca45b1Srmind ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) <= LEVEL_SIZE);
4766577bb50Srmind
47782ca45b1Srmind /* Element will be GC-ed later; no need for ordering here. */
47882ca45b1Srmind atomic_store_relaxed(&node->slots[slot], THMAP_NULL);
47982ca45b1Srmind atomic_store_relaxed(&node->state,
48082ca45b1Srmind atomic_load_relaxed(&node->state) - 1);
4816577bb50Srmind }
4826577bb50Srmind
4836577bb50Srmind /*
4846577bb50Srmind * LEAF OPERATIONS.
4856577bb50Srmind */
4866577bb50Srmind
4876577bb50Srmind static thmap_leaf_t *
leaf_create(const thmap_t * thmap,const void * key,size_t len,void * val)4886577bb50Srmind leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val)
4896577bb50Srmind {
4906577bb50Srmind thmap_leaf_t *leaf;
4916577bb50Srmind uintptr_t leaf_off, key_off;
4926577bb50Srmind
493*ed443730Sriastradh leaf_off = gc_alloc(thmap, sizeof(thmap_leaf_t));
4946577bb50Srmind if (!leaf_off) {
4956577bb50Srmind return NULL;
4966577bb50Srmind }
4976577bb50Srmind leaf = THMAP_GETPTR(thmap, leaf_off);
4986577bb50Srmind ASSERT(THMAP_ALIGNED_P(leaf));
4996577bb50Srmind
5006577bb50Srmind if ((thmap->flags & THMAP_NOCOPY) == 0) {
5016577bb50Srmind /*
5026577bb50Srmind * Copy the key.
5036577bb50Srmind */
504*ed443730Sriastradh key_off = gc_alloc(thmap, len);
5056577bb50Srmind if (!key_off) {
506*ed443730Sriastradh gc_free(thmap, leaf_off, sizeof(thmap_leaf_t));
5076577bb50Srmind return NULL;
5086577bb50Srmind }
5096577bb50Srmind memcpy(THMAP_GETPTR(thmap, key_off), key, len);
5106577bb50Srmind leaf->key = key_off;
5116577bb50Srmind } else {
5126577bb50Srmind /* Otherwise, we use a reference. */
5136577bb50Srmind leaf->key = (uintptr_t)key;
5146577bb50Srmind }
5156577bb50Srmind leaf->len = len;
5166577bb50Srmind leaf->val = val;
5176577bb50Srmind return leaf;
5186577bb50Srmind }
5196577bb50Srmind
5206577bb50Srmind static void
leaf_free(const thmap_t * thmap,thmap_leaf_t * leaf)5216577bb50Srmind leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf)
5226577bb50Srmind {
5236577bb50Srmind if ((thmap->flags & THMAP_NOCOPY) == 0) {
524*ed443730Sriastradh gc_free(thmap, leaf->key, leaf->len);
5256577bb50Srmind }
526*ed443730Sriastradh gc_free(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
5276577bb50Srmind }
5286577bb50Srmind
5296577bb50Srmind static thmap_leaf_t *
get_leaf(const thmap_t * thmap,thmap_inode_t * parent,unsigned slot)5306577bb50Srmind get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot)
5316577bb50Srmind {
5326577bb50Srmind thmap_ptr_t node;
5336577bb50Srmind
53482ca45b1Srmind /* Consume from prior release in thmap_put(). */
53582ca45b1Srmind node = atomic_load_consume(&parent->slots[slot]);
5366577bb50Srmind if (THMAP_INODE_P(node)) {
5376577bb50Srmind return NULL;
5386577bb50Srmind }
5396577bb50Srmind return THMAP_NODE(thmap, node);
5406577bb50Srmind }
5416577bb50Srmind
5426577bb50Srmind /*
5436577bb50Srmind * ROOT OPERATIONS.
5446577bb50Srmind */
5456577bb50Srmind
54682ca45b1Srmind /*
54782ca45b1Srmind * root_try_put: Try to set a root pointer at query->rslot.
54882ca45b1Srmind *
54982ca45b1Srmind * => Implies release operation on success.
55082ca45b1Srmind * => Implies no ordering on failure.
55182ca45b1Srmind */
5520916fe48Sriastradh static inline int
root_try_put(thmap_t * thmap,const thmap_query_t * query,thmap_leaf_t * leaf)5536577bb50Srmind root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf)
5546577bb50Srmind {
55582ca45b1Srmind thmap_ptr_t expected;
5566577bb50Srmind const unsigned i = query->rslot;
5576577bb50Srmind thmap_inode_t *node;
5586577bb50Srmind thmap_ptr_t nptr;
5596577bb50Srmind unsigned slot;
5606577bb50Srmind
5616577bb50Srmind /*
56282ca45b1Srmind * Must pre-check first. No ordering required because we will
56382ca45b1Srmind * check again before taking any actions, and start over if
56482ca45b1Srmind * this changes from null.
5656577bb50Srmind */
56682ca45b1Srmind if (atomic_load_relaxed(&thmap->root[i])) {
5670916fe48Sriastradh return EEXIST;
5686577bb50Srmind }
5696577bb50Srmind
5706577bb50Srmind /*
5716577bb50Srmind * Create an intermediate node. Since there is no parent set,
57282ca45b1Srmind * it will be created unlocked and the CAS operation will
57382ca45b1Srmind * release it to readers.
5746577bb50Srmind */
5756577bb50Srmind node = node_create(thmap, NULL);
5760916fe48Sriastradh if (__predict_false(node == NULL)) {
5770916fe48Sriastradh return ENOMEM;
5780916fe48Sriastradh }
5796577bb50Srmind slot = hashval_getl0slot(thmap, query, leaf);
5806577bb50Srmind node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT);
5816577bb50Srmind nptr = THMAP_GETOFF(thmap, node);
5826577bb50Srmind again:
58382ca45b1Srmind if (atomic_load_relaxed(&thmap->root[i])) {
584*ed443730Sriastradh gc_free(thmap, nptr, THMAP_INODE_LEN);
5850916fe48Sriastradh return EEXIST;
5866577bb50Srmind }
58782ca45b1Srmind /* Release to subsequent consume in find_edge_node(). */
58882ca45b1Srmind expected = THMAP_NULL;
58982ca45b1Srmind if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->root[i], &expected,
59082ca45b1Srmind nptr, memory_order_release, memory_order_relaxed)) {
5916577bb50Srmind goto again;
5926577bb50Srmind }
5930916fe48Sriastradh return 0;
5946577bb50Srmind }
5956577bb50Srmind
5966577bb50Srmind /*
5976577bb50Srmind * find_edge_node: given the hash, traverse the tree to find the edge node.
5986577bb50Srmind *
5996577bb50Srmind * => Returns an aligned (clean) pointer to the parent node.
6006577bb50Srmind * => Returns the slot number and sets current level.
6016577bb50Srmind */
6026577bb50Srmind static thmap_inode_t *
find_edge_node(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)6036577bb50Srmind find_edge_node(const thmap_t *thmap, thmap_query_t *query,
6046577bb50Srmind const void * restrict key, size_t len, unsigned *slot)
6056577bb50Srmind {
60682ca45b1Srmind thmap_ptr_t root_slot;
6076577bb50Srmind thmap_inode_t *parent;
6086577bb50Srmind thmap_ptr_t node;
6096577bb50Srmind unsigned off;
6106577bb50Srmind
6116577bb50Srmind ASSERT(query->level == 0);
6126577bb50Srmind
61382ca45b1Srmind /* Consume from prior release in root_try_put(). */
61482ca45b1Srmind root_slot = atomic_load_consume(&thmap->root[query->rslot]);
6156577bb50Srmind parent = THMAP_NODE(thmap, root_slot);
6166577bb50Srmind if (!parent) {
6176577bb50Srmind return NULL;
6186577bb50Srmind }
6196577bb50Srmind descend:
6206577bb50Srmind off = hashval_getslot(query, key, len);
62182ca45b1Srmind /* Consume from prior release in thmap_put(). */
62282ca45b1Srmind node = atomic_load_consume(&parent->slots[off]);
6236577bb50Srmind
6246577bb50Srmind /* Descend the tree until we find a leaf or empty slot. */
6256577bb50Srmind if (node && THMAP_INODE_P(node)) {
6266577bb50Srmind parent = THMAP_NODE(thmap, node);
6276577bb50Srmind query->level++;
6286577bb50Srmind goto descend;
6296577bb50Srmind }
63082ca45b1Srmind /*
63182ca45b1Srmind * NODE_DELETED does not become stale until GC runs, which
63282ca45b1Srmind * cannot happen while we are in the middle of an operation,
63382ca45b1Srmind * hence relaxed ordering.
63482ca45b1Srmind */
63582ca45b1Srmind if (atomic_load_relaxed(&parent->state) & NODE_DELETED) {
6366577bb50Srmind return NULL;
6376577bb50Srmind }
6386577bb50Srmind *slot = off;
6396577bb50Srmind return parent;
6406577bb50Srmind }
6416577bb50Srmind
6426577bb50Srmind /*
6436577bb50Srmind * find_edge_node_locked: traverse the tree, like find_edge_node(),
6446577bb50Srmind * but attempt to lock the edge node.
6456577bb50Srmind *
6466577bb50Srmind * => Returns NULL if the deleted node is found. This indicates that
6476577bb50Srmind * the caller must re-try from the root, as the root slot might have
6486577bb50Srmind * changed too.
6496577bb50Srmind */
6506577bb50Srmind static thmap_inode_t *
find_edge_node_locked(const thmap_t * thmap,thmap_query_t * query,const void * restrict key,size_t len,unsigned * slot)6516577bb50Srmind find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query,
6526577bb50Srmind const void * restrict key, size_t len, unsigned *slot)
6536577bb50Srmind {
6546577bb50Srmind thmap_inode_t *node;
6556577bb50Srmind thmap_ptr_t target;
6566577bb50Srmind retry:
6576577bb50Srmind /*
6586577bb50Srmind * Find the edge node and lock it! Re-check the state since
6596577bb50Srmind * the tree might change by the time we acquire the lock.
6606577bb50Srmind */
6616577bb50Srmind node = find_edge_node(thmap, query, key, len, slot);
6626577bb50Srmind if (!node) {
6636577bb50Srmind /* The root slot is empty -- let the caller decide. */
6646577bb50Srmind query->level = 0;
6656577bb50Srmind return NULL;
6666577bb50Srmind }
6676577bb50Srmind lock_node(node);
66882ca45b1Srmind if (__predict_false(atomic_load_relaxed(&node->state) & NODE_DELETED)) {
6696577bb50Srmind /*
6706577bb50Srmind * The node has been deleted. The tree might have a new
6716577bb50Srmind * shape now, therefore we must re-start from the root.
6726577bb50Srmind */
6736577bb50Srmind unlock_node(node);
6746577bb50Srmind query->level = 0;
6756577bb50Srmind return NULL;
6766577bb50Srmind }
67782ca45b1Srmind target = atomic_load_relaxed(&node->slots[*slot]);
6786577bb50Srmind if (__predict_false(target && THMAP_INODE_P(target))) {
6796577bb50Srmind /*
6806577bb50Srmind * The target slot has been changed and it is now an
6816577bb50Srmind * intermediate node. Re-start from the top internode.
6826577bb50Srmind */
6836577bb50Srmind unlock_node(node);
6846577bb50Srmind query->level = 0;
6856577bb50Srmind goto retry;
6866577bb50Srmind }
6876577bb50Srmind return node;
6886577bb50Srmind }
6896577bb50Srmind
6906577bb50Srmind /*
6916577bb50Srmind * thmap_get: lookup a value given the key.
6926577bb50Srmind */
6936577bb50Srmind void *
thmap_get(thmap_t * thmap,const void * key,size_t len)6946577bb50Srmind thmap_get(thmap_t *thmap, const void *key, size_t len)
6956577bb50Srmind {
6966577bb50Srmind thmap_query_t query;
6976577bb50Srmind thmap_inode_t *parent;
6986577bb50Srmind thmap_leaf_t *leaf;
6996577bb50Srmind unsigned slot;
7006577bb50Srmind
701aced4ea1Sriastradh hashval_init(&query, thmap->seed, key, len);
7026577bb50Srmind parent = find_edge_node(thmap, &query, key, len, &slot);
7036577bb50Srmind if (!parent) {
7046577bb50Srmind return NULL;
7056577bb50Srmind }
7066577bb50Srmind leaf = get_leaf(thmap, parent, slot);
7076577bb50Srmind if (!leaf) {
7086577bb50Srmind return NULL;
7096577bb50Srmind }
7106577bb50Srmind if (!key_cmp_p(thmap, leaf, key, len)) {
7116577bb50Srmind return NULL;
7126577bb50Srmind }
7136577bb50Srmind return leaf->val;
7146577bb50Srmind }
7156577bb50Srmind
7166577bb50Srmind /*
7176577bb50Srmind * thmap_put: insert a value given the key.
7186577bb50Srmind *
7196577bb50Srmind * => If the key is already present, return the associated value.
7206577bb50Srmind * => Otherwise, on successful insert, return the given value.
7216577bb50Srmind */
7226577bb50Srmind void *
thmap_put(thmap_t * thmap,const void * key,size_t len,void * val)7236577bb50Srmind thmap_put(thmap_t *thmap, const void *key, size_t len, void *val)
7246577bb50Srmind {
7256577bb50Srmind thmap_query_t query;
7266577bb50Srmind thmap_leaf_t *leaf, *other;
7276577bb50Srmind thmap_inode_t *parent, *child;
7286577bb50Srmind unsigned slot, other_slot;
7296577bb50Srmind thmap_ptr_t target;
7306577bb50Srmind
7316577bb50Srmind /*
73282ca45b1Srmind * First, pre-allocate and initialize the leaf node.
7336577bb50Srmind */
7346577bb50Srmind leaf = leaf_create(thmap, key, len, val);
7356577bb50Srmind if (__predict_false(!leaf)) {
7366577bb50Srmind return NULL;
7376577bb50Srmind }
738aced4ea1Sriastradh hashval_init(&query, thmap->seed, key, len);
7396577bb50Srmind retry:
7406577bb50Srmind /*
7416577bb50Srmind * Try to insert into the root first, if its slot is empty.
7426577bb50Srmind */
7430916fe48Sriastradh switch (root_try_put(thmap, &query, leaf)) {
7440916fe48Sriastradh case 0:
7456577bb50Srmind /* Success: the leaf was inserted; no locking involved. */
7466577bb50Srmind return val;
7470916fe48Sriastradh case EEXIST:
7480916fe48Sriastradh break;
7490916fe48Sriastradh case ENOMEM:
7500916fe48Sriastradh return NULL;
7510916fe48Sriastradh default:
7520916fe48Sriastradh __unreachable();
7536577bb50Srmind }
7546577bb50Srmind
7556577bb50Srmind /*
75682ca45b1Srmind * Release node via store in node_insert (*) to subsequent
75782ca45b1Srmind * consume in get_leaf() or find_edge_node().
75882ca45b1Srmind */
75982ca45b1Srmind atomic_thread_fence(memory_order_release);
76082ca45b1Srmind
76182ca45b1Srmind /*
7626577bb50Srmind * Find the edge node and the target slot.
7636577bb50Srmind */
7646577bb50Srmind parent = find_edge_node_locked(thmap, &query, key, len, &slot);
7656577bb50Srmind if (!parent) {
7666577bb50Srmind goto retry;
7676577bb50Srmind }
76882ca45b1Srmind target = atomic_load_relaxed(&parent->slots[slot]); // tagged offset
7696577bb50Srmind if (THMAP_INODE_P(target)) {
7706577bb50Srmind /*
77182ca45b1Srmind * Empty slot: simply insert the new leaf. The release
7726577bb50Srmind * fence is already issued for us.
7736577bb50Srmind */
7746577bb50Srmind target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
77582ca45b1Srmind node_insert(parent, slot, target); /* (*) */
7766577bb50Srmind goto out;
7776577bb50Srmind }
7786577bb50Srmind
7796577bb50Srmind /*
7806577bb50Srmind * Collision or duplicate.
7816577bb50Srmind */
7826577bb50Srmind other = THMAP_NODE(thmap, target);
7836577bb50Srmind if (key_cmp_p(thmap, other, key, len)) {
7846577bb50Srmind /*
7856577bb50Srmind * Duplicate. Free the pre-allocated leaf and
7866577bb50Srmind * return the present value.
7876577bb50Srmind */
7886577bb50Srmind leaf_free(thmap, leaf);
7896577bb50Srmind val = other->val;
7906577bb50Srmind goto out;
7916577bb50Srmind }
7926577bb50Srmind descend:
7936577bb50Srmind /*
7946577bb50Srmind * Collision -- expand the tree. Create an intermediate node
7956577bb50Srmind * which will be locked (NODE_LOCKED) for us. At this point,
7966577bb50Srmind * we advance to the next level.
7976577bb50Srmind */
7986577bb50Srmind child = node_create(thmap, parent);
7996577bb50Srmind if (__predict_false(!child)) {
8006577bb50Srmind leaf_free(thmap, leaf);
8016577bb50Srmind val = NULL;
8026577bb50Srmind goto out;
8036577bb50Srmind }
8046577bb50Srmind query.level++;
8056577bb50Srmind
8066577bb50Srmind /*
80782ca45b1Srmind * Insert the other (colliding) leaf first. The new child is
80882ca45b1Srmind * not yet published, so memory order is relaxed.
8096577bb50Srmind */
8106577bb50Srmind other_slot = hashval_getleafslot(thmap, other, query.level);
8116577bb50Srmind target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT;
8126577bb50Srmind node_insert(child, other_slot, target);
8136577bb50Srmind
8146577bb50Srmind /*
8156577bb50Srmind * Insert the intermediate node into the parent node.
8166577bb50Srmind * It becomes the new parent for the our new leaf.
8176577bb50Srmind *
81882ca45b1Srmind * Ensure that stores to the child (and leaf) reach global
81982ca45b1Srmind * visibility before it gets inserted to the parent, as
82082ca45b1Srmind * consumed by get_leaf() or find_edge_node().
8216577bb50Srmind */
82282ca45b1Srmind atomic_store_release(&parent->slots[slot], THMAP_GETOFF(thmap, child));
8236577bb50Srmind
8246577bb50Srmind unlock_node(parent);
8256577bb50Srmind ASSERT(node_locked_p(child));
8266577bb50Srmind parent = child;
8276577bb50Srmind
8286577bb50Srmind /*
8296577bb50Srmind * Get the new slot and check for another collision
8306577bb50Srmind * at the next level.
8316577bb50Srmind */
8326577bb50Srmind slot = hashval_getslot(&query, key, len);
8336577bb50Srmind if (slot == other_slot) {
8346577bb50Srmind /* Another collision -- descend and expand again. */
8356577bb50Srmind goto descend;
8366577bb50Srmind }
8376577bb50Srmind
83882ca45b1Srmind /*
83982ca45b1Srmind * Insert our new leaf once we expanded enough. The release
84082ca45b1Srmind * fence is already issued for us.
84182ca45b1Srmind */
8426577bb50Srmind target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
84382ca45b1Srmind node_insert(parent, slot, target); /* (*) */
8446577bb50Srmind out:
8456577bb50Srmind unlock_node(parent);
8466577bb50Srmind return val;
8476577bb50Srmind }
8486577bb50Srmind
8496577bb50Srmind /*
8506577bb50Srmind * thmap_del: remove the entry given the key.
8516577bb50Srmind */
8526577bb50Srmind void *
thmap_del(thmap_t * thmap,const void * key,size_t len)8536577bb50Srmind thmap_del(thmap_t *thmap, const void *key, size_t len)
8546577bb50Srmind {
8556577bb50Srmind thmap_query_t query;
8566577bb50Srmind thmap_leaf_t *leaf;
8576577bb50Srmind thmap_inode_t *parent;
8586577bb50Srmind unsigned slot;
8596577bb50Srmind void *val;
8606577bb50Srmind
861aced4ea1Sriastradh hashval_init(&query, thmap->seed, key, len);
8626577bb50Srmind parent = find_edge_node_locked(thmap, &query, key, len, &slot);
8636577bb50Srmind if (!parent) {
8646577bb50Srmind /* Root slot empty: not found. */
8656577bb50Srmind return NULL;
8666577bb50Srmind }
8676577bb50Srmind leaf = get_leaf(thmap, parent, slot);
8686577bb50Srmind if (!leaf || !key_cmp_p(thmap, leaf, key, len)) {
8696577bb50Srmind /* Not found. */
8706577bb50Srmind unlock_node(parent);
8716577bb50Srmind return NULL;
8726577bb50Srmind }
8736577bb50Srmind
8746577bb50Srmind /* Remove the leaf. */
87582ca45b1Srmind ASSERT(THMAP_NODE(thmap, atomic_load_relaxed(&parent->slots[slot]))
87682ca45b1Srmind == leaf);
8776577bb50Srmind node_remove(parent, slot);
8786577bb50Srmind
8796577bb50Srmind /*
8806577bb50Srmind * Collapse the levels if removing the last item.
8816577bb50Srmind */
88282ca45b1Srmind while (query.level &&
88382ca45b1Srmind NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
8846577bb50Srmind thmap_inode_t *node = parent;
8856577bb50Srmind
88682ca45b1Srmind ASSERT(atomic_load_relaxed(&node->state) == NODE_LOCKED);
8876577bb50Srmind
8886577bb50Srmind /*
8896577bb50Srmind * Ascend one level up.
8906577bb50Srmind * => Mark our current parent as deleted.
8916577bb50Srmind * => Lock the parent one level up.
8926577bb50Srmind */
8936577bb50Srmind query.level--;
8946577bb50Srmind slot = hashval_getslot(&query, key, len);
8956577bb50Srmind parent = THMAP_NODE(thmap, node->parent);
8966577bb50Srmind ASSERT(parent != NULL);
8976577bb50Srmind
8986577bb50Srmind lock_node(parent);
89982ca45b1Srmind ASSERT((atomic_load_relaxed(&parent->state) & NODE_DELETED)
90082ca45b1Srmind == 0);
9016577bb50Srmind
90282ca45b1Srmind /*
90382ca45b1Srmind * Lock is exclusive, so nobody else can be writing at
90482ca45b1Srmind * the same time, and no need for atomic R/M/W, but
90582ca45b1Srmind * readers may read without the lock and so need atomic
90682ca45b1Srmind * load/store. No ordering here needed because the
90782ca45b1Srmind * entry itself stays valid until GC.
90882ca45b1Srmind */
90982ca45b1Srmind atomic_store_relaxed(&node->state,
91082ca45b1Srmind atomic_load_relaxed(&node->state) | NODE_DELETED);
91182ca45b1Srmind unlock_node(node); // memory_order_release
9126577bb50Srmind
91382ca45b1Srmind ASSERT(THMAP_NODE(thmap,
91482ca45b1Srmind atomic_load_relaxed(&parent->slots[slot])) == node);
9156577bb50Srmind node_remove(parent, slot);
9166577bb50Srmind
9176577bb50Srmind /* Stage the removed node for G/C. */
9186577bb50Srmind stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN);
9196577bb50Srmind }
9206577bb50Srmind
9216577bb50Srmind /*
9226577bb50Srmind * If the top node is empty, then we need to remove it from the
9236577bb50Srmind * root level. Mark the node as deleted and clear the slot.
9246577bb50Srmind *
9256577bb50Srmind * Note: acquiring the lock on the top node effectively prevents
9266577bb50Srmind * the root slot from changing.
9276577bb50Srmind */
92882ca45b1Srmind if (NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) {
9296577bb50Srmind const unsigned rslot = query.rslot;
93082ca45b1Srmind const thmap_ptr_t nptr =
93182ca45b1Srmind atomic_load_relaxed(&thmap->root[rslot]);
9326577bb50Srmind
9336577bb50Srmind ASSERT(query.level == 0);
9346577bb50Srmind ASSERT(parent->parent == THMAP_NULL);
9356577bb50Srmind ASSERT(THMAP_GETOFF(thmap, parent) == nptr);
9366577bb50Srmind
9376577bb50Srmind /* Mark as deleted and remove from the root-level slot. */
93882ca45b1Srmind atomic_store_relaxed(&parent->state,
93982ca45b1Srmind atomic_load_relaxed(&parent->state) | NODE_DELETED);
94082ca45b1Srmind atomic_store_relaxed(&thmap->root[rslot], THMAP_NULL);
9416577bb50Srmind
9426577bb50Srmind stage_mem_gc(thmap, nptr, THMAP_INODE_LEN);
9436577bb50Srmind }
9446577bb50Srmind unlock_node(parent);
9456577bb50Srmind
9466577bb50Srmind /*
9476577bb50Srmind * Save the value and stage the leaf for G/C.
9486577bb50Srmind */
9496577bb50Srmind val = leaf->val;
9506577bb50Srmind if ((thmap->flags & THMAP_NOCOPY) == 0) {
9516577bb50Srmind stage_mem_gc(thmap, leaf->key, leaf->len);
9526577bb50Srmind }
9536577bb50Srmind stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
9546577bb50Srmind return val;
9556577bb50Srmind }
9566577bb50Srmind
9576577bb50Srmind /*
9586577bb50Srmind * G/C routines.
9596577bb50Srmind */
9606577bb50Srmind
9616577bb50Srmind static void
stage_mem_gc(thmap_t * thmap,uintptr_t addr,size_t len)9626577bb50Srmind stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len)
9636577bb50Srmind {
964*ed443730Sriastradh char *const ptr = THMAP_GETPTR(thmap, addr);
9656577bb50Srmind thmap_gc_t *head, *gc;
9666577bb50Srmind
967*ed443730Sriastradh gc = container_of(ptr, struct thmap_gc, data[0]);
968*ed443730Sriastradh KASSERTMSG(gc->len == len,
969*ed443730Sriastradh "thmap=%p ops=%p ptr=%p len=%zu gc=%p gc->len=%zu",
970*ed443730Sriastradh thmap, thmap->ops, (char *)addr, len, gc, gc->len);
9716577bb50Srmind retry:
97282ca45b1Srmind head = atomic_load_relaxed(&thmap->gc_list);
97382ca45b1Srmind gc->next = head; // not yet published
97482ca45b1Srmind
97582ca45b1Srmind /* Release to subsequent acquire in thmap_stage_gc(). */
97682ca45b1Srmind if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->gc_list, &head, gc,
97782ca45b1Srmind memory_order_release, memory_order_relaxed)) {
9786577bb50Srmind goto retry;
9796577bb50Srmind }
9806577bb50Srmind }
9816577bb50Srmind
9826577bb50Srmind void *
thmap_stage_gc(thmap_t * thmap)9836577bb50Srmind thmap_stage_gc(thmap_t *thmap)
9846577bb50Srmind {
98582ca45b1Srmind /* Acquire from prior release in stage_mem_gc(). */
98682ca45b1Srmind return atomic_exchange_explicit(&thmap->gc_list, NULL,
98782ca45b1Srmind memory_order_acquire);
9886577bb50Srmind }
9896577bb50Srmind
9906577bb50Srmind void
thmap_gc(thmap_t * thmap,void * ref)9916577bb50Srmind thmap_gc(thmap_t *thmap, void *ref)
9926577bb50Srmind {
9936577bb50Srmind thmap_gc_t *gc = ref;
9946577bb50Srmind
9956577bb50Srmind while (gc) {
9966577bb50Srmind thmap_gc_t *next = gc->next;
997*ed443730Sriastradh gc_free(thmap, THMAP_GETOFF(thmap, &gc->data[0]), gc->len);
9986577bb50Srmind gc = next;
9996577bb50Srmind }
10006577bb50Srmind }
10016577bb50Srmind
10026577bb50Srmind /*
10036577bb50Srmind * thmap_create: construct a new trie-hash map object.
10046577bb50Srmind */
10056577bb50Srmind thmap_t *
thmap_create(uintptr_t baseptr,const thmap_ops_t * ops,unsigned flags)10066577bb50Srmind thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags)
10076577bb50Srmind {
10086577bb50Srmind thmap_t *thmap;
10096577bb50Srmind uintptr_t root;
10106577bb50Srmind
10116577bb50Srmind /*
10126577bb50Srmind * Setup the map object.
10136577bb50Srmind */
10146577bb50Srmind if (!THMAP_ALIGNED_P(baseptr)) {
10156577bb50Srmind return NULL;
10166577bb50Srmind }
10176577bb50Srmind thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP);
10186577bb50Srmind thmap->baseptr = baseptr;
10196577bb50Srmind thmap->ops = ops ? ops : &thmap_default_ops;
10206577bb50Srmind thmap->flags = flags;
10216577bb50Srmind
10226577bb50Srmind if ((thmap->flags & THMAP_SETROOT) == 0) {
10236577bb50Srmind /* Allocate the root level. */
1024*ed443730Sriastradh root = gc_alloc(thmap, THMAP_ROOT_LEN);
1025272918d6Sriastradh if (!root) {
10266577bb50Srmind kmem_free(thmap, sizeof(thmap_t));
10276577bb50Srmind return NULL;
10286577bb50Srmind }
1029272918d6Sriastradh thmap->root = THMAP_GETPTR(thmap, root);
10306577bb50Srmind memset(thmap->root, 0, THMAP_ROOT_LEN);
10316577bb50Srmind }
1032aced4ea1Sriastradh
1033aced4ea1Sriastradh cprng_strong(kern_cprng, thmap->seed, sizeof thmap->seed, 0);
1034aced4ea1Sriastradh
10356577bb50Srmind return thmap;
10366577bb50Srmind }
10376577bb50Srmind
10386577bb50Srmind int
thmap_setroot(thmap_t * thmap,uintptr_t root_off)10396577bb50Srmind thmap_setroot(thmap_t *thmap, uintptr_t root_off)
10406577bb50Srmind {
10416577bb50Srmind if (thmap->root) {
10426577bb50Srmind return -1;
10436577bb50Srmind }
10446577bb50Srmind thmap->root = THMAP_GETPTR(thmap, root_off);
10456577bb50Srmind return 0;
10466577bb50Srmind }
10476577bb50Srmind
10486577bb50Srmind uintptr_t
thmap_getroot(const thmap_t * thmap)10496577bb50Srmind thmap_getroot(const thmap_t *thmap)
10506577bb50Srmind {
10516577bb50Srmind return THMAP_GETOFF(thmap, thmap->root);
10526577bb50Srmind }
10536577bb50Srmind
10546577bb50Srmind void
thmap_destroy(thmap_t * thmap)10556577bb50Srmind thmap_destroy(thmap_t *thmap)
10566577bb50Srmind {
10576577bb50Srmind uintptr_t root = THMAP_GETOFF(thmap, thmap->root);
10586577bb50Srmind void *ref;
10596577bb50Srmind
10606577bb50Srmind ref = thmap_stage_gc(thmap);
10616577bb50Srmind thmap_gc(thmap, ref);
10626577bb50Srmind
10636577bb50Srmind if ((thmap->flags & THMAP_SETROOT) == 0) {
1064*ed443730Sriastradh gc_free(thmap, root, THMAP_ROOT_LEN);
10656577bb50Srmind }
10666577bb50Srmind kmem_free(thmap, sizeof(thmap_t));
10676577bb50Srmind }
1068