1*f7a6843eSchristos /* $NetBSD: qp.c,v 1.4 2025/01/27 15:40:36 christos Exp $ */ 29689912eSchristos 39689912eSchristos /* 49689912eSchristos * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 59689912eSchristos * 69689912eSchristos * SPDX-License-Identifier: MPL-2.0 79689912eSchristos * 89689912eSchristos * This Source Code Form is subject to the terms of the Mozilla Public 99689912eSchristos * License, v. 2.0. If a copy of the MPL was not distributed with this 109689912eSchristos * file, you can obtain one at https://mozilla.org/MPL/2.0/. 119689912eSchristos * 129689912eSchristos * See the COPYRIGHT file distributed with this work for additional 139689912eSchristos * information regarding copyright ownership. 149689912eSchristos */ 159689912eSchristos 169689912eSchristos /* 179689912eSchristos * For an overview, see doc/design/qp-trie.md 189689912eSchristos */ 199689912eSchristos 209689912eSchristos #include <inttypes.h> 219689912eSchristos #include <stdbool.h> 229689912eSchristos #include <stddef.h> 239689912eSchristos #include <stdint.h> 249689912eSchristos #include <string.h> 259689912eSchristos 269689912eSchristos #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 279689912eSchristos #include <sys/mman.h> 289689912eSchristos #include <unistd.h> 299689912eSchristos #endif 309689912eSchristos 319689912eSchristos #include <isc/atomic.h> 329689912eSchristos #include <isc/buffer.h> 339689912eSchristos #include <isc/magic.h> 349689912eSchristos #include <isc/mem.h> 359689912eSchristos #include <isc/mutex.h> 369689912eSchristos #include <isc/refcount.h> 379689912eSchristos #include <isc/result.h> 389689912eSchristos #include <isc/rwlock.h> 399689912eSchristos #include <isc/tid.h> 409689912eSchristos #include <isc/time.h> 419689912eSchristos #include <isc/types.h> 429689912eSchristos #include <isc/urcu.h> 439689912eSchristos #include <isc/util.h> 449689912eSchristos 459689912eSchristos #include <dns/fixedname.h> 469689912eSchristos #include <dns/log.h> 479689912eSchristos #include <dns/name.h> 489689912eSchristos #include <dns/qp.h> 499689912eSchristos #include <dns/types.h> 509689912eSchristos 519689912eSchristos #include "qp_p.h" 529689912eSchristos 539689912eSchristos #ifndef DNS_QP_LOG_STATS 549689912eSchristos #define DNS_QP_LOG_STATS 1 559689912eSchristos #endif 569689912eSchristos #ifndef DNS_QP_TRACE 579689912eSchristos #define DNS_QP_TRACE 0 589689912eSchristos #endif 599689912eSchristos 609689912eSchristos /* 619689912eSchristos * very basic garbage collector statistics 629689912eSchristos * 639689912eSchristos * XXXFANF for now we're logging GC times, but ideally we should 649689912eSchristos * accumulate stats more quietly and report via the statschannel 659689912eSchristos */ 66c2d18a95Schristos #ifdef _LP64 679689912eSchristos static atomic_uint_fast64_t compact_time; 689689912eSchristos static atomic_uint_fast64_t recycle_time; 699689912eSchristos static atomic_uint_fast64_t rollback_time; 70c2d18a95Schristos #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a)) 71*f7a6843eSchristos #define ISC_QP_GET(v) atomic_load_relaxed(&(v)) 72c2d18a95Schristos #else 73c2d18a95Schristos static uint64_t compact_time; 74c2d18a95Schristos static uint64_t recycle_time; 75c2d18a95Schristos static uint64_t rollback_time; 76c2d18a95Schristos static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER; 77c2d18a95Schristos #define ISC_QP_ADD(v, a) \ 78c2d18a95Schristos ({ \ 79c2d18a95Schristos isc_mutex_lock(&qp_mutex); \ 80c2d18a95Schristos uint64_t x = (v) + (a); \ 81c2d18a95Schristos isc_mutex_unlock(&qp_mutex); \ 82c2d18a95Schristos x; \ 83c2d18a95Schristos }) 84c2d18a95Schristos #define ISC_QP_GET(v) \ 85c2d18a95Schristos ({ \ 86c2d18a95Schristos isc_mutex_lock(&qp_mutex); \ 87c2d18a95Schristos uint64_t x = (v); \ 88c2d18a95Schristos isc_mutex_unlock(&qp_mutex); \ 89c2d18a95Schristos x; \ 90c2d18a95Schristos }) 91c2d18a95Schristos #endif 92c2d18a95Schristos 939689912eSchristos 949689912eSchristos /* for LOG_STATS() format strings */ 959689912eSchristos #define PRItime " %" PRIu64 " ns " 969689912eSchristos 979689912eSchristos #if DNS_QP_LOG_STATS 989689912eSchristos #define LOG_STATS(...) \ 999689912eSchristos isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \ 1009689912eSchristos ISC_LOG_DEBUG(1), __VA_ARGS__) 1019689912eSchristos #else 1029689912eSchristos #define LOG_STATS(...) 1039689912eSchristos #endif 1049689912eSchristos 1059689912eSchristos #if DNS_QP_TRACE 1069689912eSchristos /* 1079689912eSchristos * TRACE is generally used in allocation-related functions so it doesn't 1089689912eSchristos * trace very high-frequency ops 1099689912eSchristos */ 1109689912eSchristos #define TRACE(fmt, ...) \ 1119689912eSchristos do { \ 1129689912eSchristos if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) { \ 1139689912eSchristos isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, \ 1149689912eSchristos DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7), \ 1159689912eSchristos "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \ 1169689912eSchristos __FILE__, __LINE__, __func__, qp, \ 1179689912eSchristos qp ? TRIENAME(qp) : "(null)", isc_tid(), \ 1189689912eSchristos ##__VA_ARGS__); \ 1199689912eSchristos } \ 1209689912eSchristos } while (0) 1219689912eSchristos #else 1229689912eSchristos #define TRACE(...) 1239689912eSchristos #endif 1249689912eSchristos 1259689912eSchristos /*********************************************************************** 1269689912eSchristos * 1279689912eSchristos * converting DNS names to trie keys 1289689912eSchristos */ 1299689912eSchristos 1309689912eSchristos /* 1319689912eSchristos * Number of distinct byte values, i.e. 256 1329689912eSchristos */ 1339689912eSchristos #define BYTE_VALUES (UINT8_MAX + 1) 1349689912eSchristos 1359689912eSchristos /* 1369689912eSchristos * Lookup table mapping bytes in DNS names to bit positions, used 1379689912eSchristos * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. 1389689912eSchristos * 1399689912eSchristos * Each element holds one or two bit positions, bit_one in the 1409689912eSchristos * lower half and bit_two in the upper half. 1419689912eSchristos * 1429689912eSchristos * For common hostname characters, bit_two is zero (which cannot 1439689912eSchristos * be a valid bit position). 1449689912eSchristos * 1459689912eSchristos * For others, bit_one is the escape bit, and bit_two is the 1469689912eSchristos * position of the character within the escaped range. 1479689912eSchristos */ 1489689912eSchristos uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 }; 1499689912eSchristos 1509689912eSchristos /* 1519689912eSchristos * And the reverse, mapping bit positions to characters, so the tests 1529689912eSchristos * can print diagnostics involving qp-trie keys. 1539689912eSchristos * 1549689912eSchristos * This table only handles the first bit in an escape sequence; we 1559689912eSchristos * arrange that we can calculate the byte value for both bits by 1569689912eSchristos * adding the the second bit to the first bit's byte value. 1579689912eSchristos */ 1589689912eSchristos uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 }; 1599689912eSchristos 1609689912eSchristos /* 1619689912eSchristos * Fill in the lookup tables at program startup. (It doesn't matter 1629689912eSchristos * when this is initialized relative to other startup code.) 1639689912eSchristos */ 1649689912eSchristos static void 1659689912eSchristos initialize_bits_for_byte(void) ISC_CONSTRUCTOR; 1669689912eSchristos 1679689912eSchristos /* 1689689912eSchristos * The bit positions for bytes inside labels have to be between 1699689912eSchristos * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.) 1709689912eSchristos * 1719689912eSchristos * Each byte range in between common hostname characters has a different 1729689912eSchristos * escape character, to preserve the correct lexical order. 1739689912eSchristos * 1749689912eSchristos * Escaped byte ranges mostly fit into the space available in the 1759689912eSchristos * bitmap, except for those above 'z' (which is mostly bytes with the 1769689912eSchristos * top bit set). So, when we reach the end of the bitmap we roll over 1779689912eSchristos * to the next escape character. 1789689912eSchristos * 1799689912eSchristos * After filling the table we ensure that the bit positions for 1809689912eSchristos * hostname characters and escape characters all fit. 1819689912eSchristos */ 1829689912eSchristos static void 1839689912eSchristos initialize_bits_for_byte(void) { 1849689912eSchristos /* zero common character marker not a valid shift position */ 1859689912eSchristos INSIST(0 < SHIFT_BITMAP); 1869689912eSchristos /* first bit is common byte or escape byte */ 1879689912eSchristos dns_qpshift_t bit_one = SHIFT_BITMAP; 1889689912eSchristos /* second bit is position in escaped range */ 1899689912eSchristos dns_qpshift_t bit_two = SHIFT_BITMAP; 1909689912eSchristos bool escaping = true; 1919689912eSchristos 1929689912eSchristos for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) { 1939689912eSchristos if (qp_common_character(byte)) { 1949689912eSchristos escaping = false; 1959689912eSchristos bit_one++; 1969689912eSchristos dns_qp_byte_for_bit[bit_one] = byte; 1979689912eSchristos dns_qp_bits_for_byte[byte] = bit_one; 1989689912eSchristos } else if ('A' <= byte && byte <= 'Z') { 1999689912eSchristos /* map upper case to lower case */ 2009689912eSchristos dns_qpshift_t after_esc = bit_one + 1; 2019689912eSchristos dns_qpshift_t skip_punct = 'a' - '_'; 2029689912eSchristos dns_qpshift_t letter = byte - 'A'; 2039689912eSchristos dns_qpshift_t bit = after_esc + skip_punct + letter; 2049689912eSchristos dns_qp_bits_for_byte[byte] = bit; 2059689912eSchristos /* to simplify reverse conversion */ 2069689912eSchristos bit_two++; 2079689912eSchristos } else { 2089689912eSchristos /* non-hostname characters need to be escaped */ 2099689912eSchristos if (!escaping || bit_two >= SHIFT_OFFSET) { 2109689912eSchristos escaping = true; 2119689912eSchristos bit_one++; 2129689912eSchristos dns_qp_byte_for_bit[bit_one] = byte; 2139689912eSchristos bit_two = SHIFT_BITMAP; 2149689912eSchristos } 2159689912eSchristos dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one; 2169689912eSchristos bit_two++; 2179689912eSchristos } 2189689912eSchristos } 2199689912eSchristos ENSURE(bit_one < SHIFT_OFFSET); 2209689912eSchristos } 2219689912eSchristos 2229689912eSchristos /* 2239689912eSchristos * Convert a DNS name into a trie lookup key. 2249689912eSchristos * 2259689912eSchristos * Returns the length of the key. 2269689912eSchristos * 2279689912eSchristos * For performance we get our hands dirty in the guts of the name. 2289689912eSchristos * 2299689912eSchristos * We don't worry about the distinction between absolute and relative 2309689912eSchristos * names. When the trie is only used with absolute names, the first byte 2319689912eSchristos * of the key will always be SHIFT_NOBYTE and it will always be skipped 2329689912eSchristos * when traversing the trie. So keeping the root label costs little, and 2339689912eSchristos * it allows us to support tries of relative names too. In fact absolute 2349689912eSchristos * and relative names can be mixed in the same trie without causing 2359689912eSchristos * confusion, because the presence or absence of the initial 2369689912eSchristos * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing 2379689912eSchristos * dot in a zone file). 2389689912eSchristos */ 2399689912eSchristos size_t 2409689912eSchristos dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) { 2419689912eSchristos size_t len, label; 2429689912eSchristos dns_fixedname_t fixed; 2439689912eSchristos 2449689912eSchristos REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 2459689912eSchristos 2469689912eSchristos if (name->labels == 0) { 2479689912eSchristos key[0] = SHIFT_NOBYTE; 2489689912eSchristos return 0; 2499689912eSchristos } 2509689912eSchristos 2519689912eSchristos if (name->offsets == NULL) { 2529689912eSchristos dns_name_t *clone = dns_fixedname_initname(&fixed); 2539689912eSchristos dns_name_clone(name, clone); 2549689912eSchristos name = clone; 2559689912eSchristos } 2569689912eSchristos 2579689912eSchristos len = 0; 2589689912eSchristos label = name->labels; 2599689912eSchristos while (label-- > 0) { 2609689912eSchristos const uint8_t *ldata = name->ndata + name->offsets[label]; 2619689912eSchristos size_t label_len = *ldata++; 2629689912eSchristos while (label_len-- > 0) { 2639689912eSchristos uint16_t bits = dns_qp_bits_for_byte[*ldata++]; 2649689912eSchristos key[len++] = bits & 0xFF; /* bit_one */ 2659689912eSchristos if ((bits >> 8) != 0) { /* escape? */ 2669689912eSchristos key[len++] = bits >> 8; /* bit_two */ 2679689912eSchristos } 2689689912eSchristos } 2699689912eSchristos /* label terminator */ 2709689912eSchristos key[len++] = SHIFT_NOBYTE; 2719689912eSchristos } 2729689912eSchristos /* mark end with a double NOBYTE */ 2739689912eSchristos key[len] = SHIFT_NOBYTE; 2749689912eSchristos ENSURE(len < sizeof(dns_qpkey_t)); 2759689912eSchristos return len; 2769689912eSchristos } 2779689912eSchristos 2789689912eSchristos void 2799689912eSchristos dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) { 2809689912eSchristos size_t locs[DNS_NAME_MAXLABELS]; 2819689912eSchristos size_t loc = 0, opos = 0; 2829689912eSchristos size_t offset; 2839689912eSchristos 2849689912eSchristos REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 2859689912eSchristos REQUIRE(name->buffer != NULL); 2869689912eSchristos REQUIRE(name->offsets != NULL); 2879689912eSchristos 2889689912eSchristos dns_name_reset(name); 2899689912eSchristos 2909689912eSchristos if (keylen == 0) { 2919689912eSchristos return; 2929689912eSchristos } 2939689912eSchristos 2949689912eSchristos /* Scan the key looking for label boundaries */ 2959689912eSchristos for (offset = 0; offset <= keylen; offset++) { 2969689912eSchristos INSIST(key[offset] >= SHIFT_NOBYTE && 2979689912eSchristos key[offset] < SHIFT_OFFSET); 2989689912eSchristos INSIST(loc < DNS_NAME_MAXLABELS); 2999689912eSchristos if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) { 3009689912eSchristos if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE) 3019689912eSchristos { 3029689912eSchristos locs[loc] = offset + 1; 3039689912eSchristos goto scanned; 3049689912eSchristos } 3059689912eSchristos locs[loc++] = offset + 1; 3069689912eSchristos } else if (offset == 0) { 3079689912eSchristos /* This happens for a relative name */ 3089689912eSchristos locs[loc++] = offset; 3099689912eSchristos } 3109689912eSchristos } 3119689912eSchristos UNREACHABLE(); 3129689912eSchristos scanned: 3139689912eSchristos 3149689912eSchristos /* 3159689912eSchristos * In the key the labels are encoded in reverse order, so 3169689912eSchristos * we step backward through the label boundaries, then forward 3179689912eSchristos * through the labels, to create the DNS wire format data. 3189689912eSchristos */ 3199689912eSchristos name->labels = loc; 3209689912eSchristos while (loc-- > 0) { 3219689912eSchristos uint8_t len = 0, *lenp = NULL; 3229689912eSchristos 3239689912eSchristos /* Add a length byte to the name data and set an offset */ 3249689912eSchristos lenp = isc_buffer_used(name->buffer); 3259689912eSchristos isc_buffer_putuint8(name->buffer, 0); 3269689912eSchristos name->offsets[opos++] = name->length++; 3279689912eSchristos 3289689912eSchristos /* Convert from escaped byte ranges to ASCII */ 3299689912eSchristos for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) { 3309689912eSchristos uint8_t bit = qpkey_bit(key, keylen, offset); 3319689912eSchristos uint8_t byte = dns_qp_byte_for_bit[bit]; 3329689912eSchristos if (qp_common_character(byte)) { 3339689912eSchristos isc_buffer_putuint8(name->buffer, byte); 3349689912eSchristos } else { 3359689912eSchristos byte += key[++offset] - SHIFT_BITMAP; 3369689912eSchristos isc_buffer_putuint8(name->buffer, byte); 3379689912eSchristos } 3389689912eSchristos len++; 3399689912eSchristos } 3409689912eSchristos 3419689912eSchristos name->length += len; 3429689912eSchristos *lenp = len; 3439689912eSchristos } 3449689912eSchristos 3459689912eSchristos /* Add a root label for absolute names */ 3469689912eSchristos if (key[0] == SHIFT_NOBYTE) { 3479689912eSchristos name->attributes.absolute = true; 3489689912eSchristos isc_buffer_putuint8(name->buffer, 0); 3499689912eSchristos name->offsets[opos++] = name->length++; 3509689912eSchristos name->labels++; 3519689912eSchristos } 3529689912eSchristos 3539689912eSchristos name->ndata = isc_buffer_base(name->buffer); 3549689912eSchristos } 3559689912eSchristos 3569689912eSchristos /* 3579689912eSchristos * Sentinel value for equal keys 3589689912eSchristos */ 3599689912eSchristos #define QPKEY_EQUAL (~(size_t)0) 3609689912eSchristos 3619689912eSchristos /* 3629689912eSchristos * Compare two keys and return the offset where they differ. 3639689912eSchristos * 3649689912eSchristos * This offset is used to work out where a trie search diverged: when one 3659689912eSchristos * of the keys is in the trie and one is not, the common prefix (up to the 3669689912eSchristos * offset) is the part of the unknown key that exists in the trie. This 3679689912eSchristos * matters for adding new keys or finding neighbours of missing keys. 3689689912eSchristos * 3699689912eSchristos * When the keys are different lengths it is possible (but unwise) for 3709689912eSchristos * the longer key to be the same as the shorter key but with superfluous 3719689912eSchristos * trailing SHIFT_NOBYTE elements. This makes the keys equal for the 3729689912eSchristos * purpose of traversing the trie. 3739689912eSchristos */ 3749689912eSchristos static size_t 3759689912eSchristos qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a, 3769689912eSchristos const dns_qpkey_t key_b, const size_t keylen_b) { 3779689912eSchristos size_t keylen = ISC_MAX(keylen_a, keylen_b); 3789689912eSchristos for (size_t offset = 0; offset < keylen; offset++) { 3799689912eSchristos if (qpkey_bit(key_a, keylen_a, offset) != 3809689912eSchristos qpkey_bit(key_b, keylen_b, offset)) 3819689912eSchristos { 3829689912eSchristos return offset; 3839689912eSchristos } 3849689912eSchristos } 3859689912eSchristos return QPKEY_EQUAL; 3869689912eSchristos } 3879689912eSchristos 3889689912eSchristos /*********************************************************************** 3899689912eSchristos * 3909689912eSchristos * allocator wrappers 3919689912eSchristos */ 3929689912eSchristos 3939689912eSchristos #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 3949689912eSchristos 3959689912eSchristos /* 3969689912eSchristos * Optionally (for debugging) during a copy-on-write transaction, use 3979689912eSchristos * memory protection to ensure that the shared chunks are not modified. 3989689912eSchristos * Once a chunk becomes shared, it remains read-only until it is freed. 3999689912eSchristos * POSIX says we have to use mmap() to get an allocation that we can 4009689912eSchristos * definitely pass to mprotect(). 4019689912eSchristos */ 4029689912eSchristos 4039689912eSchristos static size_t 4049689912eSchristos chunk_size_raw(void) { 4059689912eSchristos size_t size = (size_t)sysconf(_SC_PAGE_SIZE); 4069689912eSchristos return ISC_MAX(size, QP_CHUNK_BYTES); 4079689912eSchristos } 4089689912eSchristos 4099689912eSchristos static void * 4109689912eSchristos chunk_get_raw(dns_qp_t *qp) { 4119689912eSchristos if (qp->write_protect) { 4129689912eSchristos size_t size = chunk_size_raw(); 4139689912eSchristos void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, 4149689912eSchristos MAP_ANON | MAP_PRIVATE, -1, 0); 4159689912eSchristos RUNTIME_CHECK(ptr != MAP_FAILED); 4169689912eSchristos return ptr; 4179689912eSchristos } else { 4189689912eSchristos return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES); 4199689912eSchristos } 4209689912eSchristos } 4219689912eSchristos 4229689912eSchristos static void 4239689912eSchristos chunk_free_raw(dns_qp_t *qp, void *ptr) { 4249689912eSchristos if (qp->write_protect) { 4259689912eSchristos RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0); 4269689912eSchristos } else { 4279689912eSchristos isc_mem_free(qp->mctx, ptr); 4289689912eSchristos } 4299689912eSchristos } 4309689912eSchristos 4319689912eSchristos static void * 4329689912eSchristos chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) { 4339689912eSchristos if (qp->write_protect) { 4349689912eSchristos return ptr; 4359689912eSchristos } else { 4369689912eSchristos return isc_mem_reallocate(qp->mctx, ptr, bytes); 4379689912eSchristos } 4389689912eSchristos } 4399689912eSchristos 4409689912eSchristos static void 4419689912eSchristos write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) { 4429689912eSchristos if (qp->write_protect) { 4439689912eSchristos /* see transaction_open() wrt this special case */ 4449689912eSchristos if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) { 4459689912eSchristos return; 4469689912eSchristos } 4479689912eSchristos TRACE("chunk %u", chunk); 4489689912eSchristos void *ptr = qp->base->ptr[chunk]; 4499689912eSchristos size_t size = chunk_size_raw(); 4509689912eSchristos RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0); 4519689912eSchristos } 4529689912eSchristos } 4539689912eSchristos 4549689912eSchristos #else 4559689912eSchristos 4569689912eSchristos #define chunk_get_raw(qp) isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES) 4579689912eSchristos #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr) 4589689912eSchristos 4599689912eSchristos #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size) 4609689912eSchristos 4619689912eSchristos #define write_protect(qp, chunk) 4629689912eSchristos 4639689912eSchristos #endif 4649689912eSchristos 4659689912eSchristos /*********************************************************************** 4669689912eSchristos * 4679689912eSchristos * allocator 4689689912eSchristos */ 4699689912eSchristos 4709689912eSchristos /* 4719689912eSchristos * When we reuse the bump chunk across multiple write transactions, 4729689912eSchristos * it can have an immutable prefix and a mutable suffix. 4739689912eSchristos */ 4749689912eSchristos static inline bool 4759689912eSchristos cells_immutable(dns_qp_t *qp, dns_qpref_t ref) { 4769689912eSchristos dns_qpchunk_t chunk = ref_chunk(ref); 4779689912eSchristos dns_qpcell_t cell = ref_cell(ref); 4789689912eSchristos if (chunk == qp->bump) { 4799689912eSchristos return cell < qp->fender; 4809689912eSchristos } else { 4819689912eSchristos return qp->usage[chunk].immutable; 4829689912eSchristos } 4839689912eSchristos } 4849689912eSchristos 4859689912eSchristos /* 4869689912eSchristos * Create a fresh bump chunk and allocate some twigs from it. 4879689912eSchristos */ 4889689912eSchristos static dns_qpref_t 4899689912eSchristos chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) { 4909689912eSchristos INSIST(qp->base->ptr[chunk] == NULL); 4919689912eSchristos INSIST(qp->usage[chunk].used == 0); 4929689912eSchristos INSIST(qp->usage[chunk].free == 0); 4939689912eSchristos 4949689912eSchristos qp->base->ptr[chunk] = chunk_get_raw(qp); 4959689912eSchristos qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size }; 4969689912eSchristos qp->used_count += size; 4979689912eSchristos qp->bump = chunk; 4989689912eSchristos qp->fender = 0; 4999689912eSchristos 5009689912eSchristos if (qp->write_protect) { 5019689912eSchristos TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 5029689912eSchristos } 5039689912eSchristos return make_ref(chunk, 0); 5049689912eSchristos } 5059689912eSchristos 5069689912eSchristos /* 5079689912eSchristos * This is used to grow the chunk arrays when they fill up. If the old 5089689912eSchristos * base array is in use by readers, we must make a clone, otherwise we 5099689912eSchristos * can reallocate in place. 5109689912eSchristos * 5119689912eSchristos * The isc_refcount_init() and qpbase_unref() in this function are a pair. 5129689912eSchristos */ 5139689912eSchristos static void 5149689912eSchristos realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) { 5159689912eSchristos size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max; 5169689912eSchristos size_t newptrs = sizeof(qp->base->ptr[0]) * newmax; 5179689912eSchristos size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax); 5189689912eSchristos 5199689912eSchristos if (qp->base == NULL || qpbase_unref(qp)) { 5209689912eSchristos qp->base = isc_mem_reallocate(qp->mctx, qp->base, size); 5219689912eSchristos } else { 5229689912eSchristos dns_qpbase_t *oldbase = qp->base; 5239689912eSchristos qp->base = isc_mem_allocate(qp->mctx, size); 5249689912eSchristos memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs); 5259689912eSchristos } 5269689912eSchristos memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); 5279689912eSchristos isc_refcount_init(&qp->base->refcount, 1); 5289689912eSchristos qp->base->magic = QPBASE_MAGIC; 5299689912eSchristos 5309689912eSchristos /* usage array is exclusive to the writer */ 5319689912eSchristos size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max; 5329689912eSchristos size_t newusage = sizeof(qp->usage[0]) * newmax; 5339689912eSchristos qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage); 5349689912eSchristos memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage); 5359689912eSchristos 5369689912eSchristos qp->chunk_max = newmax; 5379689912eSchristos 5389689912eSchristos TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max); 5399689912eSchristos } 5409689912eSchristos 5419689912eSchristos /* 5429689912eSchristos * There was no space in the bump chunk, so find a place to put a fresh 5439689912eSchristos * chunk in the chunk arrays, then allocate some twigs from it. 5449689912eSchristos */ 5459689912eSchristos static dns_qpref_t 5469689912eSchristos alloc_slow(dns_qp_t *qp, dns_qpweight_t size) { 5479689912eSchristos dns_qpchunk_t chunk; 5489689912eSchristos 5499689912eSchristos for (chunk = 0; chunk < qp->chunk_max; chunk++) { 5509689912eSchristos if (!qp->usage[chunk].exists) { 5519689912eSchristos return chunk_alloc(qp, chunk, size); 5529689912eSchristos } 5539689912eSchristos } 5549689912eSchristos ENSURE(chunk == qp->chunk_max); 5559689912eSchristos realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk)); 5569689912eSchristos return chunk_alloc(qp, chunk, size); 5579689912eSchristos } 5589689912eSchristos 5599689912eSchristos /* 5609689912eSchristos * Ensure we are using a fresh bump chunk. 5619689912eSchristos */ 5629689912eSchristos static void 5639689912eSchristos alloc_reset(dns_qp_t *qp) { 5649689912eSchristos (void)alloc_slow(qp, 0); 5659689912eSchristos } 5669689912eSchristos 5679689912eSchristos /* 5689689912eSchristos * Allocate some fresh twigs. This is the bump allocator fast path. 5699689912eSchristos */ 5709689912eSchristos static inline dns_qpref_t 5719689912eSchristos alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) { 5729689912eSchristos dns_qpchunk_t chunk = qp->bump; 5739689912eSchristos dns_qpcell_t cell = qp->usage[chunk].used; 5749689912eSchristos 5759689912eSchristos if (cell + size <= QP_CHUNK_SIZE) { 5769689912eSchristos qp->usage[chunk].used += size; 5779689912eSchristos qp->used_count += size; 5789689912eSchristos return make_ref(chunk, cell); 5799689912eSchristos } else { 5809689912eSchristos return alloc_slow(qp, size); 5819689912eSchristos } 5829689912eSchristos } 5839689912eSchristos 5849689912eSchristos /* 5859689912eSchristos * Record that some twigs are no longer being used, and if possible 5869689912eSchristos * zero them to ensure that there isn't a spurious double detach when 5879689912eSchristos * the chunk is later recycled. 5889689912eSchristos * 5899689912eSchristos * Returns true if the twigs were immediately destroyed. 5909689912eSchristos * 5919689912eSchristos * NOTE: the caller is responsible for attaching or detaching any 5929689912eSchristos * leaves as required. 5939689912eSchristos */ 5949689912eSchristos static inline bool 5959689912eSchristos free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 5969689912eSchristos dns_qpchunk_t chunk = ref_chunk(twigs); 5979689912eSchristos 5989689912eSchristos qp->free_count += size; 5999689912eSchristos qp->usage[chunk].free += size; 6009689912eSchristos ENSURE(qp->free_count <= qp->used_count); 6019689912eSchristos ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used); 6029689912eSchristos 6039689912eSchristos if (cells_immutable(qp, twigs)) { 6049689912eSchristos qp->hold_count += size; 6059689912eSchristos ENSURE(qp->free_count >= qp->hold_count); 6069689912eSchristos return false; 6079689912eSchristos } else { 6089689912eSchristos zero_twigs(ref_ptr(qp, twigs), size); 6099689912eSchristos return true; 6109689912eSchristos } 6119689912eSchristos } 6129689912eSchristos 6139689912eSchristos /* 6149689912eSchristos * When some twigs have been copied, and free_twigs() could not 6159689912eSchristos * immediately destroy the old copy, we need to update the refcount 6169689912eSchristos * on any leaves that were duplicated. 6179689912eSchristos */ 6189689912eSchristos static void 6199689912eSchristos attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) { 6209689912eSchristos for (dns_qpweight_t pos = 0; pos < size; pos++) { 6219689912eSchristos if (node_tag(&twigs[pos]) == LEAF_TAG) { 6229689912eSchristos attach_leaf(qp, &twigs[pos]); 6239689912eSchristos } 6249689912eSchristos } 6259689912eSchristos } 6269689912eSchristos 6279689912eSchristos /*********************************************************************** 6289689912eSchristos * 6299689912eSchristos * chunk reclamation 6309689912eSchristos */ 6319689912eSchristos 6329689912eSchristos /* 6339689912eSchristos * Is any of this chunk still in use? 6349689912eSchristos */ 6359689912eSchristos static inline dns_qpcell_t 6369689912eSchristos chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) { 6379689912eSchristos return qp->usage[chunk].used - qp->usage[chunk].free; 6389689912eSchristos } 6399689912eSchristos 6409689912eSchristos /* 6419689912eSchristos * We remove each empty chunk from the total counts when the chunk is 6429689912eSchristos * freed, or when it is scheduled for safe memory reclamation. We check 6439689912eSchristos * the chunk's phase to avoid discounting it twice in the latter case. 6449689912eSchristos */ 6459689912eSchristos static void 6469689912eSchristos chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) { 6479689912eSchristos if (qp->usage[chunk].discounted) { 6489689912eSchristos return; 6499689912eSchristos } 6509689912eSchristos INSIST(qp->used_count >= qp->usage[chunk].used); 6519689912eSchristos INSIST(qp->free_count >= qp->usage[chunk].free); 6529689912eSchristos qp->used_count -= qp->usage[chunk].used; 6539689912eSchristos qp->free_count -= qp->usage[chunk].free; 6549689912eSchristos qp->usage[chunk].discounted = true; 6559689912eSchristos } 6569689912eSchristos 6579689912eSchristos /* 6589689912eSchristos * When a chunk is being recycled, we need to detach any leaves that 6599689912eSchristos * remain, and free any `base` arrays that have been marked as unused. 6609689912eSchristos */ 6619689912eSchristos static void 6629689912eSchristos chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) { 6639689912eSchristos if (qp->write_protect) { 6649689912eSchristos TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 6659689912eSchristos } 6669689912eSchristos 6679689912eSchristos dns_qpnode_t *n = qp->base->ptr[chunk]; 6689689912eSchristos for (dns_qpcell_t count = qp->usage[chunk].used; count > 0; 6699689912eSchristos count--, n++) 6709689912eSchristos { 6719689912eSchristos if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) { 6729689912eSchristos detach_leaf(qp, n); 6739689912eSchristos } else if (count > 1 && reader_valid(n)) { 6749689912eSchristos dns_qpreader_t qpr; 6759689912eSchristos unpack_reader(&qpr, n); 6769689912eSchristos /* pairs with dns_qpmulti_commit() */ 6779689912eSchristos if (qpbase_unref(&qpr)) { 6789689912eSchristos isc_mem_free(qp->mctx, qpr.base); 6799689912eSchristos } 6809689912eSchristos } 6819689912eSchristos } 6829689912eSchristos chunk_discount(qp, chunk); 6839689912eSchristos chunk_free_raw(qp, qp->base->ptr[chunk]); 6849689912eSchristos qp->base->ptr[chunk] = NULL; 6859689912eSchristos qp->usage[chunk] = (qp_usage_t){}; 6869689912eSchristos } 6879689912eSchristos 6889689912eSchristos /* 6899689912eSchristos * Free any chunks that we can while a trie is in use. 6909689912eSchristos */ 6919689912eSchristos static void 6929689912eSchristos recycle(dns_qp_t *qp) { 6939689912eSchristos unsigned int free = 0; 6949689912eSchristos 6959689912eSchristos isc_nanosecs_t start = isc_time_monotonic(); 6969689912eSchristos 6979689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 6989689912eSchristos if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 6999689912eSchristos qp->usage[chunk].exists && !qp->usage[chunk].immutable) 7009689912eSchristos { 7019689912eSchristos chunk_free(qp, chunk); 7029689912eSchristos free++; 7039689912eSchristos } 7049689912eSchristos } 7059689912eSchristos 7069689912eSchristos isc_nanosecs_t time = isc_time_monotonic() - start; 707c2d18a95Schristos ISC_QP_ADD(recycle_time, time); 7089689912eSchristos 7099689912eSchristos if (free > 0) { 7109689912eSchristos LOG_STATS("qp recycle" PRItime "free %u chunks", time, free); 7119689912eSchristos LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u", 7129689912eSchristos qp->leaf_count, qp->used_count - qp->free_count, 7139689912eSchristos qp->used_count, qp->free_count, qp->hold_count); 7149689912eSchristos } 7159689912eSchristos } 7169689912eSchristos 7179689912eSchristos /* 7189689912eSchristos * asynchronous cleanup 7199689912eSchristos */ 7209689912eSchristos static void 7219689912eSchristos reclaim_chunks_cb(struct rcu_head *arg) { 7229689912eSchristos qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 7239689912eSchristos REQUIRE(QPRCU_VALID(rcuctx)); 7249689912eSchristos dns_qpmulti_t *multi = rcuctx->multi; 7259689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 7269689912eSchristos 7279689912eSchristos LOCK(&multi->mutex); 7289689912eSchristos 7299689912eSchristos dns_qp_t *qp = &multi->writer; 7309689912eSchristos REQUIRE(QP_VALID(qp)); 7319689912eSchristos 7329689912eSchristos unsigned int free = 0; 7339689912eSchristos isc_nanosecs_t start = isc_time_monotonic(); 7349689912eSchristos 7359689912eSchristos for (unsigned int i = 0; i < rcuctx->count; i++) { 7369689912eSchristos dns_qpchunk_t chunk = rcuctx->chunk[i]; 7379689912eSchristos if (qp->usage[chunk].snapshot) { 7389689912eSchristos /* cleanup when snapshot is destroyed */ 7399689912eSchristos qp->usage[chunk].snapfree = true; 7409689912eSchristos } else { 7419689912eSchristos chunk_free(qp, chunk); 7429689912eSchristos free++; 7439689912eSchristos } 7449689912eSchristos } 7459689912eSchristos 7469689912eSchristos isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 7479689912eSchristos STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 7489689912eSchristos 7499689912eSchristos isc_nanosecs_t time = isc_time_monotonic() - start; 750c2d18a95Schristos ISC_QP_ADD(recycle_time, time); 7519689912eSchristos 7529689912eSchristos if (free > 0) { 7539689912eSchristos LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free); 7549689912eSchristos LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u", 7559689912eSchristos qp->leaf_count, qp->used_count - qp->free_count, 7569689912eSchristos qp->used_count, qp->free_count, qp->hold_count); 7579689912eSchristos } 7589689912eSchristos 7599689912eSchristos UNLOCK(&multi->mutex); 7609689912eSchristos } 7619689912eSchristos 7629689912eSchristos /* 7639689912eSchristos * At the end of a transaction, schedule empty but immutable chunks 7649689912eSchristos * for reclamation later. 7659689912eSchristos */ 7669689912eSchristos static void 7679689912eSchristos reclaim_chunks(dns_qpmulti_t *multi) { 7689689912eSchristos dns_qp_t *qp = &multi->writer; 7699689912eSchristos 7709689912eSchristos unsigned int count = 0; 7719689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 7729689912eSchristos if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 7739689912eSchristos qp->usage[chunk].exists && qp->usage[chunk].immutable && 7749689912eSchristos !qp->usage[chunk].discounted) 7759689912eSchristos { 7769689912eSchristos count++; 7779689912eSchristos } 7789689912eSchristos } 7799689912eSchristos 7809689912eSchristos if (count == 0) { 7819689912eSchristos return; 7829689912eSchristos } 7839689912eSchristos 7849689912eSchristos qp_rcuctx_t *rcuctx = 7859689912eSchristos isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count)); 7869689912eSchristos *rcuctx = (qp_rcuctx_t){ 7879689912eSchristos .magic = QPRCU_MAGIC, 7889689912eSchristos .multi = multi, 7899689912eSchristos .count = count, 7909689912eSchristos }; 7919689912eSchristos isc_mem_attach(qp->mctx, &rcuctx->mctx); 7929689912eSchristos 7939689912eSchristos unsigned int i = 0; 7949689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 7959689912eSchristos if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 7969689912eSchristos qp->usage[chunk].exists && qp->usage[chunk].immutable && 7979689912eSchristos !qp->usage[chunk].discounted) 7989689912eSchristos { 7999689912eSchristos rcuctx->chunk[i++] = chunk; 8009689912eSchristos chunk_discount(qp, chunk); 8019689912eSchristos } 8029689912eSchristos } 8039689912eSchristos 8049689912eSchristos call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb); 8059689912eSchristos 8069689912eSchristos LOG_STATS("qp will reclaim %u chunks", count); 8079689912eSchristos } 8089689912eSchristos 8099689912eSchristos /* 8109689912eSchristos * When a snapshot is destroyed, clean up chunks that need free()ing 8119689912eSchristos * and are not used by any remaining snapshots. 8129689912eSchristos */ 8139689912eSchristos static void 8149689912eSchristos marksweep_chunks(dns_qpmulti_t *multi) { 8159689912eSchristos unsigned int free = 0; 8169689912eSchristos 8179689912eSchristos isc_nanosecs_t start = isc_time_monotonic(); 8189689912eSchristos 8199689912eSchristos dns_qp_t *qpw = &multi->writer; 8209689912eSchristos 8219689912eSchristos for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL; 8229689912eSchristos qps = ISC_LIST_NEXT(qps, link)) 8239689912eSchristos { 8249689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) { 8259689912eSchristos if (qps->base->ptr[chunk] != NULL) { 8269689912eSchristos INSIST(qps->base->ptr[chunk] == 8279689912eSchristos qpw->base->ptr[chunk]); 8289689912eSchristos qpw->usage[chunk].snapmark = true; 8299689912eSchristos } 8309689912eSchristos } 8319689912eSchristos } 8329689912eSchristos 8339689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 8349689912eSchristos qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark; 8359689912eSchristos qpw->usage[chunk].snapmark = false; 8369689912eSchristos if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) { 8379689912eSchristos chunk_free(qpw, chunk); 8389689912eSchristos free++; 8399689912eSchristos } 8409689912eSchristos } 8419689912eSchristos 8429689912eSchristos isc_nanosecs_t time = isc_time_monotonic() - start; 843c2d18a95Schristos ISC_QP_ADD(recycle_time, time); 8449689912eSchristos 8459689912eSchristos if (free > 0) { 8469689912eSchristos LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free); 8479689912eSchristos LOG_STATS( 8489689912eSchristos "qp marksweep leaf %u live %u used %u free %u hold %u", 8499689912eSchristos qpw->leaf_count, qpw->used_count - qpw->free_count, 8509689912eSchristos qpw->used_count, qpw->free_count, qpw->hold_count); 8519689912eSchristos } 8529689912eSchristos } 8539689912eSchristos 8549689912eSchristos /*********************************************************************** 8559689912eSchristos * 8569689912eSchristos * garbage collector 8579689912eSchristos */ 8589689912eSchristos 8599689912eSchristos /* 8609689912eSchristos * Move a branch node's twigs to the `bump` chunk, for copy-on-write 8619689912eSchristos * or for garbage collection. We don't update the node in place 8629689912eSchristos * because `compact_recursive()` does not ensure the node itself is 8639689912eSchristos * mutable until after it discovers evacuation was necessary. 8649689912eSchristos * 8659689912eSchristos * If free_twigs() could not immediately destroy the old twigs, we have 8669689912eSchristos * to re-attach to any leaves. 8679689912eSchristos */ 8689689912eSchristos static dns_qpref_t 8699689912eSchristos evacuate(dns_qp_t *qp, dns_qpnode_t *n) { 8709689912eSchristos dns_qpweight_t size = branch_twigs_size(n); 8719689912eSchristos dns_qpref_t old_ref = branch_twigs_ref(n); 8729689912eSchristos dns_qpref_t new_ref = alloc_twigs(qp, size); 8739689912eSchristos dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref); 8749689912eSchristos dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref); 8759689912eSchristos 8769689912eSchristos move_twigs(new_twigs, old_twigs, size); 8779689912eSchristos if (!free_twigs(qp, old_ref, size)) { 8789689912eSchristos attach_twigs(qp, new_twigs, size); 8799689912eSchristos } 8809689912eSchristos 8819689912eSchristos return new_ref; 8829689912eSchristos } 8839689912eSchristos 8849689912eSchristos /* 8859689912eSchristos * Immutable nodes need copy-on-write. As we walk down the trie finding the 8869689912eSchristos * right place to modify, make_root_mutable() and make_twigs_mutable() 8879689912eSchristos * are called to ensure that immutable nodes on the path from the root are 8889689912eSchristos * copied to a mutable chunk. 8899689912eSchristos */ 8909689912eSchristos 8919689912eSchristos static inline dns_qpnode_t * 8929689912eSchristos make_root_mutable(dns_qp_t *qp) { 8939689912eSchristos if (cells_immutable(qp, qp->root_ref)) { 8949689912eSchristos qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp)); 8959689912eSchristos } 8969689912eSchristos return ref_ptr(qp, qp->root_ref); 8979689912eSchristos } 8989689912eSchristos 8999689912eSchristos static inline void 9009689912eSchristos make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) { 9019689912eSchristos if (cells_immutable(qp, branch_twigs_ref(n))) { 9029689912eSchristos *n = make_node(branch_index(n), evacuate(qp, n)); 9039689912eSchristos } 9049689912eSchristos } 9059689912eSchristos 9069689912eSchristos /* 9079689912eSchristos * Compact the trie by traversing the whole thing recursively, copying 9089689912eSchristos * bottom-up as required. The aim is to avoid evacuation as much as 9099689912eSchristos * possible, but when parts of the trie are immutable, we need to evacuate 9109689912eSchristos * the paths from the root to the parts of the trie that occupy 9119689912eSchristos * fragmented chunks. 9129689912eSchristos * 9139689912eSchristos * Without the QP_MIN_USED check, the algorithm will leave the trie 9149689912eSchristos * unchanged. If the children are all leaves, the loop changes nothing, 9159689912eSchristos * so we will return this node's original ref. If all of the children 9169689912eSchristos * that are branches did not need moving, again, the loop changes 9179689912eSchristos * nothing. So the evacuation check is the only place that the 9189689912eSchristos * algorithm introduces ref changes, that then bubble up towards the 9199689912eSchristos * root through the logic inside the loop. 9209689912eSchristos */ 9219689912eSchristos static dns_qpref_t 9229689912eSchristos compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) { 9239689912eSchristos dns_qpweight_t size = branch_twigs_size(parent); 9249689912eSchristos dns_qpref_t twigs_ref = branch_twigs_ref(parent); 9259689912eSchristos dns_qpchunk_t chunk = ref_chunk(twigs_ref); 9269689912eSchristos 9279689912eSchristos if (qp->compact_all || 9289689912eSchristos (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED)) 9299689912eSchristos { 9309689912eSchristos twigs_ref = evacuate(qp, parent); 9319689912eSchristos } 9329689912eSchristos bool immutable = cells_immutable(qp, twigs_ref); 9339689912eSchristos for (dns_qpweight_t pos = 0; pos < size; pos++) { 9349689912eSchristos dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos; 9359689912eSchristos if (!is_branch(child)) { 9369689912eSchristos continue; 9379689912eSchristos } 9389689912eSchristos dns_qpref_t old_grandtwigs = branch_twigs_ref(child); 9399689912eSchristos dns_qpref_t new_grandtwigs = compact_recursive(qp, child); 9409689912eSchristos if (old_grandtwigs == new_grandtwigs) { 9419689912eSchristos continue; 9429689912eSchristos } 9439689912eSchristos if (immutable) { 9449689912eSchristos twigs_ref = evacuate(qp, parent); 9459689912eSchristos /* the twigs have moved */ 9469689912eSchristos child = ref_ptr(qp, twigs_ref) + pos; 9479689912eSchristos immutable = false; 9489689912eSchristos } 9499689912eSchristos *child = make_node(branch_index(child), new_grandtwigs); 9509689912eSchristos } 9519689912eSchristos return twigs_ref; 9529689912eSchristos } 9539689912eSchristos 9549689912eSchristos static void 9559689912eSchristos compact(dns_qp_t *qp) { 9569689912eSchristos LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u", 9579689912eSchristos qp->leaf_count, qp->used_count - qp->free_count, 9589689912eSchristos qp->used_count, qp->free_count, qp->hold_count); 9599689912eSchristos 9609689912eSchristos isc_nanosecs_t start = isc_time_monotonic(); 9619689912eSchristos 9629689912eSchristos if (qp->usage[qp->bump].free > QP_MAX_FREE) { 9639689912eSchristos alloc_reset(qp); 9649689912eSchristos } 9659689912eSchristos 9669689912eSchristos if (qp->leaf_count > 0) { 9679689912eSchristos qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp)); 9689689912eSchristos } 9699689912eSchristos qp->compact_all = false; 9709689912eSchristos 9719689912eSchristos isc_nanosecs_t time = isc_time_monotonic() - start; 972c2d18a95Schristos ISC_QP_ADD(compact_time, time); 9739689912eSchristos 9749689912eSchristos LOG_STATS("qp compact" PRItime 9759689912eSchristos "leaf %u live %u used %u free %u hold %u", 9769689912eSchristos time, qp->leaf_count, qp->used_count - qp->free_count, 9779689912eSchristos qp->used_count, qp->free_count, qp->hold_count); 9789689912eSchristos } 9799689912eSchristos 9809689912eSchristos void 9819689912eSchristos dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) { 9829689912eSchristos REQUIRE(QP_VALID(qp)); 9839689912eSchristos if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) { 9849689912eSchristos return; 9859689912eSchristos } 9869689912eSchristos if (mode == DNS_QPGC_ALL) { 9879689912eSchristos alloc_reset(qp); 9889689912eSchristos qp->compact_all = true; 9899689912eSchristos } 9909689912eSchristos compact(qp); 9919689912eSchristos recycle(qp); 9929689912eSchristos } 9939689912eSchristos 9949689912eSchristos /* 9959689912eSchristos * Free some twigs and (if they were destroyed immediately so that the 9969689912eSchristos * result from QP_MAX_GARBAGE can change) compact the trie if necessary. 9979689912eSchristos * 9989689912eSchristos * This is called by the trie modification API entry points. The 9999689912eSchristos * free_twigs() function requires the caller to attach or detach any 10009689912eSchristos * leaves as necessary. Callers of squash_twigs() satisfy this 10019689912eSchristos * requirement by calling make_twigs_mutable(). 10029689912eSchristos * 10039689912eSchristos * Aside: In typical garbage collectors, compaction is triggered when 10049689912eSchristos * the allocator runs out of space. But that is because typical garbage 10059689912eSchristos * collectors do not know how much memory can be recovered, so they must 10069689912eSchristos * find out by scanning the heap. The qp-trie code was originally 10079689912eSchristos * designed to use malloc() and free(), so it has more information about 10089689912eSchristos * when garbage collection might be worthwhile. Hence we can trigger 10099689912eSchristos * collection when garbage passes a threshold. 10109689912eSchristos * 10119689912eSchristos * XXXFANF: If we need to avoid latency outliers caused by compaction in 10129689912eSchristos * write transactions, we can check qp->transaction_mode here. 10139689912eSchristos */ 10149689912eSchristos static inline bool 10159689912eSchristos squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 10169689912eSchristos bool destroyed = free_twigs(qp, twigs, size); 10179689912eSchristos if (destroyed && QP_AUTOGC(qp)) { 10189689912eSchristos compact(qp); 10199689912eSchristos recycle(qp); 10209689912eSchristos /* 10219689912eSchristos * This shouldn't happen if the garbage collector is 10229689912eSchristos * working correctly. We can recover at the cost of some 10239689912eSchristos * time and space, but recovery should be cheaper than 10249689912eSchristos * letting compact+recycle fail repeatedly. 10259689912eSchristos */ 10269689912eSchristos if (QP_AUTOGC(qp)) { 10279689912eSchristos isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 10289689912eSchristos DNS_LOGMODULE_QP, ISC_LOG_NOTICE, 10299689912eSchristos "qp %p uctx \"%s\" compact/recycle " 10309689912eSchristos "failed to recover any space, " 10319689912eSchristos "scheduling a full compaction", 10329689912eSchristos qp, TRIENAME(qp)); 10339689912eSchristos qp->compact_all = true; 10349689912eSchristos } 10359689912eSchristos } 10369689912eSchristos return destroyed; 10379689912eSchristos } 10389689912eSchristos 10399689912eSchristos /*********************************************************************** 10409689912eSchristos * 10419689912eSchristos * public accessors for memory management internals 10429689912eSchristos */ 10439689912eSchristos 10449689912eSchristos dns_qp_memusage_t 10459689912eSchristos dns_qp_memusage(dns_qp_t *qp) { 10469689912eSchristos REQUIRE(QP_VALID(qp)); 10479689912eSchristos 10489689912eSchristos dns_qp_memusage_t memusage = { 10499689912eSchristos .uctx = qp->uctx, 10509689912eSchristos .leaves = qp->leaf_count, 10519689912eSchristos .live = qp->used_count - qp->free_count, 10529689912eSchristos .used = qp->used_count, 10539689912eSchristos .hold = qp->hold_count, 10549689912eSchristos .free = qp->free_count, 10559689912eSchristos .node_size = sizeof(dns_qpnode_t), 10569689912eSchristos .chunk_size = QP_CHUNK_SIZE, 10579689912eSchristos .fragmented = QP_NEEDGC(qp), 10589689912eSchristos }; 10599689912eSchristos 10609689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 10619689912eSchristos if (qp->base->ptr[chunk] != NULL) { 10629689912eSchristos memusage.chunk_count += 1; 10639689912eSchristos } 10649689912eSchristos } 10659689912eSchristos 10669689912eSchristos /* 10679689912eSchristos * XXXFANF does not subtract chunks that have been shrunk, 10689689912eSchristos * and does not count unreclaimed dns_qpbase_t objects 10699689912eSchristos */ 10709689912eSchristos memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES + 10719689912eSchristos qp->chunk_max * sizeof(qp->base->ptr[0]) + 10729689912eSchristos qp->chunk_max * sizeof(qp->usage[0]); 10739689912eSchristos 10749689912eSchristos return memusage; 10759689912eSchristos } 10769689912eSchristos 10779689912eSchristos dns_qp_memusage_t 10789689912eSchristos dns_qpmulti_memusage(dns_qpmulti_t *multi) { 10799689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 10809689912eSchristos LOCK(&multi->mutex); 10819689912eSchristos 10829689912eSchristos dns_qp_t *qp = &multi->writer; 10839689912eSchristos INSIST(QP_VALID(qp)); 10849689912eSchristos 10859689912eSchristos dns_qp_memusage_t memusage = dns_qp_memusage(qp); 10869689912eSchristos 10879689912eSchristos if (qp->transaction_mode == QP_UPDATE) { 10889689912eSchristos memusage.bytes -= QP_CHUNK_BYTES; 10899689912eSchristos memusage.bytes += qp->usage[qp->bump].used * 10909689912eSchristos sizeof(dns_qpnode_t); 10919689912eSchristos } 10929689912eSchristos 10939689912eSchristos UNLOCK(&multi->mutex); 10949689912eSchristos return memusage; 10959689912eSchristos } 10969689912eSchristos 10979689912eSchristos void 10989689912eSchristos dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p, 10999689912eSchristos isc_nanosecs_t *rollback_p) { 1100c2d18a95Schristos *compact_p = ISC_QP_GET(compact_time); 1101c2d18a95Schristos *recycle_p = ISC_QP_GET(recycle_time); 1102c2d18a95Schristos *rollback_p = ISC_QP_GET(rollback_time); 11039689912eSchristos } 11049689912eSchristos 11059689912eSchristos /*********************************************************************** 11069689912eSchristos * 11079689912eSchristos * read-write transactions 11089689912eSchristos */ 11099689912eSchristos 11109689912eSchristos static dns_qp_t * 11119689912eSchristos transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) { 11129689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 11139689912eSchristos REQUIRE(qptp != NULL && *qptp == NULL); 11149689912eSchristos 11159689912eSchristos LOCK(&multi->mutex); 11169689912eSchristos 11179689912eSchristos dns_qp_t *qp = &multi->writer; 11189689912eSchristos INSIST(QP_VALID(qp)); 11199689912eSchristos 11209689912eSchristos /* 11219689912eSchristos * Mark existing chunks as immutable. 11229689912eSchristos * 11239689912eSchristos * Aside: The bump chunk is special: in a series of write 11249689912eSchristos * transactions the bump chunk is reused; the first part (up 11259689912eSchristos * to fender) is immutable, the rest mutable. But we set its 11269689912eSchristos * immutable flag so that when the bump chunk fills up, the 11279689912eSchristos * first part continues to be treated as immutable. (And the 11289689912eSchristos * rest of the chunk too, but that's OK.) 11299689912eSchristos */ 11309689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 11319689912eSchristos if (qp->usage[chunk].exists) { 11329689912eSchristos qp->usage[chunk].immutable = true; 11339689912eSchristos write_protect(qp, chunk); 11349689912eSchristos } 11359689912eSchristos } 11369689912eSchristos 11379689912eSchristos /* 11389689912eSchristos * Ensure QP_AUTOGC() ignores free space in immutable chunks. 11399689912eSchristos */ 11409689912eSchristos qp->hold_count = qp->free_count; 11419689912eSchristos 11429689912eSchristos *qptp = qp; 11439689912eSchristos return qp; 11449689912eSchristos } 11459689912eSchristos 11469689912eSchristos /* 11479689912eSchristos * a write is light 11489689912eSchristos * 11499689912eSchristos * We need to ensure we allocate from a fresh chunk if the last transaction 11509689912eSchristos * shrunk the bump chunk; but usually in a sequence of write transactions 11519689912eSchristos * we just put `fender` at the point where we started this generation. 11529689912eSchristos * 11539689912eSchristos * (Aside: Instead of keeping the previous transaction's mode, I 11549689912eSchristos * considered forcing allocation into the slow path by fiddling with 11559689912eSchristos * the bump chunk's usage counters. But that is troublesome because 11569689912eSchristos * `chunk_free()` needs to know how much of the chunk to scan.) 11579689912eSchristos */ 11589689912eSchristos void 11599689912eSchristos dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) { 11609689912eSchristos dns_qp_t *qp = transaction_open(multi, qptp); 11619689912eSchristos TRACE(""); 11629689912eSchristos 11639689912eSchristos if (qp->transaction_mode == QP_WRITE) { 11649689912eSchristos qp->fender = qp->usage[qp->bump].used; 11659689912eSchristos } else { 11669689912eSchristos alloc_reset(qp); 11679689912eSchristos } 11689689912eSchristos qp->transaction_mode = QP_WRITE; 11699689912eSchristos } 11709689912eSchristos 11719689912eSchristos /* 11729689912eSchristos * an update is heavier 11739689912eSchristos * 11749689912eSchristos * We always reset the allocator to the start of a fresh chunk, 11759689912eSchristos * because the previous transaction was probably an update that shrunk 11769689912eSchristos * the bump chunk. It simplifies rollback because `fender` is always zero. 11779689912eSchristos * 11789689912eSchristos * To rollback a transaction, we need to reset all the allocation 11799689912eSchristos * counters to their previous state, in particular we need to un-free 11809689912eSchristos * any nodes that were copied to make them mutable. This means we need 11819689912eSchristos * to make a copy of basically the whole `dns_qp_t writer`: everything 11829689912eSchristos * but the chunks holding the trie nodes. 11839689912eSchristos * 11849689912eSchristos * We do most of the transaction setup before creating the rollback 11859689912eSchristos * state so that after rollback we have a correct idea of which chunks 11869689912eSchristos * are immutable, and so we have the correct transaction mode to make 11879689912eSchristos * the next transaction allocate a new bump chunk. The exception is 11889689912eSchristos * resetting the allocator, which we do after creating the rollback 11899689912eSchristos * state; if this transaction is rolled back then the next transaction 11909689912eSchristos * will start from the rollback state and also reset the allocator as 11919689912eSchristos * one of its first actions. 11929689912eSchristos */ 11939689912eSchristos void 11949689912eSchristos dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) { 11959689912eSchristos dns_qp_t *qp = transaction_open(multi, qptp); 11969689912eSchristos TRACE(""); 11979689912eSchristos 11989689912eSchristos qp->transaction_mode = QP_UPDATE; 11999689912eSchristos 12009689912eSchristos dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback)); 12019689912eSchristos memmove(rollback, qp, sizeof(*rollback)); 12029689912eSchristos /* can be uninitialized on the first transaction */ 12039689912eSchristos if (rollback->base != NULL) { 12049689912eSchristos INSIST(QPBASE_VALID(rollback->base)); 12059689912eSchristos INSIST(qp->usage != NULL && qp->chunk_max > 0); 12069689912eSchristos /* paired with either _commit() or _rollback() */ 12079689912eSchristos isc_refcount_increment(&rollback->base->refcount); 12089689912eSchristos size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max; 12099689912eSchristos rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes); 12109689912eSchristos memmove(rollback->usage, qp->usage, usage_bytes); 12119689912eSchristos } 12129689912eSchristos INSIST(multi->rollback == NULL); 12139689912eSchristos multi->rollback = rollback; 12149689912eSchristos 12159689912eSchristos alloc_reset(qp); 12169689912eSchristos } 12179689912eSchristos 12189689912eSchristos void 12199689912eSchristos dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) { 12209689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 12219689912eSchristos REQUIRE(qptp != NULL && *qptp == &multi->writer); 12229689912eSchristos REQUIRE(multi->writer.transaction_mode == QP_WRITE || 12239689912eSchristos multi->writer.transaction_mode == QP_UPDATE); 12249689912eSchristos 12259689912eSchristos dns_qp_t *qp = *qptp; 12269689912eSchristos TRACE(""); 12279689912eSchristos 12289689912eSchristos if (qp->transaction_mode == QP_UPDATE) { 12299689912eSchristos INSIST(multi->rollback != NULL); 12309689912eSchristos /* paired with dns_qpmulti_update() */ 12319689912eSchristos if (qpbase_unref(multi->rollback)) { 12329689912eSchristos isc_mem_free(qp->mctx, multi->rollback->base); 12339689912eSchristos } 12349689912eSchristos if (multi->rollback->usage != NULL) { 12359689912eSchristos isc_mem_free(qp->mctx, multi->rollback->usage); 12369689912eSchristos } 12379689912eSchristos isc_mem_free(qp->mctx, multi->rollback); 12389689912eSchristos } 12399689912eSchristos INSIST(multi->rollback == NULL); 12409689912eSchristos 12419689912eSchristos /* not the first commit? */ 12429689912eSchristos if (multi->reader_ref != INVALID_REF) { 12439689912eSchristos INSIST(cells_immutable(qp, multi->reader_ref)); 12449689912eSchristos free_twigs(qp, multi->reader_ref, READER_SIZE); 12459689912eSchristos } 12469689912eSchristos 12479689912eSchristos if (qp->transaction_mode == QP_UPDATE) { 12489689912eSchristos /* minimize memory overhead */ 12499689912eSchristos compact(qp); 12509689912eSchristos multi->reader_ref = alloc_twigs(qp, READER_SIZE); 12519689912eSchristos qp->base->ptr[qp->bump] = chunk_shrink_raw( 12529689912eSchristos qp, qp->base->ptr[qp->bump], 12539689912eSchristos qp->usage[qp->bump].used * sizeof(dns_qpnode_t)); 12549689912eSchristos } else { 12559689912eSchristos multi->reader_ref = alloc_twigs(qp, READER_SIZE); 12569689912eSchristos } 12579689912eSchristos 12589689912eSchristos /* anchor a new version of the trie */ 12599689912eSchristos dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref); 12609689912eSchristos make_reader(reader, multi); 12619689912eSchristos /* paired with chunk_free() */ 12629689912eSchristos isc_refcount_increment(&qp->base->refcount); 12639689912eSchristos 12649689912eSchristos rcu_assign_pointer(multi->reader, reader); /* COMMIT */ 12659689912eSchristos 12669689912eSchristos /* clean up what we can right now */ 12679689912eSchristos if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) { 12689689912eSchristos recycle(qp); 12699689912eSchristos } 12709689912eSchristos 12719689912eSchristos /* schedule the rest for later */ 12729689912eSchristos reclaim_chunks(multi); 12739689912eSchristos 12749689912eSchristos *qptp = NULL; 12759689912eSchristos UNLOCK(&multi->mutex); 12769689912eSchristos } 12779689912eSchristos 12789689912eSchristos /* 12799689912eSchristos * Throw away everything that was allocated during this transaction. 12809689912eSchristos */ 12819689912eSchristos void 12829689912eSchristos dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) { 12839689912eSchristos unsigned int free = 0; 12849689912eSchristos 12859689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 12869689912eSchristos REQUIRE(multi->writer.transaction_mode == QP_UPDATE); 12879689912eSchristos REQUIRE(qptp != NULL && *qptp == &multi->writer); 12889689912eSchristos 12899689912eSchristos dns_qp_t *qp = *qptp; 12909689912eSchristos TRACE(""); 12919689912eSchristos 12929689912eSchristos isc_nanosecs_t start = isc_time_monotonic(); 12939689912eSchristos 12949689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 12959689912eSchristos if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable) 12969689912eSchristos { 12979689912eSchristos chunk_free(qp, chunk); 12989689912eSchristos /* 12999689912eSchristos * we need to clear its base pointer in the rollback 13009689912eSchristos * trie, in case the arrays were resized 13019689912eSchristos */ 13029689912eSchristos if (chunk < multi->rollback->chunk_max) { 13039689912eSchristos INSIST(!multi->rollback->usage[chunk].exists); 13049689912eSchristos multi->rollback->base->ptr[chunk] = NULL; 13059689912eSchristos } 13069689912eSchristos free++; 13079689912eSchristos } 13089689912eSchristos } 13099689912eSchristos 13109689912eSchristos /* 13119689912eSchristos * multi->rollback->base and multi->writer->base are the same, 13129689912eSchristos * unless there was a realloc_chunk_arrays() during the transaction 13139689912eSchristos */ 13149689912eSchristos if (qpbase_unref(qp)) { 13159689912eSchristos /* paired with dns_qpmulti_update() */ 13169689912eSchristos isc_mem_free(qp->mctx, qp->base); 13179689912eSchristos } 13189689912eSchristos isc_mem_free(qp->mctx, qp->usage); 13199689912eSchristos 13209689912eSchristos /* reset allocator state */ 13219689912eSchristos INSIST(multi->rollback != NULL); 13229689912eSchristos memmove(qp, multi->rollback, sizeof(*qp)); 13239689912eSchristos isc_mem_free(qp->mctx, multi->rollback); 13249689912eSchristos INSIST(multi->rollback == NULL); 13259689912eSchristos 13269689912eSchristos isc_nanosecs_t time = isc_time_monotonic() - start; 1327c2d18a95Schristos ISC_QP_ADD(rollback_time, time); 13289689912eSchristos 13299689912eSchristos LOG_STATS("qp rollback" PRItime "free %u chunks", time, free); 13309689912eSchristos 13319689912eSchristos *qptp = NULL; 13329689912eSchristos UNLOCK(&multi->mutex); 13339689912eSchristos } 13349689912eSchristos 13359689912eSchristos /*********************************************************************** 13369689912eSchristos * 13379689912eSchristos * read-only transactions 13389689912eSchristos */ 13399689912eSchristos 13409689912eSchristos static dns_qpmulti_t * 13419689912eSchristos reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) { 13429689912eSchristos dns_qpreader_t *qp = dns_qpreader(qpr); 13439689912eSchristos dns_qpnode_t *reader = rcu_dereference(multi->reader); 13449689912eSchristos if (reader == NULL) { 13459689912eSchristos QP_INIT(qp, multi->writer.methods, multi->writer.uctx); 13469689912eSchristos } else { 13479689912eSchristos multi = unpack_reader(qp, reader); 13489689912eSchristos } 13499689912eSchristos return multi; 13509689912eSchristos } 13519689912eSchristos 13529689912eSchristos /* 13539689912eSchristos * a query is light 13549689912eSchristos */ 13559689912eSchristos 13569689912eSchristos void 13579689912eSchristos dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) { 13589689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 13599689912eSchristos REQUIRE(qp != NULL); 13609689912eSchristos 13619689912eSchristos qp->tid = isc_tid(); 13629689912eSchristos rcu_read_lock(); 13639689912eSchristos 13649689912eSchristos dns_qpmulti_t *whence = reader_open(multi, qp); 13659689912eSchristos INSIST(whence == multi); 13669689912eSchristos } 13679689912eSchristos 13689689912eSchristos void 13699689912eSchristos dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) { 13709689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 13719689912eSchristos REQUIRE(QP_VALID(qp)); 13729689912eSchristos REQUIRE(qp->tid == isc_tid()); 13739689912eSchristos *qp = (dns_qpread_t){}; 13749689912eSchristos rcu_read_unlock(); 13759689912eSchristos } 13769689912eSchristos 13779689912eSchristos /* 13789689912eSchristos * a snapshot is heavy 13799689912eSchristos */ 13809689912eSchristos 13819689912eSchristos void 13829689912eSchristos dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 13839689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 13849689912eSchristos REQUIRE(qpsp != NULL && *qpsp == NULL); 13859689912eSchristos 13869689912eSchristos rcu_read_lock(); 13879689912eSchristos 13889689912eSchristos LOCK(&multi->mutex); 13899689912eSchristos 13909689912eSchristos dns_qp_t *qpw = &multi->writer; 13919689912eSchristos size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) + 13929689912eSchristos sizeof(qpw->base->ptr[0]) * qpw->chunk_max; 13939689912eSchristos dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes); 13949689912eSchristos 13959689912eSchristos qps->whence = reader_open(multi, qps); 13969689912eSchristos INSIST(qps->whence == multi); 13979689912eSchristos 13989689912eSchristos /* not a separate allocation */ 13999689912eSchristos qps->base = (dns_qpbase_t *)(qps + 1); 14009689912eSchristos isc_refcount_init(&qps->base->refcount, 0); 14019689912eSchristos 14029689912eSchristos /* 14039689912eSchristos * only copy base pointers of chunks we need, so we can 14049689912eSchristos * reclaim unused memory in dns_qpsnap_destroy() 14059689912eSchristos */ 14069689912eSchristos qps->chunk_max = qpw->chunk_max; 14079689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 14089689912eSchristos if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) { 14099689912eSchristos qpw->usage[chunk].snapshot = true; 14109689912eSchristos qps->base->ptr[chunk] = qpw->base->ptr[chunk]; 14119689912eSchristos } else { 14129689912eSchristos qps->base->ptr[chunk] = NULL; 14139689912eSchristos } 14149689912eSchristos } 14159689912eSchristos ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link); 14169689912eSchristos 14179689912eSchristos *qpsp = qps; 14189689912eSchristos UNLOCK(&multi->mutex); 14199689912eSchristos 14209689912eSchristos rcu_read_unlock(); 14219689912eSchristos } 14229689912eSchristos 14239689912eSchristos void 14249689912eSchristos dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 14259689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 14269689912eSchristos REQUIRE(qpsp != NULL && *qpsp != NULL); 14279689912eSchristos 14289689912eSchristos LOCK(&multi->mutex); 14299689912eSchristos 14309689912eSchristos dns_qpsnap_t *qp = *qpsp; 14319689912eSchristos 14329689912eSchristos /* make sure the API is being used correctly */ 14339689912eSchristos REQUIRE(qp->whence == multi); 14349689912eSchristos 14359689912eSchristos ISC_LIST_UNLINK(multi->snapshots, qp, link); 14369689912eSchristos 14379689912eSchristos /* 14389689912eSchristos * eagerly reclaim chunks that are now unused, so that memory does 14399689912eSchristos * not accumulate when a trie has a lot of updates and snapshots 14409689912eSchristos */ 14419689912eSchristos marksweep_chunks(multi); 14429689912eSchristos 14439689912eSchristos isc_mem_free(multi->writer.mctx, qp); 14449689912eSchristos 14459689912eSchristos *qpsp = NULL; 14469689912eSchristos UNLOCK(&multi->mutex); 14479689912eSchristos } 14489689912eSchristos 14499689912eSchristos /*********************************************************************** 14509689912eSchristos * 14519689912eSchristos * constructors, destructors 14529689912eSchristos */ 14539689912eSchristos 14549689912eSchristos void 14559689912eSchristos dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 14569689912eSchristos dns_qp_t **qptp) { 14579689912eSchristos REQUIRE(qptp != NULL && *qptp == NULL); 14589689912eSchristos 14599689912eSchristos dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp)); 14609689912eSchristos QP_INIT(qp, methods, uctx); 14619689912eSchristos isc_mem_attach(mctx, &qp->mctx); 14629689912eSchristos alloc_reset(qp); 14639689912eSchristos TRACE(""); 14649689912eSchristos *qptp = qp; 14659689912eSchristos } 14669689912eSchristos 14679689912eSchristos void 14689689912eSchristos dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 14699689912eSchristos dns_qpmulti_t **qpmp) { 14709689912eSchristos REQUIRE(qpmp != NULL && *qpmp == NULL); 14719689912eSchristos 14729689912eSchristos dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi)); 14739689912eSchristos *multi = (dns_qpmulti_t){ 14749689912eSchristos .magic = QPMULTI_MAGIC, 14759689912eSchristos .reader_ref = INVALID_REF, 14769689912eSchristos }; 14779689912eSchristos isc_mutex_init(&multi->mutex); 14789689912eSchristos ISC_LIST_INIT(multi->snapshots); 14799689912eSchristos /* 14809689912eSchristos * Do not waste effort allocating a bump chunk that will be thrown 14819689912eSchristos * away when a transaction is opened. dns_qpmulti_update() always 14829689912eSchristos * allocates; to ensure dns_qpmulti_write() does too, pretend the 14839689912eSchristos * previous transaction was an update 14849689912eSchristos */ 14859689912eSchristos dns_qp_t *qp = &multi->writer; 14869689912eSchristos QP_INIT(qp, methods, uctx); 14879689912eSchristos isc_mem_attach(mctx, &qp->mctx); 14889689912eSchristos qp->transaction_mode = QP_UPDATE; 14899689912eSchristos TRACE(""); 14909689912eSchristos *qpmp = multi; 14919689912eSchristos } 14929689912eSchristos 14939689912eSchristos static void 14949689912eSchristos destroy_guts(dns_qp_t *qp) { 14959689912eSchristos if (qp->chunk_max == 0) { 14969689912eSchristos return; 14979689912eSchristos } 14989689912eSchristos for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 14999689912eSchristos if (qp->base->ptr[chunk] != NULL) { 15009689912eSchristos chunk_free(qp, chunk); 15019689912eSchristos } 15029689912eSchristos } 15039689912eSchristos ENSURE(qp->used_count == 0); 15049689912eSchristos ENSURE(qp->free_count == 0); 15059689912eSchristos ENSURE(isc_refcount_current(&qp->base->refcount) == 1); 15069689912eSchristos isc_mem_free(qp->mctx, qp->base); 15079689912eSchristos isc_mem_free(qp->mctx, qp->usage); 15089689912eSchristos qp->magic = 0; 15099689912eSchristos } 15109689912eSchristos 15119689912eSchristos void 15129689912eSchristos dns_qp_destroy(dns_qp_t **qptp) { 15139689912eSchristos REQUIRE(qptp != NULL); 15149689912eSchristos REQUIRE(QP_VALID(*qptp)); 15159689912eSchristos 15169689912eSchristos dns_qp_t *qp = *qptp; 15179689912eSchristos *qptp = NULL; 15189689912eSchristos 15199689912eSchristos /* do not try to destroy part of a dns_qpmulti_t */ 15209689912eSchristos REQUIRE(qp->transaction_mode == QP_NONE); 15219689912eSchristos 15229689912eSchristos TRACE(""); 15239689912eSchristos destroy_guts(qp); 15249689912eSchristos isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp)); 15259689912eSchristos } 15269689912eSchristos 15279689912eSchristos static void 15289689912eSchristos qpmulti_destroy_cb(struct rcu_head *arg) { 15299689912eSchristos qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 15309689912eSchristos REQUIRE(QPRCU_VALID(rcuctx)); 15319689912eSchristos /* only nonzero for reclaim_chunks_cb() */ 15329689912eSchristos REQUIRE(rcuctx->count == 0); 15339689912eSchristos 15349689912eSchristos dns_qpmulti_t *multi = rcuctx->multi; 15359689912eSchristos REQUIRE(QPMULTI_VALID(multi)); 15369689912eSchristos 15379689912eSchristos /* reassure thread sanitizer */ 15389689912eSchristos LOCK(&multi->mutex); 15399689912eSchristos 15409689912eSchristos dns_qp_t *qp = &multi->writer; 15419689912eSchristos REQUIRE(QP_VALID(qp)); 15429689912eSchristos 15439689912eSchristos destroy_guts(qp); 15449689912eSchristos 15459689912eSchristos UNLOCK(&multi->mutex); 15469689912eSchristos 15479689912eSchristos isc_mutex_destroy(&multi->mutex); 15489689912eSchristos isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 15499689912eSchristos STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 15509689912eSchristos isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); 15519689912eSchristos } 15529689912eSchristos 15539689912eSchristos void 15549689912eSchristos dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { 15559689912eSchristos dns_qp_t *qp = NULL; 15569689912eSchristos dns_qpmulti_t *multi = NULL; 15579689912eSchristos qp_rcuctx_t *rcuctx = NULL; 15589689912eSchristos 15599689912eSchristos REQUIRE(qpmp != NULL); 15609689912eSchristos REQUIRE(QPMULTI_VALID(*qpmp)); 15619689912eSchristos 15629689912eSchristos multi = *qpmp; 15639689912eSchristos qp = &multi->writer; 15649689912eSchristos *qpmp = NULL; 15659689912eSchristos 15669689912eSchristos REQUIRE(QP_VALID(qp)); 15679689912eSchristos REQUIRE(multi->rollback == NULL); 15689689912eSchristos REQUIRE(ISC_LIST_EMPTY(multi->snapshots)); 15699689912eSchristos 15709689912eSchristos rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0)); 15719689912eSchristos *rcuctx = (qp_rcuctx_t){ 15729689912eSchristos .magic = QPRCU_MAGIC, 15739689912eSchristos .multi = multi, 15749689912eSchristos }; 15759689912eSchristos isc_mem_attach(qp->mctx, &rcuctx->mctx); 15769689912eSchristos call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb); 15779689912eSchristos } 15789689912eSchristos 15799689912eSchristos /*********************************************************************** 15809689912eSchristos * 15819689912eSchristos * modification 15829689912eSchristos */ 15839689912eSchristos 15849689912eSchristos isc_result_t 15859689912eSchristos dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) { 15869689912eSchristos dns_qpref_t new_ref, old_ref; 15879689912eSchristos dns_qpnode_t new_leaf, old_node; 15889689912eSchristos dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL; 15899689912eSchristos dns_qpshift_t new_bit, old_bit; 15909689912eSchristos dns_qpweight_t old_size, new_size; 15919689912eSchristos dns_qpkey_t new_key, old_key; 15929689912eSchristos size_t new_keylen, old_keylen; 15939689912eSchristos size_t offset; 15949689912eSchristos uint64_t index; 15959689912eSchristos dns_qpshift_t bit; 15969689912eSchristos dns_qpweight_t pos; 15979689912eSchristos dns_qpnode_t *n = NULL; 15989689912eSchristos 15999689912eSchristos REQUIRE(QP_VALID(qp)); 16009689912eSchristos 16019689912eSchristos new_leaf = make_leaf(pval, ival); 16029689912eSchristos new_keylen = leaf_qpkey(qp, &new_leaf, new_key); 16039689912eSchristos 16049689912eSchristos /* first leaf in an empty trie? */ 16059689912eSchristos if (qp->leaf_count == 0) { 16069689912eSchristos new_ref = alloc_twigs(qp, 1); 16079689912eSchristos new_twigs = ref_ptr(qp, new_ref); 16089689912eSchristos *new_twigs = new_leaf; 16099689912eSchristos attach_leaf(qp, new_twigs); 16109689912eSchristos qp->leaf_count++; 16119689912eSchristos qp->root_ref = new_ref; 16129689912eSchristos return ISC_R_SUCCESS; 16139689912eSchristos } 16149689912eSchristos 16159689912eSchristos /* 16169689912eSchristos * We need to keep searching down to a leaf even if our key is 16179689912eSchristos * missing from this branch. It doesn't matter which twig we 16189689912eSchristos * choose since the keys are all the same up to this node's 16199689912eSchristos * offset. Note that if we simply use branch_twig_pos(n, bit) 16209689912eSchristos * we may get an out-of-bounds access if our bit is greater 16219689912eSchristos * than all the set bits in the node. 16229689912eSchristos */ 16239689912eSchristos n = ref_ptr(qp, qp->root_ref); 16249689912eSchristos while (is_branch(n)) { 16259689912eSchristos prefetch_twigs(qp, n); 16269689912eSchristos dns_qpref_t ref = branch_twigs_ref(n); 16279689912eSchristos bit = branch_keybit(n, new_key, new_keylen); 16289689912eSchristos pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0; 16299689912eSchristos n = ref_ptr(qp, ref + pos); 16309689912eSchristos } 16319689912eSchristos 16329689912eSchristos /* do the keys differ, and if so, where? */ 16339689912eSchristos old_keylen = leaf_qpkey(qp, n, old_key); 16349689912eSchristos offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen); 16359689912eSchristos if (offset == QPKEY_EQUAL) { 16369689912eSchristos return ISC_R_EXISTS; 16379689912eSchristos } 16389689912eSchristos new_bit = qpkey_bit(new_key, new_keylen, offset); 16399689912eSchristos old_bit = qpkey_bit(old_key, old_keylen, offset); 16409689912eSchristos 16419689912eSchristos /* find where to insert a branch or grow an existing branch. */ 16429689912eSchristos n = make_root_mutable(qp); 16439689912eSchristos while (is_branch(n)) { 16449689912eSchristos prefetch_twigs(qp, n); 16459689912eSchristos if (offset < branch_key_offset(n)) { 16469689912eSchristos goto newbranch; 16479689912eSchristos } 16489689912eSchristos if (offset == branch_key_offset(n)) { 16499689912eSchristos goto growbranch; 16509689912eSchristos } 16519689912eSchristos make_twigs_mutable(qp, n); 16529689912eSchristos bit = branch_keybit(n, new_key, new_keylen); 16539689912eSchristos INSIST(branch_has_twig(n, bit)); 16549689912eSchristos n = branch_twig_ptr(qp, n, bit); 16559689912eSchristos } 16569689912eSchristos /* fall through */ 16579689912eSchristos 16589689912eSchristos newbranch: 16599689912eSchristos new_ref = alloc_twigs(qp, 2); 16609689912eSchristos new_twigs = ref_ptr(qp, new_ref); 16619689912eSchristos 16629689912eSchristos /* save before overwriting. */ 16639689912eSchristos old_node = *n; 16649689912eSchristos 16659689912eSchristos /* new branch node takes old node's place */ 16669689912eSchristos index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) | 16679689912eSchristos ((uint64_t)offset << SHIFT_OFFSET); 16689689912eSchristos *n = make_node(index, new_ref); 16699689912eSchristos 16709689912eSchristos /* populate twigs */ 16719689912eSchristos new_twigs[old_bit > new_bit] = old_node; 16729689912eSchristos new_twigs[new_bit > old_bit] = new_leaf; 16739689912eSchristos 16749689912eSchristos attach_leaf(qp, &new_leaf); 16759689912eSchristos qp->leaf_count++; 16769689912eSchristos 16779689912eSchristos return ISC_R_SUCCESS; 16789689912eSchristos 16799689912eSchristos growbranch: 16809689912eSchristos INSIST(!branch_has_twig(n, new_bit)); 16819689912eSchristos 16829689912eSchristos /* locate twigs vectors */ 16839689912eSchristos old_size = branch_twigs_size(n); 16849689912eSchristos new_size = old_size + 1; 16859689912eSchristos old_ref = branch_twigs_ref(n); 16869689912eSchristos new_ref = alloc_twigs(qp, new_size); 16879689912eSchristos old_twigs = ref_ptr(qp, old_ref); 16889689912eSchristos new_twigs = ref_ptr(qp, new_ref); 16899689912eSchristos 16909689912eSchristos /* embiggen branch node */ 16919689912eSchristos index = branch_index(n) | (1ULL << new_bit); 16929689912eSchristos *n = make_node(index, new_ref); 16939689912eSchristos 16949689912eSchristos /* embiggen twigs vector */ 16959689912eSchristos pos = branch_twig_pos(n, new_bit); 16969689912eSchristos move_twigs(new_twigs, old_twigs, pos); 16979689912eSchristos new_twigs[pos] = new_leaf; 16989689912eSchristos move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos); 16999689912eSchristos 17009689912eSchristos if (squash_twigs(qp, old_ref, old_size)) { 17019689912eSchristos /* old twigs destroyed, only attach to new leaf */ 17029689912eSchristos attach_leaf(qp, &new_leaf); 17039689912eSchristos } else { 17049689912eSchristos /* old twigs duplicated, attach to all leaves */ 17059689912eSchristos attach_twigs(qp, new_twigs, new_size); 17069689912eSchristos } 17079689912eSchristos qp->leaf_count++; 17089689912eSchristos 17099689912eSchristos return ISC_R_SUCCESS; 17109689912eSchristos } 17119689912eSchristos 17129689912eSchristos isc_result_t 17139689912eSchristos dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key, 17149689912eSchristos size_t search_keylen, void **pval_r, uint32_t *ival_r) { 17159689912eSchristos REQUIRE(QP_VALID(qp)); 17169689912eSchristos REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 17179689912eSchristos 17189689912eSchristos if (get_root(qp) == NULL) { 17199689912eSchristos return ISC_R_NOTFOUND; 17209689912eSchristos } 17219689912eSchristos 17229689912eSchristos dns_qpshift_t bit = 0; /* suppress warning */ 17239689912eSchristos dns_qpnode_t *parent = NULL; 17249689912eSchristos dns_qpnode_t *n = make_root_mutable(qp); 17259689912eSchristos while (is_branch(n)) { 17269689912eSchristos prefetch_twigs(qp, n); 17279689912eSchristos bit = branch_keybit(n, search_key, search_keylen); 17289689912eSchristos if (!branch_has_twig(n, bit)) { 17299689912eSchristos return ISC_R_NOTFOUND; 17309689912eSchristos } 17319689912eSchristos make_twigs_mutable(qp, n); 17329689912eSchristos parent = n; 17339689912eSchristos n = branch_twig_ptr(qp, n, bit); 17349689912eSchristos } 17359689912eSchristos 17369689912eSchristos dns_qpkey_t found_key; 17379689912eSchristos size_t found_keylen = leaf_qpkey(qp, n, found_key); 17389689912eSchristos if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 17399689912eSchristos QPKEY_EQUAL) 17409689912eSchristos { 17419689912eSchristos return ISC_R_NOTFOUND; 17429689912eSchristos } 17439689912eSchristos 17449689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 17459689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 17469689912eSchristos detach_leaf(qp, n); 17479689912eSchristos qp->leaf_count--; 17489689912eSchristos 17499689912eSchristos /* trie becomes empty */ 17509689912eSchristos if (qp->leaf_count == 0) { 17519689912eSchristos INSIST(parent == NULL); 17529689912eSchristos INSIST(n == get_root(qp)); 17539689912eSchristos free_twigs(qp, qp->root_ref, 1); 17549689912eSchristos qp->root_ref = INVALID_REF; 17559689912eSchristos return ISC_R_SUCCESS; 17569689912eSchristos } 17579689912eSchristos 17589689912eSchristos /* step back to parent node */ 17599689912eSchristos n = parent; 17609689912eSchristos parent = NULL; 17619689912eSchristos 17629689912eSchristos INSIST(bit != 0); 17639689912eSchristos dns_qpweight_t size = branch_twigs_size(n); 17649689912eSchristos dns_qpweight_t pos = branch_twig_pos(n, bit); 17659689912eSchristos dns_qpref_t ref = branch_twigs_ref(n); 17669689912eSchristos dns_qpnode_t *twigs = ref_ptr(qp, ref); 17679689912eSchristos 17689689912eSchristos if (size == 2) { 17699689912eSchristos /* 17709689912eSchristos * move the other twig to the parent branch. 17719689912eSchristos */ 17729689912eSchristos *n = twigs[!pos]; 17739689912eSchristos squash_twigs(qp, ref, 2); 17749689912eSchristos } else { 17759689912eSchristos /* 17769689912eSchristos * shrink the twigs in place, to avoid using the bump 17779689912eSchristos * chunk too fast - the gc will clean up after us 17789689912eSchristos */ 17799689912eSchristos *n = make_node(branch_index(n) & ~(1ULL << bit), ref); 17809689912eSchristos move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1); 17819689912eSchristos squash_twigs(qp, ref + size - 1, 1); 17829689912eSchristos } 17839689912eSchristos 17849689912eSchristos return ISC_R_SUCCESS; 17859689912eSchristos } 17869689912eSchristos 17879689912eSchristos isc_result_t 17889689912eSchristos dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r, 17899689912eSchristos uint32_t *ival_r) { 17909689912eSchristos dns_qpkey_t key; 17919689912eSchristos size_t keylen = dns_qpkey_fromname(key, name); 17929689912eSchristos return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r); 17939689912eSchristos } 17949689912eSchristos 17959689912eSchristos /*********************************************************************** 17969689912eSchristos * chains 17979689912eSchristos */ 17989689912eSchristos static void 17999689912eSchristos maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) { 18009689912eSchristos dns_qpkey_t key; 18019689912eSchristos size_t len; 18029689912eSchristos 18039689912eSchristos if (name == NULL) { 18049689912eSchristos return; 18059689912eSchristos } 18069689912eSchristos 18079689912eSchristos dns_name_reset(name); 18089689912eSchristos len = leaf_qpkey(qp, node, key); 18099689912eSchristos dns_qpkey_toname(key, len, name); 18109689912eSchristos } 18119689912eSchristos 18129689912eSchristos void 18139689912eSchristos dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) { 18149689912eSchristos dns_qpreader_t *qp = dns_qpreader(qpr); 18159689912eSchristos REQUIRE(QP_VALID(qp)); 18169689912eSchristos REQUIRE(chain != NULL); 18179689912eSchristos 18189689912eSchristos *chain = (dns_qpchain_t){ 18199689912eSchristos .magic = QPCHAIN_MAGIC, 18209689912eSchristos .qp = qp, 18219689912eSchristos }; 18229689912eSchristos } 18239689912eSchristos 18249689912eSchristos unsigned int 18259689912eSchristos dns_qpchain_length(dns_qpchain_t *chain) { 18269689912eSchristos REQUIRE(QPCHAIN_VALID(chain)); 18279689912eSchristos 18289689912eSchristos return chain->len; 18299689912eSchristos } 18309689912eSchristos 18319689912eSchristos void 18329689912eSchristos dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name, 18339689912eSchristos void **pval_r, uint32_t *ival_r) { 18349689912eSchristos dns_qpnode_t *node = NULL; 18359689912eSchristos 18369689912eSchristos REQUIRE(QPCHAIN_VALID(chain)); 18379689912eSchristos REQUIRE(level < chain->len); 18389689912eSchristos 18399689912eSchristos node = chain->chain[level].node; 18409689912eSchristos maybe_set_name(chain->qp, node, name); 18419689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 18429689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 18439689912eSchristos } 18449689912eSchristos 18459689912eSchristos /*********************************************************************** 18469689912eSchristos * iterators 18479689912eSchristos */ 18489689912eSchristos 18499689912eSchristos void 18509689912eSchristos dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) { 18519689912eSchristos dns_qpreader_t *qp = dns_qpreader(qpr); 18529689912eSchristos REQUIRE(QP_VALID(qp)); 18539689912eSchristos REQUIRE(qpi != NULL); 18549689912eSchristos *qpi = (dns_qpiter_t){ 18559689912eSchristos .qp = qp, 18569689912eSchristos .magic = QPITER_MAGIC, 18579689912eSchristos }; 18589689912eSchristos } 18599689912eSchristos 18609689912eSchristos /* 18619689912eSchristos * are we at the last twig in this branch (in whichever direction 18629689912eSchristos * we're iterating)? 18639689912eSchristos */ 18649689912eSchristos static bool 18659689912eSchristos last_twig(dns_qpiter_t *qpi, bool forward) { 18669689912eSchristos dns_qpweight_t pos = 0, max = 0; 18679689912eSchristos if (qpi->sp > 0) { 18689689912eSchristos dns_qpnode_t *child = qpi->stack[qpi->sp]; 18699689912eSchristos dns_qpnode_t *parent = qpi->stack[qpi->sp - 1]; 18709689912eSchristos pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent)); 18719689912eSchristos if (forward) { 18729689912eSchristos max = branch_twigs_size(parent) - 1; 18739689912eSchristos } 18749689912eSchristos } 18759689912eSchristos return pos == max; 18769689912eSchristos } 18779689912eSchristos 18789689912eSchristos /* 18799689912eSchristos * move a QP iterator forward or back to the next or previous leaf. 18809689912eSchristos * note: this function can go wrong when the iterator refers to 18819689912eSchristos * a mutable view of the trie which is altered while iterating 18829689912eSchristos */ 18839689912eSchristos static isc_result_t 18849689912eSchristos iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 18859689912eSchristos uint32_t *ival_r) { 18869689912eSchristos dns_qpnode_t *node = NULL; 18879689912eSchristos bool initial_branch = true; 18889689912eSchristos 18899689912eSchristos REQUIRE(QPITER_VALID(qpi)); 18909689912eSchristos 18919689912eSchristos dns_qpreader_t *qp = qpi->qp; 18929689912eSchristos 18939689912eSchristos REQUIRE(QP_VALID(qp)); 18949689912eSchristos 18959689912eSchristos node = get_root(qp); 18969689912eSchristos if (node == NULL) { 18979689912eSchristos return ISC_R_NOMORE; 18989689912eSchristos } 18999689912eSchristos 19009689912eSchristos do { 19019689912eSchristos if (qpi->stack[qpi->sp] == NULL) { 19029689912eSchristos /* newly initialized iterator: use the root node */ 19039689912eSchristos INSIST(qpi->sp == 0); 19049689912eSchristos qpi->stack[0] = node; 19059689912eSchristos } else if (!initial_branch) { 19069689912eSchristos /* 19079689912eSchristos * in a prior loop, we reached a branch; from 19089689912eSchristos * here we just need to get the highest or lowest 19099689912eSchristos * leaf in the subtree; we don't need to bother 19109689912eSchristos * stepping forward or backward through twigs 19119689912eSchristos * anymore. 19129689912eSchristos */ 19139689912eSchristos INSIST(qpi->sp > 0); 19149689912eSchristos } else if (last_twig(qpi, forward)) { 19159689912eSchristos /* 19169689912eSchristos * we've stepped to the end (or the beginning, 19179689912eSchristos * if we're iterating backwards) of a set of twigs. 19189689912eSchristos */ 19199689912eSchristos if (qpi->sp == 0) { 19209689912eSchristos /* 19219689912eSchristos * we've finished iterating. reinitialize 19229689912eSchristos * the iterator, then return ISC_R_NOMORE. 19239689912eSchristos */ 19249689912eSchristos dns_qpiter_init(qpi->qp, qpi); 19259689912eSchristos return ISC_R_NOMORE; 19269689912eSchristos } 19279689912eSchristos 19289689912eSchristos /* 19299689912eSchristos * pop the stack, and resume at the parent branch. 19309689912eSchristos */ 19319689912eSchristos qpi->stack[qpi->sp] = NULL; 19329689912eSchristos qpi->sp--; 19339689912eSchristos continue; 19349689912eSchristos } else { 19359689912eSchristos /* 19369689912eSchristos * there are more twigs in the current branch, 19379689912eSchristos * so step the node pointer forward (or back). 19389689912eSchristos */ 19399689912eSchristos qpi->stack[qpi->sp] += (forward ? 1 : -1); 19409689912eSchristos node = qpi->stack[qpi->sp]; 19419689912eSchristos } 19429689912eSchristos 19439689912eSchristos /* 19449689912eSchristos * if we're at a branch now, we loop down to the 19459689912eSchristos * left- or rightmost leaf. 19469689912eSchristos */ 19479689912eSchristos if (is_branch(node)) { 19489689912eSchristos qpi->sp++; 19499689912eSchristos INSIST(qpi->sp < DNS_QP_MAXKEY); 19509689912eSchristos node = ref_ptr(qp, branch_twigs_ref(node)) + 19519689912eSchristos (forward ? 0 : branch_twigs_size(node) - 1); 19529689912eSchristos qpi->stack[qpi->sp] = node; 19539689912eSchristos initial_branch = false; 19549689912eSchristos } 19559689912eSchristos } while (is_branch(node)); 19569689912eSchristos 19579689912eSchristos /* we're at a leaf: return its data to the caller */ 19589689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 19599689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 19609689912eSchristos maybe_set_name(qpi->qp, node, name); 19619689912eSchristos return ISC_R_SUCCESS; 19629689912eSchristos } 19639689912eSchristos 19649689912eSchristos isc_result_t 19659689912eSchristos dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 19669689912eSchristos uint32_t *ival_r) { 19679689912eSchristos return iterate(true, qpi, name, pval_r, ival_r); 19689689912eSchristos } 19699689912eSchristos 19709689912eSchristos isc_result_t 19719689912eSchristos dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 19729689912eSchristos uint32_t *ival_r) { 19739689912eSchristos return iterate(false, qpi, name, pval_r, ival_r); 19749689912eSchristos } 19759689912eSchristos 19769689912eSchristos isc_result_t 19779689912eSchristos dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 19789689912eSchristos uint32_t *ival_r) { 19799689912eSchristos dns_qpnode_t *node = NULL; 19809689912eSchristos 19819689912eSchristos REQUIRE(QPITER_VALID(qpi)); 19829689912eSchristos 19839689912eSchristos node = qpi->stack[qpi->sp]; 19849689912eSchristos if (node == NULL || is_branch(node)) { 19859689912eSchristos return ISC_R_FAILURE; 19869689912eSchristos } 19879689912eSchristos 19889689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 19899689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 19909689912eSchristos maybe_set_name(qpi->qp, node, name); 19919689912eSchristos return ISC_R_SUCCESS; 19929689912eSchristos } 19939689912eSchristos 19949689912eSchristos /*********************************************************************** 19959689912eSchristos * 19969689912eSchristos * search 19979689912eSchristos */ 19989689912eSchristos 19999689912eSchristos isc_result_t 20009689912eSchristos dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key, 20019689912eSchristos size_t search_keylen, void **pval_r, uint32_t *ival_r) { 20029689912eSchristos dns_qpreader_t *qp = dns_qpreader(qpr); 20039689912eSchristos dns_qpkey_t found_key; 20049689912eSchristos size_t found_keylen; 20059689912eSchristos dns_qpshift_t bit; 20069689912eSchristos dns_qpnode_t *n = NULL; 20079689912eSchristos 20089689912eSchristos REQUIRE(QP_VALID(qp)); 20099689912eSchristos REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 20109689912eSchristos 20119689912eSchristos n = get_root(qp); 20129689912eSchristos if (n == NULL) { 20139689912eSchristos return ISC_R_NOTFOUND; 20149689912eSchristos } 20159689912eSchristos 20169689912eSchristos while (is_branch(n)) { 20179689912eSchristos prefetch_twigs(qp, n); 20189689912eSchristos bit = branch_keybit(n, search_key, search_keylen); 20199689912eSchristos if (!branch_has_twig(n, bit)) { 20209689912eSchristos return ISC_R_NOTFOUND; 20219689912eSchristos } 20229689912eSchristos n = branch_twig_ptr(qp, n, bit); 20239689912eSchristos } 20249689912eSchristos 20259689912eSchristos found_keylen = leaf_qpkey(qp, n, found_key); 20269689912eSchristos if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 20279689912eSchristos QPKEY_EQUAL) 20289689912eSchristos { 20299689912eSchristos return ISC_R_NOTFOUND; 20309689912eSchristos } 20319689912eSchristos 20329689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 20339689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 20349689912eSchristos return ISC_R_SUCCESS; 20359689912eSchristos } 20369689912eSchristos 20379689912eSchristos isc_result_t 20389689912eSchristos dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r, 20399689912eSchristos uint32_t *ival_r) { 20409689912eSchristos dns_qpkey_t key; 20419689912eSchristos size_t keylen = dns_qpkey_fromname(key, name); 20429689912eSchristos return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r); 20439689912eSchristos } 20449689912eSchristos 20459689912eSchristos static inline void 20469689912eSchristos add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) { 20479689912eSchristos /* prevent duplication */ 20489689912eSchristos if (chain->len != 0 && chain->chain[chain->len - 1].node == node) { 20499689912eSchristos return; 20509689912eSchristos } 20519689912eSchristos chain->chain[chain->len].node = node; 20529689912eSchristos chain->chain[chain->len].offset = offset; 20539689912eSchristos chain->len++; 20549689912eSchristos INSIST(chain->len <= DNS_NAME_MAXLABELS); 20559689912eSchristos } 20569689912eSchristos 20579689912eSchristos static inline void 20589689912eSchristos prevleaf(dns_qpiter_t *it) { 20599689912eSchristos isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL); 20609689912eSchristos if (result == ISC_R_NOMORE) { 20619689912eSchristos result = dns_qpiter_prev(it, NULL, NULL, NULL); 20629689912eSchristos } 20639689912eSchristos RUNTIME_CHECK(result == ISC_R_SUCCESS); 20649689912eSchristos } 20659689912eSchristos 20669689912eSchristos static inline void 20679689912eSchristos greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) { 20689689912eSchristos while (is_branch(n)) { 20699689912eSchristos dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) - 20709689912eSchristos 1; 20719689912eSchristos iter->stack[++iter->sp] = n; 20729689912eSchristos n = ref_ptr(qpr, ref); 20739689912eSchristos } 20749689912eSchristos iter->stack[++iter->sp] = n; 20759689912eSchristos } 20769689912eSchristos 20779689912eSchristos static inline dns_qpnode_t * 20789689912eSchristos anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) { 20799689912eSchristos while (is_branch(n)) { 20809689912eSchristos n = branch_twigs(qp, n); 20819689912eSchristos } 20829689912eSchristos return n; 20839689912eSchristos } 20849689912eSchristos 20859689912eSchristos static inline int 20869689912eSchristos twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit, 20879689912eSchristos dns_qpshift_t fbit) { 20889689912eSchristos dns_qpweight_t pos = branch_twig_pos(n, sbit); 20899689912eSchristos if (branch_has_twig(n, sbit)) { 20909689912eSchristos return pos - (kbit < fbit); 20919689912eSchristos } 20929689912eSchristos return pos - 1; 20939689912eSchristos } 20949689912eSchristos 20959689912eSchristos /* 20969689912eSchristos * If dns_qp_lookup() was passed an iterator, we want it to point at the 20979689912eSchristos * matching name in the case of an exact match, or at the predecessor name 20989689912eSchristos * for a non-exact match. 20999689912eSchristos * 21009689912eSchristos * If there is an exact match, then there is nothing to be done. Otherwise, 21019689912eSchristos * we pop up the iterator stack until we find a parent branch with an offset 21029689912eSchristos * that is before the position where the search key differs from the found key. 21039689912eSchristos * From there we can step to the leaf that is the predecessor of the searched 21049689912eSchristos * name. 21059689912eSchristos * 21069689912eSchristos * Requires the iterator to be pointing at a leaf node. 21079689912eSchristos */ 21089689912eSchristos static void 21099689912eSchristos fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key, 21109689912eSchristos size_t len) { 21119689912eSchristos dns_qpnode_t *n = it->stack[it->sp]; 21129689912eSchristos 21139689912eSchristos REQUIRE(!is_branch(n)); 21149689912eSchristos 21159689912eSchristos dns_qpkey_t found; 21169689912eSchristos size_t foundlen = leaf_qpkey(qp, n, found); 21179689912eSchristos size_t to = qpkey_compare(key, len, found, foundlen); 21189689912eSchristos 21199689912eSchristos /* If the keys are equal, the iterator is already at the right node. */ 21209689912eSchristos if (to == QPKEY_EQUAL) { 21219689912eSchristos return; 21229689912eSchristos } 21239689912eSchristos 21249689912eSchristos /* 21259689912eSchristos * Special case: if the key differs even before the root 21269689912eSchristos * key offset, it means the name desired either precedes or 21279689912eSchristos * follows the entire range of names in the database, and 21289689912eSchristos * popping up the stack won't help us, so just move the 21299689912eSchristos * iterator one step back from the origin and return. 21309689912eSchristos */ 21319689912eSchristos if (to < branch_key_offset(it->stack[0])) { 21329689912eSchristos dns_qpiter_init(qp, it); 21339689912eSchristos prevleaf(it); 21349689912eSchristos return; 21359689912eSchristos } 21369689912eSchristos 21379689912eSchristos /* 21389689912eSchristos * As long as the branch offset point is after the point where the 21399689912eSchristos * key differs, we need to branch up and find a better node. 21409689912eSchristos */ 21419689912eSchristos while (it->sp > 0) { 21429689912eSchristos dns_qpnode_t *b = it->stack[it->sp - 1]; 21439689912eSchristos if (branch_key_offset(b) < to) { 21449689912eSchristos break; 21459689912eSchristos } 21469689912eSchristos it->sp--; 21479689912eSchristos } 21489689912eSchristos n = it->stack[it->sp]; 21499689912eSchristos 21509689912eSchristos /* 21519689912eSchristos * Either we are now at the correct branch, or we are at the 21529689912eSchristos * first unmatched node. Determine the bit position for the 21539689912eSchristos * twig we need (sbit). 21549689912eSchristos */ 21559689912eSchristos dns_qpshift_t kbit = qpkey_bit(key, len, to); 21569689912eSchristos dns_qpshift_t fbit = qpkey_bit(found, foundlen, to); 21579689912eSchristos dns_qpshift_t sbit = 0; 21589689912eSchristos 21599689912eSchristos if (is_branch(n) && branch_key_offset(n) == to) { 21609689912eSchristos /* We are on the correct branch now. */ 21619689912eSchristos sbit = kbit; 21629689912eSchristos } else if (it->sp == 0) { 21639689912eSchristos /* 21649689912eSchristos * We are on the root branch, popping up the stack won't 21659689912eSchristos * help us, so just move the iterator one step back from the 21669689912eSchristos * origin and return. 21679689912eSchristos */ 21689689912eSchristos dns_qpiter_init(qp, it); 21699689912eSchristos prevleaf(it); 21709689912eSchristos return; 21719689912eSchristos } else { 21729689912eSchristos /* We are at the first unmatched node, pop up the stack. */ 21739689912eSchristos n = it->stack[--it->sp]; 21749689912eSchristos sbit = qpkey_bit(key, len, branch_key_offset(n)); 21759689912eSchristos } 21769689912eSchristos 21779689912eSchristos INSIST(is_branch(n)); 21789689912eSchristos 21799689912eSchristos prefetch_twigs(qp, n); 21809689912eSchristos dns_qpnode_t *twigs = branch_twigs(qp, n); 21819689912eSchristos int toff = twig_offset(n, sbit, kbit, fbit); 21829689912eSchristos if (toff >= 0) { 21839689912eSchristos /* 21849689912eSchristos * The name we want would've been after some twig in 21859689912eSchristos * this branch. Walk down from that twig to the 21869689912eSchristos * highest leaf in its subtree to get the predecessor. 21879689912eSchristos */ 21889689912eSchristos greatest_leaf(qp, twigs + toff, it); 21899689912eSchristos } else { 21909689912eSchristos /* 21919689912eSchristos * Every leaf below this node is greater than the one we 21929689912eSchristos * wanted, so the previous leaf is the predecessor. 21939689912eSchristos */ 21949689912eSchristos prevleaf(it); 21959689912eSchristos } 21969689912eSchristos } 21979689912eSchristos 21989689912eSchristos /* 21999689912eSchristos * When searching for a requested name in dns_qp_lookup(), we might add 22009689912eSchristos * a leaf node to the chain, then subsequently determine that it was a 22019689912eSchristos * dead end. When this happens, the chain can be left holding a node 22029689912eSchristos * that is *not* an ancestor of the requested name. We correct for that 22039689912eSchristos * here. 22049689912eSchristos */ 22059689912eSchristos static void 22069689912eSchristos fix_chain(dns_qpchain_t *chain, size_t offset) { 22079689912eSchristos while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset) 22089689912eSchristos { 22099689912eSchristos chain->len--; 22109689912eSchristos chain->chain[chain->len].node = NULL; 22119689912eSchristos chain->chain[chain->len].offset = 0; 22129689912eSchristos } 22139689912eSchristos } 22149689912eSchristos 22159689912eSchristos isc_result_t 22169689912eSchristos dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name, 22179689912eSchristos dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain, 22189689912eSchristos void **pval_r, uint32_t *ival_r) { 22199689912eSchristos dns_qpreader_t *qp = dns_qpreader(qpr); 22209689912eSchristos dns_qpkey_t search, found; 22219689912eSchristos size_t searchlen, foundlen; 22229689912eSchristos size_t offset = 0; 22239689912eSchristos dns_qpnode_t *n = NULL; 22249689912eSchristos dns_qpshift_t bit = SHIFT_NOBYTE; 22259689912eSchristos dns_qpchain_t oc; 22269689912eSchristos dns_qpiter_t it; 22279689912eSchristos bool matched = false; 22289689912eSchristos bool setiter = true; 22299689912eSchristos 22309689912eSchristos REQUIRE(QP_VALID(qp)); 22319689912eSchristos REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 22329689912eSchristos 22339689912eSchristos searchlen = dns_qpkey_fromname(search, name); 22349689912eSchristos 22359689912eSchristos if (chain == NULL) { 22369689912eSchristos chain = &oc; 22379689912eSchristos } 22389689912eSchristos if (iter == NULL) { 22399689912eSchristos iter = ⁢ 22409689912eSchristos setiter = false; 22419689912eSchristos } 22429689912eSchristos dns_qpchain_init(qp, chain); 22439689912eSchristos dns_qpiter_init(qp, iter); 22449689912eSchristos 22459689912eSchristos n = get_root(qp); 22469689912eSchristos if (n == NULL) { 22479689912eSchristos return ISC_R_NOTFOUND; 22489689912eSchristos } 22499689912eSchristos iter->stack[0] = n; 22509689912eSchristos 22519689912eSchristos /* 22529689912eSchristos * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a 22539689912eSchristos * second pass: instead, we keep track of any leaves with shorter keys 22549689912eSchristos * that we discover along the way. (In general, qp-trie searches can be 22559689912eSchristos * one-pass, by recording their traversal, or two-pass, for less stack 22569689912eSchristos * memory usage.) 22579689912eSchristos */ 22589689912eSchristos while (is_branch(n)) { 22599689912eSchristos prefetch_twigs(qp, n); 22609689912eSchristos 22619689912eSchristos offset = branch_key_offset(n); 22629689912eSchristos bit = qpkey_bit(search, searchlen, offset); 22639689912eSchristos dns_qpnode_t *twigs = branch_twigs(qp, n); 22649689912eSchristos 22659689912eSchristos /* 22669689912eSchristos * A shorter key that can be a parent domain always has a 22679689912eSchristos * leaf node at SHIFT_NOBYTE (indicating end of its key) 22689689912eSchristos * where our search key has a normal character immediately 22699689912eSchristos * after a label separator. 22709689912eSchristos * 22719689912eSchristos * Note 1: It is OK if `off - 1` underflows: it will 22729689912eSchristos * become SIZE_MAX, which is greater than `searchlen`, so 22739689912eSchristos * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we 22749689912eSchristos * want when `off == 0`. 22759689912eSchristos * 22769689912eSchristos * Note 2: If SHIFT_NOBYTE twig is present, it will always 22779689912eSchristos * be in position 0, the first location in 'twigs'. 22789689912eSchristos */ 22799689912eSchristos if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) && 22809689912eSchristos qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE && 22819689912eSchristos !is_branch(twigs)) 22829689912eSchristos { 22839689912eSchristos add_link(chain, twigs, offset); 22849689912eSchristos } 22859689912eSchristos 22869689912eSchristos matched = branch_has_twig(n, bit); 22879689912eSchristos if (matched) { 22889689912eSchristos /* 22899689912eSchristos * found a match: if it's a branch, we keep 22909689912eSchristos * searching, and if it's a leaf, we drop out of 22919689912eSchristos * the loop. 22929689912eSchristos */ 22939689912eSchristos n = branch_twig_ptr(qp, n, bit); 22949689912eSchristos } else { 22959689912eSchristos /* 22969689912eSchristos * this branch is a dead end, and the predecessor 22979689912eSchristos * doesn't matter. now we just need to find a leaf 22989689912eSchristos * to end on so that qpkey_leaf() will work below. 22999689912eSchristos */ 23009689912eSchristos n = anyleaf(qp, twigs); 23019689912eSchristos } 23029689912eSchristos 23039689912eSchristos iter->stack[++iter->sp] = n; 23049689912eSchristos } 23059689912eSchristos 23069689912eSchristos if (setiter) { 23079689912eSchristos /* 23089689912eSchristos * we found a leaf, but it might not be the leaf we wanted. 23099689912eSchristos * if it isn't, and if the caller passed us an iterator, 23109689912eSchristos * then we might need to reposition it. 23119689912eSchristos */ 23129689912eSchristos fix_iterator(qp, iter, search, searchlen); 23139689912eSchristos n = iter->stack[iter->sp]; 23149689912eSchristos } 23159689912eSchristos 23169689912eSchristos /* at this point, n can only be a leaf node */ 23179689912eSchristos INSIST(!is_branch(n)); 23189689912eSchristos 23199689912eSchristos foundlen = leaf_qpkey(qp, n, found); 23209689912eSchristos offset = qpkey_compare(search, searchlen, found, foundlen); 23219689912eSchristos 23229689912eSchristos /* the search ended with an exact or partial match */ 23239689912eSchristos if (offset == QPKEY_EQUAL || offset == foundlen) { 23249689912eSchristos isc_result_t result = ISC_R_SUCCESS; 23259689912eSchristos 23269689912eSchristos if (offset == foundlen) { 23279689912eSchristos fix_chain(chain, offset); 23289689912eSchristos result = DNS_R_PARTIALMATCH; 23299689912eSchristos } 23309689912eSchristos add_link(chain, n, offset); 23319689912eSchristos 23329689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 23339689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 23349689912eSchristos maybe_set_name(qp, n, foundname); 23359689912eSchristos return result; 23369689912eSchristos } 23379689912eSchristos 23389689912eSchristos /* 23399689912eSchristos * the requested name was not found, but if an ancestor 23409689912eSchristos * was, we can retrieve that from the chain. 23419689912eSchristos */ 23429689912eSchristos int len = chain->len; 23439689912eSchristos while (len-- > 0) { 23449689912eSchristos if (offset >= chain->chain[len].offset) { 23459689912eSchristos n = chain->chain[len].node; 23469689912eSchristos SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 23479689912eSchristos SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 23489689912eSchristos maybe_set_name(qp, n, foundname); 23499689912eSchristos return DNS_R_PARTIALMATCH; 23509689912eSchristos } else { 23519689912eSchristos /* 23529689912eSchristos * oops, during the search we found and added 23539689912eSchristos * a leaf that's longer than the requested 23549689912eSchristos * name; remove it from the chain. 23559689912eSchristos */ 23569689912eSchristos chain->len--; 23579689912eSchristos } 23589689912eSchristos } 23599689912eSchristos 23609689912eSchristos /* nothing was found at all */ 23619689912eSchristos return ISC_R_NOTFOUND; 23629689912eSchristos } 23639689912eSchristos 23649689912eSchristos /**********************************************************************/ 2365