xref: /netbsd-src/external/mpl/bind/dist/lib/dns/qp.c (revision f7a6843ec2594e18c4f81c87e47ae35bab70f5e7)
1 /*	$NetBSD: qp.c,v 1.4 2025/01/27 15:40:36 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*
17  * For an overview, see doc/design/qp-trie.md
18  */
19 
20 #include <inttypes.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <string.h>
25 
26 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
27 #include <sys/mman.h>
28 #include <unistd.h>
29 #endif
30 
31 #include <isc/atomic.h>
32 #include <isc/buffer.h>
33 #include <isc/magic.h>
34 #include <isc/mem.h>
35 #include <isc/mutex.h>
36 #include <isc/refcount.h>
37 #include <isc/result.h>
38 #include <isc/rwlock.h>
39 #include <isc/tid.h>
40 #include <isc/time.h>
41 #include <isc/types.h>
42 #include <isc/urcu.h>
43 #include <isc/util.h>
44 
45 #include <dns/fixedname.h>
46 #include <dns/log.h>
47 #include <dns/name.h>
48 #include <dns/qp.h>
49 #include <dns/types.h>
50 
51 #include "qp_p.h"
52 
53 #ifndef DNS_QP_LOG_STATS
54 #define DNS_QP_LOG_STATS 1
55 #endif
56 #ifndef DNS_QP_TRACE
57 #define DNS_QP_TRACE 0
58 #endif
59 
60 /*
61  * very basic garbage collector statistics
62  *
63  * XXXFANF for now we're logging GC times, but ideally we should
64  * accumulate stats more quietly and report via the statschannel
65  */
66 #ifdef _LP64
67 static atomic_uint_fast64_t compact_time;
68 static atomic_uint_fast64_t recycle_time;
69 static atomic_uint_fast64_t rollback_time;
70 #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a))
71 #define ISC_QP_GET(v) atomic_load_relaxed(&(v))
72 #else
73 static uint64_t compact_time;
74 static uint64_t recycle_time;
75 static uint64_t rollback_time;
76 static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER;
77 #define ISC_QP_ADD(v, a) \
78 	({ \
79 		isc_mutex_lock(&qp_mutex); \
80 		uint64_t x = (v) + (a); \
81 		isc_mutex_unlock(&qp_mutex); \
82 		x; \
83 	})
84 #define ISC_QP_GET(v) \
85 	({ \
86 		isc_mutex_lock(&qp_mutex); \
87 		uint64_t x = (v); \
88 		isc_mutex_unlock(&qp_mutex); \
89 		x; \
90 	})
91 #endif
92 
93 
94 /* for LOG_STATS() format strings */
95 #define PRItime " %" PRIu64 " ns "
96 
97 #if DNS_QP_LOG_STATS
98 #define LOG_STATS(...)                                                      \
99 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
100 		      ISC_LOG_DEBUG(1), __VA_ARGS__)
101 #else
102 #define LOG_STATS(...)
103 #endif
104 
105 #if DNS_QP_TRACE
106 /*
107  * TRACE is generally used in allocation-related functions so it doesn't
108  * trace very high-frequency ops
109  */
110 #define TRACE(fmt, ...)                                                        \
111 	do {                                                                   \
112 		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {            \
113 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,      \
114 				      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),      \
115 				      "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \
116 				      __FILE__, __LINE__, __func__, qp,        \
117 				      qp ? TRIENAME(qp) : "(null)", isc_tid(), \
118 				      ##__VA_ARGS__);                          \
119 		}                                                              \
120 	} while (0)
121 #else
122 #define TRACE(...)
123 #endif
124 
125 /***********************************************************************
126  *
127  *  converting DNS names to trie keys
128  */
129 
130 /*
131  * Number of distinct byte values, i.e. 256
132  */
133 #define BYTE_VALUES (UINT8_MAX + 1)
134 
135 /*
136  * Lookup table mapping bytes in DNS names to bit positions, used
137  * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
138  *
139  * Each element holds one or two bit positions, bit_one in the
140  * lower half and bit_two in the upper half.
141  *
142  * For common hostname characters, bit_two is zero (which cannot
143  * be a valid bit position).
144  *
145  * For others, bit_one is the escape bit, and bit_two is the
146  * position of the character within the escaped range.
147  */
148 uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 };
149 
150 /*
151  * And the reverse, mapping bit positions to characters, so the tests
152  * can print diagnostics involving qp-trie keys.
153  *
154  * This table only handles the first bit in an escape sequence; we
155  * arrange that we can calculate the byte value for both bits by
156  * adding the the second bit to the first bit's byte value.
157  */
158 uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 };
159 
160 /*
161  * Fill in the lookup tables at program startup. (It doesn't matter
162  * when this is initialized relative to other startup code.)
163  */
164 static void
165 initialize_bits_for_byte(void) ISC_CONSTRUCTOR;
166 
167 /*
168  * The bit positions for bytes inside labels have to be between
169  * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.)
170  *
171  * Each byte range in between common hostname characters has a different
172  * escape character, to preserve the correct lexical order.
173  *
174  * Escaped byte ranges mostly fit into the space available in the
175  * bitmap, except for those above 'z' (which is mostly bytes with the
176  * top bit set). So, when we reach the end of the bitmap we roll over
177  * to the next escape character.
178  *
179  * After filling the table we ensure that the bit positions for
180  * hostname characters and escape characters all fit.
181  */
182 static void
183 initialize_bits_for_byte(void) {
184 	/* zero common character marker not a valid shift position */
185 	INSIST(0 < SHIFT_BITMAP);
186 	/* first bit is common byte or escape byte */
187 	dns_qpshift_t bit_one = SHIFT_BITMAP;
188 	/* second bit is position in escaped range */
189 	dns_qpshift_t bit_two = SHIFT_BITMAP;
190 	bool escaping = true;
191 
192 	for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) {
193 		if (qp_common_character(byte)) {
194 			escaping = false;
195 			bit_one++;
196 			dns_qp_byte_for_bit[bit_one] = byte;
197 			dns_qp_bits_for_byte[byte] = bit_one;
198 		} else if ('A' <= byte && byte <= 'Z') {
199 			/* map upper case to lower case */
200 			dns_qpshift_t after_esc = bit_one + 1;
201 			dns_qpshift_t skip_punct = 'a' - '_';
202 			dns_qpshift_t letter = byte - 'A';
203 			dns_qpshift_t bit = after_esc + skip_punct + letter;
204 			dns_qp_bits_for_byte[byte] = bit;
205 			/* to simplify reverse conversion */
206 			bit_two++;
207 		} else {
208 			/* non-hostname characters need to be escaped */
209 			if (!escaping || bit_two >= SHIFT_OFFSET) {
210 				escaping = true;
211 				bit_one++;
212 				dns_qp_byte_for_bit[bit_one] = byte;
213 				bit_two = SHIFT_BITMAP;
214 			}
215 			dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one;
216 			bit_two++;
217 		}
218 	}
219 	ENSURE(bit_one < SHIFT_OFFSET);
220 }
221 
222 /*
223  * Convert a DNS name into a trie lookup key.
224  *
225  * Returns the length of the key.
226  *
227  * For performance we get our hands dirty in the guts of the name.
228  *
229  * We don't worry about the distinction between absolute and relative
230  * names. When the trie is only used with absolute names, the first byte
231  * of the key will always be SHIFT_NOBYTE and it will always be skipped
232  * when traversing the trie. So keeping the root label costs little, and
233  * it allows us to support tries of relative names too. In fact absolute
234  * and relative names can be mixed in the same trie without causing
235  * confusion, because the presence or absence of the initial
236  * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing
237  * dot in a zone file).
238  */
239 size_t
240 dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) {
241 	size_t len, label;
242 	dns_fixedname_t fixed;
243 
244 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
245 
246 	if (name->labels == 0) {
247 		key[0] = SHIFT_NOBYTE;
248 		return 0;
249 	}
250 
251 	if (name->offsets == NULL) {
252 		dns_name_t *clone = dns_fixedname_initname(&fixed);
253 		dns_name_clone(name, clone);
254 		name = clone;
255 	}
256 
257 	len = 0;
258 	label = name->labels;
259 	while (label-- > 0) {
260 		const uint8_t *ldata = name->ndata + name->offsets[label];
261 		size_t label_len = *ldata++;
262 		while (label_len-- > 0) {
263 			uint16_t bits = dns_qp_bits_for_byte[*ldata++];
264 			key[len++] = bits & 0xFF;	/* bit_one */
265 			if ((bits >> 8) != 0) {		/* escape? */
266 				key[len++] = bits >> 8; /* bit_two */
267 			}
268 		}
269 		/* label terminator */
270 		key[len++] = SHIFT_NOBYTE;
271 	}
272 	/* mark end with a double NOBYTE */
273 	key[len] = SHIFT_NOBYTE;
274 	ENSURE(len < sizeof(dns_qpkey_t));
275 	return len;
276 }
277 
278 void
279 dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) {
280 	size_t locs[DNS_NAME_MAXLABELS];
281 	size_t loc = 0, opos = 0;
282 	size_t offset;
283 
284 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
285 	REQUIRE(name->buffer != NULL);
286 	REQUIRE(name->offsets != NULL);
287 
288 	dns_name_reset(name);
289 
290 	if (keylen == 0) {
291 		return;
292 	}
293 
294 	/* Scan the key looking for label boundaries */
295 	for (offset = 0; offset <= keylen; offset++) {
296 		INSIST(key[offset] >= SHIFT_NOBYTE &&
297 		       key[offset] < SHIFT_OFFSET);
298 		INSIST(loc < DNS_NAME_MAXLABELS);
299 		if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) {
300 			if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE)
301 			{
302 				locs[loc] = offset + 1;
303 				goto scanned;
304 			}
305 			locs[loc++] = offset + 1;
306 		} else if (offset == 0) {
307 			/* This happens for a relative name */
308 			locs[loc++] = offset;
309 		}
310 	}
311 	UNREACHABLE();
312 scanned:
313 
314 	/*
315 	 * In the key the labels are encoded in reverse order, so
316 	 * we step backward through the label boundaries, then forward
317 	 * through the labels, to create the DNS wire format data.
318 	 */
319 	name->labels = loc;
320 	while (loc-- > 0) {
321 		uint8_t len = 0, *lenp = NULL;
322 
323 		/* Add a length byte to the name data and set an offset */
324 		lenp = isc_buffer_used(name->buffer);
325 		isc_buffer_putuint8(name->buffer, 0);
326 		name->offsets[opos++] = name->length++;
327 
328 		/* Convert from escaped byte ranges to ASCII */
329 		for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) {
330 			uint8_t bit = qpkey_bit(key, keylen, offset);
331 			uint8_t byte = dns_qp_byte_for_bit[bit];
332 			if (qp_common_character(byte)) {
333 				isc_buffer_putuint8(name->buffer, byte);
334 			} else {
335 				byte += key[++offset] - SHIFT_BITMAP;
336 				isc_buffer_putuint8(name->buffer, byte);
337 			}
338 			len++;
339 		}
340 
341 		name->length += len;
342 		*lenp = len;
343 	}
344 
345 	/* Add a root label for absolute names */
346 	if (key[0] == SHIFT_NOBYTE) {
347 		name->attributes.absolute = true;
348 		isc_buffer_putuint8(name->buffer, 0);
349 		name->offsets[opos++] = name->length++;
350 		name->labels++;
351 	}
352 
353 	name->ndata = isc_buffer_base(name->buffer);
354 }
355 
356 /*
357  * Sentinel value for equal keys
358  */
359 #define QPKEY_EQUAL (~(size_t)0)
360 
361 /*
362  * Compare two keys and return the offset where they differ.
363  *
364  * This offset is used to work out where a trie search diverged: when one
365  * of the keys is in the trie and one is not, the common prefix (up to the
366  * offset) is the part of the unknown key that exists in the trie. This
367  * matters for adding new keys or finding neighbours of missing keys.
368  *
369  * When the keys are different lengths it is possible (but unwise) for
370  * the longer key to be the same as the shorter key but with superfluous
371  * trailing SHIFT_NOBYTE elements. This makes the keys equal for the
372  * purpose of traversing the trie.
373  */
374 static size_t
375 qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a,
376 	      const dns_qpkey_t key_b, const size_t keylen_b) {
377 	size_t keylen = ISC_MAX(keylen_a, keylen_b);
378 	for (size_t offset = 0; offset < keylen; offset++) {
379 		if (qpkey_bit(key_a, keylen_a, offset) !=
380 		    qpkey_bit(key_b, keylen_b, offset))
381 		{
382 			return offset;
383 		}
384 	}
385 	return QPKEY_EQUAL;
386 }
387 
388 /***********************************************************************
389  *
390  *  allocator wrappers
391  */
392 
393 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
394 
395 /*
396  * Optionally (for debugging) during a copy-on-write transaction, use
397  * memory protection to ensure that the shared chunks are not modified.
398  * Once a chunk becomes shared, it remains read-only until it is freed.
399  * POSIX says we have to use mmap() to get an allocation that we can
400  * definitely pass to mprotect().
401  */
402 
403 static size_t
404 chunk_size_raw(void) {
405 	size_t size = (size_t)sysconf(_SC_PAGE_SIZE);
406 	return ISC_MAX(size, QP_CHUNK_BYTES);
407 }
408 
409 static void *
410 chunk_get_raw(dns_qp_t *qp) {
411 	if (qp->write_protect) {
412 		size_t size = chunk_size_raw();
413 		void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
414 				 MAP_ANON | MAP_PRIVATE, -1, 0);
415 		RUNTIME_CHECK(ptr != MAP_FAILED);
416 		return ptr;
417 	} else {
418 		return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES);
419 	}
420 }
421 
422 static void
423 chunk_free_raw(dns_qp_t *qp, void *ptr) {
424 	if (qp->write_protect) {
425 		RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0);
426 	} else {
427 		isc_mem_free(qp->mctx, ptr);
428 	}
429 }
430 
431 static void *
432 chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) {
433 	if (qp->write_protect) {
434 		return ptr;
435 	} else {
436 		return isc_mem_reallocate(qp->mctx, ptr, bytes);
437 	}
438 }
439 
440 static void
441 write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) {
442 	if (qp->write_protect) {
443 		/* see transaction_open() wrt this special case */
444 		if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) {
445 			return;
446 		}
447 		TRACE("chunk %u", chunk);
448 		void *ptr = qp->base->ptr[chunk];
449 		size_t size = chunk_size_raw();
450 		RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0);
451 	}
452 }
453 
454 #else
455 
456 #define chunk_get_raw(qp)	isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES)
457 #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr)
458 
459 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)
460 
461 #define write_protect(qp, chunk)
462 
463 #endif
464 
465 /***********************************************************************
466  *
467  *  allocator
468  */
469 
470 /*
471  * When we reuse the bump chunk across multiple write transactions,
472  * it can have an immutable prefix and a mutable suffix.
473  */
474 static inline bool
475 cells_immutable(dns_qp_t *qp, dns_qpref_t ref) {
476 	dns_qpchunk_t chunk = ref_chunk(ref);
477 	dns_qpcell_t cell = ref_cell(ref);
478 	if (chunk == qp->bump) {
479 		return cell < qp->fender;
480 	} else {
481 		return qp->usage[chunk].immutable;
482 	}
483 }
484 
485 /*
486  * Create a fresh bump chunk and allocate some twigs from it.
487  */
488 static dns_qpref_t
489 chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) {
490 	INSIST(qp->base->ptr[chunk] == NULL);
491 	INSIST(qp->usage[chunk].used == 0);
492 	INSIST(qp->usage[chunk].free == 0);
493 
494 	qp->base->ptr[chunk] = chunk_get_raw(qp);
495 	qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size };
496 	qp->used_count += size;
497 	qp->bump = chunk;
498 	qp->fender = 0;
499 
500 	if (qp->write_protect) {
501 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
502 	}
503 	return make_ref(chunk, 0);
504 }
505 
506 /*
507  * This is used to grow the chunk arrays when they fill up. If the old
508  * base array is in use by readers, we must make a clone, otherwise we
509  * can reallocate in place.
510  *
511  * The isc_refcount_init() and qpbase_unref() in this function are a pair.
512  */
513 static void
514 realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) {
515 	size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
516 	size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
517 	size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax);
518 
519 	if (qp->base == NULL || qpbase_unref(qp)) {
520 		qp->base = isc_mem_reallocate(qp->mctx, qp->base, size);
521 	} else {
522 		dns_qpbase_t *oldbase = qp->base;
523 		qp->base = isc_mem_allocate(qp->mctx, size);
524 		memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
525 	}
526 	memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
527 	isc_refcount_init(&qp->base->refcount, 1);
528 	qp->base->magic = QPBASE_MAGIC;
529 
530 	/* usage array is exclusive to the writer */
531 	size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
532 	size_t newusage = sizeof(qp->usage[0]) * newmax;
533 	qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage);
534 	memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage);
535 
536 	qp->chunk_max = newmax;
537 
538 	TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
539 }
540 
541 /*
542  * There was no space in the bump chunk, so find a place to put a fresh
543  * chunk in the chunk arrays, then allocate some twigs from it.
544  */
545 static dns_qpref_t
546 alloc_slow(dns_qp_t *qp, dns_qpweight_t size) {
547 	dns_qpchunk_t chunk;
548 
549 	for (chunk = 0; chunk < qp->chunk_max; chunk++) {
550 		if (!qp->usage[chunk].exists) {
551 			return chunk_alloc(qp, chunk, size);
552 		}
553 	}
554 	ENSURE(chunk == qp->chunk_max);
555 	realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk));
556 	return chunk_alloc(qp, chunk, size);
557 }
558 
559 /*
560  * Ensure we are using a fresh bump chunk.
561  */
562 static void
563 alloc_reset(dns_qp_t *qp) {
564 	(void)alloc_slow(qp, 0);
565 }
566 
567 /*
568  * Allocate some fresh twigs. This is the bump allocator fast path.
569  */
570 static inline dns_qpref_t
571 alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) {
572 	dns_qpchunk_t chunk = qp->bump;
573 	dns_qpcell_t cell = qp->usage[chunk].used;
574 
575 	if (cell + size <= QP_CHUNK_SIZE) {
576 		qp->usage[chunk].used += size;
577 		qp->used_count += size;
578 		return make_ref(chunk, cell);
579 	} else {
580 		return alloc_slow(qp, size);
581 	}
582 }
583 
584 /*
585  * Record that some twigs are no longer being used, and if possible
586  * zero them to ensure that there isn't a spurious double detach when
587  * the chunk is later recycled.
588  *
589  * Returns true if the twigs were immediately destroyed.
590  *
591  * NOTE: the caller is responsible for attaching or detaching any
592  * leaves as required.
593  */
594 static inline bool
595 free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
596 	dns_qpchunk_t chunk = ref_chunk(twigs);
597 
598 	qp->free_count += size;
599 	qp->usage[chunk].free += size;
600 	ENSURE(qp->free_count <= qp->used_count);
601 	ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used);
602 
603 	if (cells_immutable(qp, twigs)) {
604 		qp->hold_count += size;
605 		ENSURE(qp->free_count >= qp->hold_count);
606 		return false;
607 	} else {
608 		zero_twigs(ref_ptr(qp, twigs), size);
609 		return true;
610 	}
611 }
612 
613 /*
614  * When some twigs have been copied, and free_twigs() could not
615  * immediately destroy the old copy, we need to update the refcount
616  * on any leaves that were duplicated.
617  */
618 static void
619 attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) {
620 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
621 		if (node_tag(&twigs[pos]) == LEAF_TAG) {
622 			attach_leaf(qp, &twigs[pos]);
623 		}
624 	}
625 }
626 
627 /***********************************************************************
628  *
629  *  chunk reclamation
630  */
631 
632 /*
633  * Is any of this chunk still in use?
634  */
635 static inline dns_qpcell_t
636 chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) {
637 	return qp->usage[chunk].used - qp->usage[chunk].free;
638 }
639 
640 /*
641  * We remove each empty chunk from the total counts when the chunk is
642  * freed, or when it is scheduled for safe memory reclamation. We check
643  * the chunk's phase to avoid discounting it twice in the latter case.
644  */
645 static void
646 chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) {
647 	if (qp->usage[chunk].discounted) {
648 		return;
649 	}
650 	INSIST(qp->used_count >= qp->usage[chunk].used);
651 	INSIST(qp->free_count >= qp->usage[chunk].free);
652 	qp->used_count -= qp->usage[chunk].used;
653 	qp->free_count -= qp->usage[chunk].free;
654 	qp->usage[chunk].discounted = true;
655 }
656 
657 /*
658  * When a chunk is being recycled, we need to detach any leaves that
659  * remain, and free any `base` arrays that have been marked as unused.
660  */
661 static void
662 chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) {
663 	if (qp->write_protect) {
664 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
665 	}
666 
667 	dns_qpnode_t *n = qp->base->ptr[chunk];
668 	for (dns_qpcell_t count = qp->usage[chunk].used; count > 0;
669 	     count--, n++)
670 	{
671 		if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) {
672 			detach_leaf(qp, n);
673 		} else if (count > 1 && reader_valid(n)) {
674 			dns_qpreader_t qpr;
675 			unpack_reader(&qpr, n);
676 			/* pairs with dns_qpmulti_commit() */
677 			if (qpbase_unref(&qpr)) {
678 				isc_mem_free(qp->mctx, qpr.base);
679 			}
680 		}
681 	}
682 	chunk_discount(qp, chunk);
683 	chunk_free_raw(qp, qp->base->ptr[chunk]);
684 	qp->base->ptr[chunk] = NULL;
685 	qp->usage[chunk] = (qp_usage_t){};
686 }
687 
688 /*
689  * Free any chunks that we can while a trie is in use.
690  */
691 static void
692 recycle(dns_qp_t *qp) {
693 	unsigned int free = 0;
694 
695 	isc_nanosecs_t start = isc_time_monotonic();
696 
697 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
698 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
699 		    qp->usage[chunk].exists && !qp->usage[chunk].immutable)
700 		{
701 			chunk_free(qp, chunk);
702 			free++;
703 		}
704 	}
705 
706 	isc_nanosecs_t time = isc_time_monotonic() - start;
707 	ISC_QP_ADD(recycle_time, time);
708 
709 	if (free > 0) {
710 		LOG_STATS("qp recycle" PRItime "free %u chunks", time, free);
711 		LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u",
712 			  qp->leaf_count, qp->used_count - qp->free_count,
713 			  qp->used_count, qp->free_count, qp->hold_count);
714 	}
715 }
716 
717 /*
718  * asynchronous cleanup
719  */
720 static void
721 reclaim_chunks_cb(struct rcu_head *arg) {
722 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
723 	REQUIRE(QPRCU_VALID(rcuctx));
724 	dns_qpmulti_t *multi = rcuctx->multi;
725 	REQUIRE(QPMULTI_VALID(multi));
726 
727 	LOCK(&multi->mutex);
728 
729 	dns_qp_t *qp = &multi->writer;
730 	REQUIRE(QP_VALID(qp));
731 
732 	unsigned int free = 0;
733 	isc_nanosecs_t start = isc_time_monotonic();
734 
735 	for (unsigned int i = 0; i < rcuctx->count; i++) {
736 		dns_qpchunk_t chunk = rcuctx->chunk[i];
737 		if (qp->usage[chunk].snapshot) {
738 			/* cleanup when snapshot is destroyed */
739 			qp->usage[chunk].snapfree = true;
740 		} else {
741 			chunk_free(qp, chunk);
742 			free++;
743 		}
744 	}
745 
746 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
747 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
748 
749 	isc_nanosecs_t time = isc_time_monotonic() - start;
750 	ISC_QP_ADD(recycle_time, time);
751 
752 	if (free > 0) {
753 		LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free);
754 		LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u",
755 			  qp->leaf_count, qp->used_count - qp->free_count,
756 			  qp->used_count, qp->free_count, qp->hold_count);
757 	}
758 
759 	UNLOCK(&multi->mutex);
760 }
761 
762 /*
763  * At the end of a transaction, schedule empty but immutable chunks
764  * for reclamation later.
765  */
766 static void
767 reclaim_chunks(dns_qpmulti_t *multi) {
768 	dns_qp_t *qp = &multi->writer;
769 
770 	unsigned int count = 0;
771 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
772 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
773 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
774 		    !qp->usage[chunk].discounted)
775 		{
776 			count++;
777 		}
778 	}
779 
780 	if (count == 0) {
781 		return;
782 	}
783 
784 	qp_rcuctx_t *rcuctx =
785 		isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count));
786 	*rcuctx = (qp_rcuctx_t){
787 		.magic = QPRCU_MAGIC,
788 		.multi = multi,
789 		.count = count,
790 	};
791 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
792 
793 	unsigned int i = 0;
794 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
795 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
796 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
797 		    !qp->usage[chunk].discounted)
798 		{
799 			rcuctx->chunk[i++] = chunk;
800 			chunk_discount(qp, chunk);
801 		}
802 	}
803 
804 	call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb);
805 
806 	LOG_STATS("qp will reclaim %u chunks", count);
807 }
808 
809 /*
810  * When a snapshot is destroyed, clean up chunks that need free()ing
811  * and are not used by any remaining snapshots.
812  */
813 static void
814 marksweep_chunks(dns_qpmulti_t *multi) {
815 	unsigned int free = 0;
816 
817 	isc_nanosecs_t start = isc_time_monotonic();
818 
819 	dns_qp_t *qpw = &multi->writer;
820 
821 	for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL;
822 	     qps = ISC_LIST_NEXT(qps, link))
823 	{
824 		for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) {
825 			if (qps->base->ptr[chunk] != NULL) {
826 				INSIST(qps->base->ptr[chunk] ==
827 				       qpw->base->ptr[chunk]);
828 				qpw->usage[chunk].snapmark = true;
829 			}
830 		}
831 	}
832 
833 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
834 		qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark;
835 		qpw->usage[chunk].snapmark = false;
836 		if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) {
837 			chunk_free(qpw, chunk);
838 			free++;
839 		}
840 	}
841 
842 	isc_nanosecs_t time = isc_time_monotonic() - start;
843 	ISC_QP_ADD(recycle_time, time);
844 
845 	if (free > 0) {
846 		LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free);
847 		LOG_STATS(
848 			"qp marksweep leaf %u live %u used %u free %u hold %u",
849 			qpw->leaf_count, qpw->used_count - qpw->free_count,
850 			qpw->used_count, qpw->free_count, qpw->hold_count);
851 	}
852 }
853 
854 /***********************************************************************
855  *
856  *  garbage collector
857  */
858 
859 /*
860  * Move a branch node's twigs to the `bump` chunk, for copy-on-write
861  * or for garbage collection. We don't update the node in place
862  * because `compact_recursive()` does not ensure the node itself is
863  * mutable until after it discovers evacuation was necessary.
864  *
865  * If free_twigs() could not immediately destroy the old twigs, we have
866  * to re-attach to any leaves.
867  */
868 static dns_qpref_t
869 evacuate(dns_qp_t *qp, dns_qpnode_t *n) {
870 	dns_qpweight_t size = branch_twigs_size(n);
871 	dns_qpref_t old_ref = branch_twigs_ref(n);
872 	dns_qpref_t new_ref = alloc_twigs(qp, size);
873 	dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref);
874 	dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref);
875 
876 	move_twigs(new_twigs, old_twigs, size);
877 	if (!free_twigs(qp, old_ref, size)) {
878 		attach_twigs(qp, new_twigs, size);
879 	}
880 
881 	return new_ref;
882 }
883 
884 /*
885  * Immutable nodes need copy-on-write. As we walk down the trie finding the
886  * right place to modify, make_root_mutable() and make_twigs_mutable()
887  * are called to ensure that immutable nodes on the path from the root are
888  * copied to a mutable chunk.
889  */
890 
891 static inline dns_qpnode_t *
892 make_root_mutable(dns_qp_t *qp) {
893 	if (cells_immutable(qp, qp->root_ref)) {
894 		qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp));
895 	}
896 	return ref_ptr(qp, qp->root_ref);
897 }
898 
899 static inline void
900 make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) {
901 	if (cells_immutable(qp, branch_twigs_ref(n))) {
902 		*n = make_node(branch_index(n), evacuate(qp, n));
903 	}
904 }
905 
906 /*
907  * Compact the trie by traversing the whole thing recursively, copying
908  * bottom-up as required. The aim is to avoid evacuation as much as
909  * possible, but when parts of the trie are immutable, we need to evacuate
910  * the paths from the root to the parts of the trie that occupy
911  * fragmented chunks.
912  *
913  * Without the QP_MIN_USED check, the algorithm will leave the trie
914  * unchanged. If the children are all leaves, the loop changes nothing,
915  * so we will return this node's original ref. If all of the children
916  * that are branches did not need moving, again, the loop changes
917  * nothing. So the evacuation check is the only place that the
918  * algorithm introduces ref changes, that then bubble up towards the
919  * root through the logic inside the loop.
920  */
921 static dns_qpref_t
922 compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) {
923 	dns_qpweight_t size = branch_twigs_size(parent);
924 	dns_qpref_t twigs_ref = branch_twigs_ref(parent);
925 	dns_qpchunk_t chunk = ref_chunk(twigs_ref);
926 
927 	if (qp->compact_all ||
928 	    (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED))
929 	{
930 		twigs_ref = evacuate(qp, parent);
931 	}
932 	bool immutable = cells_immutable(qp, twigs_ref);
933 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
934 		dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos;
935 		if (!is_branch(child)) {
936 			continue;
937 		}
938 		dns_qpref_t old_grandtwigs = branch_twigs_ref(child);
939 		dns_qpref_t new_grandtwigs = compact_recursive(qp, child);
940 		if (old_grandtwigs == new_grandtwigs) {
941 			continue;
942 		}
943 		if (immutable) {
944 			twigs_ref = evacuate(qp, parent);
945 			/* the twigs have moved */
946 			child = ref_ptr(qp, twigs_ref) + pos;
947 			immutable = false;
948 		}
949 		*child = make_node(branch_index(child), new_grandtwigs);
950 	}
951 	return twigs_ref;
952 }
953 
954 static void
955 compact(dns_qp_t *qp) {
956 	LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u",
957 		  qp->leaf_count, qp->used_count - qp->free_count,
958 		  qp->used_count, qp->free_count, qp->hold_count);
959 
960 	isc_nanosecs_t start = isc_time_monotonic();
961 
962 	if (qp->usage[qp->bump].free > QP_MAX_FREE) {
963 		alloc_reset(qp);
964 	}
965 
966 	if (qp->leaf_count > 0) {
967 		qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp));
968 	}
969 	qp->compact_all = false;
970 
971 	isc_nanosecs_t time = isc_time_monotonic() - start;
972 	ISC_QP_ADD(compact_time, time);
973 
974 	LOG_STATS("qp compact" PRItime
975 		  "leaf %u live %u used %u free %u hold %u",
976 		  time, qp->leaf_count, qp->used_count - qp->free_count,
977 		  qp->used_count, qp->free_count, qp->hold_count);
978 }
979 
980 void
981 dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) {
982 	REQUIRE(QP_VALID(qp));
983 	if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) {
984 		return;
985 	}
986 	if (mode == DNS_QPGC_ALL) {
987 		alloc_reset(qp);
988 		qp->compact_all = true;
989 	}
990 	compact(qp);
991 	recycle(qp);
992 }
993 
994 /*
995  * Free some twigs and (if they were destroyed immediately so that the
996  * result from QP_MAX_GARBAGE can change) compact the trie if necessary.
997  *
998  * This is called by the trie modification API entry points. The
999  * free_twigs() function requires the caller to attach or detach any
1000  * leaves as necessary. Callers of squash_twigs() satisfy this
1001  * requirement by calling make_twigs_mutable().
1002  *
1003  * Aside: In typical garbage collectors, compaction is triggered when
1004  * the allocator runs out of space. But that is because typical garbage
1005  * collectors do not know how much memory can be recovered, so they must
1006  * find out by scanning the heap. The qp-trie code was originally
1007  * designed to use malloc() and free(), so it has more information about
1008  * when garbage collection might be worthwhile. Hence we can trigger
1009  * collection when garbage passes a threshold.
1010  *
1011  * XXXFANF: If we need to avoid latency outliers caused by compaction in
1012  * write transactions, we can check qp->transaction_mode here.
1013  */
1014 static inline bool
1015 squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
1016 	bool destroyed = free_twigs(qp, twigs, size);
1017 	if (destroyed && QP_AUTOGC(qp)) {
1018 		compact(qp);
1019 		recycle(qp);
1020 		/*
1021 		 * This shouldn't happen if the garbage collector is
1022 		 * working correctly. We can recover at the cost of some
1023 		 * time and space, but recovery should be cheaper than
1024 		 * letting compact+recycle fail repeatedly.
1025 		 */
1026 		if (QP_AUTOGC(qp)) {
1027 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1028 				      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
1029 				      "qp %p uctx \"%s\" compact/recycle "
1030 				      "failed to recover any space, "
1031 				      "scheduling a full compaction",
1032 				      qp, TRIENAME(qp));
1033 			qp->compact_all = true;
1034 		}
1035 	}
1036 	return destroyed;
1037 }
1038 
1039 /***********************************************************************
1040  *
1041  *  public accessors for memory management internals
1042  */
1043 
1044 dns_qp_memusage_t
1045 dns_qp_memusage(dns_qp_t *qp) {
1046 	REQUIRE(QP_VALID(qp));
1047 
1048 	dns_qp_memusage_t memusage = {
1049 		.uctx = qp->uctx,
1050 		.leaves = qp->leaf_count,
1051 		.live = qp->used_count - qp->free_count,
1052 		.used = qp->used_count,
1053 		.hold = qp->hold_count,
1054 		.free = qp->free_count,
1055 		.node_size = sizeof(dns_qpnode_t),
1056 		.chunk_size = QP_CHUNK_SIZE,
1057 		.fragmented = QP_NEEDGC(qp),
1058 	};
1059 
1060 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
1061 		if (qp->base->ptr[chunk] != NULL) {
1062 			memusage.chunk_count += 1;
1063 		}
1064 	}
1065 
1066 	/*
1067 	 * XXXFANF does not subtract chunks that have been shrunk,
1068 	 * and does not count unreclaimed dns_qpbase_t objects
1069 	 */
1070 	memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES +
1071 			 qp->chunk_max * sizeof(qp->base->ptr[0]) +
1072 			 qp->chunk_max * sizeof(qp->usage[0]);
1073 
1074 	return memusage;
1075 }
1076 
1077 dns_qp_memusage_t
1078 dns_qpmulti_memusage(dns_qpmulti_t *multi) {
1079 	REQUIRE(QPMULTI_VALID(multi));
1080 	LOCK(&multi->mutex);
1081 
1082 	dns_qp_t *qp = &multi->writer;
1083 	INSIST(QP_VALID(qp));
1084 
1085 	dns_qp_memusage_t memusage = dns_qp_memusage(qp);
1086 
1087 	if (qp->transaction_mode == QP_UPDATE) {
1088 		memusage.bytes -= QP_CHUNK_BYTES;
1089 		memusage.bytes += qp->usage[qp->bump].used *
1090 				  sizeof(dns_qpnode_t);
1091 	}
1092 
1093 	UNLOCK(&multi->mutex);
1094 	return memusage;
1095 }
1096 
1097 void
1098 dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p,
1099 	      isc_nanosecs_t *rollback_p) {
1100 	*compact_p = ISC_QP_GET(compact_time);
1101 	*recycle_p = ISC_QP_GET(recycle_time);
1102 	*rollback_p = ISC_QP_GET(rollback_time);
1103 }
1104 
1105 /***********************************************************************
1106  *
1107  *  read-write transactions
1108  */
1109 
1110 static dns_qp_t *
1111 transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
1112 	REQUIRE(QPMULTI_VALID(multi));
1113 	REQUIRE(qptp != NULL && *qptp == NULL);
1114 
1115 	LOCK(&multi->mutex);
1116 
1117 	dns_qp_t *qp = &multi->writer;
1118 	INSIST(QP_VALID(qp));
1119 
1120 	/*
1121 	 * Mark existing chunks as immutable.
1122 	 *
1123 	 * Aside: The bump chunk is special: in a series of write
1124 	 * transactions the bump chunk is reused; the first part (up
1125 	 * to fender) is immutable, the rest mutable. But we set its
1126 	 * immutable flag so that when the bump chunk fills up, the
1127 	 * first part continues to be treated as immutable. (And the
1128 	 * rest of the chunk too, but that's OK.)
1129 	 */
1130 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
1131 		if (qp->usage[chunk].exists) {
1132 			qp->usage[chunk].immutable = true;
1133 			write_protect(qp, chunk);
1134 		}
1135 	}
1136 
1137 	/*
1138 	 * Ensure QP_AUTOGC() ignores free space in immutable chunks.
1139 	 */
1140 	qp->hold_count = qp->free_count;
1141 
1142 	*qptp = qp;
1143 	return qp;
1144 }
1145 
1146 /*
1147  * a write is light
1148  *
1149  * We need to ensure we allocate from a fresh chunk if the last transaction
1150  * shrunk the bump chunk; but usually in a sequence of write transactions
1151  * we just put `fender` at the point where we started this generation.
1152  *
1153  * (Aside: Instead of keeping the previous transaction's mode, I
1154  * considered forcing allocation into the slow path by fiddling with
1155  * the bump chunk's usage counters. But that is troublesome because
1156  * `chunk_free()` needs to know how much of the chunk to scan.)
1157  */
1158 void
1159 dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) {
1160 	dns_qp_t *qp = transaction_open(multi, qptp);
1161 	TRACE("");
1162 
1163 	if (qp->transaction_mode == QP_WRITE) {
1164 		qp->fender = qp->usage[qp->bump].used;
1165 	} else {
1166 		alloc_reset(qp);
1167 	}
1168 	qp->transaction_mode = QP_WRITE;
1169 }
1170 
1171 /*
1172  * an update is heavier
1173  *
1174  * We always reset the allocator to the start of a fresh chunk,
1175  * because the previous transaction was probably an update that shrunk
1176  * the bump chunk. It simplifies rollback because `fender` is always zero.
1177  *
1178  * To rollback a transaction, we need to reset all the allocation
1179  * counters to their previous state, in particular we need to un-free
1180  * any nodes that were copied to make them mutable. This means we need
1181  * to make a copy of basically the whole `dns_qp_t writer`: everything
1182  * but the chunks holding the trie nodes.
1183  *
1184  * We do most of the transaction setup before creating the rollback
1185  * state so that after rollback we have a correct idea of which chunks
1186  * are immutable, and so we have the correct transaction mode to make
1187  * the next transaction allocate a new bump chunk. The exception is
1188  * resetting the allocator, which we do after creating the rollback
1189  * state; if this transaction is rolled back then the next transaction
1190  * will start from the rollback state and also reset the allocator as
1191  * one of its first actions.
1192  */
1193 void
1194 dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
1195 	dns_qp_t *qp = transaction_open(multi, qptp);
1196 	TRACE("");
1197 
1198 	qp->transaction_mode = QP_UPDATE;
1199 
1200 	dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback));
1201 	memmove(rollback, qp, sizeof(*rollback));
1202 	/* can be uninitialized on the first transaction */
1203 	if (rollback->base != NULL) {
1204 		INSIST(QPBASE_VALID(rollback->base));
1205 		INSIST(qp->usage != NULL && qp->chunk_max > 0);
1206 		/* paired with either _commit() or _rollback() */
1207 		isc_refcount_increment(&rollback->base->refcount);
1208 		size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
1209 		rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes);
1210 		memmove(rollback->usage, qp->usage, usage_bytes);
1211 	}
1212 	INSIST(multi->rollback == NULL);
1213 	multi->rollback = rollback;
1214 
1215 	alloc_reset(qp);
1216 }
1217 
1218 void
1219 dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
1220 	REQUIRE(QPMULTI_VALID(multi));
1221 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
1222 	REQUIRE(multi->writer.transaction_mode == QP_WRITE ||
1223 		multi->writer.transaction_mode == QP_UPDATE);
1224 
1225 	dns_qp_t *qp = *qptp;
1226 	TRACE("");
1227 
1228 	if (qp->transaction_mode == QP_UPDATE) {
1229 		INSIST(multi->rollback != NULL);
1230 		/* paired with dns_qpmulti_update() */
1231 		if (qpbase_unref(multi->rollback)) {
1232 			isc_mem_free(qp->mctx, multi->rollback->base);
1233 		}
1234 		if (multi->rollback->usage != NULL) {
1235 			isc_mem_free(qp->mctx, multi->rollback->usage);
1236 		}
1237 		isc_mem_free(qp->mctx, multi->rollback);
1238 	}
1239 	INSIST(multi->rollback == NULL);
1240 
1241 	/* not the first commit? */
1242 	if (multi->reader_ref != INVALID_REF) {
1243 		INSIST(cells_immutable(qp, multi->reader_ref));
1244 		free_twigs(qp, multi->reader_ref, READER_SIZE);
1245 	}
1246 
1247 	if (qp->transaction_mode == QP_UPDATE) {
1248 		/* minimize memory overhead */
1249 		compact(qp);
1250 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
1251 		qp->base->ptr[qp->bump] = chunk_shrink_raw(
1252 			qp, qp->base->ptr[qp->bump],
1253 			qp->usage[qp->bump].used * sizeof(dns_qpnode_t));
1254 	} else {
1255 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
1256 	}
1257 
1258 	/* anchor a new version of the trie */
1259 	dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref);
1260 	make_reader(reader, multi);
1261 	/* paired with chunk_free() */
1262 	isc_refcount_increment(&qp->base->refcount);
1263 
1264 	rcu_assign_pointer(multi->reader, reader); /* COMMIT */
1265 
1266 	/* clean up what we can right now */
1267 	if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) {
1268 		recycle(qp);
1269 	}
1270 
1271 	/* schedule the rest for later */
1272 	reclaim_chunks(multi);
1273 
1274 	*qptp = NULL;
1275 	UNLOCK(&multi->mutex);
1276 }
1277 
1278 /*
1279  * Throw away everything that was allocated during this transaction.
1280  */
1281 void
1282 dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
1283 	unsigned int free = 0;
1284 
1285 	REQUIRE(QPMULTI_VALID(multi));
1286 	REQUIRE(multi->writer.transaction_mode == QP_UPDATE);
1287 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
1288 
1289 	dns_qp_t *qp = *qptp;
1290 	TRACE("");
1291 
1292 	isc_nanosecs_t start = isc_time_monotonic();
1293 
1294 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
1295 		if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable)
1296 		{
1297 			chunk_free(qp, chunk);
1298 			/*
1299 			 * we need to clear its base pointer in the rollback
1300 			 * trie, in case the arrays were resized
1301 			 */
1302 			if (chunk < multi->rollback->chunk_max) {
1303 				INSIST(!multi->rollback->usage[chunk].exists);
1304 				multi->rollback->base->ptr[chunk] = NULL;
1305 			}
1306 			free++;
1307 		}
1308 	}
1309 
1310 	/*
1311 	 * multi->rollback->base and multi->writer->base are the same,
1312 	 * unless there was a realloc_chunk_arrays() during the transaction
1313 	 */
1314 	if (qpbase_unref(qp)) {
1315 		/* paired with dns_qpmulti_update() */
1316 		isc_mem_free(qp->mctx, qp->base);
1317 	}
1318 	isc_mem_free(qp->mctx, qp->usage);
1319 
1320 	/* reset allocator state */
1321 	INSIST(multi->rollback != NULL);
1322 	memmove(qp, multi->rollback, sizeof(*qp));
1323 	isc_mem_free(qp->mctx, multi->rollback);
1324 	INSIST(multi->rollback == NULL);
1325 
1326 	isc_nanosecs_t time = isc_time_monotonic() - start;
1327 	ISC_QP_ADD(rollback_time, time);
1328 
1329 	LOG_STATS("qp rollback" PRItime "free %u chunks", time, free);
1330 
1331 	*qptp = NULL;
1332 	UNLOCK(&multi->mutex);
1333 }
1334 
1335 /***********************************************************************
1336  *
1337  *  read-only transactions
1338  */
1339 
1340 static dns_qpmulti_t *
1341 reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) {
1342 	dns_qpreader_t *qp = dns_qpreader(qpr);
1343 	dns_qpnode_t *reader = rcu_dereference(multi->reader);
1344 	if (reader == NULL) {
1345 		QP_INIT(qp, multi->writer.methods, multi->writer.uctx);
1346 	} else {
1347 		multi = unpack_reader(qp, reader);
1348 	}
1349 	return multi;
1350 }
1351 
1352 /*
1353  * a query is light
1354  */
1355 
1356 void
1357 dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
1358 	REQUIRE(QPMULTI_VALID(multi));
1359 	REQUIRE(qp != NULL);
1360 
1361 	qp->tid = isc_tid();
1362 	rcu_read_lock();
1363 
1364 	dns_qpmulti_t *whence = reader_open(multi, qp);
1365 	INSIST(whence == multi);
1366 }
1367 
1368 void
1369 dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
1370 	REQUIRE(QPMULTI_VALID(multi));
1371 	REQUIRE(QP_VALID(qp));
1372 	REQUIRE(qp->tid == isc_tid());
1373 	*qp = (dns_qpread_t){};
1374 	rcu_read_unlock();
1375 }
1376 
1377 /*
1378  * a snapshot is heavy
1379  */
1380 
1381 void
1382 dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
1383 	REQUIRE(QPMULTI_VALID(multi));
1384 	REQUIRE(qpsp != NULL && *qpsp == NULL);
1385 
1386 	rcu_read_lock();
1387 
1388 	LOCK(&multi->mutex);
1389 
1390 	dns_qp_t *qpw = &multi->writer;
1391 	size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) +
1392 		       sizeof(qpw->base->ptr[0]) * qpw->chunk_max;
1393 	dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes);
1394 
1395 	qps->whence = reader_open(multi, qps);
1396 	INSIST(qps->whence == multi);
1397 
1398 	/* not a separate allocation */
1399 	qps->base = (dns_qpbase_t *)(qps + 1);
1400 	isc_refcount_init(&qps->base->refcount, 0);
1401 
1402 	/*
1403 	 * only copy base pointers of chunks we need, so we can
1404 	 * reclaim unused memory in dns_qpsnap_destroy()
1405 	 */
1406 	qps->chunk_max = qpw->chunk_max;
1407 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
1408 		if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) {
1409 			qpw->usage[chunk].snapshot = true;
1410 			qps->base->ptr[chunk] = qpw->base->ptr[chunk];
1411 		} else {
1412 			qps->base->ptr[chunk] = NULL;
1413 		}
1414 	}
1415 	ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link);
1416 
1417 	*qpsp = qps;
1418 	UNLOCK(&multi->mutex);
1419 
1420 	rcu_read_unlock();
1421 }
1422 
1423 void
1424 dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
1425 	REQUIRE(QPMULTI_VALID(multi));
1426 	REQUIRE(qpsp != NULL && *qpsp != NULL);
1427 
1428 	LOCK(&multi->mutex);
1429 
1430 	dns_qpsnap_t *qp = *qpsp;
1431 
1432 	/* make sure the API is being used correctly */
1433 	REQUIRE(qp->whence == multi);
1434 
1435 	ISC_LIST_UNLINK(multi->snapshots, qp, link);
1436 
1437 	/*
1438 	 * eagerly reclaim chunks that are now unused, so that memory does
1439 	 * not accumulate when a trie has a lot of updates and snapshots
1440 	 */
1441 	marksweep_chunks(multi);
1442 
1443 	isc_mem_free(multi->writer.mctx, qp);
1444 
1445 	*qpsp = NULL;
1446 	UNLOCK(&multi->mutex);
1447 }
1448 
1449 /***********************************************************************
1450  *
1451  *  constructors, destructors
1452  */
1453 
1454 void
1455 dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
1456 	      dns_qp_t **qptp) {
1457 	REQUIRE(qptp != NULL && *qptp == NULL);
1458 
1459 	dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp));
1460 	QP_INIT(qp, methods, uctx);
1461 	isc_mem_attach(mctx, &qp->mctx);
1462 	alloc_reset(qp);
1463 	TRACE("");
1464 	*qptp = qp;
1465 }
1466 
1467 void
1468 dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
1469 		   dns_qpmulti_t **qpmp) {
1470 	REQUIRE(qpmp != NULL && *qpmp == NULL);
1471 
1472 	dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi));
1473 	*multi = (dns_qpmulti_t){
1474 		.magic = QPMULTI_MAGIC,
1475 		.reader_ref = INVALID_REF,
1476 	};
1477 	isc_mutex_init(&multi->mutex);
1478 	ISC_LIST_INIT(multi->snapshots);
1479 	/*
1480 	 * Do not waste effort allocating a bump chunk that will be thrown
1481 	 * away when a transaction is opened. dns_qpmulti_update() always
1482 	 * allocates; to ensure dns_qpmulti_write() does too, pretend the
1483 	 * previous transaction was an update
1484 	 */
1485 	dns_qp_t *qp = &multi->writer;
1486 	QP_INIT(qp, methods, uctx);
1487 	isc_mem_attach(mctx, &qp->mctx);
1488 	qp->transaction_mode = QP_UPDATE;
1489 	TRACE("");
1490 	*qpmp = multi;
1491 }
1492 
1493 static void
1494 destroy_guts(dns_qp_t *qp) {
1495 	if (qp->chunk_max == 0) {
1496 		return;
1497 	}
1498 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
1499 		if (qp->base->ptr[chunk] != NULL) {
1500 			chunk_free(qp, chunk);
1501 		}
1502 	}
1503 	ENSURE(qp->used_count == 0);
1504 	ENSURE(qp->free_count == 0);
1505 	ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
1506 	isc_mem_free(qp->mctx, qp->base);
1507 	isc_mem_free(qp->mctx, qp->usage);
1508 	qp->magic = 0;
1509 }
1510 
1511 void
1512 dns_qp_destroy(dns_qp_t **qptp) {
1513 	REQUIRE(qptp != NULL);
1514 	REQUIRE(QP_VALID(*qptp));
1515 
1516 	dns_qp_t *qp = *qptp;
1517 	*qptp = NULL;
1518 
1519 	/* do not try to destroy part of a dns_qpmulti_t */
1520 	REQUIRE(qp->transaction_mode == QP_NONE);
1521 
1522 	TRACE("");
1523 	destroy_guts(qp);
1524 	isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp));
1525 }
1526 
1527 static void
1528 qpmulti_destroy_cb(struct rcu_head *arg) {
1529 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
1530 	REQUIRE(QPRCU_VALID(rcuctx));
1531 	/* only nonzero for reclaim_chunks_cb() */
1532 	REQUIRE(rcuctx->count == 0);
1533 
1534 	dns_qpmulti_t *multi = rcuctx->multi;
1535 	REQUIRE(QPMULTI_VALID(multi));
1536 
1537 	/* reassure thread sanitizer */
1538 	LOCK(&multi->mutex);
1539 
1540 	dns_qp_t *qp = &multi->writer;
1541 	REQUIRE(QP_VALID(qp));
1542 
1543 	destroy_guts(qp);
1544 
1545 	UNLOCK(&multi->mutex);
1546 
1547 	isc_mutex_destroy(&multi->mutex);
1548 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
1549 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
1550 	isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
1551 }
1552 
1553 void
1554 dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
1555 	dns_qp_t *qp = NULL;
1556 	dns_qpmulti_t *multi = NULL;
1557 	qp_rcuctx_t *rcuctx = NULL;
1558 
1559 	REQUIRE(qpmp != NULL);
1560 	REQUIRE(QPMULTI_VALID(*qpmp));
1561 
1562 	multi = *qpmp;
1563 	qp = &multi->writer;
1564 	*qpmp = NULL;
1565 
1566 	REQUIRE(QP_VALID(qp));
1567 	REQUIRE(multi->rollback == NULL);
1568 	REQUIRE(ISC_LIST_EMPTY(multi->snapshots));
1569 
1570 	rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0));
1571 	*rcuctx = (qp_rcuctx_t){
1572 		.magic = QPRCU_MAGIC,
1573 		.multi = multi,
1574 	};
1575 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
1576 	call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb);
1577 }
1578 
1579 /***********************************************************************
1580  *
1581  *  modification
1582  */
1583 
1584 isc_result_t
1585 dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
1586 	dns_qpref_t new_ref, old_ref;
1587 	dns_qpnode_t new_leaf, old_node;
1588 	dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL;
1589 	dns_qpshift_t new_bit, old_bit;
1590 	dns_qpweight_t old_size, new_size;
1591 	dns_qpkey_t new_key, old_key;
1592 	size_t new_keylen, old_keylen;
1593 	size_t offset;
1594 	uint64_t index;
1595 	dns_qpshift_t bit;
1596 	dns_qpweight_t pos;
1597 	dns_qpnode_t *n = NULL;
1598 
1599 	REQUIRE(QP_VALID(qp));
1600 
1601 	new_leaf = make_leaf(pval, ival);
1602 	new_keylen = leaf_qpkey(qp, &new_leaf, new_key);
1603 
1604 	/* first leaf in an empty trie? */
1605 	if (qp->leaf_count == 0) {
1606 		new_ref = alloc_twigs(qp, 1);
1607 		new_twigs = ref_ptr(qp, new_ref);
1608 		*new_twigs = new_leaf;
1609 		attach_leaf(qp, new_twigs);
1610 		qp->leaf_count++;
1611 		qp->root_ref = new_ref;
1612 		return ISC_R_SUCCESS;
1613 	}
1614 
1615 	/*
1616 	 * We need to keep searching down to a leaf even if our key is
1617 	 * missing from this branch. It doesn't matter which twig we
1618 	 * choose since the keys are all the same up to this node's
1619 	 * offset. Note that if we simply use branch_twig_pos(n, bit)
1620 	 * we may get an out-of-bounds access if our bit is greater
1621 	 * than all the set bits in the node.
1622 	 */
1623 	n = ref_ptr(qp, qp->root_ref);
1624 	while (is_branch(n)) {
1625 		prefetch_twigs(qp, n);
1626 		dns_qpref_t ref = branch_twigs_ref(n);
1627 		bit = branch_keybit(n, new_key, new_keylen);
1628 		pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0;
1629 		n = ref_ptr(qp, ref + pos);
1630 	}
1631 
1632 	/* do the keys differ, and if so, where? */
1633 	old_keylen = leaf_qpkey(qp, n, old_key);
1634 	offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen);
1635 	if (offset == QPKEY_EQUAL) {
1636 		return ISC_R_EXISTS;
1637 	}
1638 	new_bit = qpkey_bit(new_key, new_keylen, offset);
1639 	old_bit = qpkey_bit(old_key, old_keylen, offset);
1640 
1641 	/* find where to insert a branch or grow an existing branch. */
1642 	n = make_root_mutable(qp);
1643 	while (is_branch(n)) {
1644 		prefetch_twigs(qp, n);
1645 		if (offset < branch_key_offset(n)) {
1646 			goto newbranch;
1647 		}
1648 		if (offset == branch_key_offset(n)) {
1649 			goto growbranch;
1650 		}
1651 		make_twigs_mutable(qp, n);
1652 		bit = branch_keybit(n, new_key, new_keylen);
1653 		INSIST(branch_has_twig(n, bit));
1654 		n = branch_twig_ptr(qp, n, bit);
1655 	}
1656 	/* fall through */
1657 
1658 newbranch:
1659 	new_ref = alloc_twigs(qp, 2);
1660 	new_twigs = ref_ptr(qp, new_ref);
1661 
1662 	/* save before overwriting. */
1663 	old_node = *n;
1664 
1665 	/* new branch node takes old node's place */
1666 	index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) |
1667 		((uint64_t)offset << SHIFT_OFFSET);
1668 	*n = make_node(index, new_ref);
1669 
1670 	/* populate twigs */
1671 	new_twigs[old_bit > new_bit] = old_node;
1672 	new_twigs[new_bit > old_bit] = new_leaf;
1673 
1674 	attach_leaf(qp, &new_leaf);
1675 	qp->leaf_count++;
1676 
1677 	return ISC_R_SUCCESS;
1678 
1679 growbranch:
1680 	INSIST(!branch_has_twig(n, new_bit));
1681 
1682 	/* locate twigs vectors */
1683 	old_size = branch_twigs_size(n);
1684 	new_size = old_size + 1;
1685 	old_ref = branch_twigs_ref(n);
1686 	new_ref = alloc_twigs(qp, new_size);
1687 	old_twigs = ref_ptr(qp, old_ref);
1688 	new_twigs = ref_ptr(qp, new_ref);
1689 
1690 	/* embiggen branch node */
1691 	index = branch_index(n) | (1ULL << new_bit);
1692 	*n = make_node(index, new_ref);
1693 
1694 	/* embiggen twigs vector */
1695 	pos = branch_twig_pos(n, new_bit);
1696 	move_twigs(new_twigs, old_twigs, pos);
1697 	new_twigs[pos] = new_leaf;
1698 	move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos);
1699 
1700 	if (squash_twigs(qp, old_ref, old_size)) {
1701 		/* old twigs destroyed, only attach to new leaf */
1702 		attach_leaf(qp, &new_leaf);
1703 	} else {
1704 		/* old twigs duplicated, attach to all leaves */
1705 		attach_twigs(qp, new_twigs, new_size);
1706 	}
1707 	qp->leaf_count++;
1708 
1709 	return ISC_R_SUCCESS;
1710 }
1711 
1712 isc_result_t
1713 dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
1714 		 size_t search_keylen, void **pval_r, uint32_t *ival_r) {
1715 	REQUIRE(QP_VALID(qp));
1716 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
1717 
1718 	if (get_root(qp) == NULL) {
1719 		return ISC_R_NOTFOUND;
1720 	}
1721 
1722 	dns_qpshift_t bit = 0; /* suppress warning */
1723 	dns_qpnode_t *parent = NULL;
1724 	dns_qpnode_t *n = make_root_mutable(qp);
1725 	while (is_branch(n)) {
1726 		prefetch_twigs(qp, n);
1727 		bit = branch_keybit(n, search_key, search_keylen);
1728 		if (!branch_has_twig(n, bit)) {
1729 			return ISC_R_NOTFOUND;
1730 		}
1731 		make_twigs_mutable(qp, n);
1732 		parent = n;
1733 		n = branch_twig_ptr(qp, n, bit);
1734 	}
1735 
1736 	dns_qpkey_t found_key;
1737 	size_t found_keylen = leaf_qpkey(qp, n, found_key);
1738 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
1739 	    QPKEY_EQUAL)
1740 	{
1741 		return ISC_R_NOTFOUND;
1742 	}
1743 
1744 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
1745 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
1746 	detach_leaf(qp, n);
1747 	qp->leaf_count--;
1748 
1749 	/* trie becomes empty */
1750 	if (qp->leaf_count == 0) {
1751 		INSIST(parent == NULL);
1752 		INSIST(n == get_root(qp));
1753 		free_twigs(qp, qp->root_ref, 1);
1754 		qp->root_ref = INVALID_REF;
1755 		return ISC_R_SUCCESS;
1756 	}
1757 
1758 	/* step back to parent node */
1759 	n = parent;
1760 	parent = NULL;
1761 
1762 	INSIST(bit != 0);
1763 	dns_qpweight_t size = branch_twigs_size(n);
1764 	dns_qpweight_t pos = branch_twig_pos(n, bit);
1765 	dns_qpref_t ref = branch_twigs_ref(n);
1766 	dns_qpnode_t *twigs = ref_ptr(qp, ref);
1767 
1768 	if (size == 2) {
1769 		/*
1770 		 * move the other twig to the parent branch.
1771 		 */
1772 		*n = twigs[!pos];
1773 		squash_twigs(qp, ref, 2);
1774 	} else {
1775 		/*
1776 		 * shrink the twigs in place, to avoid using the bump
1777 		 * chunk too fast - the gc will clean up after us
1778 		 */
1779 		*n = make_node(branch_index(n) & ~(1ULL << bit), ref);
1780 		move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1);
1781 		squash_twigs(qp, ref + size - 1, 1);
1782 	}
1783 
1784 	return ISC_R_SUCCESS;
1785 }
1786 
1787 isc_result_t
1788 dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r,
1789 		  uint32_t *ival_r) {
1790 	dns_qpkey_t key;
1791 	size_t keylen = dns_qpkey_fromname(key, name);
1792 	return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r);
1793 }
1794 
1795 /***********************************************************************
1796  *  chains
1797  */
1798 static void
1799 maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) {
1800 	dns_qpkey_t key;
1801 	size_t len;
1802 
1803 	if (name == NULL) {
1804 		return;
1805 	}
1806 
1807 	dns_name_reset(name);
1808 	len = leaf_qpkey(qp, node, key);
1809 	dns_qpkey_toname(key, len, name);
1810 }
1811 
1812 void
1813 dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) {
1814 	dns_qpreader_t *qp = dns_qpreader(qpr);
1815 	REQUIRE(QP_VALID(qp));
1816 	REQUIRE(chain != NULL);
1817 
1818 	*chain = (dns_qpchain_t){
1819 		.magic = QPCHAIN_MAGIC,
1820 		.qp = qp,
1821 	};
1822 }
1823 
1824 unsigned int
1825 dns_qpchain_length(dns_qpchain_t *chain) {
1826 	REQUIRE(QPCHAIN_VALID(chain));
1827 
1828 	return chain->len;
1829 }
1830 
1831 void
1832 dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name,
1833 		 void **pval_r, uint32_t *ival_r) {
1834 	dns_qpnode_t *node = NULL;
1835 
1836 	REQUIRE(QPCHAIN_VALID(chain));
1837 	REQUIRE(level < chain->len);
1838 
1839 	node = chain->chain[level].node;
1840 	maybe_set_name(chain->qp, node, name);
1841 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
1842 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
1843 }
1844 
1845 /***********************************************************************
1846  *  iterators
1847  */
1848 
1849 void
1850 dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) {
1851 	dns_qpreader_t *qp = dns_qpreader(qpr);
1852 	REQUIRE(QP_VALID(qp));
1853 	REQUIRE(qpi != NULL);
1854 	*qpi = (dns_qpiter_t){
1855 		.qp = qp,
1856 		.magic = QPITER_MAGIC,
1857 	};
1858 }
1859 
1860 /*
1861  * are we at the last twig in this branch (in whichever direction
1862  * we're iterating)?
1863  */
1864 static bool
1865 last_twig(dns_qpiter_t *qpi, bool forward) {
1866 	dns_qpweight_t pos = 0, max = 0;
1867 	if (qpi->sp > 0) {
1868 		dns_qpnode_t *child = qpi->stack[qpi->sp];
1869 		dns_qpnode_t *parent = qpi->stack[qpi->sp - 1];
1870 		pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent));
1871 		if (forward) {
1872 			max = branch_twigs_size(parent) - 1;
1873 		}
1874 	}
1875 	return pos == max;
1876 }
1877 
1878 /*
1879  * move a QP iterator forward or back to the next or previous leaf.
1880  * note: this function can go wrong when the iterator refers to
1881  * a mutable view of the trie which is altered while iterating
1882  */
1883 static isc_result_t
1884 iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
1885 	uint32_t *ival_r) {
1886 	dns_qpnode_t *node = NULL;
1887 	bool initial_branch = true;
1888 
1889 	REQUIRE(QPITER_VALID(qpi));
1890 
1891 	dns_qpreader_t *qp = qpi->qp;
1892 
1893 	REQUIRE(QP_VALID(qp));
1894 
1895 	node = get_root(qp);
1896 	if (node == NULL) {
1897 		return ISC_R_NOMORE;
1898 	}
1899 
1900 	do {
1901 		if (qpi->stack[qpi->sp] == NULL) {
1902 			/* newly initialized iterator: use the root node */
1903 			INSIST(qpi->sp == 0);
1904 			qpi->stack[0] = node;
1905 		} else if (!initial_branch) {
1906 			/*
1907 			 * in a prior loop, we reached a branch; from
1908 			 * here we just need to get the highest or lowest
1909 			 * leaf in the subtree; we don't need to bother
1910 			 * stepping forward or backward through twigs
1911 			 * anymore.
1912 			 */
1913 			INSIST(qpi->sp > 0);
1914 		} else if (last_twig(qpi, forward)) {
1915 			/*
1916 			 * we've stepped to the end (or the beginning,
1917 			 * if we're iterating backwards) of a set of twigs.
1918 			 */
1919 			if (qpi->sp == 0) {
1920 				/*
1921 				 * we've finished iterating. reinitialize
1922 				 * the iterator, then return ISC_R_NOMORE.
1923 				 */
1924 				dns_qpiter_init(qpi->qp, qpi);
1925 				return ISC_R_NOMORE;
1926 			}
1927 
1928 			/*
1929 			 * pop the stack, and resume at the parent branch.
1930 			 */
1931 			qpi->stack[qpi->sp] = NULL;
1932 			qpi->sp--;
1933 			continue;
1934 		} else {
1935 			/*
1936 			 * there are more twigs in the current branch,
1937 			 * so step the node pointer forward (or back).
1938 			 */
1939 			qpi->stack[qpi->sp] += (forward ? 1 : -1);
1940 			node = qpi->stack[qpi->sp];
1941 		}
1942 
1943 		/*
1944 		 * if we're at a branch now, we loop down to the
1945 		 * left- or rightmost leaf.
1946 		 */
1947 		if (is_branch(node)) {
1948 			qpi->sp++;
1949 			INSIST(qpi->sp < DNS_QP_MAXKEY);
1950 			node = ref_ptr(qp, branch_twigs_ref(node)) +
1951 			       (forward ? 0 : branch_twigs_size(node) - 1);
1952 			qpi->stack[qpi->sp] = node;
1953 			initial_branch = false;
1954 		}
1955 	} while (is_branch(node));
1956 
1957 	/* we're at a leaf: return its data to the caller */
1958 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
1959 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
1960 	maybe_set_name(qpi->qp, node, name);
1961 	return ISC_R_SUCCESS;
1962 }
1963 
1964 isc_result_t
1965 dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
1966 		uint32_t *ival_r) {
1967 	return iterate(true, qpi, name, pval_r, ival_r);
1968 }
1969 
1970 isc_result_t
1971 dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
1972 		uint32_t *ival_r) {
1973 	return iterate(false, qpi, name, pval_r, ival_r);
1974 }
1975 
1976 isc_result_t
1977 dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
1978 		   uint32_t *ival_r) {
1979 	dns_qpnode_t *node = NULL;
1980 
1981 	REQUIRE(QPITER_VALID(qpi));
1982 
1983 	node = qpi->stack[qpi->sp];
1984 	if (node == NULL || is_branch(node)) {
1985 		return ISC_R_FAILURE;
1986 	}
1987 
1988 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
1989 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
1990 	maybe_set_name(qpi->qp, node, name);
1991 	return ISC_R_SUCCESS;
1992 }
1993 
1994 /***********************************************************************
1995  *
1996  *  search
1997  */
1998 
1999 isc_result_t
2000 dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
2001 	      size_t search_keylen, void **pval_r, uint32_t *ival_r) {
2002 	dns_qpreader_t *qp = dns_qpreader(qpr);
2003 	dns_qpkey_t found_key;
2004 	size_t found_keylen;
2005 	dns_qpshift_t bit;
2006 	dns_qpnode_t *n = NULL;
2007 
2008 	REQUIRE(QP_VALID(qp));
2009 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
2010 
2011 	n = get_root(qp);
2012 	if (n == NULL) {
2013 		return ISC_R_NOTFOUND;
2014 	}
2015 
2016 	while (is_branch(n)) {
2017 		prefetch_twigs(qp, n);
2018 		bit = branch_keybit(n, search_key, search_keylen);
2019 		if (!branch_has_twig(n, bit)) {
2020 			return ISC_R_NOTFOUND;
2021 		}
2022 		n = branch_twig_ptr(qp, n, bit);
2023 	}
2024 
2025 	found_keylen = leaf_qpkey(qp, n, found_key);
2026 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
2027 	    QPKEY_EQUAL)
2028 	{
2029 		return ISC_R_NOTFOUND;
2030 	}
2031 
2032 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
2033 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
2034 	return ISC_R_SUCCESS;
2035 }
2036 
2037 isc_result_t
2038 dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r,
2039 	       uint32_t *ival_r) {
2040 	dns_qpkey_t key;
2041 	size_t keylen = dns_qpkey_fromname(key, name);
2042 	return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r);
2043 }
2044 
2045 static inline void
2046 add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) {
2047 	/* prevent duplication */
2048 	if (chain->len != 0 && chain->chain[chain->len - 1].node == node) {
2049 		return;
2050 	}
2051 	chain->chain[chain->len].node = node;
2052 	chain->chain[chain->len].offset = offset;
2053 	chain->len++;
2054 	INSIST(chain->len <= DNS_NAME_MAXLABELS);
2055 }
2056 
2057 static inline void
2058 prevleaf(dns_qpiter_t *it) {
2059 	isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL);
2060 	if (result == ISC_R_NOMORE) {
2061 		result = dns_qpiter_prev(it, NULL, NULL, NULL);
2062 	}
2063 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
2064 }
2065 
2066 static inline void
2067 greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) {
2068 	while (is_branch(n)) {
2069 		dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) -
2070 				  1;
2071 		iter->stack[++iter->sp] = n;
2072 		n = ref_ptr(qpr, ref);
2073 	}
2074 	iter->stack[++iter->sp] = n;
2075 }
2076 
2077 static inline dns_qpnode_t *
2078 anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) {
2079 	while (is_branch(n)) {
2080 		n = branch_twigs(qp, n);
2081 	}
2082 	return n;
2083 }
2084 
2085 static inline int
2086 twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit,
2087 	    dns_qpshift_t fbit) {
2088 	dns_qpweight_t pos = branch_twig_pos(n, sbit);
2089 	if (branch_has_twig(n, sbit)) {
2090 		return pos - (kbit < fbit);
2091 	}
2092 	return pos - 1;
2093 }
2094 
2095 /*
2096  * If dns_qp_lookup() was passed an iterator, we want it to point at the
2097  * matching name in the case of an exact match, or at the predecessor name
2098  * for a non-exact match.
2099  *
2100  * If there is an exact match, then there is nothing to be done. Otherwise,
2101  * we pop up the iterator stack until we find a parent branch with an offset
2102  * that is before the position where the search key differs from the found key.
2103  * From there we can step to the leaf that is the predecessor of the searched
2104  * name.
2105  *
2106  * Requires the iterator to be pointing at a leaf node.
2107  */
2108 static void
2109 fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key,
2110 	     size_t len) {
2111 	dns_qpnode_t *n = it->stack[it->sp];
2112 
2113 	REQUIRE(!is_branch(n));
2114 
2115 	dns_qpkey_t found;
2116 	size_t foundlen = leaf_qpkey(qp, n, found);
2117 	size_t to = qpkey_compare(key, len, found, foundlen);
2118 
2119 	/* If the keys are equal, the iterator is already at the right node. */
2120 	if (to == QPKEY_EQUAL) {
2121 		return;
2122 	}
2123 
2124 	/*
2125 	 * Special case: if the key differs even before the root
2126 	 * key offset, it means the name desired either precedes or
2127 	 * follows the entire range of names in the database, and
2128 	 * popping up the stack won't help us, so just move the
2129 	 * iterator one step back from the origin and return.
2130 	 */
2131 	if (to < branch_key_offset(it->stack[0])) {
2132 		dns_qpiter_init(qp, it);
2133 		prevleaf(it);
2134 		return;
2135 	}
2136 
2137 	/*
2138 	 * As long as the branch offset point is after the point where the
2139 	 * key differs, we need to branch up and find a better node.
2140 	 */
2141 	while (it->sp > 0) {
2142 		dns_qpnode_t *b = it->stack[it->sp - 1];
2143 		if (branch_key_offset(b) < to) {
2144 			break;
2145 		}
2146 		it->sp--;
2147 	}
2148 	n = it->stack[it->sp];
2149 
2150 	/*
2151 	 * Either we are now at the correct branch, or we are at the
2152 	 * first unmatched node. Determine the bit position for the
2153 	 * twig we need (sbit).
2154 	 */
2155 	dns_qpshift_t kbit = qpkey_bit(key, len, to);
2156 	dns_qpshift_t fbit = qpkey_bit(found, foundlen, to);
2157 	dns_qpshift_t sbit = 0;
2158 
2159 	if (is_branch(n) && branch_key_offset(n) == to) {
2160 		/* We are on the correct branch now. */
2161 		sbit = kbit;
2162 	} else if (it->sp == 0) {
2163 		/*
2164 		 * We are on the root branch, popping up the stack won't
2165 		 * help us, so just move the iterator one step back from the
2166 		 * origin and return.
2167 		 */
2168 		dns_qpiter_init(qp, it);
2169 		prevleaf(it);
2170 		return;
2171 	} else {
2172 		/* We are at the first unmatched node, pop up the stack. */
2173 		n = it->stack[--it->sp];
2174 		sbit = qpkey_bit(key, len, branch_key_offset(n));
2175 	}
2176 
2177 	INSIST(is_branch(n));
2178 
2179 	prefetch_twigs(qp, n);
2180 	dns_qpnode_t *twigs = branch_twigs(qp, n);
2181 	int toff = twig_offset(n, sbit, kbit, fbit);
2182 	if (toff >= 0) {
2183 		/*
2184 		 * The name we want would've been after some twig in
2185 		 * this branch. Walk down from that twig to the
2186 		 * highest leaf in its subtree to get the predecessor.
2187 		 */
2188 		greatest_leaf(qp, twigs + toff, it);
2189 	} else {
2190 		/*
2191 		 * Every leaf below this node is greater than the one we
2192 		 * wanted, so the previous leaf is the predecessor.
2193 		 */
2194 		prevleaf(it);
2195 	}
2196 }
2197 
2198 /*
2199  * When searching for a requested name in dns_qp_lookup(), we might add
2200  * a leaf node to the chain, then subsequently determine that it was a
2201  * dead end. When this happens, the chain can be left holding a node
2202  * that is *not* an ancestor of the requested name. We correct for that
2203  * here.
2204  */
2205 static void
2206 fix_chain(dns_qpchain_t *chain, size_t offset) {
2207 	while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset)
2208 	{
2209 		chain->len--;
2210 		chain->chain[chain->len].node = NULL;
2211 		chain->chain[chain->len].offset = 0;
2212 	}
2213 }
2214 
2215 isc_result_t
2216 dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name,
2217 	      dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain,
2218 	      void **pval_r, uint32_t *ival_r) {
2219 	dns_qpreader_t *qp = dns_qpreader(qpr);
2220 	dns_qpkey_t search, found;
2221 	size_t searchlen, foundlen;
2222 	size_t offset = 0;
2223 	dns_qpnode_t *n = NULL;
2224 	dns_qpshift_t bit = SHIFT_NOBYTE;
2225 	dns_qpchain_t oc;
2226 	dns_qpiter_t it;
2227 	bool matched = false;
2228 	bool setiter = true;
2229 
2230 	REQUIRE(QP_VALID(qp));
2231 	REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
2232 
2233 	searchlen = dns_qpkey_fromname(search, name);
2234 
2235 	if (chain == NULL) {
2236 		chain = &oc;
2237 	}
2238 	if (iter == NULL) {
2239 		iter = &it;
2240 		setiter = false;
2241 	}
2242 	dns_qpchain_init(qp, chain);
2243 	dns_qpiter_init(qp, iter);
2244 
2245 	n = get_root(qp);
2246 	if (n == NULL) {
2247 		return ISC_R_NOTFOUND;
2248 	}
2249 	iter->stack[0] = n;
2250 
2251 	/*
2252 	 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a
2253 	 * second pass: instead, we keep track of any leaves with shorter keys
2254 	 * that we discover along the way. (In general, qp-trie searches can be
2255 	 * one-pass, by recording their traversal, or two-pass, for less stack
2256 	 * memory usage.)
2257 	 */
2258 	while (is_branch(n)) {
2259 		prefetch_twigs(qp, n);
2260 
2261 		offset = branch_key_offset(n);
2262 		bit = qpkey_bit(search, searchlen, offset);
2263 		dns_qpnode_t *twigs = branch_twigs(qp, n);
2264 
2265 		/*
2266 		 * A shorter key that can be a parent domain always has a
2267 		 * leaf node at SHIFT_NOBYTE (indicating end of its key)
2268 		 * where our search key has a normal character immediately
2269 		 * after a label separator.
2270 		 *
2271 		 * Note 1: It is OK if `off - 1` underflows: it will
2272 		 * become SIZE_MAX, which is greater than `searchlen`, so
2273 		 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we
2274 		 * want when `off == 0`.
2275 		 *
2276 		 * Note 2: If SHIFT_NOBYTE twig is present, it will always
2277 		 * be in position 0, the first location in 'twigs'.
2278 		 */
2279 		if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) &&
2280 		    qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE &&
2281 		    !is_branch(twigs))
2282 		{
2283 			add_link(chain, twigs, offset);
2284 		}
2285 
2286 		matched = branch_has_twig(n, bit);
2287 		if (matched) {
2288 			/*
2289 			 * found a match: if it's a branch, we keep
2290 			 * searching, and if it's a leaf, we drop out of
2291 			 * the loop.
2292 			 */
2293 			n = branch_twig_ptr(qp, n, bit);
2294 		} else {
2295 			/*
2296 			 * this branch is a dead end, and the predecessor
2297 			 * doesn't matter. now we just need to find a leaf
2298 			 * to end on so that qpkey_leaf() will work below.
2299 			 */
2300 			n = anyleaf(qp, twigs);
2301 		}
2302 
2303 		iter->stack[++iter->sp] = n;
2304 	}
2305 
2306 	if (setiter) {
2307 		/*
2308 		 * we found a leaf, but it might not be the leaf we wanted.
2309 		 * if it isn't, and if the caller passed us an iterator,
2310 		 * then we might need to reposition it.
2311 		 */
2312 		fix_iterator(qp, iter, search, searchlen);
2313 		n = iter->stack[iter->sp];
2314 	}
2315 
2316 	/* at this point, n can only be a leaf node */
2317 	INSIST(!is_branch(n));
2318 
2319 	foundlen = leaf_qpkey(qp, n, found);
2320 	offset = qpkey_compare(search, searchlen, found, foundlen);
2321 
2322 	/* the search ended with an exact or partial match */
2323 	if (offset == QPKEY_EQUAL || offset == foundlen) {
2324 		isc_result_t result = ISC_R_SUCCESS;
2325 
2326 		if (offset == foundlen) {
2327 			fix_chain(chain, offset);
2328 			result = DNS_R_PARTIALMATCH;
2329 		}
2330 		add_link(chain, n, offset);
2331 
2332 		SET_IF_NOT_NULL(pval_r, leaf_pval(n));
2333 		SET_IF_NOT_NULL(ival_r, leaf_ival(n));
2334 		maybe_set_name(qp, n, foundname);
2335 		return result;
2336 	}
2337 
2338 	/*
2339 	 * the requested name was not found, but if an ancestor
2340 	 * was, we can retrieve that from the chain.
2341 	 */
2342 	int len = chain->len;
2343 	while (len-- > 0) {
2344 		if (offset >= chain->chain[len].offset) {
2345 			n = chain->chain[len].node;
2346 			SET_IF_NOT_NULL(pval_r, leaf_pval(n));
2347 			SET_IF_NOT_NULL(ival_r, leaf_ival(n));
2348 			maybe_set_name(qp, n, foundname);
2349 			return DNS_R_PARTIALMATCH;
2350 		} else {
2351 			/*
2352 			 * oops, during the search we found and added
2353 			 * a leaf that's longer than the requested
2354 			 * name; remove it from the chain.
2355 			 */
2356 			chain->len--;
2357 		}
2358 	}
2359 
2360 	/* nothing was found at all */
2361 	return ISC_R_NOTFOUND;
2362 }
2363 
2364 /**********************************************************************/
2365