1 /* $NetBSD: qp.c,v 1.4 2025/01/27 15:40:36 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * For an overview, see doc/design/qp-trie.md 18 */ 19 20 #include <inttypes.h> 21 #include <stdbool.h> 22 #include <stddef.h> 23 #include <stdint.h> 24 #include <string.h> 25 26 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 27 #include <sys/mman.h> 28 #include <unistd.h> 29 #endif 30 31 #include <isc/atomic.h> 32 #include <isc/buffer.h> 33 #include <isc/magic.h> 34 #include <isc/mem.h> 35 #include <isc/mutex.h> 36 #include <isc/refcount.h> 37 #include <isc/result.h> 38 #include <isc/rwlock.h> 39 #include <isc/tid.h> 40 #include <isc/time.h> 41 #include <isc/types.h> 42 #include <isc/urcu.h> 43 #include <isc/util.h> 44 45 #include <dns/fixedname.h> 46 #include <dns/log.h> 47 #include <dns/name.h> 48 #include <dns/qp.h> 49 #include <dns/types.h> 50 51 #include "qp_p.h" 52 53 #ifndef DNS_QP_LOG_STATS 54 #define DNS_QP_LOG_STATS 1 55 #endif 56 #ifndef DNS_QP_TRACE 57 #define DNS_QP_TRACE 0 58 #endif 59 60 /* 61 * very basic garbage collector statistics 62 * 63 * XXXFANF for now we're logging GC times, but ideally we should 64 * accumulate stats more quietly and report via the statschannel 65 */ 66 #ifdef _LP64 67 static atomic_uint_fast64_t compact_time; 68 static atomic_uint_fast64_t recycle_time; 69 static atomic_uint_fast64_t rollback_time; 70 #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a)) 71 #define ISC_QP_GET(v) atomic_load_relaxed(&(v)) 72 #else 73 static uint64_t compact_time; 74 static uint64_t recycle_time; 75 static uint64_t rollback_time; 76 static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER; 77 #define ISC_QP_ADD(v, a) \ 78 ({ \ 79 isc_mutex_lock(&qp_mutex); \ 80 uint64_t x = (v) + (a); \ 81 isc_mutex_unlock(&qp_mutex); \ 82 x; \ 83 }) 84 #define ISC_QP_GET(v) \ 85 ({ \ 86 isc_mutex_lock(&qp_mutex); \ 87 uint64_t x = (v); \ 88 isc_mutex_unlock(&qp_mutex); \ 89 x; \ 90 }) 91 #endif 92 93 94 /* for LOG_STATS() format strings */ 95 #define PRItime " %" PRIu64 " ns " 96 97 #if DNS_QP_LOG_STATS 98 #define LOG_STATS(...) \ 99 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \ 100 ISC_LOG_DEBUG(1), __VA_ARGS__) 101 #else 102 #define LOG_STATS(...) 103 #endif 104 105 #if DNS_QP_TRACE 106 /* 107 * TRACE is generally used in allocation-related functions so it doesn't 108 * trace very high-frequency ops 109 */ 110 #define TRACE(fmt, ...) \ 111 do { \ 112 if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) { \ 113 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, \ 114 DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7), \ 115 "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \ 116 __FILE__, __LINE__, __func__, qp, \ 117 qp ? TRIENAME(qp) : "(null)", isc_tid(), \ 118 ##__VA_ARGS__); \ 119 } \ 120 } while (0) 121 #else 122 #define TRACE(...) 123 #endif 124 125 /*********************************************************************** 126 * 127 * converting DNS names to trie keys 128 */ 129 130 /* 131 * Number of distinct byte values, i.e. 256 132 */ 133 #define BYTE_VALUES (UINT8_MAX + 1) 134 135 /* 136 * Lookup table mapping bytes in DNS names to bit positions, used 137 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. 138 * 139 * Each element holds one or two bit positions, bit_one in the 140 * lower half and bit_two in the upper half. 141 * 142 * For common hostname characters, bit_two is zero (which cannot 143 * be a valid bit position). 144 * 145 * For others, bit_one is the escape bit, and bit_two is the 146 * position of the character within the escaped range. 147 */ 148 uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 }; 149 150 /* 151 * And the reverse, mapping bit positions to characters, so the tests 152 * can print diagnostics involving qp-trie keys. 153 * 154 * This table only handles the first bit in an escape sequence; we 155 * arrange that we can calculate the byte value for both bits by 156 * adding the the second bit to the first bit's byte value. 157 */ 158 uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 }; 159 160 /* 161 * Fill in the lookup tables at program startup. (It doesn't matter 162 * when this is initialized relative to other startup code.) 163 */ 164 static void 165 initialize_bits_for_byte(void) ISC_CONSTRUCTOR; 166 167 /* 168 * The bit positions for bytes inside labels have to be between 169 * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.) 170 * 171 * Each byte range in between common hostname characters has a different 172 * escape character, to preserve the correct lexical order. 173 * 174 * Escaped byte ranges mostly fit into the space available in the 175 * bitmap, except for those above 'z' (which is mostly bytes with the 176 * top bit set). So, when we reach the end of the bitmap we roll over 177 * to the next escape character. 178 * 179 * After filling the table we ensure that the bit positions for 180 * hostname characters and escape characters all fit. 181 */ 182 static void 183 initialize_bits_for_byte(void) { 184 /* zero common character marker not a valid shift position */ 185 INSIST(0 < SHIFT_BITMAP); 186 /* first bit is common byte or escape byte */ 187 dns_qpshift_t bit_one = SHIFT_BITMAP; 188 /* second bit is position in escaped range */ 189 dns_qpshift_t bit_two = SHIFT_BITMAP; 190 bool escaping = true; 191 192 for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) { 193 if (qp_common_character(byte)) { 194 escaping = false; 195 bit_one++; 196 dns_qp_byte_for_bit[bit_one] = byte; 197 dns_qp_bits_for_byte[byte] = bit_one; 198 } else if ('A' <= byte && byte <= 'Z') { 199 /* map upper case to lower case */ 200 dns_qpshift_t after_esc = bit_one + 1; 201 dns_qpshift_t skip_punct = 'a' - '_'; 202 dns_qpshift_t letter = byte - 'A'; 203 dns_qpshift_t bit = after_esc + skip_punct + letter; 204 dns_qp_bits_for_byte[byte] = bit; 205 /* to simplify reverse conversion */ 206 bit_two++; 207 } else { 208 /* non-hostname characters need to be escaped */ 209 if (!escaping || bit_two >= SHIFT_OFFSET) { 210 escaping = true; 211 bit_one++; 212 dns_qp_byte_for_bit[bit_one] = byte; 213 bit_two = SHIFT_BITMAP; 214 } 215 dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one; 216 bit_two++; 217 } 218 } 219 ENSURE(bit_one < SHIFT_OFFSET); 220 } 221 222 /* 223 * Convert a DNS name into a trie lookup key. 224 * 225 * Returns the length of the key. 226 * 227 * For performance we get our hands dirty in the guts of the name. 228 * 229 * We don't worry about the distinction between absolute and relative 230 * names. When the trie is only used with absolute names, the first byte 231 * of the key will always be SHIFT_NOBYTE and it will always be skipped 232 * when traversing the trie. So keeping the root label costs little, and 233 * it allows us to support tries of relative names too. In fact absolute 234 * and relative names can be mixed in the same trie without causing 235 * confusion, because the presence or absence of the initial 236 * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing 237 * dot in a zone file). 238 */ 239 size_t 240 dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) { 241 size_t len, label; 242 dns_fixedname_t fixed; 243 244 REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 245 246 if (name->labels == 0) { 247 key[0] = SHIFT_NOBYTE; 248 return 0; 249 } 250 251 if (name->offsets == NULL) { 252 dns_name_t *clone = dns_fixedname_initname(&fixed); 253 dns_name_clone(name, clone); 254 name = clone; 255 } 256 257 len = 0; 258 label = name->labels; 259 while (label-- > 0) { 260 const uint8_t *ldata = name->ndata + name->offsets[label]; 261 size_t label_len = *ldata++; 262 while (label_len-- > 0) { 263 uint16_t bits = dns_qp_bits_for_byte[*ldata++]; 264 key[len++] = bits & 0xFF; /* bit_one */ 265 if ((bits >> 8) != 0) { /* escape? */ 266 key[len++] = bits >> 8; /* bit_two */ 267 } 268 } 269 /* label terminator */ 270 key[len++] = SHIFT_NOBYTE; 271 } 272 /* mark end with a double NOBYTE */ 273 key[len] = SHIFT_NOBYTE; 274 ENSURE(len < sizeof(dns_qpkey_t)); 275 return len; 276 } 277 278 void 279 dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) { 280 size_t locs[DNS_NAME_MAXLABELS]; 281 size_t loc = 0, opos = 0; 282 size_t offset; 283 284 REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 285 REQUIRE(name->buffer != NULL); 286 REQUIRE(name->offsets != NULL); 287 288 dns_name_reset(name); 289 290 if (keylen == 0) { 291 return; 292 } 293 294 /* Scan the key looking for label boundaries */ 295 for (offset = 0; offset <= keylen; offset++) { 296 INSIST(key[offset] >= SHIFT_NOBYTE && 297 key[offset] < SHIFT_OFFSET); 298 INSIST(loc < DNS_NAME_MAXLABELS); 299 if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) { 300 if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE) 301 { 302 locs[loc] = offset + 1; 303 goto scanned; 304 } 305 locs[loc++] = offset + 1; 306 } else if (offset == 0) { 307 /* This happens for a relative name */ 308 locs[loc++] = offset; 309 } 310 } 311 UNREACHABLE(); 312 scanned: 313 314 /* 315 * In the key the labels are encoded in reverse order, so 316 * we step backward through the label boundaries, then forward 317 * through the labels, to create the DNS wire format data. 318 */ 319 name->labels = loc; 320 while (loc-- > 0) { 321 uint8_t len = 0, *lenp = NULL; 322 323 /* Add a length byte to the name data and set an offset */ 324 lenp = isc_buffer_used(name->buffer); 325 isc_buffer_putuint8(name->buffer, 0); 326 name->offsets[opos++] = name->length++; 327 328 /* Convert from escaped byte ranges to ASCII */ 329 for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) { 330 uint8_t bit = qpkey_bit(key, keylen, offset); 331 uint8_t byte = dns_qp_byte_for_bit[bit]; 332 if (qp_common_character(byte)) { 333 isc_buffer_putuint8(name->buffer, byte); 334 } else { 335 byte += key[++offset] - SHIFT_BITMAP; 336 isc_buffer_putuint8(name->buffer, byte); 337 } 338 len++; 339 } 340 341 name->length += len; 342 *lenp = len; 343 } 344 345 /* Add a root label for absolute names */ 346 if (key[0] == SHIFT_NOBYTE) { 347 name->attributes.absolute = true; 348 isc_buffer_putuint8(name->buffer, 0); 349 name->offsets[opos++] = name->length++; 350 name->labels++; 351 } 352 353 name->ndata = isc_buffer_base(name->buffer); 354 } 355 356 /* 357 * Sentinel value for equal keys 358 */ 359 #define QPKEY_EQUAL (~(size_t)0) 360 361 /* 362 * Compare two keys and return the offset where they differ. 363 * 364 * This offset is used to work out where a trie search diverged: when one 365 * of the keys is in the trie and one is not, the common prefix (up to the 366 * offset) is the part of the unknown key that exists in the trie. This 367 * matters for adding new keys or finding neighbours of missing keys. 368 * 369 * When the keys are different lengths it is possible (but unwise) for 370 * the longer key to be the same as the shorter key but with superfluous 371 * trailing SHIFT_NOBYTE elements. This makes the keys equal for the 372 * purpose of traversing the trie. 373 */ 374 static size_t 375 qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a, 376 const dns_qpkey_t key_b, const size_t keylen_b) { 377 size_t keylen = ISC_MAX(keylen_a, keylen_b); 378 for (size_t offset = 0; offset < keylen; offset++) { 379 if (qpkey_bit(key_a, keylen_a, offset) != 380 qpkey_bit(key_b, keylen_b, offset)) 381 { 382 return offset; 383 } 384 } 385 return QPKEY_EQUAL; 386 } 387 388 /*********************************************************************** 389 * 390 * allocator wrappers 391 */ 392 393 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 394 395 /* 396 * Optionally (for debugging) during a copy-on-write transaction, use 397 * memory protection to ensure that the shared chunks are not modified. 398 * Once a chunk becomes shared, it remains read-only until it is freed. 399 * POSIX says we have to use mmap() to get an allocation that we can 400 * definitely pass to mprotect(). 401 */ 402 403 static size_t 404 chunk_size_raw(void) { 405 size_t size = (size_t)sysconf(_SC_PAGE_SIZE); 406 return ISC_MAX(size, QP_CHUNK_BYTES); 407 } 408 409 static void * 410 chunk_get_raw(dns_qp_t *qp) { 411 if (qp->write_protect) { 412 size_t size = chunk_size_raw(); 413 void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, 414 MAP_ANON | MAP_PRIVATE, -1, 0); 415 RUNTIME_CHECK(ptr != MAP_FAILED); 416 return ptr; 417 } else { 418 return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES); 419 } 420 } 421 422 static void 423 chunk_free_raw(dns_qp_t *qp, void *ptr) { 424 if (qp->write_protect) { 425 RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0); 426 } else { 427 isc_mem_free(qp->mctx, ptr); 428 } 429 } 430 431 static void * 432 chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) { 433 if (qp->write_protect) { 434 return ptr; 435 } else { 436 return isc_mem_reallocate(qp->mctx, ptr, bytes); 437 } 438 } 439 440 static void 441 write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) { 442 if (qp->write_protect) { 443 /* see transaction_open() wrt this special case */ 444 if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) { 445 return; 446 } 447 TRACE("chunk %u", chunk); 448 void *ptr = qp->base->ptr[chunk]; 449 size_t size = chunk_size_raw(); 450 RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0); 451 } 452 } 453 454 #else 455 456 #define chunk_get_raw(qp) isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES) 457 #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr) 458 459 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size) 460 461 #define write_protect(qp, chunk) 462 463 #endif 464 465 /*********************************************************************** 466 * 467 * allocator 468 */ 469 470 /* 471 * When we reuse the bump chunk across multiple write transactions, 472 * it can have an immutable prefix and a mutable suffix. 473 */ 474 static inline bool 475 cells_immutable(dns_qp_t *qp, dns_qpref_t ref) { 476 dns_qpchunk_t chunk = ref_chunk(ref); 477 dns_qpcell_t cell = ref_cell(ref); 478 if (chunk == qp->bump) { 479 return cell < qp->fender; 480 } else { 481 return qp->usage[chunk].immutable; 482 } 483 } 484 485 /* 486 * Create a fresh bump chunk and allocate some twigs from it. 487 */ 488 static dns_qpref_t 489 chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) { 490 INSIST(qp->base->ptr[chunk] == NULL); 491 INSIST(qp->usage[chunk].used == 0); 492 INSIST(qp->usage[chunk].free == 0); 493 494 qp->base->ptr[chunk] = chunk_get_raw(qp); 495 qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size }; 496 qp->used_count += size; 497 qp->bump = chunk; 498 qp->fender = 0; 499 500 if (qp->write_protect) { 501 TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 502 } 503 return make_ref(chunk, 0); 504 } 505 506 /* 507 * This is used to grow the chunk arrays when they fill up. If the old 508 * base array is in use by readers, we must make a clone, otherwise we 509 * can reallocate in place. 510 * 511 * The isc_refcount_init() and qpbase_unref() in this function are a pair. 512 */ 513 static void 514 realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) { 515 size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max; 516 size_t newptrs = sizeof(qp->base->ptr[0]) * newmax; 517 size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax); 518 519 if (qp->base == NULL || qpbase_unref(qp)) { 520 qp->base = isc_mem_reallocate(qp->mctx, qp->base, size); 521 } else { 522 dns_qpbase_t *oldbase = qp->base; 523 qp->base = isc_mem_allocate(qp->mctx, size); 524 memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs); 525 } 526 memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); 527 isc_refcount_init(&qp->base->refcount, 1); 528 qp->base->magic = QPBASE_MAGIC; 529 530 /* usage array is exclusive to the writer */ 531 size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max; 532 size_t newusage = sizeof(qp->usage[0]) * newmax; 533 qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage); 534 memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage); 535 536 qp->chunk_max = newmax; 537 538 TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max); 539 } 540 541 /* 542 * There was no space in the bump chunk, so find a place to put a fresh 543 * chunk in the chunk arrays, then allocate some twigs from it. 544 */ 545 static dns_qpref_t 546 alloc_slow(dns_qp_t *qp, dns_qpweight_t size) { 547 dns_qpchunk_t chunk; 548 549 for (chunk = 0; chunk < qp->chunk_max; chunk++) { 550 if (!qp->usage[chunk].exists) { 551 return chunk_alloc(qp, chunk, size); 552 } 553 } 554 ENSURE(chunk == qp->chunk_max); 555 realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk)); 556 return chunk_alloc(qp, chunk, size); 557 } 558 559 /* 560 * Ensure we are using a fresh bump chunk. 561 */ 562 static void 563 alloc_reset(dns_qp_t *qp) { 564 (void)alloc_slow(qp, 0); 565 } 566 567 /* 568 * Allocate some fresh twigs. This is the bump allocator fast path. 569 */ 570 static inline dns_qpref_t 571 alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) { 572 dns_qpchunk_t chunk = qp->bump; 573 dns_qpcell_t cell = qp->usage[chunk].used; 574 575 if (cell + size <= QP_CHUNK_SIZE) { 576 qp->usage[chunk].used += size; 577 qp->used_count += size; 578 return make_ref(chunk, cell); 579 } else { 580 return alloc_slow(qp, size); 581 } 582 } 583 584 /* 585 * Record that some twigs are no longer being used, and if possible 586 * zero them to ensure that there isn't a spurious double detach when 587 * the chunk is later recycled. 588 * 589 * Returns true if the twigs were immediately destroyed. 590 * 591 * NOTE: the caller is responsible for attaching or detaching any 592 * leaves as required. 593 */ 594 static inline bool 595 free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 596 dns_qpchunk_t chunk = ref_chunk(twigs); 597 598 qp->free_count += size; 599 qp->usage[chunk].free += size; 600 ENSURE(qp->free_count <= qp->used_count); 601 ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used); 602 603 if (cells_immutable(qp, twigs)) { 604 qp->hold_count += size; 605 ENSURE(qp->free_count >= qp->hold_count); 606 return false; 607 } else { 608 zero_twigs(ref_ptr(qp, twigs), size); 609 return true; 610 } 611 } 612 613 /* 614 * When some twigs have been copied, and free_twigs() could not 615 * immediately destroy the old copy, we need to update the refcount 616 * on any leaves that were duplicated. 617 */ 618 static void 619 attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) { 620 for (dns_qpweight_t pos = 0; pos < size; pos++) { 621 if (node_tag(&twigs[pos]) == LEAF_TAG) { 622 attach_leaf(qp, &twigs[pos]); 623 } 624 } 625 } 626 627 /*********************************************************************** 628 * 629 * chunk reclamation 630 */ 631 632 /* 633 * Is any of this chunk still in use? 634 */ 635 static inline dns_qpcell_t 636 chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) { 637 return qp->usage[chunk].used - qp->usage[chunk].free; 638 } 639 640 /* 641 * We remove each empty chunk from the total counts when the chunk is 642 * freed, or when it is scheduled for safe memory reclamation. We check 643 * the chunk's phase to avoid discounting it twice in the latter case. 644 */ 645 static void 646 chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) { 647 if (qp->usage[chunk].discounted) { 648 return; 649 } 650 INSIST(qp->used_count >= qp->usage[chunk].used); 651 INSIST(qp->free_count >= qp->usage[chunk].free); 652 qp->used_count -= qp->usage[chunk].used; 653 qp->free_count -= qp->usage[chunk].free; 654 qp->usage[chunk].discounted = true; 655 } 656 657 /* 658 * When a chunk is being recycled, we need to detach any leaves that 659 * remain, and free any `base` arrays that have been marked as unused. 660 */ 661 static void 662 chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) { 663 if (qp->write_protect) { 664 TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 665 } 666 667 dns_qpnode_t *n = qp->base->ptr[chunk]; 668 for (dns_qpcell_t count = qp->usage[chunk].used; count > 0; 669 count--, n++) 670 { 671 if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) { 672 detach_leaf(qp, n); 673 } else if (count > 1 && reader_valid(n)) { 674 dns_qpreader_t qpr; 675 unpack_reader(&qpr, n); 676 /* pairs with dns_qpmulti_commit() */ 677 if (qpbase_unref(&qpr)) { 678 isc_mem_free(qp->mctx, qpr.base); 679 } 680 } 681 } 682 chunk_discount(qp, chunk); 683 chunk_free_raw(qp, qp->base->ptr[chunk]); 684 qp->base->ptr[chunk] = NULL; 685 qp->usage[chunk] = (qp_usage_t){}; 686 } 687 688 /* 689 * Free any chunks that we can while a trie is in use. 690 */ 691 static void 692 recycle(dns_qp_t *qp) { 693 unsigned int free = 0; 694 695 isc_nanosecs_t start = isc_time_monotonic(); 696 697 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 698 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 699 qp->usage[chunk].exists && !qp->usage[chunk].immutable) 700 { 701 chunk_free(qp, chunk); 702 free++; 703 } 704 } 705 706 isc_nanosecs_t time = isc_time_monotonic() - start; 707 ISC_QP_ADD(recycle_time, time); 708 709 if (free > 0) { 710 LOG_STATS("qp recycle" PRItime "free %u chunks", time, free); 711 LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u", 712 qp->leaf_count, qp->used_count - qp->free_count, 713 qp->used_count, qp->free_count, qp->hold_count); 714 } 715 } 716 717 /* 718 * asynchronous cleanup 719 */ 720 static void 721 reclaim_chunks_cb(struct rcu_head *arg) { 722 qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 723 REQUIRE(QPRCU_VALID(rcuctx)); 724 dns_qpmulti_t *multi = rcuctx->multi; 725 REQUIRE(QPMULTI_VALID(multi)); 726 727 LOCK(&multi->mutex); 728 729 dns_qp_t *qp = &multi->writer; 730 REQUIRE(QP_VALID(qp)); 731 732 unsigned int free = 0; 733 isc_nanosecs_t start = isc_time_monotonic(); 734 735 for (unsigned int i = 0; i < rcuctx->count; i++) { 736 dns_qpchunk_t chunk = rcuctx->chunk[i]; 737 if (qp->usage[chunk].snapshot) { 738 /* cleanup when snapshot is destroyed */ 739 qp->usage[chunk].snapfree = true; 740 } else { 741 chunk_free(qp, chunk); 742 free++; 743 } 744 } 745 746 isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 747 STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 748 749 isc_nanosecs_t time = isc_time_monotonic() - start; 750 ISC_QP_ADD(recycle_time, time); 751 752 if (free > 0) { 753 LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free); 754 LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u", 755 qp->leaf_count, qp->used_count - qp->free_count, 756 qp->used_count, qp->free_count, qp->hold_count); 757 } 758 759 UNLOCK(&multi->mutex); 760 } 761 762 /* 763 * At the end of a transaction, schedule empty but immutable chunks 764 * for reclamation later. 765 */ 766 static void 767 reclaim_chunks(dns_qpmulti_t *multi) { 768 dns_qp_t *qp = &multi->writer; 769 770 unsigned int count = 0; 771 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 772 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 773 qp->usage[chunk].exists && qp->usage[chunk].immutable && 774 !qp->usage[chunk].discounted) 775 { 776 count++; 777 } 778 } 779 780 if (count == 0) { 781 return; 782 } 783 784 qp_rcuctx_t *rcuctx = 785 isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count)); 786 *rcuctx = (qp_rcuctx_t){ 787 .magic = QPRCU_MAGIC, 788 .multi = multi, 789 .count = count, 790 }; 791 isc_mem_attach(qp->mctx, &rcuctx->mctx); 792 793 unsigned int i = 0; 794 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 795 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 796 qp->usage[chunk].exists && qp->usage[chunk].immutable && 797 !qp->usage[chunk].discounted) 798 { 799 rcuctx->chunk[i++] = chunk; 800 chunk_discount(qp, chunk); 801 } 802 } 803 804 call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb); 805 806 LOG_STATS("qp will reclaim %u chunks", count); 807 } 808 809 /* 810 * When a snapshot is destroyed, clean up chunks that need free()ing 811 * and are not used by any remaining snapshots. 812 */ 813 static void 814 marksweep_chunks(dns_qpmulti_t *multi) { 815 unsigned int free = 0; 816 817 isc_nanosecs_t start = isc_time_monotonic(); 818 819 dns_qp_t *qpw = &multi->writer; 820 821 for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL; 822 qps = ISC_LIST_NEXT(qps, link)) 823 { 824 for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) { 825 if (qps->base->ptr[chunk] != NULL) { 826 INSIST(qps->base->ptr[chunk] == 827 qpw->base->ptr[chunk]); 828 qpw->usage[chunk].snapmark = true; 829 } 830 } 831 } 832 833 for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 834 qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark; 835 qpw->usage[chunk].snapmark = false; 836 if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) { 837 chunk_free(qpw, chunk); 838 free++; 839 } 840 } 841 842 isc_nanosecs_t time = isc_time_monotonic() - start; 843 ISC_QP_ADD(recycle_time, time); 844 845 if (free > 0) { 846 LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free); 847 LOG_STATS( 848 "qp marksweep leaf %u live %u used %u free %u hold %u", 849 qpw->leaf_count, qpw->used_count - qpw->free_count, 850 qpw->used_count, qpw->free_count, qpw->hold_count); 851 } 852 } 853 854 /*********************************************************************** 855 * 856 * garbage collector 857 */ 858 859 /* 860 * Move a branch node's twigs to the `bump` chunk, for copy-on-write 861 * or for garbage collection. We don't update the node in place 862 * because `compact_recursive()` does not ensure the node itself is 863 * mutable until after it discovers evacuation was necessary. 864 * 865 * If free_twigs() could not immediately destroy the old twigs, we have 866 * to re-attach to any leaves. 867 */ 868 static dns_qpref_t 869 evacuate(dns_qp_t *qp, dns_qpnode_t *n) { 870 dns_qpweight_t size = branch_twigs_size(n); 871 dns_qpref_t old_ref = branch_twigs_ref(n); 872 dns_qpref_t new_ref = alloc_twigs(qp, size); 873 dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref); 874 dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref); 875 876 move_twigs(new_twigs, old_twigs, size); 877 if (!free_twigs(qp, old_ref, size)) { 878 attach_twigs(qp, new_twigs, size); 879 } 880 881 return new_ref; 882 } 883 884 /* 885 * Immutable nodes need copy-on-write. As we walk down the trie finding the 886 * right place to modify, make_root_mutable() and make_twigs_mutable() 887 * are called to ensure that immutable nodes on the path from the root are 888 * copied to a mutable chunk. 889 */ 890 891 static inline dns_qpnode_t * 892 make_root_mutable(dns_qp_t *qp) { 893 if (cells_immutable(qp, qp->root_ref)) { 894 qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp)); 895 } 896 return ref_ptr(qp, qp->root_ref); 897 } 898 899 static inline void 900 make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) { 901 if (cells_immutable(qp, branch_twigs_ref(n))) { 902 *n = make_node(branch_index(n), evacuate(qp, n)); 903 } 904 } 905 906 /* 907 * Compact the trie by traversing the whole thing recursively, copying 908 * bottom-up as required. The aim is to avoid evacuation as much as 909 * possible, but when parts of the trie are immutable, we need to evacuate 910 * the paths from the root to the parts of the trie that occupy 911 * fragmented chunks. 912 * 913 * Without the QP_MIN_USED check, the algorithm will leave the trie 914 * unchanged. If the children are all leaves, the loop changes nothing, 915 * so we will return this node's original ref. If all of the children 916 * that are branches did not need moving, again, the loop changes 917 * nothing. So the evacuation check is the only place that the 918 * algorithm introduces ref changes, that then bubble up towards the 919 * root through the logic inside the loop. 920 */ 921 static dns_qpref_t 922 compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) { 923 dns_qpweight_t size = branch_twigs_size(parent); 924 dns_qpref_t twigs_ref = branch_twigs_ref(parent); 925 dns_qpchunk_t chunk = ref_chunk(twigs_ref); 926 927 if (qp->compact_all || 928 (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED)) 929 { 930 twigs_ref = evacuate(qp, parent); 931 } 932 bool immutable = cells_immutable(qp, twigs_ref); 933 for (dns_qpweight_t pos = 0; pos < size; pos++) { 934 dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos; 935 if (!is_branch(child)) { 936 continue; 937 } 938 dns_qpref_t old_grandtwigs = branch_twigs_ref(child); 939 dns_qpref_t new_grandtwigs = compact_recursive(qp, child); 940 if (old_grandtwigs == new_grandtwigs) { 941 continue; 942 } 943 if (immutable) { 944 twigs_ref = evacuate(qp, parent); 945 /* the twigs have moved */ 946 child = ref_ptr(qp, twigs_ref) + pos; 947 immutable = false; 948 } 949 *child = make_node(branch_index(child), new_grandtwigs); 950 } 951 return twigs_ref; 952 } 953 954 static void 955 compact(dns_qp_t *qp) { 956 LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u", 957 qp->leaf_count, qp->used_count - qp->free_count, 958 qp->used_count, qp->free_count, qp->hold_count); 959 960 isc_nanosecs_t start = isc_time_monotonic(); 961 962 if (qp->usage[qp->bump].free > QP_MAX_FREE) { 963 alloc_reset(qp); 964 } 965 966 if (qp->leaf_count > 0) { 967 qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp)); 968 } 969 qp->compact_all = false; 970 971 isc_nanosecs_t time = isc_time_monotonic() - start; 972 ISC_QP_ADD(compact_time, time); 973 974 LOG_STATS("qp compact" PRItime 975 "leaf %u live %u used %u free %u hold %u", 976 time, qp->leaf_count, qp->used_count - qp->free_count, 977 qp->used_count, qp->free_count, qp->hold_count); 978 } 979 980 void 981 dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) { 982 REQUIRE(QP_VALID(qp)); 983 if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) { 984 return; 985 } 986 if (mode == DNS_QPGC_ALL) { 987 alloc_reset(qp); 988 qp->compact_all = true; 989 } 990 compact(qp); 991 recycle(qp); 992 } 993 994 /* 995 * Free some twigs and (if they were destroyed immediately so that the 996 * result from QP_MAX_GARBAGE can change) compact the trie if necessary. 997 * 998 * This is called by the trie modification API entry points. The 999 * free_twigs() function requires the caller to attach or detach any 1000 * leaves as necessary. Callers of squash_twigs() satisfy this 1001 * requirement by calling make_twigs_mutable(). 1002 * 1003 * Aside: In typical garbage collectors, compaction is triggered when 1004 * the allocator runs out of space. But that is because typical garbage 1005 * collectors do not know how much memory can be recovered, so they must 1006 * find out by scanning the heap. The qp-trie code was originally 1007 * designed to use malloc() and free(), so it has more information about 1008 * when garbage collection might be worthwhile. Hence we can trigger 1009 * collection when garbage passes a threshold. 1010 * 1011 * XXXFANF: If we need to avoid latency outliers caused by compaction in 1012 * write transactions, we can check qp->transaction_mode here. 1013 */ 1014 static inline bool 1015 squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 1016 bool destroyed = free_twigs(qp, twigs, size); 1017 if (destroyed && QP_AUTOGC(qp)) { 1018 compact(qp); 1019 recycle(qp); 1020 /* 1021 * This shouldn't happen if the garbage collector is 1022 * working correctly. We can recover at the cost of some 1023 * time and space, but recovery should be cheaper than 1024 * letting compact+recycle fail repeatedly. 1025 */ 1026 if (QP_AUTOGC(qp)) { 1027 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 1028 DNS_LOGMODULE_QP, ISC_LOG_NOTICE, 1029 "qp %p uctx \"%s\" compact/recycle " 1030 "failed to recover any space, " 1031 "scheduling a full compaction", 1032 qp, TRIENAME(qp)); 1033 qp->compact_all = true; 1034 } 1035 } 1036 return destroyed; 1037 } 1038 1039 /*********************************************************************** 1040 * 1041 * public accessors for memory management internals 1042 */ 1043 1044 dns_qp_memusage_t 1045 dns_qp_memusage(dns_qp_t *qp) { 1046 REQUIRE(QP_VALID(qp)); 1047 1048 dns_qp_memusage_t memusage = { 1049 .uctx = qp->uctx, 1050 .leaves = qp->leaf_count, 1051 .live = qp->used_count - qp->free_count, 1052 .used = qp->used_count, 1053 .hold = qp->hold_count, 1054 .free = qp->free_count, 1055 .node_size = sizeof(dns_qpnode_t), 1056 .chunk_size = QP_CHUNK_SIZE, 1057 .fragmented = QP_NEEDGC(qp), 1058 }; 1059 1060 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1061 if (qp->base->ptr[chunk] != NULL) { 1062 memusage.chunk_count += 1; 1063 } 1064 } 1065 1066 /* 1067 * XXXFANF does not subtract chunks that have been shrunk, 1068 * and does not count unreclaimed dns_qpbase_t objects 1069 */ 1070 memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES + 1071 qp->chunk_max * sizeof(qp->base->ptr[0]) + 1072 qp->chunk_max * sizeof(qp->usage[0]); 1073 1074 return memusage; 1075 } 1076 1077 dns_qp_memusage_t 1078 dns_qpmulti_memusage(dns_qpmulti_t *multi) { 1079 REQUIRE(QPMULTI_VALID(multi)); 1080 LOCK(&multi->mutex); 1081 1082 dns_qp_t *qp = &multi->writer; 1083 INSIST(QP_VALID(qp)); 1084 1085 dns_qp_memusage_t memusage = dns_qp_memusage(qp); 1086 1087 if (qp->transaction_mode == QP_UPDATE) { 1088 memusage.bytes -= QP_CHUNK_BYTES; 1089 memusage.bytes += qp->usage[qp->bump].used * 1090 sizeof(dns_qpnode_t); 1091 } 1092 1093 UNLOCK(&multi->mutex); 1094 return memusage; 1095 } 1096 1097 void 1098 dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p, 1099 isc_nanosecs_t *rollback_p) { 1100 *compact_p = ISC_QP_GET(compact_time); 1101 *recycle_p = ISC_QP_GET(recycle_time); 1102 *rollback_p = ISC_QP_GET(rollback_time); 1103 } 1104 1105 /*********************************************************************** 1106 * 1107 * read-write transactions 1108 */ 1109 1110 static dns_qp_t * 1111 transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1112 REQUIRE(QPMULTI_VALID(multi)); 1113 REQUIRE(qptp != NULL && *qptp == NULL); 1114 1115 LOCK(&multi->mutex); 1116 1117 dns_qp_t *qp = &multi->writer; 1118 INSIST(QP_VALID(qp)); 1119 1120 /* 1121 * Mark existing chunks as immutable. 1122 * 1123 * Aside: The bump chunk is special: in a series of write 1124 * transactions the bump chunk is reused; the first part (up 1125 * to fender) is immutable, the rest mutable. But we set its 1126 * immutable flag so that when the bump chunk fills up, the 1127 * first part continues to be treated as immutable. (And the 1128 * rest of the chunk too, but that's OK.) 1129 */ 1130 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1131 if (qp->usage[chunk].exists) { 1132 qp->usage[chunk].immutable = true; 1133 write_protect(qp, chunk); 1134 } 1135 } 1136 1137 /* 1138 * Ensure QP_AUTOGC() ignores free space in immutable chunks. 1139 */ 1140 qp->hold_count = qp->free_count; 1141 1142 *qptp = qp; 1143 return qp; 1144 } 1145 1146 /* 1147 * a write is light 1148 * 1149 * We need to ensure we allocate from a fresh chunk if the last transaction 1150 * shrunk the bump chunk; but usually in a sequence of write transactions 1151 * we just put `fender` at the point where we started this generation. 1152 * 1153 * (Aside: Instead of keeping the previous transaction's mode, I 1154 * considered forcing allocation into the slow path by fiddling with 1155 * the bump chunk's usage counters. But that is troublesome because 1156 * `chunk_free()` needs to know how much of the chunk to scan.) 1157 */ 1158 void 1159 dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1160 dns_qp_t *qp = transaction_open(multi, qptp); 1161 TRACE(""); 1162 1163 if (qp->transaction_mode == QP_WRITE) { 1164 qp->fender = qp->usage[qp->bump].used; 1165 } else { 1166 alloc_reset(qp); 1167 } 1168 qp->transaction_mode = QP_WRITE; 1169 } 1170 1171 /* 1172 * an update is heavier 1173 * 1174 * We always reset the allocator to the start of a fresh chunk, 1175 * because the previous transaction was probably an update that shrunk 1176 * the bump chunk. It simplifies rollback because `fender` is always zero. 1177 * 1178 * To rollback a transaction, we need to reset all the allocation 1179 * counters to their previous state, in particular we need to un-free 1180 * any nodes that were copied to make them mutable. This means we need 1181 * to make a copy of basically the whole `dns_qp_t writer`: everything 1182 * but the chunks holding the trie nodes. 1183 * 1184 * We do most of the transaction setup before creating the rollback 1185 * state so that after rollback we have a correct idea of which chunks 1186 * are immutable, and so we have the correct transaction mode to make 1187 * the next transaction allocate a new bump chunk. The exception is 1188 * resetting the allocator, which we do after creating the rollback 1189 * state; if this transaction is rolled back then the next transaction 1190 * will start from the rollback state and also reset the allocator as 1191 * one of its first actions. 1192 */ 1193 void 1194 dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1195 dns_qp_t *qp = transaction_open(multi, qptp); 1196 TRACE(""); 1197 1198 qp->transaction_mode = QP_UPDATE; 1199 1200 dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback)); 1201 memmove(rollback, qp, sizeof(*rollback)); 1202 /* can be uninitialized on the first transaction */ 1203 if (rollback->base != NULL) { 1204 INSIST(QPBASE_VALID(rollback->base)); 1205 INSIST(qp->usage != NULL && qp->chunk_max > 0); 1206 /* paired with either _commit() or _rollback() */ 1207 isc_refcount_increment(&rollback->base->refcount); 1208 size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max; 1209 rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes); 1210 memmove(rollback->usage, qp->usage, usage_bytes); 1211 } 1212 INSIST(multi->rollback == NULL); 1213 multi->rollback = rollback; 1214 1215 alloc_reset(qp); 1216 } 1217 1218 void 1219 dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1220 REQUIRE(QPMULTI_VALID(multi)); 1221 REQUIRE(qptp != NULL && *qptp == &multi->writer); 1222 REQUIRE(multi->writer.transaction_mode == QP_WRITE || 1223 multi->writer.transaction_mode == QP_UPDATE); 1224 1225 dns_qp_t *qp = *qptp; 1226 TRACE(""); 1227 1228 if (qp->transaction_mode == QP_UPDATE) { 1229 INSIST(multi->rollback != NULL); 1230 /* paired with dns_qpmulti_update() */ 1231 if (qpbase_unref(multi->rollback)) { 1232 isc_mem_free(qp->mctx, multi->rollback->base); 1233 } 1234 if (multi->rollback->usage != NULL) { 1235 isc_mem_free(qp->mctx, multi->rollback->usage); 1236 } 1237 isc_mem_free(qp->mctx, multi->rollback); 1238 } 1239 INSIST(multi->rollback == NULL); 1240 1241 /* not the first commit? */ 1242 if (multi->reader_ref != INVALID_REF) { 1243 INSIST(cells_immutable(qp, multi->reader_ref)); 1244 free_twigs(qp, multi->reader_ref, READER_SIZE); 1245 } 1246 1247 if (qp->transaction_mode == QP_UPDATE) { 1248 /* minimize memory overhead */ 1249 compact(qp); 1250 multi->reader_ref = alloc_twigs(qp, READER_SIZE); 1251 qp->base->ptr[qp->bump] = chunk_shrink_raw( 1252 qp, qp->base->ptr[qp->bump], 1253 qp->usage[qp->bump].used * sizeof(dns_qpnode_t)); 1254 } else { 1255 multi->reader_ref = alloc_twigs(qp, READER_SIZE); 1256 } 1257 1258 /* anchor a new version of the trie */ 1259 dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref); 1260 make_reader(reader, multi); 1261 /* paired with chunk_free() */ 1262 isc_refcount_increment(&qp->base->refcount); 1263 1264 rcu_assign_pointer(multi->reader, reader); /* COMMIT */ 1265 1266 /* clean up what we can right now */ 1267 if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) { 1268 recycle(qp); 1269 } 1270 1271 /* schedule the rest for later */ 1272 reclaim_chunks(multi); 1273 1274 *qptp = NULL; 1275 UNLOCK(&multi->mutex); 1276 } 1277 1278 /* 1279 * Throw away everything that was allocated during this transaction. 1280 */ 1281 void 1282 dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1283 unsigned int free = 0; 1284 1285 REQUIRE(QPMULTI_VALID(multi)); 1286 REQUIRE(multi->writer.transaction_mode == QP_UPDATE); 1287 REQUIRE(qptp != NULL && *qptp == &multi->writer); 1288 1289 dns_qp_t *qp = *qptp; 1290 TRACE(""); 1291 1292 isc_nanosecs_t start = isc_time_monotonic(); 1293 1294 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1295 if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable) 1296 { 1297 chunk_free(qp, chunk); 1298 /* 1299 * we need to clear its base pointer in the rollback 1300 * trie, in case the arrays were resized 1301 */ 1302 if (chunk < multi->rollback->chunk_max) { 1303 INSIST(!multi->rollback->usage[chunk].exists); 1304 multi->rollback->base->ptr[chunk] = NULL; 1305 } 1306 free++; 1307 } 1308 } 1309 1310 /* 1311 * multi->rollback->base and multi->writer->base are the same, 1312 * unless there was a realloc_chunk_arrays() during the transaction 1313 */ 1314 if (qpbase_unref(qp)) { 1315 /* paired with dns_qpmulti_update() */ 1316 isc_mem_free(qp->mctx, qp->base); 1317 } 1318 isc_mem_free(qp->mctx, qp->usage); 1319 1320 /* reset allocator state */ 1321 INSIST(multi->rollback != NULL); 1322 memmove(qp, multi->rollback, sizeof(*qp)); 1323 isc_mem_free(qp->mctx, multi->rollback); 1324 INSIST(multi->rollback == NULL); 1325 1326 isc_nanosecs_t time = isc_time_monotonic() - start; 1327 ISC_QP_ADD(rollback_time, time); 1328 1329 LOG_STATS("qp rollback" PRItime "free %u chunks", time, free); 1330 1331 *qptp = NULL; 1332 UNLOCK(&multi->mutex); 1333 } 1334 1335 /*********************************************************************** 1336 * 1337 * read-only transactions 1338 */ 1339 1340 static dns_qpmulti_t * 1341 reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) { 1342 dns_qpreader_t *qp = dns_qpreader(qpr); 1343 dns_qpnode_t *reader = rcu_dereference(multi->reader); 1344 if (reader == NULL) { 1345 QP_INIT(qp, multi->writer.methods, multi->writer.uctx); 1346 } else { 1347 multi = unpack_reader(qp, reader); 1348 } 1349 return multi; 1350 } 1351 1352 /* 1353 * a query is light 1354 */ 1355 1356 void 1357 dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) { 1358 REQUIRE(QPMULTI_VALID(multi)); 1359 REQUIRE(qp != NULL); 1360 1361 qp->tid = isc_tid(); 1362 rcu_read_lock(); 1363 1364 dns_qpmulti_t *whence = reader_open(multi, qp); 1365 INSIST(whence == multi); 1366 } 1367 1368 void 1369 dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) { 1370 REQUIRE(QPMULTI_VALID(multi)); 1371 REQUIRE(QP_VALID(qp)); 1372 REQUIRE(qp->tid == isc_tid()); 1373 *qp = (dns_qpread_t){}; 1374 rcu_read_unlock(); 1375 } 1376 1377 /* 1378 * a snapshot is heavy 1379 */ 1380 1381 void 1382 dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 1383 REQUIRE(QPMULTI_VALID(multi)); 1384 REQUIRE(qpsp != NULL && *qpsp == NULL); 1385 1386 rcu_read_lock(); 1387 1388 LOCK(&multi->mutex); 1389 1390 dns_qp_t *qpw = &multi->writer; 1391 size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) + 1392 sizeof(qpw->base->ptr[0]) * qpw->chunk_max; 1393 dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes); 1394 1395 qps->whence = reader_open(multi, qps); 1396 INSIST(qps->whence == multi); 1397 1398 /* not a separate allocation */ 1399 qps->base = (dns_qpbase_t *)(qps + 1); 1400 isc_refcount_init(&qps->base->refcount, 0); 1401 1402 /* 1403 * only copy base pointers of chunks we need, so we can 1404 * reclaim unused memory in dns_qpsnap_destroy() 1405 */ 1406 qps->chunk_max = qpw->chunk_max; 1407 for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 1408 if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) { 1409 qpw->usage[chunk].snapshot = true; 1410 qps->base->ptr[chunk] = qpw->base->ptr[chunk]; 1411 } else { 1412 qps->base->ptr[chunk] = NULL; 1413 } 1414 } 1415 ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link); 1416 1417 *qpsp = qps; 1418 UNLOCK(&multi->mutex); 1419 1420 rcu_read_unlock(); 1421 } 1422 1423 void 1424 dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 1425 REQUIRE(QPMULTI_VALID(multi)); 1426 REQUIRE(qpsp != NULL && *qpsp != NULL); 1427 1428 LOCK(&multi->mutex); 1429 1430 dns_qpsnap_t *qp = *qpsp; 1431 1432 /* make sure the API is being used correctly */ 1433 REQUIRE(qp->whence == multi); 1434 1435 ISC_LIST_UNLINK(multi->snapshots, qp, link); 1436 1437 /* 1438 * eagerly reclaim chunks that are now unused, so that memory does 1439 * not accumulate when a trie has a lot of updates and snapshots 1440 */ 1441 marksweep_chunks(multi); 1442 1443 isc_mem_free(multi->writer.mctx, qp); 1444 1445 *qpsp = NULL; 1446 UNLOCK(&multi->mutex); 1447 } 1448 1449 /*********************************************************************** 1450 * 1451 * constructors, destructors 1452 */ 1453 1454 void 1455 dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 1456 dns_qp_t **qptp) { 1457 REQUIRE(qptp != NULL && *qptp == NULL); 1458 1459 dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp)); 1460 QP_INIT(qp, methods, uctx); 1461 isc_mem_attach(mctx, &qp->mctx); 1462 alloc_reset(qp); 1463 TRACE(""); 1464 *qptp = qp; 1465 } 1466 1467 void 1468 dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 1469 dns_qpmulti_t **qpmp) { 1470 REQUIRE(qpmp != NULL && *qpmp == NULL); 1471 1472 dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi)); 1473 *multi = (dns_qpmulti_t){ 1474 .magic = QPMULTI_MAGIC, 1475 .reader_ref = INVALID_REF, 1476 }; 1477 isc_mutex_init(&multi->mutex); 1478 ISC_LIST_INIT(multi->snapshots); 1479 /* 1480 * Do not waste effort allocating a bump chunk that will be thrown 1481 * away when a transaction is opened. dns_qpmulti_update() always 1482 * allocates; to ensure dns_qpmulti_write() does too, pretend the 1483 * previous transaction was an update 1484 */ 1485 dns_qp_t *qp = &multi->writer; 1486 QP_INIT(qp, methods, uctx); 1487 isc_mem_attach(mctx, &qp->mctx); 1488 qp->transaction_mode = QP_UPDATE; 1489 TRACE(""); 1490 *qpmp = multi; 1491 } 1492 1493 static void 1494 destroy_guts(dns_qp_t *qp) { 1495 if (qp->chunk_max == 0) { 1496 return; 1497 } 1498 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1499 if (qp->base->ptr[chunk] != NULL) { 1500 chunk_free(qp, chunk); 1501 } 1502 } 1503 ENSURE(qp->used_count == 0); 1504 ENSURE(qp->free_count == 0); 1505 ENSURE(isc_refcount_current(&qp->base->refcount) == 1); 1506 isc_mem_free(qp->mctx, qp->base); 1507 isc_mem_free(qp->mctx, qp->usage); 1508 qp->magic = 0; 1509 } 1510 1511 void 1512 dns_qp_destroy(dns_qp_t **qptp) { 1513 REQUIRE(qptp != NULL); 1514 REQUIRE(QP_VALID(*qptp)); 1515 1516 dns_qp_t *qp = *qptp; 1517 *qptp = NULL; 1518 1519 /* do not try to destroy part of a dns_qpmulti_t */ 1520 REQUIRE(qp->transaction_mode == QP_NONE); 1521 1522 TRACE(""); 1523 destroy_guts(qp); 1524 isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp)); 1525 } 1526 1527 static void 1528 qpmulti_destroy_cb(struct rcu_head *arg) { 1529 qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 1530 REQUIRE(QPRCU_VALID(rcuctx)); 1531 /* only nonzero for reclaim_chunks_cb() */ 1532 REQUIRE(rcuctx->count == 0); 1533 1534 dns_qpmulti_t *multi = rcuctx->multi; 1535 REQUIRE(QPMULTI_VALID(multi)); 1536 1537 /* reassure thread sanitizer */ 1538 LOCK(&multi->mutex); 1539 1540 dns_qp_t *qp = &multi->writer; 1541 REQUIRE(QP_VALID(qp)); 1542 1543 destroy_guts(qp); 1544 1545 UNLOCK(&multi->mutex); 1546 1547 isc_mutex_destroy(&multi->mutex); 1548 isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 1549 STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 1550 isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); 1551 } 1552 1553 void 1554 dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { 1555 dns_qp_t *qp = NULL; 1556 dns_qpmulti_t *multi = NULL; 1557 qp_rcuctx_t *rcuctx = NULL; 1558 1559 REQUIRE(qpmp != NULL); 1560 REQUIRE(QPMULTI_VALID(*qpmp)); 1561 1562 multi = *qpmp; 1563 qp = &multi->writer; 1564 *qpmp = NULL; 1565 1566 REQUIRE(QP_VALID(qp)); 1567 REQUIRE(multi->rollback == NULL); 1568 REQUIRE(ISC_LIST_EMPTY(multi->snapshots)); 1569 1570 rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0)); 1571 *rcuctx = (qp_rcuctx_t){ 1572 .magic = QPRCU_MAGIC, 1573 .multi = multi, 1574 }; 1575 isc_mem_attach(qp->mctx, &rcuctx->mctx); 1576 call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb); 1577 } 1578 1579 /*********************************************************************** 1580 * 1581 * modification 1582 */ 1583 1584 isc_result_t 1585 dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) { 1586 dns_qpref_t new_ref, old_ref; 1587 dns_qpnode_t new_leaf, old_node; 1588 dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL; 1589 dns_qpshift_t new_bit, old_bit; 1590 dns_qpweight_t old_size, new_size; 1591 dns_qpkey_t new_key, old_key; 1592 size_t new_keylen, old_keylen; 1593 size_t offset; 1594 uint64_t index; 1595 dns_qpshift_t bit; 1596 dns_qpweight_t pos; 1597 dns_qpnode_t *n = NULL; 1598 1599 REQUIRE(QP_VALID(qp)); 1600 1601 new_leaf = make_leaf(pval, ival); 1602 new_keylen = leaf_qpkey(qp, &new_leaf, new_key); 1603 1604 /* first leaf in an empty trie? */ 1605 if (qp->leaf_count == 0) { 1606 new_ref = alloc_twigs(qp, 1); 1607 new_twigs = ref_ptr(qp, new_ref); 1608 *new_twigs = new_leaf; 1609 attach_leaf(qp, new_twigs); 1610 qp->leaf_count++; 1611 qp->root_ref = new_ref; 1612 return ISC_R_SUCCESS; 1613 } 1614 1615 /* 1616 * We need to keep searching down to a leaf even if our key is 1617 * missing from this branch. It doesn't matter which twig we 1618 * choose since the keys are all the same up to this node's 1619 * offset. Note that if we simply use branch_twig_pos(n, bit) 1620 * we may get an out-of-bounds access if our bit is greater 1621 * than all the set bits in the node. 1622 */ 1623 n = ref_ptr(qp, qp->root_ref); 1624 while (is_branch(n)) { 1625 prefetch_twigs(qp, n); 1626 dns_qpref_t ref = branch_twigs_ref(n); 1627 bit = branch_keybit(n, new_key, new_keylen); 1628 pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0; 1629 n = ref_ptr(qp, ref + pos); 1630 } 1631 1632 /* do the keys differ, and if so, where? */ 1633 old_keylen = leaf_qpkey(qp, n, old_key); 1634 offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen); 1635 if (offset == QPKEY_EQUAL) { 1636 return ISC_R_EXISTS; 1637 } 1638 new_bit = qpkey_bit(new_key, new_keylen, offset); 1639 old_bit = qpkey_bit(old_key, old_keylen, offset); 1640 1641 /* find where to insert a branch or grow an existing branch. */ 1642 n = make_root_mutable(qp); 1643 while (is_branch(n)) { 1644 prefetch_twigs(qp, n); 1645 if (offset < branch_key_offset(n)) { 1646 goto newbranch; 1647 } 1648 if (offset == branch_key_offset(n)) { 1649 goto growbranch; 1650 } 1651 make_twigs_mutable(qp, n); 1652 bit = branch_keybit(n, new_key, new_keylen); 1653 INSIST(branch_has_twig(n, bit)); 1654 n = branch_twig_ptr(qp, n, bit); 1655 } 1656 /* fall through */ 1657 1658 newbranch: 1659 new_ref = alloc_twigs(qp, 2); 1660 new_twigs = ref_ptr(qp, new_ref); 1661 1662 /* save before overwriting. */ 1663 old_node = *n; 1664 1665 /* new branch node takes old node's place */ 1666 index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) | 1667 ((uint64_t)offset << SHIFT_OFFSET); 1668 *n = make_node(index, new_ref); 1669 1670 /* populate twigs */ 1671 new_twigs[old_bit > new_bit] = old_node; 1672 new_twigs[new_bit > old_bit] = new_leaf; 1673 1674 attach_leaf(qp, &new_leaf); 1675 qp->leaf_count++; 1676 1677 return ISC_R_SUCCESS; 1678 1679 growbranch: 1680 INSIST(!branch_has_twig(n, new_bit)); 1681 1682 /* locate twigs vectors */ 1683 old_size = branch_twigs_size(n); 1684 new_size = old_size + 1; 1685 old_ref = branch_twigs_ref(n); 1686 new_ref = alloc_twigs(qp, new_size); 1687 old_twigs = ref_ptr(qp, old_ref); 1688 new_twigs = ref_ptr(qp, new_ref); 1689 1690 /* embiggen branch node */ 1691 index = branch_index(n) | (1ULL << new_bit); 1692 *n = make_node(index, new_ref); 1693 1694 /* embiggen twigs vector */ 1695 pos = branch_twig_pos(n, new_bit); 1696 move_twigs(new_twigs, old_twigs, pos); 1697 new_twigs[pos] = new_leaf; 1698 move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos); 1699 1700 if (squash_twigs(qp, old_ref, old_size)) { 1701 /* old twigs destroyed, only attach to new leaf */ 1702 attach_leaf(qp, &new_leaf); 1703 } else { 1704 /* old twigs duplicated, attach to all leaves */ 1705 attach_twigs(qp, new_twigs, new_size); 1706 } 1707 qp->leaf_count++; 1708 1709 return ISC_R_SUCCESS; 1710 } 1711 1712 isc_result_t 1713 dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key, 1714 size_t search_keylen, void **pval_r, uint32_t *ival_r) { 1715 REQUIRE(QP_VALID(qp)); 1716 REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 1717 1718 if (get_root(qp) == NULL) { 1719 return ISC_R_NOTFOUND; 1720 } 1721 1722 dns_qpshift_t bit = 0; /* suppress warning */ 1723 dns_qpnode_t *parent = NULL; 1724 dns_qpnode_t *n = make_root_mutable(qp); 1725 while (is_branch(n)) { 1726 prefetch_twigs(qp, n); 1727 bit = branch_keybit(n, search_key, search_keylen); 1728 if (!branch_has_twig(n, bit)) { 1729 return ISC_R_NOTFOUND; 1730 } 1731 make_twigs_mutable(qp, n); 1732 parent = n; 1733 n = branch_twig_ptr(qp, n, bit); 1734 } 1735 1736 dns_qpkey_t found_key; 1737 size_t found_keylen = leaf_qpkey(qp, n, found_key); 1738 if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 1739 QPKEY_EQUAL) 1740 { 1741 return ISC_R_NOTFOUND; 1742 } 1743 1744 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 1745 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 1746 detach_leaf(qp, n); 1747 qp->leaf_count--; 1748 1749 /* trie becomes empty */ 1750 if (qp->leaf_count == 0) { 1751 INSIST(parent == NULL); 1752 INSIST(n == get_root(qp)); 1753 free_twigs(qp, qp->root_ref, 1); 1754 qp->root_ref = INVALID_REF; 1755 return ISC_R_SUCCESS; 1756 } 1757 1758 /* step back to parent node */ 1759 n = parent; 1760 parent = NULL; 1761 1762 INSIST(bit != 0); 1763 dns_qpweight_t size = branch_twigs_size(n); 1764 dns_qpweight_t pos = branch_twig_pos(n, bit); 1765 dns_qpref_t ref = branch_twigs_ref(n); 1766 dns_qpnode_t *twigs = ref_ptr(qp, ref); 1767 1768 if (size == 2) { 1769 /* 1770 * move the other twig to the parent branch. 1771 */ 1772 *n = twigs[!pos]; 1773 squash_twigs(qp, ref, 2); 1774 } else { 1775 /* 1776 * shrink the twigs in place, to avoid using the bump 1777 * chunk too fast - the gc will clean up after us 1778 */ 1779 *n = make_node(branch_index(n) & ~(1ULL << bit), ref); 1780 move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1); 1781 squash_twigs(qp, ref + size - 1, 1); 1782 } 1783 1784 return ISC_R_SUCCESS; 1785 } 1786 1787 isc_result_t 1788 dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r, 1789 uint32_t *ival_r) { 1790 dns_qpkey_t key; 1791 size_t keylen = dns_qpkey_fromname(key, name); 1792 return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r); 1793 } 1794 1795 /*********************************************************************** 1796 * chains 1797 */ 1798 static void 1799 maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) { 1800 dns_qpkey_t key; 1801 size_t len; 1802 1803 if (name == NULL) { 1804 return; 1805 } 1806 1807 dns_name_reset(name); 1808 len = leaf_qpkey(qp, node, key); 1809 dns_qpkey_toname(key, len, name); 1810 } 1811 1812 void 1813 dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) { 1814 dns_qpreader_t *qp = dns_qpreader(qpr); 1815 REQUIRE(QP_VALID(qp)); 1816 REQUIRE(chain != NULL); 1817 1818 *chain = (dns_qpchain_t){ 1819 .magic = QPCHAIN_MAGIC, 1820 .qp = qp, 1821 }; 1822 } 1823 1824 unsigned int 1825 dns_qpchain_length(dns_qpchain_t *chain) { 1826 REQUIRE(QPCHAIN_VALID(chain)); 1827 1828 return chain->len; 1829 } 1830 1831 void 1832 dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name, 1833 void **pval_r, uint32_t *ival_r) { 1834 dns_qpnode_t *node = NULL; 1835 1836 REQUIRE(QPCHAIN_VALID(chain)); 1837 REQUIRE(level < chain->len); 1838 1839 node = chain->chain[level].node; 1840 maybe_set_name(chain->qp, node, name); 1841 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 1842 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 1843 } 1844 1845 /*********************************************************************** 1846 * iterators 1847 */ 1848 1849 void 1850 dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) { 1851 dns_qpreader_t *qp = dns_qpreader(qpr); 1852 REQUIRE(QP_VALID(qp)); 1853 REQUIRE(qpi != NULL); 1854 *qpi = (dns_qpiter_t){ 1855 .qp = qp, 1856 .magic = QPITER_MAGIC, 1857 }; 1858 } 1859 1860 /* 1861 * are we at the last twig in this branch (in whichever direction 1862 * we're iterating)? 1863 */ 1864 static bool 1865 last_twig(dns_qpiter_t *qpi, bool forward) { 1866 dns_qpweight_t pos = 0, max = 0; 1867 if (qpi->sp > 0) { 1868 dns_qpnode_t *child = qpi->stack[qpi->sp]; 1869 dns_qpnode_t *parent = qpi->stack[qpi->sp - 1]; 1870 pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent)); 1871 if (forward) { 1872 max = branch_twigs_size(parent) - 1; 1873 } 1874 } 1875 return pos == max; 1876 } 1877 1878 /* 1879 * move a QP iterator forward or back to the next or previous leaf. 1880 * note: this function can go wrong when the iterator refers to 1881 * a mutable view of the trie which is altered while iterating 1882 */ 1883 static isc_result_t 1884 iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 1885 uint32_t *ival_r) { 1886 dns_qpnode_t *node = NULL; 1887 bool initial_branch = true; 1888 1889 REQUIRE(QPITER_VALID(qpi)); 1890 1891 dns_qpreader_t *qp = qpi->qp; 1892 1893 REQUIRE(QP_VALID(qp)); 1894 1895 node = get_root(qp); 1896 if (node == NULL) { 1897 return ISC_R_NOMORE; 1898 } 1899 1900 do { 1901 if (qpi->stack[qpi->sp] == NULL) { 1902 /* newly initialized iterator: use the root node */ 1903 INSIST(qpi->sp == 0); 1904 qpi->stack[0] = node; 1905 } else if (!initial_branch) { 1906 /* 1907 * in a prior loop, we reached a branch; from 1908 * here we just need to get the highest or lowest 1909 * leaf in the subtree; we don't need to bother 1910 * stepping forward or backward through twigs 1911 * anymore. 1912 */ 1913 INSIST(qpi->sp > 0); 1914 } else if (last_twig(qpi, forward)) { 1915 /* 1916 * we've stepped to the end (or the beginning, 1917 * if we're iterating backwards) of a set of twigs. 1918 */ 1919 if (qpi->sp == 0) { 1920 /* 1921 * we've finished iterating. reinitialize 1922 * the iterator, then return ISC_R_NOMORE. 1923 */ 1924 dns_qpiter_init(qpi->qp, qpi); 1925 return ISC_R_NOMORE; 1926 } 1927 1928 /* 1929 * pop the stack, and resume at the parent branch. 1930 */ 1931 qpi->stack[qpi->sp] = NULL; 1932 qpi->sp--; 1933 continue; 1934 } else { 1935 /* 1936 * there are more twigs in the current branch, 1937 * so step the node pointer forward (or back). 1938 */ 1939 qpi->stack[qpi->sp] += (forward ? 1 : -1); 1940 node = qpi->stack[qpi->sp]; 1941 } 1942 1943 /* 1944 * if we're at a branch now, we loop down to the 1945 * left- or rightmost leaf. 1946 */ 1947 if (is_branch(node)) { 1948 qpi->sp++; 1949 INSIST(qpi->sp < DNS_QP_MAXKEY); 1950 node = ref_ptr(qp, branch_twigs_ref(node)) + 1951 (forward ? 0 : branch_twigs_size(node) - 1); 1952 qpi->stack[qpi->sp] = node; 1953 initial_branch = false; 1954 } 1955 } while (is_branch(node)); 1956 1957 /* we're at a leaf: return its data to the caller */ 1958 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 1959 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 1960 maybe_set_name(qpi->qp, node, name); 1961 return ISC_R_SUCCESS; 1962 } 1963 1964 isc_result_t 1965 dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 1966 uint32_t *ival_r) { 1967 return iterate(true, qpi, name, pval_r, ival_r); 1968 } 1969 1970 isc_result_t 1971 dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 1972 uint32_t *ival_r) { 1973 return iterate(false, qpi, name, pval_r, ival_r); 1974 } 1975 1976 isc_result_t 1977 dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 1978 uint32_t *ival_r) { 1979 dns_qpnode_t *node = NULL; 1980 1981 REQUIRE(QPITER_VALID(qpi)); 1982 1983 node = qpi->stack[qpi->sp]; 1984 if (node == NULL || is_branch(node)) { 1985 return ISC_R_FAILURE; 1986 } 1987 1988 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 1989 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 1990 maybe_set_name(qpi->qp, node, name); 1991 return ISC_R_SUCCESS; 1992 } 1993 1994 /*********************************************************************** 1995 * 1996 * search 1997 */ 1998 1999 isc_result_t 2000 dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key, 2001 size_t search_keylen, void **pval_r, uint32_t *ival_r) { 2002 dns_qpreader_t *qp = dns_qpreader(qpr); 2003 dns_qpkey_t found_key; 2004 size_t found_keylen; 2005 dns_qpshift_t bit; 2006 dns_qpnode_t *n = NULL; 2007 2008 REQUIRE(QP_VALID(qp)); 2009 REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 2010 2011 n = get_root(qp); 2012 if (n == NULL) { 2013 return ISC_R_NOTFOUND; 2014 } 2015 2016 while (is_branch(n)) { 2017 prefetch_twigs(qp, n); 2018 bit = branch_keybit(n, search_key, search_keylen); 2019 if (!branch_has_twig(n, bit)) { 2020 return ISC_R_NOTFOUND; 2021 } 2022 n = branch_twig_ptr(qp, n, bit); 2023 } 2024 2025 found_keylen = leaf_qpkey(qp, n, found_key); 2026 if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 2027 QPKEY_EQUAL) 2028 { 2029 return ISC_R_NOTFOUND; 2030 } 2031 2032 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2033 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2034 return ISC_R_SUCCESS; 2035 } 2036 2037 isc_result_t 2038 dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r, 2039 uint32_t *ival_r) { 2040 dns_qpkey_t key; 2041 size_t keylen = dns_qpkey_fromname(key, name); 2042 return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r); 2043 } 2044 2045 static inline void 2046 add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) { 2047 /* prevent duplication */ 2048 if (chain->len != 0 && chain->chain[chain->len - 1].node == node) { 2049 return; 2050 } 2051 chain->chain[chain->len].node = node; 2052 chain->chain[chain->len].offset = offset; 2053 chain->len++; 2054 INSIST(chain->len <= DNS_NAME_MAXLABELS); 2055 } 2056 2057 static inline void 2058 prevleaf(dns_qpiter_t *it) { 2059 isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL); 2060 if (result == ISC_R_NOMORE) { 2061 result = dns_qpiter_prev(it, NULL, NULL, NULL); 2062 } 2063 RUNTIME_CHECK(result == ISC_R_SUCCESS); 2064 } 2065 2066 static inline void 2067 greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) { 2068 while (is_branch(n)) { 2069 dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) - 2070 1; 2071 iter->stack[++iter->sp] = n; 2072 n = ref_ptr(qpr, ref); 2073 } 2074 iter->stack[++iter->sp] = n; 2075 } 2076 2077 static inline dns_qpnode_t * 2078 anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) { 2079 while (is_branch(n)) { 2080 n = branch_twigs(qp, n); 2081 } 2082 return n; 2083 } 2084 2085 static inline int 2086 twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit, 2087 dns_qpshift_t fbit) { 2088 dns_qpweight_t pos = branch_twig_pos(n, sbit); 2089 if (branch_has_twig(n, sbit)) { 2090 return pos - (kbit < fbit); 2091 } 2092 return pos - 1; 2093 } 2094 2095 /* 2096 * If dns_qp_lookup() was passed an iterator, we want it to point at the 2097 * matching name in the case of an exact match, or at the predecessor name 2098 * for a non-exact match. 2099 * 2100 * If there is an exact match, then there is nothing to be done. Otherwise, 2101 * we pop up the iterator stack until we find a parent branch with an offset 2102 * that is before the position where the search key differs from the found key. 2103 * From there we can step to the leaf that is the predecessor of the searched 2104 * name. 2105 * 2106 * Requires the iterator to be pointing at a leaf node. 2107 */ 2108 static void 2109 fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key, 2110 size_t len) { 2111 dns_qpnode_t *n = it->stack[it->sp]; 2112 2113 REQUIRE(!is_branch(n)); 2114 2115 dns_qpkey_t found; 2116 size_t foundlen = leaf_qpkey(qp, n, found); 2117 size_t to = qpkey_compare(key, len, found, foundlen); 2118 2119 /* If the keys are equal, the iterator is already at the right node. */ 2120 if (to == QPKEY_EQUAL) { 2121 return; 2122 } 2123 2124 /* 2125 * Special case: if the key differs even before the root 2126 * key offset, it means the name desired either precedes or 2127 * follows the entire range of names in the database, and 2128 * popping up the stack won't help us, so just move the 2129 * iterator one step back from the origin and return. 2130 */ 2131 if (to < branch_key_offset(it->stack[0])) { 2132 dns_qpiter_init(qp, it); 2133 prevleaf(it); 2134 return; 2135 } 2136 2137 /* 2138 * As long as the branch offset point is after the point where the 2139 * key differs, we need to branch up and find a better node. 2140 */ 2141 while (it->sp > 0) { 2142 dns_qpnode_t *b = it->stack[it->sp - 1]; 2143 if (branch_key_offset(b) < to) { 2144 break; 2145 } 2146 it->sp--; 2147 } 2148 n = it->stack[it->sp]; 2149 2150 /* 2151 * Either we are now at the correct branch, or we are at the 2152 * first unmatched node. Determine the bit position for the 2153 * twig we need (sbit). 2154 */ 2155 dns_qpshift_t kbit = qpkey_bit(key, len, to); 2156 dns_qpshift_t fbit = qpkey_bit(found, foundlen, to); 2157 dns_qpshift_t sbit = 0; 2158 2159 if (is_branch(n) && branch_key_offset(n) == to) { 2160 /* We are on the correct branch now. */ 2161 sbit = kbit; 2162 } else if (it->sp == 0) { 2163 /* 2164 * We are on the root branch, popping up the stack won't 2165 * help us, so just move the iterator one step back from the 2166 * origin and return. 2167 */ 2168 dns_qpiter_init(qp, it); 2169 prevleaf(it); 2170 return; 2171 } else { 2172 /* We are at the first unmatched node, pop up the stack. */ 2173 n = it->stack[--it->sp]; 2174 sbit = qpkey_bit(key, len, branch_key_offset(n)); 2175 } 2176 2177 INSIST(is_branch(n)); 2178 2179 prefetch_twigs(qp, n); 2180 dns_qpnode_t *twigs = branch_twigs(qp, n); 2181 int toff = twig_offset(n, sbit, kbit, fbit); 2182 if (toff >= 0) { 2183 /* 2184 * The name we want would've been after some twig in 2185 * this branch. Walk down from that twig to the 2186 * highest leaf in its subtree to get the predecessor. 2187 */ 2188 greatest_leaf(qp, twigs + toff, it); 2189 } else { 2190 /* 2191 * Every leaf below this node is greater than the one we 2192 * wanted, so the previous leaf is the predecessor. 2193 */ 2194 prevleaf(it); 2195 } 2196 } 2197 2198 /* 2199 * When searching for a requested name in dns_qp_lookup(), we might add 2200 * a leaf node to the chain, then subsequently determine that it was a 2201 * dead end. When this happens, the chain can be left holding a node 2202 * that is *not* an ancestor of the requested name. We correct for that 2203 * here. 2204 */ 2205 static void 2206 fix_chain(dns_qpchain_t *chain, size_t offset) { 2207 while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset) 2208 { 2209 chain->len--; 2210 chain->chain[chain->len].node = NULL; 2211 chain->chain[chain->len].offset = 0; 2212 } 2213 } 2214 2215 isc_result_t 2216 dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name, 2217 dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain, 2218 void **pval_r, uint32_t *ival_r) { 2219 dns_qpreader_t *qp = dns_qpreader(qpr); 2220 dns_qpkey_t search, found; 2221 size_t searchlen, foundlen; 2222 size_t offset = 0; 2223 dns_qpnode_t *n = NULL; 2224 dns_qpshift_t bit = SHIFT_NOBYTE; 2225 dns_qpchain_t oc; 2226 dns_qpiter_t it; 2227 bool matched = false; 2228 bool setiter = true; 2229 2230 REQUIRE(QP_VALID(qp)); 2231 REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 2232 2233 searchlen = dns_qpkey_fromname(search, name); 2234 2235 if (chain == NULL) { 2236 chain = &oc; 2237 } 2238 if (iter == NULL) { 2239 iter = ⁢ 2240 setiter = false; 2241 } 2242 dns_qpchain_init(qp, chain); 2243 dns_qpiter_init(qp, iter); 2244 2245 n = get_root(qp); 2246 if (n == NULL) { 2247 return ISC_R_NOTFOUND; 2248 } 2249 iter->stack[0] = n; 2250 2251 /* 2252 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a 2253 * second pass: instead, we keep track of any leaves with shorter keys 2254 * that we discover along the way. (In general, qp-trie searches can be 2255 * one-pass, by recording their traversal, or two-pass, for less stack 2256 * memory usage.) 2257 */ 2258 while (is_branch(n)) { 2259 prefetch_twigs(qp, n); 2260 2261 offset = branch_key_offset(n); 2262 bit = qpkey_bit(search, searchlen, offset); 2263 dns_qpnode_t *twigs = branch_twigs(qp, n); 2264 2265 /* 2266 * A shorter key that can be a parent domain always has a 2267 * leaf node at SHIFT_NOBYTE (indicating end of its key) 2268 * where our search key has a normal character immediately 2269 * after a label separator. 2270 * 2271 * Note 1: It is OK if `off - 1` underflows: it will 2272 * become SIZE_MAX, which is greater than `searchlen`, so 2273 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we 2274 * want when `off == 0`. 2275 * 2276 * Note 2: If SHIFT_NOBYTE twig is present, it will always 2277 * be in position 0, the first location in 'twigs'. 2278 */ 2279 if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) && 2280 qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE && 2281 !is_branch(twigs)) 2282 { 2283 add_link(chain, twigs, offset); 2284 } 2285 2286 matched = branch_has_twig(n, bit); 2287 if (matched) { 2288 /* 2289 * found a match: if it's a branch, we keep 2290 * searching, and if it's a leaf, we drop out of 2291 * the loop. 2292 */ 2293 n = branch_twig_ptr(qp, n, bit); 2294 } else { 2295 /* 2296 * this branch is a dead end, and the predecessor 2297 * doesn't matter. now we just need to find a leaf 2298 * to end on so that qpkey_leaf() will work below. 2299 */ 2300 n = anyleaf(qp, twigs); 2301 } 2302 2303 iter->stack[++iter->sp] = n; 2304 } 2305 2306 if (setiter) { 2307 /* 2308 * we found a leaf, but it might not be the leaf we wanted. 2309 * if it isn't, and if the caller passed us an iterator, 2310 * then we might need to reposition it. 2311 */ 2312 fix_iterator(qp, iter, search, searchlen); 2313 n = iter->stack[iter->sp]; 2314 } 2315 2316 /* at this point, n can only be a leaf node */ 2317 INSIST(!is_branch(n)); 2318 2319 foundlen = leaf_qpkey(qp, n, found); 2320 offset = qpkey_compare(search, searchlen, found, foundlen); 2321 2322 /* the search ended with an exact or partial match */ 2323 if (offset == QPKEY_EQUAL || offset == foundlen) { 2324 isc_result_t result = ISC_R_SUCCESS; 2325 2326 if (offset == foundlen) { 2327 fix_chain(chain, offset); 2328 result = DNS_R_PARTIALMATCH; 2329 } 2330 add_link(chain, n, offset); 2331 2332 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2333 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2334 maybe_set_name(qp, n, foundname); 2335 return result; 2336 } 2337 2338 /* 2339 * the requested name was not found, but if an ancestor 2340 * was, we can retrieve that from the chain. 2341 */ 2342 int len = chain->len; 2343 while (len-- > 0) { 2344 if (offset >= chain->chain[len].offset) { 2345 n = chain->chain[len].node; 2346 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2347 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2348 maybe_set_name(qp, n, foundname); 2349 return DNS_R_PARTIALMATCH; 2350 } else { 2351 /* 2352 * oops, during the search we found and added 2353 * a leaf that's longer than the requested 2354 * name; remove it from the chain. 2355 */ 2356 chain->len--; 2357 } 2358 } 2359 2360 /* nothing was found at all */ 2361 return ISC_R_NOTFOUND; 2362 } 2363 2364 /**********************************************************************/ 2365