1 /* $NetBSD: qp_p.h,v 1.2 2025/01/26 16:25:24 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * For an overview, see doc/design/qp-trie.md 18 * 19 * This private header defines the internal data structures, 20 */ 21 22 #pragma once 23 24 /*********************************************************************** 25 * 26 * interior node basics 27 */ 28 29 /* 30 * A qp-trie node is almost always one of two types: branch or leaf. 31 * (A third type is used only to anchor the root of a trie; see below.) 32 * 33 * A node contains a 64-bit word and a 32-bit word. In order to avoid 34 * unwanted padding, they are declared as three 32-bit words; this keeps 35 * the size down to 12 bytes. They are in native endian order, so getting 36 * the 64-bit part should compile down to an unaligned load. 37 * 38 * The node type is identified by the least significant bits of the 64-bit 39 * word. 40 * 41 * In a leaf node: 42 * - The 64-bit word is used to store a pointer value. (Pointers must be 43 * word-aligned so the least significant bits are zero; those bits can 44 * then act as a node tag to indicate that this is a leaf. This 45 * requirement is enforced by the make_leaf() constructor.) 46 * - The 32-bit word is used to store an integer value. Both the 47 * pointer and integer values can be retrieved when looking up a key. 48 * 49 * In a branch node: 50 * - The 64-bit word is subdivided into three portions: the least 51 * significant bits are the node type (for a branch, 0x1); the 52 * most sigificant 15 bits are an offset value into the key, and 53 * the 47 bits in the middle are a bitmap; see the documentation 54 * for the SHIFT_* enum below. 55 * - The 32-bit word is a reference (dns_qpref_t) to the packed sparse 56 * vector of "twigs", i.e. child nodes. A branch node has at least 57 * two and at most 47 twigs. (The qp-trie update functions ensure that 58 * branches actually branch, i.e. a branch cannot have only one child.) 59 * 60 * A third node type, reader nodes, anchors the root of a trie. 61 * A pair of reader nodes together contain a packed `dns_qpreader_t`. 62 * See the section on "packed reader nodes" for details. 63 */ 64 struct dns_qpnode { 65 #if WORDS_BIGENDIAN 66 uint32_t bighi, biglo, small; 67 #else 68 uint32_t biglo, bighi, small; 69 #endif 70 }; 71 72 /* 73 * The possible values of the node type tag. Type tags must fit in two bits 74 * for compatibility with 4-byte pointer alignment on 32-bit systems. 75 */ 76 enum { 77 LEAF_TAG = 0, /* leaf node */ 78 BRANCH_TAG = 1, /* branch node */ 79 READER_TAG = 2, /* reader node */ 80 TAG_MASK = 3, /* mask covering tag bits */ 81 }; 82 83 /* 84 * This code does not work on CPUs with large pointers, e.g. CHERI capability 85 * architectures. When porting to that kind of machine, a `dns_qpnode` should 86 * be just a `uintptr_t`; a leaf node will contain a single pointer, and a 87 * branch node will fit in the same space with room to spare. 88 */ 89 STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t), 90 "pointers must fit in 64 bits"); 91 92 /* 93 * The 64-bit word in a branch node is comprised of a node type tag, a 94 * bitmap, and an offset into the key. It is called an "index word" because 95 * it describes how to access the twigs vector (think "database index"). 96 * The following enum sets up the bit positions of these parts. 97 * 98 * The bitmap is just above the type tag. The `dns_qp_bits_for_byte[]` table 99 * is used to fill in a key so that bit tests can work directly against the 100 * index word without superfluous masking or shifting; we don't need to 101 * mask out the bitmap before testing a bit, but we do need to mask the 102 * bitmap before calling popcount. 103 * 104 * The byte offset into the key is at the top of the word, so that it 105 * can be extracted with just a shift, with no masking needed. 106 * 107 * The names are SHIFT_thing because they are dns_qpshift_t values. (See 108 * below for the various `qp_*` type declarations.) 109 * 110 * These values are relatively fixed in practice: SHIFT_NOBYTE needs 111 * to leave space for the type tag, and the implementation of 112 * `dns_qpkey_fromname()` depends on the bitmap being large enough. 113 * The symbolic names avoid mystery numbers in the code. 114 */ 115 enum { 116 SHIFT_NOBYTE = 2, /* label separator has no byte value */ 117 SHIFT_BITMAP, /* many bits here */ 118 SHIFT_OFFSET = 49, /* offset of byte in key */ 119 }; 120 121 /*********************************************************************** 122 * 123 * garbage collector tuning parameters 124 */ 125 126 /* 127 * A "cell" is a location that can contain a `dns_qpnode_t`, and a "chunk" 128 * is a moderately large array of cells. A big trie can occupy 129 * multiple chunks. (Unlike other nodes, a trie's root node lives in 130 * its `struct dns_qp` instead of being allocated in a cell.) 131 * 132 * The qp-trie allocator hands out space for twigs vectors. Allocations are 133 * made sequentially from one of the chunks; this kind of "sequential 134 * allocator" is also known as a "bump allocator", so in `struct dns_qp` 135 * (see below) the allocation chunk is called `bump`. 136 */ 137 138 /* 139 * Number of cells in a chunk is a power of 2, which must have space for 140 * a full twigs vector (48 wide). When testing, use a much smaller chunk 141 * size to make the allocator work harder. 142 */ 143 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 144 #define QP_CHUNK_LOG 7 145 #else 146 #define QP_CHUNK_LOG 10 147 #endif 148 149 STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20, 150 "qp-trie chunk size is unreasonable"); 151 152 #define QP_CHUNK_SIZE (1U << QP_CHUNK_LOG) 153 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t)) 154 155 /* 156 * We need a bitfield this big to count how much of a chunk is in use: 157 * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`. 158 */ 159 #define QP_USAGE_BITS (QP_CHUNK_LOG + 1) 160 161 /* 162 * A chunk needs to be compacted if it is less full than this threshold. 163 * (12% overhead seems reasonable) 164 */ 165 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8) 166 #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE) 167 168 /* 169 * Compact automatically when we pass this threshold: when there is a lot 170 * of free space in absolute terms, and when we have freed more than half 171 * of the space we allocated. 172 * 173 * The current compaction algorithm scans the whole trie, so it is important 174 * to scale the threshold based on the size of the trie to avoid quadratic 175 * behaviour. XXXFANF find an algorithm that scans less of the trie! 176 * 177 * During a modification transaction, when we copy-on-write some twigs we 178 * count the old copy as "free", because they will be when the transaction 179 * commits. But they cannot be recovered immediately so they are also 180 * counted as on hold, and discounted when we decide whether to compact. 181 */ 182 #define QP_GC_HEURISTIC(qp, free) \ 183 ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2) 184 185 #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count) 186 #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count) 187 188 /* 189 * The chunk base and usage arrays are resized geometically and start off 190 * with two entries. 191 */ 192 #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2) 193 194 /* 195 * Constructors and accessors for dns_qpref_t values, defined here to show 196 * how the dns_qpref_t, dns_qpchunk_t, dns_qpcell_t types relate to each other 197 */ 198 199 static inline dns_qpref_t 200 make_ref(dns_qpchunk_t chunk, dns_qpcell_t cell) { 201 return QP_CHUNK_SIZE * chunk + cell; 202 } 203 204 static inline dns_qpchunk_t 205 ref_chunk(dns_qpref_t ref) { 206 return ref / QP_CHUNK_SIZE; 207 } 208 209 static inline dns_qpcell_t 210 ref_cell(dns_qpref_t ref) { 211 return ref % QP_CHUNK_SIZE; 212 } 213 214 /* 215 * We should not use the `root_ref` in an empty trie, so we set it 216 * to a value that should trigger an obvious bug. See qp_init() 217 * and get_root() below. 218 */ 219 #define INVALID_REF ((dns_qpref_t)~0UL) 220 221 /*********************************************************************** 222 * 223 * chunk arrays 224 */ 225 226 /* 227 * A `dns_qp_t` contains two arrays holding information about each chunk. 228 * 229 * The `base` array holds pointers to the base of each chunk. 230 * The `usage` array hold the allocator's state for each chunk. 231 * 232 * The `base` array is used by the hot qp-trie traversal paths. It can 233 * be shared by multiple versions of a trie, which are tracked with a 234 * refcount. Old versions of the trie can retain old versions of the 235 * `base` array. 236 * 237 * In multithreaded code, the `usage` array is only used when the 238 * `dns_qpmulti_t` mutex is held, and there is only one version of 239 * it in active use (maybe with a snapshot for rollback support). 240 * 241 * The two arrays are separate because they have rather different 242 * access patterns, different lifetimes, and different element sizes. 243 */ 244 245 /* 246 * For most purposes we don't need to know exactly which cells are 247 * in use in a chunk, we only need to know how many of them there are. 248 * 249 * After we have finished allocating from a chunk, the `used` counter 250 * is the size we need to know for shrinking the chunk and for 251 * scanning it to detach leaf values before the chunk is free()d. The 252 * `free` counter tells us when the chunk needs compacting and when it 253 * has become empty. 254 * 255 * The `exists` flag allows the chunk scanning loops to look at the 256 * usage array only. 257 * 258 * In multithreaded code, we mark chunks as `immutable` when a modify 259 * transaction is opened. (We don't mark them immutable on commit, 260 * because the old bump chunk must remain mutable between write 261 * transactions, but it must become immutable when an update 262 * transaction is opened.) 263 * 264 * There are a few flags used to mark which chunks are still needed by 265 * snapshots after the chunks have passed their normal reclamation 266 * phase. 267 */ 268 typedef struct qp_usage { 269 /*% the allocation point, increases monotonically */ 270 dns_qpcell_t used : QP_USAGE_BITS; 271 /*% count of nodes no longer needed, also monotonic */ 272 dns_qpcell_t free : QP_USAGE_BITS; 273 /*% qp->base->ptr[chunk] != NULL */ 274 bool exists : 1; 275 /*% is this chunk shared? [MT] */ 276 bool immutable : 1; 277 /*% already subtracted from multi->*_count [MT] */ 278 bool discounted : 1; 279 /*% is a snapshot using this chunk? [MT] */ 280 bool snapshot : 1; 281 /*% tried to free it but a snapshot needs it [MT] */ 282 bool snapfree : 1; 283 /*% for mark/sweep snapshot flag updates [MT] */ 284 bool snapmark : 1; 285 } qp_usage_t; 286 287 /* 288 * The chunks are owned by the current version of the `base` array. 289 * When the array is resized, the old version might still be in use by 290 * concurrent readers, in which case it is free()d later when its 291 * refcount drops to zero. 292 * 293 * A `dns_qpbase_t` counts references from `dns_qp_t` objects and 294 * from packed readers, but not from `dns_qpread_t` nor from 295 * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t` 296 * would wreck multicore scalability; instead we rely on RCU. 297 * 298 * The `usage` array determines when a chunk is no longer needed: old 299 * chunk pointers in old `base` arrays are ignored. (They can become 300 * dangling pointers to free memory, but they will never be 301 * dereferenced.) 302 * 303 * We ensure that individual chunk base pointers remain immutable 304 * after assignment, and they are not cleared until the chunk is 305 * free()d, after all readers have departed. Slots can be reused, and 306 * we allow transactions to fill or re-fill empty slots adjacent to 307 * busy slots that are in use by readers. 308 */ 309 struct dns_qpbase { 310 unsigned int magic; 311 isc_refcount_t refcount; 312 dns_qpnode_t *ptr[]; 313 }; 314 315 /* 316 * Chunks that may be in use by readers are reclaimed asynchronously. 317 * When a transaction commits, immutable chunks that are now empty are 318 * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`. 319 */ 320 typedef struct qp_rcuctx { 321 unsigned int magic; 322 struct rcu_head rcu_head; 323 isc_mem_t *mctx; 324 dns_qpmulti_t *multi; 325 dns_qpchunk_t count; 326 dns_qpchunk_t chunk[]; 327 } qp_rcuctx_t; 328 329 /* 330 * Returns true when the base array can be free()d. 331 */ 332 static inline bool 333 qpbase_unref(dns_qpreadable_t qpr) { 334 dns_qpreader_t *qp = dns_qpreader(qpr); 335 return qp->base != NULL && 336 isc_refcount_decrement(&qp->base->refcount) == 1; 337 } 338 339 /* 340 * Now we know about `dns_qpreader_t` and `dns_qpbase_t`, 341 * here's how we convert a twig reference into a pointer. 342 */ 343 static inline dns_qpnode_t * 344 ref_ptr(dns_qpreadable_t qpr, dns_qpref_t ref) { 345 dns_qpreader_t *qp = dns_qpreader(qpr); 346 return qp->base->ptr[ref_chunk(ref)] + ref_cell(ref); 347 } 348 349 /*********************************************************************** 350 * 351 * main qp-trie structures 352 */ 353 354 #define QP_MAGIC ISC_MAGIC('t', 'r', 'i', 'e') 355 #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't') 356 #define QPCHAIN_MAGIC ISC_MAGIC('q', 'p', 'c', 'h') 357 #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v') 358 #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x') 359 #define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p') 360 #define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b') 361 362 #define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC) 363 #define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC) 364 #define QPCHAIN_VALID(qp) ISC_MAGIC_VALID(qp, QPCHAIN_MAGIC) 365 #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC) 366 #define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC) 367 #define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC) 368 369 /* 370 * Polymorphic initialization of the `dns_qpreader_t` prefix. 371 * 372 * The location of the root node is actually a dns_qpref_t, but is 373 * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too 374 * many internal details into the public API. 375 * 376 * The `uctx` and `methods` support callbacks into the user's code. 377 * They are constant after initialization. 378 */ 379 #define QP_INIT(qp, m, x) \ 380 (*(qp) = (typeof(*(qp))){ \ 381 .magic = QP_MAGIC, \ 382 .root_ref = INVALID_REF, \ 383 .uctx = x, \ 384 .methods = m, \ 385 }) 386 387 /* 388 * Snapshots have some extra cleanup machinery. 389 * 390 * Originally, a snapshot was basically just a `dns_qpread_t` 391 * allocated on the heap, with the extra behaviour that memory 392 * reclamation is suppressed for a particular trie while it has any 393 * snapshots. However that design gets into trouble for a zone with 394 * frequent updates and many zone transfers. 395 * 396 * Instead, each snapshot records which chunks it needs. When a 397 * snapshot is created, it makes a copy of the `base` array, except 398 * for chunks that are empty and waiting to be reclaimed. When a 399 * snapshot is destroyed, we can traverse the list of snapshots to 400 * accurately mark which chunks are still needed. 401 * 402 * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is 403 * not muddled up with the wrong `dns_qpmulti_t`. 404 * 405 * A trie's `base` array might have grown after the snapshot was 406 * created, so it records its own `chunk_max`. 407 */ 408 struct dns_qpsnap { 409 DNS_QPREADER_FIELDS; 410 dns_qpmulti_t *whence; 411 uint32_t chunk_max; 412 ISC_LINK(struct dns_qpsnap) link; 413 }; 414 415 /* 416 * Read-write access to a qp-trie requires extra fields to support the 417 * allocator and garbage collector. 418 * 419 * Bare instances of a `struct dns_qp` are used for stand-alone 420 * single-threaded tries. For multithreaded access, a `dns_qpmulti_t` 421 * wraps a `dns_qp_t` with a mutex and other fields that are only needed 422 * at the start or end of a transaction. 423 * 424 * Allocations are made sequentially in the `bump` chunk. A sequence 425 * of lightweight write transactions can use the same `bump` chunk, so 426 * its prefix before `fender` is immutable, and the rest is mutable. 427 * 428 * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines 429 * the values of `used_count`, `free_count`, and `hold_count`. The 430 * `hold_count` tracks nodes that need to be retained while readers are 431 * using them; they are free but cannot be reclaimed until the transaction 432 * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE() 433 * during a transaction. 434 * 435 * There are some flags that alter the behaviour of write transactions. 436 * 437 * - The `transaction_mode` indicates whether the current transaction is a 438 * light write or a heavy update, or (between transactions) the previous 439 * transaction's mode, because the setup for the next transaction 440 * depends on how the previous one committed. The mode is set at the 441 * start of each transaction. It is QP_NONE in a single-threaded qp-trie 442 * to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy(). 443 * 444 * - The `compact_all` flag is used when every node in the trie should be 445 * copied. (Usually compation aims to avoid moving nodes out of 446 * unfragmented chunks.) It is used when compaction is explicitly 447 * requested via `dns_qp_compact()`, and as an emergency mechanism if 448 * normal compaction failed to clear the QP_MAX_GARBAGE() condition. 449 * (This emergency is a bug even tho we have a rescue mechanism.) 450 * 451 * - When a qp-trie is destroyed while it has pending cleanup work, its 452 * `destroy` flag is set so that it is destroyed by the reclaim worker. 453 * (Because items cannot be removed from the middle of the cleanup list.) 454 * 455 * - When built with fuzzing support, we can use mprotect() and munmap() 456 * to ensure that incorrect memory accesses cause fatal errors. The 457 * `write_protect` flag must be set straight after the `dns_qpmulti_t` 458 * is created, then left unchanged. 459 * 460 * Some of the dns_qp_t fields are only needed for multithreaded transactions 461 * (marked [MT] below) but the same code paths are also used for single- 462 * threaded writes. 463 */ 464 struct dns_qp { 465 DNS_QPREADER_FIELDS; 466 /*% memory context (const) */ 467 isc_mem_t *mctx; 468 /*% array of per-chunk allocation counters */ 469 qp_usage_t *usage; 470 /*% number of slots in `chunk` and `usage` arrays */ 471 dns_qpchunk_t chunk_max; 472 /*% which chunk is used for allocations */ 473 dns_qpchunk_t bump; 474 /*% nodes in the `bump` chunk below `fender` are read only [MT] */ 475 dns_qpcell_t fender; 476 /*% number of leaf nodes */ 477 dns_qpcell_t leaf_count; 478 /*% total of all usage[] counters */ 479 dns_qpcell_t used_count, free_count; 480 /*% free cells that cannot be recovered right now */ 481 dns_qpcell_t hold_count; 482 /*% what kind of transaction was most recently started [MT] */ 483 enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2; 484 /*% compact the entire trie [MT] */ 485 bool compact_all : 1; 486 /*% optionally when compiled with fuzzing support [MT] */ 487 bool write_protect : 1; 488 }; 489 490 /* 491 * Concurrent access to a qp-trie. 492 * 493 * The `reader` pointer provides wait-free access to the current version 494 * of the trie. See the "packed reader nodes" section below for a 495 * description of what it points to. 496 * 497 * The main object under the protection of the mutex is the `writer` 498 * containing all the allocator state. There can be a backup copy when 499 * we want to be able to rollback an update transaction. 500 * 501 * There is a `reader_ref` which corresponds to the `reader` pointer 502 * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is 503 * necessary when freeing the space used by the reader, because there 504 * isn't a good way to recover a dns_qpref_t from a dns_qpnode_t pointer. 505 * 506 * There is a per-trie list of snapshots that is used for reclaiming 507 * memory when a snapshot is destroyed. 508 * 509 * Finally, we maintain a global list of `dns_qpmulti_t` objects that 510 * need asynchronous safe memory recovery. 511 */ 512 struct dns_qpmulti { 513 uint32_t magic; 514 /*% RCU-protected pointer to current packed reader */ 515 dns_qpnode_t *reader; 516 /*% the mutex protects the rest of this structure */ 517 isc_mutex_t mutex; 518 /*% ref_ptr(writer, reader_ref) == reader */ 519 dns_qpref_t reader_ref; 520 /*% the main working structure */ 521 dns_qp_t writer; 522 /*% saved allocator state to support rollback */ 523 dns_qp_t *rollback; 524 /*% all snapshots of this trie */ 525 ISC_LIST(dns_qpsnap_t) snapshots; 526 }; 527 528 /*********************************************************************** 529 * 530 * interior node constructors and accessors 531 */ 532 533 /* 534 * See the comments under "interior node basics" above, which explain 535 * the layout of nodes as implemented by the following functions. 536 * 537 * These functions are (mostly) constructors and getters. Imagine how 538 * much less code there would be if C had sum types with control over 539 * the layout... 540 */ 541 542 /* 543 * Get the 64-bit word of a node. 544 */ 545 static inline uint64_t 546 node64(dns_qpnode_t *n) { 547 uint64_t lo = n->biglo; 548 uint64_t hi = n->bighi; 549 return lo | (hi << 32); 550 } 551 552 /* 553 * Get the 32-bit word of a node. 554 */ 555 static inline uint32_t 556 node32(dns_qpnode_t *n) { 557 return n->small; 558 } 559 560 /* 561 * Create a node from its parts 562 */ 563 static inline dns_qpnode_t 564 make_node(uint64_t big, uint32_t small) { 565 return (dns_qpnode_t){ 566 .biglo = (uint32_t)(big), 567 .bighi = (uint32_t)(big >> 32), 568 .small = small, 569 }; 570 } 571 572 /* 573 * Extract a pointer from a node's 64 bit word. The double cast is to avoid 574 * a warning about mismatched pointer/integer sizes on 32 bit systems. 575 */ 576 static inline void * 577 node_pointer(dns_qpnode_t *n) { 578 return (void *)(uintptr_t)(node64(n) & ~TAG_MASK); 579 } 580 581 /* 582 * Examine a node's tag bits 583 */ 584 static inline uint32_t 585 node_tag(dns_qpnode_t *n) { 586 return n->biglo & TAG_MASK; 587 } 588 589 /* 590 * simplified for the hot path 591 */ 592 static inline bool 593 is_branch(dns_qpnode_t *n) { 594 return n->biglo & BRANCH_TAG; 595 } 596 597 /* leaf nodes *********************************************************/ 598 599 /* 600 * Get a leaf's pointer value. 601 */ 602 static inline void * 603 leaf_pval(dns_qpnode_t *n) { 604 return node_pointer(n); 605 } 606 607 /* 608 * Get a leaf's integer value 609 */ 610 static inline uint32_t 611 leaf_ival(dns_qpnode_t *n) { 612 return node32(n); 613 } 614 615 /* 616 * Create a leaf node from its parts 617 */ 618 static inline dns_qpnode_t 619 make_leaf(const void *pval, uint32_t ival) { 620 dns_qpnode_t leaf = make_node((uintptr_t)pval, ival); 621 REQUIRE(node_tag(&leaf) == LEAF_TAG); 622 return leaf; 623 } 624 625 /* branch nodes *******************************************************/ 626 627 /* 628 * The following function names use plural `twigs` when they work on a 629 * branch's twigs vector as a whole, and singular `twig` when they work on 630 * a particular twig. 631 */ 632 633 /* 634 * Get a branch node's index word 635 */ 636 static inline uint64_t 637 branch_index(dns_qpnode_t *n) { 638 return node64(n); 639 } 640 641 /* 642 * Get a reference to a branch node's child twigs. 643 */ 644 static inline dns_qpref_t 645 branch_twigs_ref(dns_qpnode_t *n) { 646 return node32(n); 647 } 648 649 /* 650 * Bit positions in the bitmap come directly from the key. DNS names are 651 * converted to keys using the tables declared at the end of this file. 652 */ 653 static inline dns_qpshift_t 654 qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) { 655 if (offset < len) { 656 return key[offset]; 657 } else { 658 return SHIFT_NOBYTE; 659 } 660 } 661 662 /* 663 * Extract a branch node's offset field, used to index the key. 664 */ 665 static inline size_t 666 branch_key_offset(dns_qpnode_t *n) { 667 return (size_t)(branch_index(n) >> SHIFT_OFFSET); 668 } 669 670 /* 671 * Which bit identifies the twig of this node for this key? 672 */ 673 static inline dns_qpshift_t 674 branch_keybit(dns_qpnode_t *n, const dns_qpkey_t key, size_t len) { 675 return qpkey_bit(key, len, branch_key_offset(n)); 676 } 677 678 /* 679 * Get a pointer to a the first twig of a branch (this also functions 680 * as a pointer to the entire twig vector). 681 */ 682 static inline dns_qpnode_t * 683 branch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) { 684 return ref_ptr(qpr, branch_twigs_ref(n)); 685 } 686 687 /* 688 * Warm up the cache while calculating which twig we want. 689 */ 690 static inline void 691 prefetch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) { 692 __builtin_prefetch(ref_ptr(qpr, branch_twigs_ref(n))); 693 } 694 695 /* root node **********************************************************/ 696 697 /* 698 * Get a pointer to the root node, checking if the trie is empty. 699 */ 700 static inline dns_qpnode_t * 701 get_root(dns_qpreadable_t qpr) { 702 dns_qpreader_t *qp = dns_qpreader(qpr); 703 if (qp->root_ref == INVALID_REF) { 704 return NULL; 705 } else { 706 return ref_ptr(qp, qp->root_ref); 707 } 708 } 709 710 /* 711 * When we need to move the root node, we avoid repeating allocation 712 * logistics by making a temporary fake branch node that has 713 * `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref` 714 * just enough to treat the root node as a vector of one twig. 715 */ 716 #define MOVABLE_ROOT(qp) \ 717 (&(dns_qpnode_t){ \ 718 .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \ 719 .small = qp->root_ref, \ 720 }) 721 722 /*********************************************************************** 723 * 724 * bitmap popcount shenanigans 725 */ 726 727 /* 728 * How many twigs appear in the vector before the one corresponding to the 729 * given bit? Calculated using popcount of part of the branch's bitmap. 730 * 731 * To calculate a mask that covers the lesser bits in the bitmap, 732 * we subtract 1 to set all lesser bits, and subtract the tag mask 733 * because the type tag is not part of the bitmap. 734 */ 735 static inline dns_qpweight_t 736 branch_count_bitmap_before(dns_qpnode_t *n, dns_qpshift_t bit) { 737 uint64_t mask = (1ULL << bit) - 1 - TAG_MASK; 738 uint64_t bitmap = branch_index(n) & mask; 739 return (dns_qpweight_t)__builtin_popcountll(bitmap); 740 } 741 742 /* 743 * How many twigs does this branch have? 744 * 745 * The offset is directly after the bitmap so the offset's lesser bits 746 * covers the whole bitmap, and the bitmap's weight is the number of twigs. 747 */ 748 static inline dns_qpweight_t 749 branch_twigs_size(dns_qpnode_t *n) { 750 return branch_count_bitmap_before(n, SHIFT_OFFSET); 751 } 752 753 /* 754 * Position of a twig within the packed sparse vector. 755 */ 756 static inline dns_qpweight_t 757 branch_twig_pos(dns_qpnode_t *n, dns_qpshift_t bit) { 758 return branch_count_bitmap_before(n, bit); 759 } 760 761 /* 762 * Get a pointer to the twig for a given bit number. 763 */ 764 static inline dns_qpnode_t * 765 branch_twig_ptr(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpshift_t bit) { 766 return ref_ptr(qpr, branch_twigs_ref(n) + branch_twig_pos(n, bit)); 767 } 768 769 /* 770 * Is the twig identified by this bit present? 771 */ 772 static inline bool 773 branch_has_twig(dns_qpnode_t *n, dns_qpshift_t bit) { 774 return branch_index(n) & (1ULL << bit); 775 } 776 777 /* twig logistics *****************************************************/ 778 779 static inline void 780 move_twigs(dns_qpnode_t *to, dns_qpnode_t *from, dns_qpweight_t size) { 781 memmove(to, from, size * sizeof(dns_qpnode_t)); 782 } 783 784 static inline void 785 zero_twigs(dns_qpnode_t *twigs, dns_qpweight_t size) { 786 memset(twigs, 0, size * sizeof(dns_qpnode_t)); 787 } 788 789 /*********************************************************************** 790 * 791 * packed reader nodes 792 */ 793 794 /* 795 * The purpose of these packed reader nodes is to simplify safe memory 796 * reclamation for a multithreaded qp-trie. 797 * 798 * After the `reader` pointer in a qpmulti is replaced, we need to wait 799 * for a grace period before we can reclaim the memory that is no longer 800 * needed by the trie. So we need some kind of structure to hold 801 * pointers to the (logically) detached memory until it is safe to free. 802 * This memory includes the chunks and the `base` arrays. 803 * 804 * Packed reader nodes save us from having to track `dns_qpread_t` 805 * objects as distinct allocations: the packed reader nodes get 806 * reclaimed when the the chunk containing their cells is reclaimed. 807 * When a real `dns_qpread_t` object is needed, it is allocated on the 808 * stack (it must not live longer than a isc_loop callback) and the 809 * packed reader is unpacked into it. 810 * 811 * Chunks are owned by the current `base` array, so unused chunks are 812 * held there until they are free()d. Old `base` arrays are attached 813 * to packed reader nodes with a refcount. When a chunk is reclaimed, 814 * it is scanned so that `chunk_free()` can call `detach_leaf()` on 815 * any remaining references to leaf objects. Similarly, it calls 816 * `qpbase_unref()` to reclaim old `base` arrays. 817 */ 818 819 /* 820 * Two nodes is just enough space for the information needed by 821 * readers and for deferred memory reclamation. 822 */ 823 #define READER_SIZE 2 824 825 /* 826 * Create a packed reader; space for the reader should have been 827 * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`. 828 */ 829 static inline void 830 make_reader(dns_qpnode_t *reader, dns_qpmulti_t *multi) { 831 dns_qp_t *qp = &multi->writer; 832 reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC); 833 reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref); 834 } 835 836 static inline bool 837 reader_valid(dns_qpnode_t *reader) { 838 return reader != NULL && // 839 node_tag(&reader[0]) == READER_TAG && 840 node_tag(&reader[1]) == READER_TAG && 841 node32(&reader[0]) == QPREADER_MAGIC; 842 } 843 844 /* 845 * Verify and unpack a reader. We return the `multi` pointer to use in 846 * consistency checks. 847 */ 848 static inline dns_qpmulti_t * 849 unpack_reader(dns_qpreader_t *qp, dns_qpnode_t *reader) { 850 INSIST(reader_valid(reader)); 851 dns_qpmulti_t *multi = node_pointer(&reader[0]); 852 dns_qpbase_t *base = node_pointer(&reader[1]); 853 INSIST(QPMULTI_VALID(multi)); 854 INSIST(QPBASE_VALID(base)); 855 *qp = (dns_qpreader_t){ 856 .magic = QP_MAGIC, 857 .uctx = multi->writer.uctx, 858 .methods = multi->writer.methods, 859 .root_ref = node32(&reader[1]), 860 .base = base, 861 }; 862 return multi; 863 } 864 865 /*********************************************************************** 866 * 867 * method invocation helpers 868 */ 869 870 static inline void 871 attach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) { 872 dns_qpreader_t *qp = dns_qpreader(qpr); 873 qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n)); 874 } 875 876 static inline void 877 detach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) { 878 dns_qpreader_t *qp = dns_qpreader(qpr); 879 qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n)); 880 } 881 882 static inline size_t 883 leaf_qpkey(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpkey_t key) { 884 dns_qpreader_t *qp = dns_qpreader(qpr); 885 size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n), 886 leaf_ival(n)); 887 INSIST(len < sizeof(dns_qpkey_t)); 888 return len; 889 } 890 891 static inline char * 892 triename(dns_qpreadable_t qpr, char *buf, size_t size) { 893 dns_qpreader_t *qp = dns_qpreader(qpr); 894 qp->methods->triename(qp->uctx, buf, size); 895 return buf; 896 } 897 898 #define TRIENAME(qp) \ 899 triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX) 900 901 /*********************************************************************** 902 * 903 * converting DNS names to trie keys 904 */ 905 906 /* 907 * This is a deliberate simplification of the hostname characters, 908 * because it doesn't matter much if we treat a few extra characters 909 * favourably: there is plenty of space in the index word for a 910 * slightly larger bitmap. 911 */ 912 static inline bool 913 qp_common_character(uint8_t byte) { 914 return ('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z'); 915 } 916 917 /* 918 * Lookup table mapping bytes in DNS names to bit positions, used 919 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. 920 */ 921 extern uint16_t dns_qp_bits_for_byte[]; 922 923 /* 924 * And the reverse, mapping bit positions to characters, so the tests 925 * can print diagnostics involving qp-trie keys. 926 */ 927 extern uint8_t dns_qp_byte_for_bit[]; 928 929 /**********************************************************************/ 930