xref: /netbsd-src/external/mpl/bind/dist/lib/dns/rbtdb.c (revision bcda20f65a8566e103791ec395f7f499ef322704)
1 /*	$NetBSD: rbtdb.c,v 1.21 2025/01/26 16:25:24 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <inttypes.h>
19 #include <stdbool.h>
20 #include <sys/mman.h>
21 
22 #include <isc/ascii.h>
23 #include <isc/async.h>
24 #include <isc/atomic.h>
25 #include <isc/crc64.h>
26 #include <isc/file.h>
27 #include <isc/hash.h>
28 #include <isc/hashmap.h>
29 #include <isc/heap.h>
30 #include <isc/hex.h>
31 #include <isc/loop.h>
32 #include <isc/mem.h>
33 #include <isc/mutex.h>
34 #include <isc/once.h>
35 #include <isc/random.h>
36 #include <isc/refcount.h>
37 #include <isc/result.h>
38 #include <isc/rwlock.h>
39 #include <isc/serial.h>
40 #include <isc/stdio.h>
41 #include <isc/string.h>
42 #include <isc/time.h>
43 #include <isc/urcu.h>
44 #include <isc/util.h>
45 
46 #include <dns/callbacks.h>
47 #include <dns/db.h>
48 #include <dns/dbiterator.h>
49 #include <dns/fixedname.h>
50 #include <dns/log.h>
51 #include <dns/masterdump.h>
52 #include <dns/nsec.h>
53 #include <dns/nsec3.h>
54 #include <dns/rbt.h>
55 #include <dns/rdata.h>
56 #include <dns/rdataset.h>
57 #include <dns/rdatasetiter.h>
58 #include <dns/rdataslab.h>
59 #include <dns/rdatastruct.h>
60 #include <dns/stats.h>
61 #include <dns/time.h>
62 #include <dns/view.h>
63 #include <dns/zone.h>
64 #include <dns/zonekey.h>
65 
66 #include "db_p.h"
67 #include "rbtdb_p.h"
68 
69 #define CHECK(op)                            \
70 	do {                                 \
71 		result = (op);               \
72 		if (result != ISC_R_SUCCESS) \
73 			goto failure;        \
74 	} while (0)
75 
76 #define EXISTS(header)                                 \
77 	((atomic_load_acquire(&(header)->attributes) & \
78 	  DNS_SLABHEADERATTR_NONEXISTENT) == 0)
79 #define NONEXISTENT(header)                            \
80 	((atomic_load_acquire(&(header)->attributes) & \
81 	  DNS_SLABHEADERATTR_NONEXISTENT) != 0)
82 #define IGNORE(header)                                 \
83 	((atomic_load_acquire(&(header)->attributes) & \
84 	  DNS_SLABHEADERATTR_IGNORE) != 0)
85 #define NXDOMAIN(header)                               \
86 	((atomic_load_acquire(&(header)->attributes) & \
87 	  DNS_SLABHEADERATTR_NXDOMAIN) != 0)
88 #define STALE(header)                                  \
89 	((atomic_load_acquire(&(header)->attributes) & \
90 	  DNS_SLABHEADERATTR_STALE) != 0)
91 #define STALE_WINDOW(header)                           \
92 	((atomic_load_acquire(&(header)->attributes) & \
93 	  DNS_SLABHEADERATTR_STALE_WINDOW) != 0)
94 #define RESIGN(header)                                 \
95 	((atomic_load_acquire(&(header)->attributes) & \
96 	  DNS_SLABHEADERATTR_RESIGN) != 0)
97 #define OPTOUT(header)                                 \
98 	((atomic_load_acquire(&(header)->attributes) & \
99 	  DNS_SLABHEADERATTR_OPTOUT) != 0)
100 #define NEGATIVE(header)                               \
101 	((atomic_load_acquire(&(header)->attributes) & \
102 	  DNS_SLABHEADERATTR_NEGATIVE) != 0)
103 #define PREFETCH(header)                               \
104 	((atomic_load_acquire(&(header)->attributes) & \
105 	  DNS_SLABHEADERATTR_PREFETCH) != 0)
106 #define CASESET(header)                                \
107 	((atomic_load_acquire(&(header)->attributes) & \
108 	  DNS_SLABHEADERATTR_CASESET) != 0)
109 #define ZEROTTL(header)                                \
110 	((atomic_load_acquire(&(header)->attributes) & \
111 	  DNS_SLABHEADERATTR_ZEROTTL) != 0)
112 #define ANCIENT(header)                                \
113 	((atomic_load_acquire(&(header)->attributes) & \
114 	  DNS_SLABHEADERATTR_ANCIENT) != 0)
115 #define STATCOUNT(header)                              \
116 	((atomic_load_acquire(&(header)->attributes) & \
117 	  DNS_SLABHEADERATTR_STATCOUNT) != 0)
118 
119 #define STALE_TTL(header, rbtdb) \
120 	(NXDOMAIN(header) ? 0 : rbtdb->common.serve_stale_ttl)
121 
122 #define ACTIVE(header, now) \
123 	(((header)->ttl > (now)) || ((header)->ttl == (now) && ZEROTTL(header)))
124 
125 #define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */
126 
127 #define EXPIREDOK(rbtiterator) \
128 	(((rbtiterator)->common.options & DNS_DB_EXPIREDOK) != 0)
129 
130 #define STALEOK(rbtiterator) \
131 	(((rbtiterator)->common.options & DNS_DB_STALEOK) != 0)
132 
133 #define KEEPSTALE(rbtdb) ((rbtdb)->common.serve_stale_ttl > 0)
134 
135 #define RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, iterator)       \
136 	((iterator)->current == &(iterator)->nsec3chain && \
137 	 (iterator)->node == (rbtdb)->nsec3_origin_node)
138 
139 /*%
140  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
141  * There is a tradeoff issue about configuring this value: if this is too
142  * small, it may cause heavier contention between threads; if this is too large,
143  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
144  * The default value should work well for most environments, but this can
145  * also be configurable at compilation time via the
146  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
147  * 1 due to the assumption of dns__cacherbt_overmem().
148  */
149 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
150 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
151 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
152 #else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
153 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
154 #endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
155 #else  /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
156 #define DEFAULT_CACHE_NODE_LOCK_COUNT 17
157 #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
158 
159 /*
160  * This defines the number of headers that we try to expire each time the
161  * expire_ttl_headers() is run.  The number should be small enough, so the
162  * TTL-based header expiration doesn't take too long, but it should be large
163  * enough, so we expire enough headers if their TTL is clustered.
164  */
165 #define DNS_RBTDB_EXPIRE_TTL_COUNT 10
166 
167 static void
168 delete_callback(void *data, void *arg);
169 static void
170 prune_tree(void *arg);
171 static void
172 free_gluetable(struct cds_lfht *glue_table);
173 
174 static void
175 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp DNS__DB_FLARG);
176 static isc_result_t
177 rdatasetiter_first(dns_rdatasetiter_t *iterator DNS__DB_FLARG);
178 static isc_result_t
179 rdatasetiter_next(dns_rdatasetiter_t *iterator DNS__DB_FLARG);
180 static void
181 rdatasetiter_current(dns_rdatasetiter_t *iterator,
182 		     dns_rdataset_t *rdataset DNS__DB_FLARG);
183 
184 static dns_rdatasetitermethods_t rdatasetiter_methods = {
185 	rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next,
186 	rdatasetiter_current
187 };
188 
189 typedef struct rbtdb_rdatasetiter {
190 	dns_rdatasetiter_t common;
191 	dns_slabheader_t *current;
192 } rbtdb_rdatasetiter_t;
193 
194 /*
195  * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
196  * DNS_DB_NONSEC3, will transparently move between the last node of the
197  * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
198  * ("nsec3chain" field) of the database in question, as if the latter was a
199  * successor to the former in lexical order.  The "current" field always holds
200  * the address of either "chain" or "nsec3chain", depending on which RBT is
201  * being traversed at given time.
202  */
203 static void
204 dbiterator_destroy(dns_dbiterator_t **iteratorp DNS__DB_FLARG);
205 static isc_result_t
206 dbiterator_first(dns_dbiterator_t *iterator DNS__DB_FLARG);
207 static isc_result_t
208 dbiterator_last(dns_dbiterator_t *iterator DNS__DB_FLARG);
209 static isc_result_t
210 dbiterator_seek(dns_dbiterator_t *iterator,
211 		const dns_name_t *name DNS__DB_FLARG);
212 static isc_result_t
213 dbiterator_prev(dns_dbiterator_t *iterator DNS__DB_FLARG);
214 static isc_result_t
215 dbiterator_next(dns_dbiterator_t *iterator DNS__DB_FLARG);
216 static isc_result_t
217 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
218 		   dns_name_t *name DNS__DB_FLARG);
219 static isc_result_t
220 dbiterator_pause(dns_dbiterator_t *iterator);
221 static isc_result_t
222 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name);
223 
224 static dns_dbiteratormethods_t dbiterator_methods = {
225 	dbiterator_destroy, dbiterator_first, dbiterator_last,
226 	dbiterator_seek,    dbiterator_prev,  dbiterator_next,
227 	dbiterator_current, dbiterator_pause, dbiterator_origin
228 };
229 
230 /*
231  * If 'paused' is true, then the tree lock is not being held.
232  */
233 typedef struct rbtdb_dbiterator {
234 	dns_dbiterator_t common;
235 	bool paused;
236 	bool new_origin;
237 	isc_rwlocktype_t tree_locked;
238 	isc_result_t result;
239 	dns_fixedname_t name;
240 	dns_fixedname_t origin;
241 	dns_rbtnodechain_t chain;
242 	dns_rbtnodechain_t nsec3chain;
243 	dns_rbtnodechain_t *current;
244 	dns_rbtnode_t *node;
245 	enum { full, nonsec3, nsec3only } nsec3mode;
246 } rbtdb_dbiterator_t;
247 
248 static void
249 free_rbtdb(dns_rbtdb_t *rbtdb, bool log);
250 static void
251 setnsec3parameters(dns_db_t *db, dns_rbtdb_version_t *version);
252 
253 /*%
254  * 'init_count' is used to initialize 'newheader->count' which inturn
255  * is used to determine where in the cycle rrset-order cyclic starts.
256  * We don't lock this as we don't care about simultaneous updates.
257  */
258 static atomic_uint_fast16_t init_count = 0;
259 
260 /*
261  * Locking
262  *
263  * If a routine is going to lock more than one lock in this module, then
264  * the locking must be done in the following order:
265  *
266  *      Tree Lock
267  *
268  *      Node Lock       (Only one from the set may be locked at one time by
269  *                       any caller)
270  *
271  *      Database Lock
272  *
273  * Failure to follow this hierarchy can result in deadlock.
274  */
275 
276 /*
277  * Deleting Nodes
278  *
279  * For zone databases the node for the origin of the zone MUST NOT be deleted.
280  */
281 
282 /*
283  * DB Routines
284  */
285 
286 static void
287 update_rrsetstats(dns_stats_t *stats, const dns_typepair_t htype,
288 		  const uint_least16_t hattributes, const bool increment) {
289 	dns_rdatastatstype_t statattributes = 0;
290 	dns_rdatastatstype_t base = 0;
291 	dns_rdatastatstype_t type;
292 	dns_slabheader_t *header = &(dns_slabheader_t){
293 		.type = htype,
294 		.attributes = hattributes,
295 	};
296 
297 	if (!EXISTS(header) || !STATCOUNT(header)) {
298 		return;
299 	}
300 
301 	if (NEGATIVE(header)) {
302 		if (NXDOMAIN(header)) {
303 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
304 		} else {
305 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
306 			base = DNS_TYPEPAIR_COVERS(header->type);
307 		}
308 	} else {
309 		base = DNS_TYPEPAIR_TYPE(header->type);
310 	}
311 
312 	if (STALE(header)) {
313 		statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE;
314 	}
315 	if (ANCIENT(header)) {
316 		statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT;
317 	}
318 
319 	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
320 	if (increment) {
321 		dns_rdatasetstats_increment(stats, type);
322 	} else {
323 		dns_rdatasetstats_decrement(stats, type);
324 	}
325 }
326 
327 void
328 dns__rbtdb_setttl(dns_slabheader_t *header, dns_ttl_t newttl) {
329 	dns_ttl_t oldttl = header->ttl;
330 
331 	header->ttl = newttl;
332 
333 	if (header->db == NULL || !dns_db_iscache(header->db)) {
334 		return;
335 	}
336 
337 	/*
338 	 * This is a cache. Adjust the heaps if necessary.
339 	 */
340 	if (header->heap == NULL || header->heap_index == 0 || newttl == oldttl)
341 	{
342 		return;
343 	}
344 
345 	if (newttl < oldttl) {
346 		isc_heap_increased(header->heap, header->heap_index);
347 	} else {
348 		isc_heap_decreased(header->heap, header->heap_index);
349 	}
350 
351 	if (newttl == 0) {
352 		isc_heap_delete(header->heap, header->heap_index);
353 	}
354 }
355 
356 /*%
357  * These functions allow the heap code to rank the priority of each
358  * element.  It returns true if v1 happens "sooner" than v2.
359  */
360 static bool
361 ttl_sooner(void *v1, void *v2) {
362 	dns_slabheader_t *h1 = v1;
363 	dns_slabheader_t *h2 = v2;
364 
365 	return h1->ttl < h2->ttl;
366 }
367 
368 /*%
369  * Return which RRset should be resigned sooner.  If the RRsets have the
370  * same signing time, prefer the other RRset over the SOA RRset.
371  */
372 static bool
373 resign_sooner(void *v1, void *v2) {
374 	dns_slabheader_t *h1 = v1;
375 	dns_slabheader_t *h2 = v2;
376 
377 	return h1->resign < h2->resign ||
378 	       (h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) ||
379 	       (h1->resign == h2->resign && h1->resign_lsb == h2->resign_lsb &&
380 		h2->type == DNS_SIGTYPE(dns_rdatatype_soa));
381 }
382 
383 /*%
384  * This function sets the heap index into the header.
385  */
386 static void
387 set_index(void *what, unsigned int idx) {
388 	dns_slabheader_t *h = what;
389 
390 	h->heap_index = idx;
391 }
392 
393 /*%
394  * Work out how many nodes can be deleted in the time between two
395  * requests to the nameserver.  Smooth the resulting number and use it
396  * as a estimate for the number of nodes to be deleted in the next
397  * iteration.
398  */
399 static unsigned int
400 adjust_quantum(unsigned int old, isc_time_t *start) {
401 	unsigned int pps = dns_pps; /* packets per second */
402 	unsigned int interval;
403 	uint64_t usecs;
404 	isc_time_t end;
405 	unsigned int nodes;
406 
407 	if (pps < 100) {
408 		pps = 100;
409 	}
410 	end = isc_time_now();
411 
412 	interval = 1000000 / pps; /* interval in usec */
413 	if (interval == 0) {
414 		interval = 1;
415 	}
416 	usecs = isc_time_microdiff(&end, start);
417 	if (usecs == 0) {
418 		/*
419 		 * We were unable to measure the amount of time taken.
420 		 * Double the nodes deleted next time.
421 		 */
422 		old *= 2;
423 		if (old > 1000) {
424 			old = 1000;
425 		}
426 		return old;
427 	}
428 	nodes = old * interval;
429 	nodes /= (unsigned int)usecs;
430 	if (nodes == 0) {
431 		nodes = 1;
432 	} else if (nodes > 1000) {
433 		nodes = 1000;
434 	}
435 
436 	/* Smooth */
437 	nodes = (nodes + old * 3) / 4;
438 
439 	if (nodes != old) {
440 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
441 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
442 			      "adjust_quantum: old=%d, new=%d", old, nodes);
443 	}
444 
445 	return nodes;
446 }
447 
448 static void
449 free_rbtdb_callback(void *arg) {
450 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
451 
452 	free_rbtdb(rbtdb, true);
453 }
454 
455 static void
456 free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
457 	unsigned int i;
458 	isc_result_t result;
459 	char buf[DNS_NAME_FORMATSIZE];
460 	dns_rbt_t **treep = NULL;
461 	isc_time_t start;
462 
463 	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
464 	REQUIRE(rbtdb->future_version == NULL);
465 
466 	if (rbtdb->current_version != NULL) {
467 		isc_refcount_decrementz(&rbtdb->current_version->references);
468 
469 		isc_refcount_destroy(&rbtdb->current_version->references);
470 		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
471 		isc_rwlock_destroy(&rbtdb->current_version->rwlock);
472 		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
473 			    sizeof(*rbtdb->current_version));
474 	}
475 
476 	/*
477 	 * We assume the number of remaining dead nodes is reasonably small;
478 	 * the overhead of unlinking all nodes here should be negligible.
479 	 */
480 	for (i = 0; i < rbtdb->node_lock_count; i++) {
481 		dns_rbtnode_t *node = NULL;
482 
483 		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
484 		while (node != NULL) {
485 			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
486 			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
487 		}
488 	}
489 
490 	rbtdb->quantum = (rbtdb->loop != NULL) ? 100 : 0;
491 
492 	for (;;) {
493 		/*
494 		 * pick the next tree to (start to) destroy
495 		 */
496 		treep = &rbtdb->tree;
497 		if (*treep == NULL) {
498 			treep = &rbtdb->nsec;
499 			if (*treep == NULL) {
500 				treep = &rbtdb->nsec3;
501 				/*
502 				 * we're finished after clear cutting
503 				 */
504 				if (*treep == NULL) {
505 					break;
506 				}
507 			}
508 		}
509 
510 		start = isc_time_now();
511 		result = dns_rbt_destroy(treep, rbtdb->quantum);
512 		if (result == ISC_R_QUOTA) {
513 			INSIST(rbtdb->loop != NULL);
514 			if (rbtdb->quantum != 0) {
515 				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
516 								&start);
517 			}
518 			isc_async_run(rbtdb->loop, free_rbtdb_callback, rbtdb);
519 			return;
520 		}
521 		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
522 	}
523 
524 	if (log) {
525 		if (dns_name_dynamic(&rbtdb->common.origin)) {
526 			dns_name_format(&rbtdb->common.origin, buf,
527 					sizeof(buf));
528 		} else {
529 			strlcpy(buf, "<UNKNOWN>", sizeof(buf));
530 		}
531 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
532 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
533 			      "done free_rbtdb(%s)", buf);
534 	}
535 	if (dns_name_dynamic(&rbtdb->common.origin)) {
536 		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
537 	}
538 	for (i = 0; i < rbtdb->node_lock_count; i++) {
539 		isc_refcount_destroy(&rbtdb->node_locks[i].references);
540 		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
541 	}
542 
543 	/*
544 	 * Clean up LRU / re-signing order lists.
545 	 */
546 	if (rbtdb->lru != NULL) {
547 		for (i = 0; i < rbtdb->node_lock_count; i++) {
548 			INSIST(ISC_LIST_EMPTY(rbtdb->lru[i]));
549 		}
550 		isc_mem_cput(rbtdb->common.mctx, rbtdb->lru,
551 			     rbtdb->node_lock_count,
552 			     sizeof(dns_slabheaderlist_t));
553 	}
554 	/*
555 	 * Clean up dead node buckets.
556 	 */
557 	if (rbtdb->deadnodes != NULL) {
558 		for (i = 0; i < rbtdb->node_lock_count; i++) {
559 			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
560 		}
561 		isc_mem_cput(rbtdb->common.mctx, rbtdb->deadnodes,
562 			     rbtdb->node_lock_count, sizeof(dns_rbtnodelist_t));
563 	}
564 	/*
565 	 * Clean up heap objects.
566 	 */
567 	if (rbtdb->heaps != NULL) {
568 		for (i = 0; i < rbtdb->node_lock_count; i++) {
569 			isc_heap_destroy(&rbtdb->heaps[i]);
570 		}
571 		isc_mem_cput(rbtdb->hmctx, rbtdb->heaps, rbtdb->node_lock_count,
572 			     sizeof(isc_heap_t *));
573 	}
574 
575 	if (rbtdb->rrsetstats != NULL) {
576 		dns_stats_detach(&rbtdb->rrsetstats);
577 	}
578 	if (rbtdb->cachestats != NULL) {
579 		isc_stats_detach(&rbtdb->cachestats);
580 	}
581 	if (rbtdb->gluecachestats != NULL) {
582 		isc_stats_detach(&rbtdb->gluecachestats);
583 	}
584 
585 	isc_mem_cput(rbtdb->common.mctx, rbtdb->node_locks,
586 		     rbtdb->node_lock_count, sizeof(db_nodelock_t));
587 	TREE_DESTROYLOCK(&rbtdb->tree_lock);
588 	isc_refcount_destroy(&rbtdb->common.references);
589 	if (rbtdb->loop != NULL) {
590 		isc_loop_detach(&rbtdb->loop);
591 	}
592 
593 	isc_rwlock_destroy(&rbtdb->lock);
594 	rbtdb->common.magic = 0;
595 	rbtdb->common.impmagic = 0;
596 	isc_mem_detach(&rbtdb->hmctx);
597 
598 	if (rbtdb->common.update_listeners != NULL) {
599 		INSIST(!cds_lfht_destroy(rbtdb->common.update_listeners, NULL));
600 	}
601 
602 	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
603 }
604 
605 void
606 dns__rbtdb_destroy(dns_db_t *arg) {
607 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
608 	bool want_free = false;
609 	unsigned int i;
610 	unsigned int inactive = 0;
611 
612 	/* XXX check for open versions here */
613 
614 	if (rbtdb->soanode != NULL) {
615 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
616 	}
617 	if (rbtdb->nsnode != NULL) {
618 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
619 	}
620 
621 	/*
622 	 * The current version's glue table needs to be freed early
623 	 * so the nodes are dereferenced before we check the active
624 	 * node count below.
625 	 */
626 	if (rbtdb->current_version != NULL) {
627 		free_gluetable(rbtdb->current_version->glue_table);
628 	}
629 
630 	/*
631 	 * Even though there are no external direct references, there still
632 	 * may be nodes in use.
633 	 */
634 	for (i = 0; i < rbtdb->node_lock_count; i++) {
635 		isc_rwlocktype_t nodelock = isc_rwlocktype_none;
636 		NODE_WRLOCK(&rbtdb->node_locks[i].lock, &nodelock);
637 		rbtdb->node_locks[i].exiting = true;
638 		if (isc_refcount_current(&rbtdb->node_locks[i].references) == 0)
639 		{
640 			inactive++;
641 		}
642 		NODE_UNLOCK(&rbtdb->node_locks[i].lock, &nodelock);
643 	}
644 
645 	if (inactive != 0) {
646 		RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
647 		rbtdb->active -= inactive;
648 		if (rbtdb->active == 0) {
649 			want_free = true;
650 		}
651 		RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
652 		if (want_free) {
653 			char buf[DNS_NAME_FORMATSIZE];
654 			if (dns_name_dynamic(&rbtdb->common.origin)) {
655 				dns_name_format(&rbtdb->common.origin, buf,
656 						sizeof(buf));
657 			} else {
658 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
659 			}
660 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
661 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
662 				      "calling free_rbtdb(%s)", buf);
663 			free_rbtdb(rbtdb, true);
664 		}
665 	}
666 }
667 
668 void
669 dns__rbtdb_currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
670 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
671 	dns_rbtdb_version_t *version = NULL;
672 
673 	REQUIRE(VALID_RBTDB(rbtdb));
674 
675 	RWLOCK(&rbtdb->lock, isc_rwlocktype_read);
676 	version = rbtdb->current_version;
677 	isc_refcount_increment(&version->references);
678 	RWUNLOCK(&rbtdb->lock, isc_rwlocktype_read);
679 
680 	*versionp = (dns_dbversion_t *)version;
681 }
682 
683 static dns_rbtdb_version_t *
684 allocate_version(isc_mem_t *mctx, uint32_t serial, unsigned int references,
685 		 bool writer) {
686 	dns_rbtdb_version_t *version = isc_mem_get(mctx, sizeof(*version));
687 	*version = (dns_rbtdb_version_t){
688 		.serial = serial,
689 		.writer = writer,
690 		.changed_list = ISC_LIST_INITIALIZER,
691 		.resigned_list = ISC_LIST_INITIALIZER,
692 		.link = ISC_LINK_INITIALIZER,
693 		.glue_table = cds_lfht_new(GLUETABLE_INIT_SIZE,
694 					   GLUETABLE_MIN_SIZE, 0,
695 					   CDS_LFHT_AUTO_RESIZE, NULL),
696 	};
697 
698 	isc_rwlock_init(&version->rwlock);
699 	isc_refcount_init(&version->references, references);
700 
701 	return version;
702 }
703 
704 isc_result_t
705 dns__rbtdb_newversion(dns_db_t *db, dns_dbversion_t **versionp) {
706 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
707 	dns_rbtdb_version_t *version = NULL;
708 
709 	REQUIRE(VALID_RBTDB(rbtdb));
710 	REQUIRE(versionp != NULL && *versionp == NULL);
711 	REQUIRE(rbtdb->future_version == NULL);
712 
713 	RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
714 	RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
715 	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
716 				   true);
717 	version->rbtdb = rbtdb;
718 	version->commit_ok = true;
719 	version->secure = rbtdb->current_version->secure;
720 	version->havensec3 = rbtdb->current_version->havensec3;
721 	if (version->havensec3) {
722 		version->flags = rbtdb->current_version->flags;
723 		version->iterations = rbtdb->current_version->iterations;
724 		version->hash = rbtdb->current_version->hash;
725 		version->salt_length = rbtdb->current_version->salt_length;
726 		memmove(version->salt, rbtdb->current_version->salt,
727 			version->salt_length);
728 	} else {
729 		version->flags = 0;
730 		version->iterations = 0;
731 		version->hash = 0;
732 		version->salt_length = 0;
733 		memset(version->salt, 0, sizeof(version->salt));
734 	}
735 	RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
736 	version->records = rbtdb->current_version->records;
737 	version->xfrsize = rbtdb->current_version->xfrsize;
738 	RWUNLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
739 	rbtdb->next_serial++;
740 	rbtdb->future_version = version;
741 	RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
742 
743 	*versionp = version;
744 
745 	return ISC_R_SUCCESS;
746 }
747 
748 void
749 dns__rbtdb_attachversion(dns_db_t *db, dns_dbversion_t *source,
750 			 dns_dbversion_t **targetp) {
751 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
752 	dns_rbtdb_version_t *rbtversion = source;
753 
754 	REQUIRE(VALID_RBTDB(rbtdb));
755 	INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
756 
757 	isc_refcount_increment(&rbtversion->references);
758 
759 	*targetp = rbtversion;
760 }
761 
762 static rbtdb_changed_t *
763 add_changed(dns_slabheader_t *header,
764 	    dns_rbtdb_version_t *version DNS__DB_FLARG) {
765 	rbtdb_changed_t *changed = NULL;
766 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)header->db;
767 
768 	/*
769 	 * Caller must be holding the node lock if its reference must be
770 	 * protected by the lock.
771 	 */
772 
773 	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
774 
775 	RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
776 
777 	REQUIRE(version->writer);
778 
779 	if (changed != NULL) {
780 		dns_rbtnode_t *node = (dns_rbtnode_t *)header->node;
781 		uint_fast32_t refs = isc_refcount_increment(&node->references);
782 #if DNS_DB_NODETRACE
783 		fprintf(stderr,
784 			"incr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
785 			func, file, line, node, refs + 1);
786 #else
787 		UNUSED(refs);
788 #endif
789 		changed->node = node;
790 		changed->dirty = false;
791 		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
792 	} else {
793 		version->commit_ok = false;
794 	}
795 
796 	RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
797 
798 	return changed;
799 }
800 
801 static void
802 rollback_node(dns_rbtnode_t *node, uint32_t serial) {
803 	dns_slabheader_t *header = NULL, *dcurrent = NULL;
804 	bool make_dirty = false;
805 
806 	/*
807 	 * Caller must hold the node lock.
808 	 */
809 
810 	/*
811 	 * We set the IGNORE attribute on rdatasets with serial number
812 	 * 'serial'.  When the reference count goes to zero, these rdatasets
813 	 * will be cleaned up; until that time, they will be ignored.
814 	 */
815 	for (header = node->data; header != NULL; header = header->next) {
816 		if (header->serial == serial) {
817 			DNS_SLABHEADER_SETATTR(header,
818 					       DNS_SLABHEADERATTR_IGNORE);
819 			make_dirty = true;
820 		}
821 		for (dcurrent = header->down; dcurrent != NULL;
822 		     dcurrent = dcurrent->down)
823 		{
824 			if (dcurrent->serial == serial) {
825 				DNS_SLABHEADER_SETATTR(
826 					dcurrent, DNS_SLABHEADERATTR_IGNORE);
827 				make_dirty = true;
828 			}
829 		}
830 	}
831 	if (make_dirty) {
832 		node->dirty = 1;
833 	}
834 }
835 
836 void
837 dns__rbtdb_mark(dns_slabheader_t *header, uint_least16_t flag) {
838 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
839 	uint_least16_t newattributes = 0;
840 	dns_stats_t *stats = NULL;
841 
842 	/*
843 	 * If we are already ancient there is nothing to do.
844 	 */
845 	do {
846 		if ((attributes & flag) != 0) {
847 			return;
848 		}
849 		newattributes = attributes | flag;
850 	} while (!atomic_compare_exchange_weak_acq_rel(
851 		&header->attributes, &attributes, newattributes));
852 
853 	/*
854 	 * Decrement and increment the stats counter for the appropriate
855 	 * RRtype.
856 	 */
857 	stats = dns_db_getrrsetstats(header->db);
858 	if (stats != NULL) {
859 		update_rrsetstats(stats, header->type, attributes, false);
860 		update_rrsetstats(stats, header->type, newattributes, true);
861 	}
862 }
863 
864 static void
865 mark_ancient(dns_slabheader_t *header) {
866 	dns__rbtdb_setttl(header, 0);
867 	dns__rbtdb_mark(header, DNS_SLABHEADERATTR_ANCIENT);
868 	RBTDB_HEADERNODE(header)->dirty = 1;
869 }
870 
871 static void
872 clean_stale_headers(dns_slabheader_t *top) {
873 	dns_slabheader_t *d = NULL, *down_next = NULL;
874 
875 	for (d = top->down; d != NULL; d = down_next) {
876 		down_next = d->down;
877 		dns_slabheader_destroy(&d);
878 	}
879 	top->down = NULL;
880 }
881 
882 static void
883 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
884 	dns_slabheader_t *current = NULL, *top_prev = NULL, *top_next = NULL;
885 
886 	/*
887 	 * Caller must be holding the node lock.
888 	 */
889 
890 	for (current = node->data; current != NULL; current = top_next) {
891 		top_next = current->next;
892 		clean_stale_headers(current);
893 		/*
894 		 * If current is nonexistent, ancient, or stale and
895 		 * we are not keeping stale, we can clean it up.
896 		 */
897 		if (NONEXISTENT(current) || ANCIENT(current) ||
898 		    (STALE(current) && !KEEPSTALE(rbtdb)))
899 		{
900 			if (top_prev != NULL) {
901 				top_prev->next = current->next;
902 			} else {
903 				node->data = current->next;
904 			}
905 			dns_slabheader_destroy(&current);
906 		} else {
907 			top_prev = current;
908 		}
909 	}
910 	node->dirty = 0;
911 }
912 
913 static void
914 clean_zone_node(dns_rbtnode_t *node, uint32_t least_serial) {
915 	dns_slabheader_t *current = NULL, *dcurrent = NULL;
916 	dns_slabheader_t *down_next = NULL, *dparent = NULL;
917 	dns_slabheader_t *top_prev = NULL, *top_next = NULL;
918 	bool still_dirty = false;
919 
920 	/*
921 	 * Caller must be holding the node lock.
922 	 */
923 	REQUIRE(least_serial != 0);
924 
925 	for (current = node->data; current != NULL; current = top_next) {
926 		top_next = current->next;
927 
928 		/*
929 		 * First, we clean up any instances of multiple rdatasets
930 		 * with the same serial number, or that have the IGNORE
931 		 * attribute.
932 		 */
933 		dparent = current;
934 		for (dcurrent = current->down; dcurrent != NULL;
935 		     dcurrent = down_next)
936 		{
937 			down_next = dcurrent->down;
938 			INSIST(dcurrent->serial <= dparent->serial);
939 			if (dcurrent->serial == dparent->serial ||
940 			    IGNORE(dcurrent))
941 			{
942 				if (down_next != NULL) {
943 					down_next->next = dparent;
944 				}
945 				dparent->down = down_next;
946 				dns_slabheader_destroy(&dcurrent);
947 			} else {
948 				dparent = dcurrent;
949 			}
950 		}
951 
952 		/*
953 		 * We've now eliminated all IGNORE datasets with the possible
954 		 * exception of current, which we now check.
955 		 */
956 		if (IGNORE(current)) {
957 			down_next = current->down;
958 			if (down_next == NULL) {
959 				if (top_prev != NULL) {
960 					top_prev->next = current->next;
961 				} else {
962 					node->data = current->next;
963 				}
964 				dns_slabheader_destroy(&current);
965 				/*
966 				 * current no longer exists, so we can
967 				 * just continue with the loop.
968 				 */
969 				continue;
970 			} else {
971 				/*
972 				 * Pull up current->down, making it the new
973 				 * current.
974 				 */
975 				if (top_prev != NULL) {
976 					top_prev->next = down_next;
977 				} else {
978 					node->data = down_next;
979 				}
980 				down_next->next = top_next;
981 				dns_slabheader_destroy(&current);
982 				current = down_next;
983 			}
984 		}
985 
986 		/*
987 		 * We now try to find the first down node less than the
988 		 * least serial.
989 		 */
990 		dparent = current;
991 		for (dcurrent = current->down; dcurrent != NULL;
992 		     dcurrent = down_next)
993 		{
994 			down_next = dcurrent->down;
995 			if (dcurrent->serial < least_serial) {
996 				break;
997 			}
998 			dparent = dcurrent;
999 		}
1000 
1001 		/*
1002 		 * If there is a such an rdataset, delete it and any older
1003 		 * versions.
1004 		 */
1005 		if (dcurrent != NULL) {
1006 			do {
1007 				down_next = dcurrent->down;
1008 				INSIST(dcurrent->serial <= least_serial);
1009 				dns_slabheader_destroy(&dcurrent);
1010 				dcurrent = down_next;
1011 			} while (dcurrent != NULL);
1012 			dparent->down = NULL;
1013 		}
1014 
1015 		/*
1016 		 * Note.  The serial number of 'current' might be less than
1017 		 * least_serial too, but we cannot delete it because it is
1018 		 * the most recent version, unless it is a NONEXISTENT
1019 		 * rdataset.
1020 		 */
1021 		if (current->down != NULL) {
1022 			still_dirty = true;
1023 			top_prev = current;
1024 		} else {
1025 			/*
1026 			 * If this is a NONEXISTENT rdataset, we can delete it.
1027 			 */
1028 			if (NONEXISTENT(current)) {
1029 				if (top_prev != NULL) {
1030 					top_prev->next = current->next;
1031 				} else {
1032 					node->data = current->next;
1033 				}
1034 				dns_slabheader_destroy(&current);
1035 			} else {
1036 				top_prev = current;
1037 			}
1038 		}
1039 	}
1040 	if (!still_dirty) {
1041 		node->dirty = 0;
1042 	}
1043 }
1044 
1045 /*
1046  * tree_lock(write) must be held.
1047  */
1048 static void
1049 delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1050 	dns_rbtnode_t *nsecnode = NULL;
1051 	dns_fixedname_t fname;
1052 	dns_name_t *name = NULL;
1053 	isc_result_t result = ISC_R_UNEXPECTED;
1054 
1055 	INSIST(!ISC_LINK_LINKED(node, deadlink));
1056 
1057 	if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1058 		char printname[DNS_NAME_FORMATSIZE];
1059 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1060 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1061 			      "delete_node(): %p %s (bucket %d)", node,
1062 			      dns_rbt_formatnodename(node, printname,
1063 						     sizeof(printname)),
1064 			      node->locknum);
1065 	}
1066 
1067 	switch (node->nsec) {
1068 	case DNS_DB_NSEC_NORMAL:
1069 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1070 		break;
1071 	case DNS_DB_NSEC_HAS_NSEC:
1072 		/*
1073 		 * Though this may be wasteful, it has to be done before
1074 		 * node is deleted.
1075 		 */
1076 		name = dns_fixedname_initname(&fname);
1077 		dns_rbt_fullnamefromnode(node, name);
1078 		/*
1079 		 * Delete the corresponding node from the auxiliary NSEC
1080 		 * tree before deleting from the main tree.
1081 		 */
1082 		nsecnode = NULL;
1083 		result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
1084 					  NULL, DNS_RBTFIND_EMPTYDATA, NULL,
1085 					  NULL);
1086 		if (result != ISC_R_SUCCESS) {
1087 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1088 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1089 				      "delete_node: "
1090 				      "dns_rbt_findnode(nsec): %s",
1091 				      isc_result_totext(result));
1092 		} else {
1093 			result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
1094 						    false);
1095 			if (result != ISC_R_SUCCESS) {
1096 				isc_log_write(
1097 					dns_lctx, DNS_LOGCATEGORY_DATABASE,
1098 					DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1099 					"delete_node(): "
1100 					"dns_rbt_deletenode(nsecnode): %s",
1101 					isc_result_totext(result));
1102 			}
1103 		}
1104 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1105 		break;
1106 	case DNS_DB_NSEC_NSEC:
1107 		result = dns_rbt_deletenode(rbtdb->nsec, node, false);
1108 		break;
1109 	case DNS_DB_NSEC_NSEC3:
1110 		result = dns_rbt_deletenode(rbtdb->nsec3, node, false);
1111 		break;
1112 	}
1113 	if (result != ISC_R_SUCCESS) {
1114 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1115 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1116 			      "delete_node(): "
1117 			      "dns_rbt_deletenode: %s",
1118 			      isc_result_totext(result));
1119 	}
1120 }
1121 
1122 /*
1123  * Caller must be holding the node lock.
1124  */
1125 void
1126 dns__rbtdb_newref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1127 		  isc_rwlocktype_t nlocktype DNS__DB_FLARG) {
1128 	uint_fast32_t refs;
1129 
1130 	if (nlocktype == isc_rwlocktype_write &&
1131 	    ISC_LINK_LINKED(node, deadlink))
1132 	{
1133 		ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
1134 				deadlink);
1135 	}
1136 
1137 	refs = isc_refcount_increment0(&node->references);
1138 #if DNS_DB_NODETRACE
1139 	fprintf(stderr, "incr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
1140 		func, file, line, node, refs + 1);
1141 #else
1142 	UNUSED(refs);
1143 #endif
1144 
1145 	if (refs == 0) {
1146 		/* this is the first reference to the node */
1147 		refs = isc_refcount_increment0(
1148 			&rbtdb->node_locks[node->locknum].references);
1149 #if DNS_DB_NODETRACE
1150 		fprintf(stderr,
1151 			"incr:nodelock:%s:%s:%u:%p:%p->references = "
1152 			"%" PRIuFAST32 "\n",
1153 			func, file, line, node,
1154 			&rbtdb->node_locks[node->locknum], refs + 1);
1155 #else
1156 		UNUSED(refs);
1157 #endif
1158 	}
1159 }
1160 
1161 /*%
1162  * The tree lock must be held for the result to be valid.
1163  */
1164 static bool
1165 is_last_node_on_its_level(dns_rbtnode_t *node) {
1166 	return node->parent != NULL && node->parent->down == node &&
1167 	       node->left == NULL && node->right == NULL;
1168 }
1169 
1170 static void
1171 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1172 		   isc_rwlocktype_t nlocktype DNS__DB_FLARG) {
1173 	rbtdb_prune_t *prune = isc_mem_get(rbtdb->common.mctx, sizeof(*prune));
1174 	*prune = (rbtdb_prune_t){ .node = node };
1175 
1176 	dns_db_attach((dns_db_t *)rbtdb, &prune->db);
1177 	dns__rbtdb_newref(rbtdb, node, nlocktype DNS__DB_FLARG_PASS);
1178 
1179 	isc_async_run(rbtdb->loop, prune_tree, prune);
1180 }
1181 
1182 /*%
1183  * Clean up dead nodes.  These are nodes which have no references, and
1184  * have no data.  They are dead but we could not or chose not to delete
1185  * them when we deleted all the data at that node because we did not want
1186  * to wait for the tree write lock.
1187  *
1188  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1189  */
1190 static void
1191 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum DNS__DB_FLARG) {
1192 	dns_rbtnode_t *node = NULL;
1193 	int count = 10; /* XXXJT: should be adjustable */
1194 
1195 	node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1196 	while (node != NULL && count > 0) {
1197 		ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1198 
1199 		/*
1200 		 * We might have reactivated this node without a tree write
1201 		 * lock, so we couldn't remove this node from deadnodes then
1202 		 * and we have to do it now.
1203 		 */
1204 		if (isc_refcount_current(&node->references) != 0 ||
1205 		    node->data != NULL)
1206 		{
1207 			node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1208 			count--;
1209 			continue;
1210 		}
1211 
1212 		if (is_last_node_on_its_level(node) && rbtdb->loop != NULL) {
1213 			send_to_prune_tree(
1214 				rbtdb, node,
1215 				isc_rwlocktype_write DNS__DB_FLARG_PASS);
1216 		} else if (node->down == NULL && node->data == NULL) {
1217 			/*
1218 			 * Not a interior node and not needing to be
1219 			 * reactivated.
1220 			 */
1221 			delete_node(rbtdb, node);
1222 		} else if (node->data == NULL) {
1223 			/*
1224 			 * A interior node without data. Leave linked to
1225 			 * to be cleaned up when node->down becomes NULL.
1226 			 */
1227 			ISC_LIST_APPEND(rbtdb->deadnodes[bucketnum], node,
1228 					deadlink);
1229 		}
1230 		node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1231 		count--;
1232 	}
1233 }
1234 
1235 /*
1236  * This function is assumed to be called when a node is newly referenced
1237  * and can be in the deadnode list.  In that case the node must be retrieved
1238  * from the list because it is going to be used.  In addition, if the caller
1239  * happens to hold a write lock on the tree, it's a good chance to purge dead
1240  * nodes.
1241  * Note: while a new reference is gained in multiple places, there are only very
1242  * few cases where the node can be in the deadnode list (only empty nodes can
1243  * have been added to the list).
1244  */
1245 static void
1246 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1247 		isc_rwlocktype_t tlocktype DNS__DB_FLARG) {
1248 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1249 	isc_rwlock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
1250 	bool maybe_cleanup = false;
1251 
1252 	POST(nlocktype);
1253 
1254 	NODE_RDLOCK(nodelock, &nlocktype);
1255 
1256 	/*
1257 	 * Check if we can possibly cleanup the dead node.  If so, upgrade
1258 	 * the node lock below to perform the cleanup.
1259 	 */
1260 	if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1261 	    tlocktype == isc_rwlocktype_write)
1262 	{
1263 		maybe_cleanup = true;
1264 	}
1265 
1266 	if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
1267 		/*
1268 		 * Upgrade the lock and test if we still need to unlink.
1269 		 */
1270 		NODE_FORCEUPGRADE(nodelock, &nlocktype);
1271 		POST(nlocktype);
1272 		if (ISC_LINK_LINKED(node, deadlink)) {
1273 			ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
1274 					deadlink);
1275 		}
1276 		if (maybe_cleanup) {
1277 			cleanup_dead_nodes(rbtdb,
1278 					   node->locknum DNS__DB_FILELINE);
1279 		}
1280 	}
1281 
1282 	dns__rbtdb_newref(rbtdb, node, nlocktype DNS__DB_FLARG_PASS);
1283 
1284 	NODE_UNLOCK(nodelock, &nlocktype);
1285 }
1286 
1287 /*
1288  * Caller must be holding the node lock; either the read or write lock.
1289  * Note that the lock must be held even when node references are
1290  * atomically modified; in that case the decrement operation itself does not
1291  * have to be protected, but we must avoid a race condition where multiple
1292  * threads are decreasing the reference to zero simultaneously and at least
1293  * one of them is going to free the node.
1294  *
1295  * This function returns true if and only if the node reference decreases
1296  * to zero.
1297  *
1298  * NOTE: Decrementing the reference count of a node to zero does not mean it
1299  * will be immediately freed.
1300  */
1301 bool
1302 dns__rbtdb_decref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1303 		  uint32_t least_serial, isc_rwlocktype_t *nlocktypep,
1304 		  isc_rwlocktype_t *tlocktypep, bool tryupgrade,
1305 		  bool pruning DNS__DB_FLARG) {
1306 	isc_result_t result;
1307 	bool locked = *tlocktypep != isc_rwlocktype_none;
1308 	bool write_locked = false;
1309 	db_nodelock_t *nodelock = NULL;
1310 	int bucket = node->locknum;
1311 	bool no_reference = true;
1312 	uint_fast32_t refs;
1313 
1314 	REQUIRE(*nlocktypep != isc_rwlocktype_none);
1315 
1316 	nodelock = &rbtdb->node_locks[bucket];
1317 
1318 #define KEEP_NODE(n, r, l)                                  \
1319 	((n)->data != NULL || ((l) && (n)->down != NULL) || \
1320 	 (n) == (r)->origin_node || (n) == (r)->nsec3_origin_node)
1321 
1322 	/* Handle easy and typical case first. */
1323 	if (!node->dirty && KEEP_NODE(node, rbtdb, locked)) {
1324 		refs = isc_refcount_decrement(&node->references);
1325 #if DNS_DB_NODETRACE
1326 		fprintf(stderr,
1327 			"decr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
1328 			func, file, line, node, refs - 1);
1329 #else
1330 		UNUSED(refs);
1331 #endif
1332 		if (refs == 1) {
1333 			refs = isc_refcount_decrement(&nodelock->references);
1334 #if DNS_DB_NODETRACE
1335 			fprintf(stderr,
1336 				"decr:nodelock:%s:%s:%u:%p:%p->references = "
1337 				"%" PRIuFAST32 "\n",
1338 				func, file, line, node, nodelock, refs - 1);
1339 #else
1340 			UNUSED(refs);
1341 #endif
1342 			return true;
1343 		} else {
1344 			return false;
1345 		}
1346 	}
1347 
1348 	/* Upgrade the lock? */
1349 	if (*nlocktypep == isc_rwlocktype_read) {
1350 		NODE_FORCEUPGRADE(&nodelock->lock, nlocktypep);
1351 	}
1352 
1353 	refs = isc_refcount_decrement(&node->references);
1354 #if DNS_DB_NODETRACE
1355 	fprintf(stderr, "decr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
1356 		func, file, line, node, refs - 1);
1357 #else
1358 	UNUSED(refs);
1359 #endif
1360 	if (refs > 1) {
1361 		return false;
1362 	}
1363 
1364 	if (node->dirty) {
1365 		if (IS_CACHE(rbtdb)) {
1366 			clean_cache_node(rbtdb, node);
1367 		} else {
1368 			if (least_serial == 0) {
1369 				/*
1370 				 * Caller doesn't know the least serial.
1371 				 * Get it.
1372 				 */
1373 				RWLOCK(&rbtdb->lock, isc_rwlocktype_read);
1374 				least_serial = rbtdb->least_serial;
1375 				RWUNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1376 			}
1377 			clean_zone_node(node, least_serial);
1378 		}
1379 	}
1380 
1381 	/*
1382 	 * Attempt to switch to a write lock on the tree.  If this fails,
1383 	 * we will add this node to a linked list of nodes in this locking
1384 	 * bucket which we will free later.
1385 	 *
1386 	 * Locking hierarchy notwithstanding, we don't need to free
1387 	 * the node lock before acquiring the tree write lock because
1388 	 * we only do a trylock.
1389 	 */
1390 	/* We are allowed to upgrade the tree lock */
1391 	switch (*tlocktypep) {
1392 	case isc_rwlocktype_write:
1393 		result = ISC_R_SUCCESS;
1394 		break;
1395 	case isc_rwlocktype_read:
1396 		if (tryupgrade) {
1397 			result = TREE_TRYUPGRADE(&rbtdb->tree_lock, tlocktypep);
1398 		} else {
1399 			result = ISC_R_LOCKBUSY;
1400 		}
1401 		break;
1402 	case isc_rwlocktype_none:
1403 		result = TREE_TRYWRLOCK(&rbtdb->tree_lock, tlocktypep);
1404 		break;
1405 	default:
1406 		UNREACHABLE();
1407 	}
1408 	RUNTIME_CHECK(result == ISC_R_SUCCESS || result == ISC_R_LOCKBUSY);
1409 	if (result == ISC_R_SUCCESS) {
1410 		write_locked = true;
1411 	}
1412 
1413 	refs = isc_refcount_decrement(&nodelock->references);
1414 #if DNS_DB_NODETRACE
1415 	fprintf(stderr,
1416 		"decr:nodelock:%s:%s:%u:%p:%p->references = %" PRIuFAST32 "\n",
1417 		func, file, line, node, nodelock, refs - 1);
1418 #else
1419 	UNUSED(refs);
1420 #endif
1421 
1422 	if (KEEP_NODE(node, rbtdb, locked || write_locked)) {
1423 		goto restore_locks;
1424 	}
1425 
1426 #undef KEEP_NODE
1427 
1428 	if (write_locked) {
1429 		/*
1430 		 * If this node is the only one left on its RBTDB level,
1431 		 * attempt pruning the RBTDB (i.e. deleting empty nodes that
1432 		 * are ancestors of 'node' and are not interior nodes) starting
1433 		 * from this node (see prune_tree()).  The main reason this is
1434 		 * not done immediately, but asynchronously, is that the
1435 		 * ancestors of 'node' are almost guaranteed to belong to
1436 		 * different node buckets and we don't want to do juggle locks
1437 		 * right now.
1438 		 *
1439 		 * Since prune_tree() also calls dns__rbtdb_decref(), check the
1440 		 * value of the 'pruning' parameter (which is only set to
1441 		 * 'true' in the dns__rbtdb_decref() call present in
1442 		 * prune_tree()) to prevent an infinite loop and to allow a
1443 		 * node sent to prune_tree() to be deleted by the delete_node()
1444 		 * call in the code branch below.
1445 		 */
1446 		if (!pruning && is_last_node_on_its_level(node) &&
1447 		    rbtdb->loop != NULL)
1448 		{
1449 			send_to_prune_tree(rbtdb, node,
1450 					   *nlocktypep DNS__DB_FLARG_PASS);
1451 			no_reference = false;
1452 		} else {
1453 			/*
1454 			 * The node can now be deleted.
1455 			 */
1456 			delete_node(rbtdb, node);
1457 		}
1458 	} else {
1459 		INSIST(node->data == NULL);
1460 		if (!ISC_LINK_LINKED(node, deadlink)) {
1461 			ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1462 					deadlink);
1463 		}
1464 	}
1465 
1466 restore_locks:
1467 	/*
1468 	 * Relock a read lock, or unlock the write lock if no lock was held.
1469 	 */
1470 	if (!locked && write_locked) {
1471 		TREE_UNLOCK(&rbtdb->tree_lock, tlocktypep);
1472 	}
1473 
1474 	return no_reference;
1475 }
1476 
1477 /*
1478  * Prune the RBTDB tree of trees.  Start by attempting to delete a node that is
1479  * the only one left on its RBTDB level (see the send_to_prune_tree() call in
1480  * dns__rbtdb_decref()).  Then, if the node has a parent (which can either
1481  * exist on the same RBTDB level or on an upper RBTDB level), check whether the
1482  * latter is an interior node (i.e. a node with a non-NULL 'down' pointer).  If
1483  * the parent node is not an interior node, attempt deleting the parent node as
1484  * well and then move on to examining the parent node's parent, etc.  Continue
1485  * traversing the RBTDB tree until a node is encountered that is still an
1486  * interior node after the previously-processed node gets deleted.
1487  *
1488  * It is acceptable for a node sent to this function to NOT be deleted in the
1489  * process (e.g. if it gets reactivated in the meantime).  Furthermore, node
1490  * deletion is not a prerequisite for continuing RBTDB traversal.
1491  *
1492  * This function gets called once for every "starting node" and it continues
1493  * traversing the RBTDB until the stop condition is met.  In the worst case,
1494  * the number of nodes processed by a single execution of this function is the
1495  * number of tree levels, which is at most the maximum number of domain name
1496  * labels (127); however, it should be much smaller in practice and deleting
1497  * empty RBTDB nodes is critical to keeping the amount of memory used by the
1498  * cache memory context within the configured limit anyway.
1499  */
1500 static void
1501 prune_tree(void *arg) {
1502 	rbtdb_prune_t *prune = (rbtdb_prune_t *)arg;
1503 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)prune->db;
1504 	dns_rbtnode_t *node = prune->node;
1505 	dns_rbtnode_t *parent = NULL;
1506 	unsigned int locknum = node->locknum;
1507 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1508 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1509 
1510 	isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune));
1511 
1512 	TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
1513 	NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1514 	do {
1515 		parent = node->parent;
1516 		dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
1517 				  true DNS__DB_FILELINE);
1518 
1519 		/*
1520 		 * Check whether the parent is an interior node.  Note that it
1521 		 * might have been one before the dns__rbtdb_decref() call on
1522 		 * the previous line, but decrementing the reference count for
1523 		 * 'node' could have caused 'node->parent->down' to become
1524 		 * NULL.
1525 		 */
1526 		if (parent != NULL && parent->down == NULL) {
1527 			/*
1528 			 * Keep the node lock if possible; otherwise, release
1529 			 * the old lock and acquire one for the parent.
1530 			 */
1531 			if (parent->locknum != locknum) {
1532 				NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1533 					    &nlocktype);
1534 				locknum = parent->locknum;
1535 				NODE_WRLOCK(&rbtdb->node_locks[locknum].lock,
1536 					    &nlocktype);
1537 			}
1538 
1539 			/*
1540 			 * We need to gain a reference to the parent node
1541 			 * before decrementing it in the next iteration.
1542 			 */
1543 			dns__rbtdb_newref(rbtdb, parent,
1544 					  nlocktype DNS__DB_FLARG_PASS);
1545 		} else {
1546 			parent = NULL;
1547 		}
1548 
1549 		node = parent;
1550 	} while (node != NULL);
1551 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1552 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
1553 
1554 	dns_db_detach((dns_db_t **)&rbtdb);
1555 }
1556 
1557 static void
1558 make_least_version(dns_rbtdb_t *rbtdb, dns_rbtdb_version_t *version,
1559 		   rbtdb_changedlist_t *cleanup_list) {
1560 	/*
1561 	 * Caller must be holding the database lock.
1562 	 */
1563 
1564 	rbtdb->least_serial = version->serial;
1565 	*cleanup_list = version->changed_list;
1566 	ISC_LIST_INIT(version->changed_list);
1567 }
1568 
1569 static void
1570 cleanup_nondirty(dns_rbtdb_version_t *version,
1571 		 rbtdb_changedlist_t *cleanup_list) {
1572 	rbtdb_changed_t *changed = NULL, *next_changed = NULL;
1573 
1574 	/*
1575 	 * If the changed record is dirty, then
1576 	 * an update created multiple versions of
1577 	 * a given rdataset.  We keep this list
1578 	 * until we're the least open version, at
1579 	 * which point it's safe to get rid of any
1580 	 * older versions.
1581 	 *
1582 	 * If the changed record isn't dirty, then
1583 	 * we don't need it anymore since we're
1584 	 * committing and not rolling back.
1585 	 *
1586 	 * The caller must be holding the database lock.
1587 	 */
1588 	for (changed = HEAD(version->changed_list); changed != NULL;
1589 	     changed = next_changed)
1590 	{
1591 		next_changed = NEXT(changed, link);
1592 		if (!changed->dirty) {
1593 			UNLINK(version->changed_list, changed, link);
1594 			APPEND(*cleanup_list, changed, link);
1595 		}
1596 	}
1597 }
1598 
1599 void
1600 dns__rbtdb_setsecure(dns_db_t *db, dns_rbtdb_version_t *version,
1601 		     dns_dbnode_t *origin) {
1602 	dns_rdataset_t keyset;
1603 	dns_rdataset_t nsecset, signsecset;
1604 	bool haszonekey = false;
1605 	bool hasnsec = false;
1606 	isc_result_t result;
1607 
1608 	REQUIRE(version != NULL);
1609 
1610 	dns_rdataset_init(&keyset);
1611 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey,
1612 				     0, 0, &keyset, NULL);
1613 	if (result == ISC_R_SUCCESS) {
1614 		result = dns_rdataset_first(&keyset);
1615 		while (result == ISC_R_SUCCESS) {
1616 			dns_rdata_t keyrdata = DNS_RDATA_INIT;
1617 			dns_rdataset_current(&keyset, &keyrdata);
1618 			if (dns_zonekey_iszonekey(&keyrdata)) {
1619 				haszonekey = true;
1620 				break;
1621 			}
1622 			result = dns_rdataset_next(&keyset);
1623 		}
1624 		dns_rdataset_disassociate(&keyset);
1625 	}
1626 	if (!haszonekey) {
1627 		version->secure = false;
1628 		version->havensec3 = false;
1629 		return;
1630 	}
1631 
1632 	dns_rdataset_init(&nsecset);
1633 	dns_rdataset_init(&signsecset);
1634 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, 0,
1635 				     0, &nsecset, &signsecset);
1636 	if (result == ISC_R_SUCCESS) {
1637 		if (dns_rdataset_isassociated(&signsecset)) {
1638 			hasnsec = true;
1639 			dns_rdataset_disassociate(&signsecset);
1640 		}
1641 		dns_rdataset_disassociate(&nsecset);
1642 	}
1643 
1644 	setnsec3parameters(db, version);
1645 
1646 	/*
1647 	 * Do we have a valid NSEC/NSEC3 chain?
1648 	 */
1649 	if (version->havensec3 || hasnsec) {
1650 		version->secure = true;
1651 	} else {
1652 		version->secure = false;
1653 	}
1654 }
1655 
1656 /*%<
1657  * Walk the origin node looking for NSEC3PARAM records.
1658  * Cache the nsec3 parameters.
1659  */
1660 static void
1661 setnsec3parameters(dns_db_t *db, dns_rbtdb_version_t *version) {
1662 	dns_rbtnode_t *node = NULL;
1663 	dns_rdata_nsec3param_t nsec3param;
1664 	dns_rdata_t rdata = DNS_RDATA_INIT;
1665 	isc_region_t region;
1666 	isc_result_t result;
1667 	dns_slabheader_t *header = NULL, *header_next = NULL;
1668 	unsigned char *raw; /* RDATASLAB */
1669 	unsigned int count, length;
1670 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1671 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1672 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1673 
1674 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
1675 	version->havensec3 = false;
1676 	node = rbtdb->origin_node;
1677 	NODE_RDLOCK(&(rbtdb->node_locks[node->locknum].lock), &nlocktype);
1678 	for (header = node->data; header != NULL; header = header_next) {
1679 		header_next = header->next;
1680 		do {
1681 			if (header->serial <= version->serial &&
1682 			    !IGNORE(header))
1683 			{
1684 				if (NONEXISTENT(header)) {
1685 					header = NULL;
1686 				}
1687 				break;
1688 			} else {
1689 				header = header->down;
1690 			}
1691 		} while (header != NULL);
1692 
1693 		if (header != NULL &&
1694 		    (header->type == dns_rdatatype_nsec3param))
1695 		{
1696 			/*
1697 			 * Find A NSEC3PARAM with a supported algorithm.
1698 			 */
1699 			raw = dns_slabheader_raw(header);
1700 			count = raw[0] * 256 + raw[1]; /* count */
1701 			raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
1702 			while (count-- > 0U) {
1703 				length = raw[0] * 256 + raw[1];
1704 				raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
1705 				region.base = raw;
1706 				region.length = length;
1707 				raw += length;
1708 				dns_rdata_fromregion(
1709 					&rdata, rbtdb->common.rdclass,
1710 					dns_rdatatype_nsec3param, &region);
1711 				result = dns_rdata_tostruct(&rdata, &nsec3param,
1712 							    NULL);
1713 				INSIST(result == ISC_R_SUCCESS);
1714 				dns_rdata_reset(&rdata);
1715 
1716 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG &&
1717 				    !dns_nsec3_supportedhash(nsec3param.hash))
1718 				{
1719 					continue;
1720 				}
1721 
1722 				if (nsec3param.flags != 0) {
1723 					continue;
1724 				}
1725 
1726 				memmove(version->salt, nsec3param.salt,
1727 					nsec3param.salt_length);
1728 				version->hash = nsec3param.hash;
1729 				version->salt_length = nsec3param.salt_length;
1730 				version->iterations = nsec3param.iterations;
1731 				version->flags = nsec3param.flags;
1732 				version->havensec3 = true;
1733 				/*
1734 				 * Look for a better algorithm than the
1735 				 * unknown test algorithm.
1736 				 */
1737 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) {
1738 					goto unlock;
1739 				}
1740 			}
1741 		}
1742 	}
1743 unlock:
1744 	NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), &nlocktype);
1745 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
1746 }
1747 
1748 static void
1749 cleanup_dead_nodes_callback(void *arg) {
1750 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
1751 	bool again = false;
1752 	unsigned int locknum;
1753 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1754 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1755 
1756 	TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
1757 	for (locknum = 0; locknum < rbtdb->node_lock_count; locknum++) {
1758 		NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1759 		cleanup_dead_nodes(rbtdb, locknum DNS__DB_FILELINE);
1760 		if (ISC_LIST_HEAD(rbtdb->deadnodes[locknum]) != NULL) {
1761 			again = true;
1762 		}
1763 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1764 	}
1765 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
1766 	if (again) {
1767 		isc_async_run(rbtdb->loop, cleanup_dead_nodes_callback, rbtdb);
1768 	} else {
1769 		dns_db_detach((dns_db_t **)&rbtdb);
1770 	}
1771 }
1772 
1773 void
1774 dns__rbtdb_closeversion(dns_db_t *db, dns_dbversion_t **versionp,
1775 			bool commit DNS__DB_FLARG) {
1776 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1777 	dns_rbtdb_version_t *version = NULL, *cleanup_version = NULL;
1778 	dns_rbtdb_version_t *least_greater = NULL;
1779 	bool rollback = false;
1780 	rbtdb_changedlist_t cleanup_list;
1781 	dns_slabheaderlist_t resigned_list;
1782 	rbtdb_changed_t *changed = NULL, *next_changed = NULL;
1783 	uint32_t serial, least_serial;
1784 	dns_rbtnode_t *rbtnode = NULL;
1785 	dns_slabheader_t *header = NULL;
1786 
1787 	REQUIRE(VALID_RBTDB(rbtdb));
1788 	version = (dns_rbtdb_version_t *)*versionp;
1789 	INSIST(version->rbtdb == rbtdb);
1790 
1791 	ISC_LIST_INIT(cleanup_list);
1792 	ISC_LIST_INIT(resigned_list);
1793 
1794 	if (isc_refcount_decrement(&version->references) > 1) {
1795 		/* typical and easy case first */
1796 		if (commit) {
1797 			RWLOCK(&rbtdb->lock, isc_rwlocktype_read);
1798 			INSIST(!version->writer);
1799 			RWUNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1800 		}
1801 		goto end;
1802 	}
1803 
1804 	/*
1805 	 * Update the zone's secure status in version before making
1806 	 * it the current version.
1807 	 */
1808 	if (version->writer && commit && !IS_CACHE(rbtdb)) {
1809 		dns__rbtdb_setsecure(db, version, rbtdb->origin_node);
1810 	}
1811 
1812 	RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
1813 	serial = version->serial;
1814 	if (version->writer) {
1815 		if (commit) {
1816 			unsigned int cur_ref;
1817 			dns_rbtdb_version_t *cur_version = NULL;
1818 
1819 			INSIST(version->commit_ok);
1820 			INSIST(version == rbtdb->future_version);
1821 			/*
1822 			 * The current version is going to be replaced.
1823 			 * Release the (likely last) reference to it from the
1824 			 * DB itself and unlink it from the open list.
1825 			 */
1826 			cur_version = rbtdb->current_version;
1827 			cur_ref = isc_refcount_decrement(
1828 				&cur_version->references);
1829 			if (cur_ref == 1) {
1830 				(void)isc_refcount_current(
1831 					&cur_version->references);
1832 				if (cur_version->serial == rbtdb->least_serial)
1833 				{
1834 					INSIST(EMPTY(
1835 						cur_version->changed_list));
1836 				}
1837 				UNLINK(rbtdb->open_versions, cur_version, link);
1838 			}
1839 			if (EMPTY(rbtdb->open_versions)) {
1840 				/*
1841 				 * We're going to become the least open
1842 				 * version.
1843 				 */
1844 				make_least_version(rbtdb, version,
1845 						   &cleanup_list);
1846 			} else {
1847 				/*
1848 				 * Some other open version is the
1849 				 * least version.  We can't cleanup
1850 				 * records that were changed in this
1851 				 * version because the older versions
1852 				 * may still be in use by an open
1853 				 * version.
1854 				 *
1855 				 * We can, however, discard the
1856 				 * changed records for things that
1857 				 * we've added that didn't exist in
1858 				 * prior versions.
1859 				 */
1860 				cleanup_nondirty(version, &cleanup_list);
1861 			}
1862 			/*
1863 			 * If the (soon to be former) current version
1864 			 * isn't being used by anyone, we can clean
1865 			 * it up.
1866 			 */
1867 			if (cur_ref == 1) {
1868 				cleanup_version = cur_version;
1869 				APPENDLIST(version->changed_list,
1870 					   cleanup_version->changed_list, link);
1871 			}
1872 			/*
1873 			 * Become the current version.
1874 			 */
1875 			version->writer = false;
1876 			rbtdb->current_version = version;
1877 			rbtdb->current_serial = version->serial;
1878 			rbtdb->future_version = NULL;
1879 
1880 			/*
1881 			 * Keep the current version in the open list, and
1882 			 * gain a reference for the DB itself (see the DB
1883 			 * creation function below).  This must be the only
1884 			 * case where we need to increment the counter from
1885 			 * zero and need to use isc_refcount_increment0().
1886 			 */
1887 			INSIST(isc_refcount_increment0(&version->references) ==
1888 			       0);
1889 			PREPEND(rbtdb->open_versions, rbtdb->current_version,
1890 				link);
1891 			resigned_list = version->resigned_list;
1892 			ISC_LIST_INIT(version->resigned_list);
1893 		} else {
1894 			/*
1895 			 * We're rolling back this transaction.
1896 			 */
1897 			cleanup_list = version->changed_list;
1898 			ISC_LIST_INIT(version->changed_list);
1899 			resigned_list = version->resigned_list;
1900 			ISC_LIST_INIT(version->resigned_list);
1901 			rollback = true;
1902 			cleanup_version = version;
1903 			rbtdb->future_version = NULL;
1904 		}
1905 	} else {
1906 		if (version != rbtdb->current_version) {
1907 			/*
1908 			 * There are no external or internal references
1909 			 * to this version and it can be cleaned up.
1910 			 */
1911 			cleanup_version = version;
1912 
1913 			/*
1914 			 * Find the version with the least serial
1915 			 * number greater than ours.
1916 			 */
1917 			least_greater = PREV(version, link);
1918 			if (least_greater == NULL) {
1919 				least_greater = rbtdb->current_version;
1920 			}
1921 
1922 			INSIST(version->serial < least_greater->serial);
1923 			/*
1924 			 * Is this the least open version?
1925 			 */
1926 			if (version->serial == rbtdb->least_serial) {
1927 				/*
1928 				 * Yes.  Install the new least open
1929 				 * version.
1930 				 */
1931 				make_least_version(rbtdb, least_greater,
1932 						   &cleanup_list);
1933 			} else {
1934 				/*
1935 				 * Add any unexecuted cleanups to
1936 				 * those of the least greater version.
1937 				 */
1938 				APPENDLIST(least_greater->changed_list,
1939 					   version->changed_list, link);
1940 			}
1941 		} else if (version->serial == rbtdb->least_serial) {
1942 			INSIST(EMPTY(version->changed_list));
1943 		}
1944 		UNLINK(rbtdb->open_versions, version, link);
1945 	}
1946 	least_serial = rbtdb->least_serial;
1947 	RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1948 
1949 	if (cleanup_version != NULL) {
1950 		isc_refcount_destroy(&cleanup_version->references);
1951 		INSIST(EMPTY(cleanup_version->changed_list));
1952 		free_gluetable(cleanup_version->glue_table);
1953 		isc_rwlock_destroy(&cleanup_version->rwlock);
1954 		isc_mem_put(rbtdb->common.mctx, cleanup_version,
1955 			    sizeof(*cleanup_version));
1956 	}
1957 
1958 	/*
1959 	 * Commit/rollback re-signed headers.
1960 	 */
1961 	for (header = HEAD(resigned_list); header != NULL;
1962 	     header = HEAD(resigned_list))
1963 	{
1964 		isc_rwlock_t *lock = NULL;
1965 		isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1966 		isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1967 
1968 		ISC_LIST_UNLINK(resigned_list, header, link);
1969 
1970 		lock = &rbtdb->node_locks[RBTDB_HEADERNODE(header)->locknum]
1971 				.lock;
1972 		NODE_WRLOCK(lock, &nlocktype);
1973 		if (rollback && !IGNORE(header)) {
1974 			dns__zonerbt_resigninsert(
1975 				rbtdb, RBTDB_HEADERNODE(header)->locknum,
1976 				header);
1977 		}
1978 		dns__rbtdb_decref(rbtdb, RBTDB_HEADERNODE(header), least_serial,
1979 				  &nlocktype, &tlocktype, true,
1980 				  false DNS__DB_FLARG_PASS);
1981 		NODE_UNLOCK(lock, &nlocktype);
1982 		INSIST(tlocktype == isc_rwlocktype_none);
1983 	}
1984 
1985 	if (!EMPTY(cleanup_list)) {
1986 		isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1987 
1988 		if (rbtdb->loop == NULL) {
1989 			/*
1990 			 * We acquire a tree write lock here in order to make
1991 			 * sure that stale nodes will be removed in
1992 			 * dns__rbtdb_decref().  If we didn't have the lock,
1993 			 * those nodes could miss the chance to be removed
1994 			 * until the server stops.  The write lock is
1995 			 * expensive, but this should be rare enough
1996 			 * to justify the cost.
1997 			 */
1998 			TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
1999 		}
2000 
2001 		for (changed = HEAD(cleanup_list); changed != NULL;
2002 		     changed = next_changed)
2003 		{
2004 			isc_rwlock_t *lock = NULL;
2005 			isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
2006 
2007 			next_changed = NEXT(changed, link);
2008 			rbtnode = changed->node;
2009 			lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2010 
2011 			NODE_WRLOCK(lock, &nlocktype);
2012 			/*
2013 			 * This is a good opportunity to purge any dead nodes,
2014 			 * so use it.
2015 			 */
2016 			if (rbtdb->loop == NULL) {
2017 				cleanup_dead_nodes(
2018 					rbtdb,
2019 					rbtnode->locknum DNS__DB_FLARG_PASS);
2020 			}
2021 
2022 			if (rollback) {
2023 				rollback_node(rbtnode, serial);
2024 			}
2025 			dns__rbtdb_decref(rbtdb, rbtnode, least_serial,
2026 					  &nlocktype, &tlocktype, true,
2027 					  false DNS__DB_FILELINE);
2028 
2029 			NODE_UNLOCK(lock, &nlocktype);
2030 
2031 			isc_mem_put(rbtdb->common.mctx, changed,
2032 				    sizeof(*changed));
2033 		}
2034 		if (rbtdb->loop != NULL) {
2035 			dns_db_attach((dns_db_t *)rbtdb, &(dns_db_t *){ NULL });
2036 			isc_async_run(rbtdb->loop, cleanup_dead_nodes_callback,
2037 				      rbtdb);
2038 		} else {
2039 			TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
2040 		}
2041 
2042 		INSIST(tlocktype == isc_rwlocktype_none);
2043 	}
2044 
2045 end:
2046 	*versionp = NULL;
2047 }
2048 
2049 isc_result_t
2050 dns__rbtdb_findnodeintree(dns_rbtdb_t *rbtdb, dns_rbt_t *tree,
2051 			  const dns_name_t *name, bool create,
2052 			  dns_dbnode_t **nodep DNS__DB_FLARG) {
2053 	dns_rbtnode_t *node = NULL;
2054 	dns_name_t nodename;
2055 	isc_result_t result;
2056 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
2057 
2058 	INSIST(tree == rbtdb->tree || tree == rbtdb->nsec3);
2059 
2060 	dns_name_init(&nodename, NULL);
2061 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
2062 	result = dns_rbt_findnode(tree, name, NULL, &node, NULL,
2063 				  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2064 	if (result != ISC_R_SUCCESS) {
2065 		if (!create) {
2066 			if (result == DNS_R_PARTIALMATCH) {
2067 				result = ISC_R_NOTFOUND;
2068 			}
2069 			goto unlock;
2070 		}
2071 		/*
2072 		 * Try to upgrade the lock and if that fails unlock then relock.
2073 		 */
2074 		TREE_FORCEUPGRADE(&rbtdb->tree_lock, &tlocktype);
2075 		node = NULL;
2076 		result = dns_rbt_addnode(tree, name, &node);
2077 		if (result == ISC_R_SUCCESS) {
2078 			dns_rbt_namefromnode(node, &nodename);
2079 			node->locknum = node->hashval % rbtdb->node_lock_count;
2080 			if (tree == rbtdb->tree) {
2081 				dns__zonerbt_addwildcards(rbtdb, name, true);
2082 
2083 				if (dns_name_iswildcard(name)) {
2084 					result = dns__zonerbt_wildcardmagic(
2085 						rbtdb, name, true);
2086 					if (result != ISC_R_SUCCESS) {
2087 						goto unlock;
2088 					}
2089 				}
2090 			}
2091 			if (tree == rbtdb->nsec3) {
2092 				node->nsec = DNS_DB_NSEC_NSEC3;
2093 			}
2094 		} else if (result == ISC_R_EXISTS) {
2095 			result = ISC_R_SUCCESS;
2096 		} else {
2097 			goto unlock;
2098 		}
2099 	}
2100 
2101 	if (tree == rbtdb->nsec3) {
2102 		INSIST(node->nsec == DNS_DB_NSEC_NSEC3);
2103 	}
2104 
2105 	reactivate_node(rbtdb, node, tlocktype DNS__DB_FLARG_PASS);
2106 
2107 	*nodep = (dns_dbnode_t *)node;
2108 unlock:
2109 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
2110 
2111 	return result;
2112 }
2113 
2114 isc_result_t
2115 dns__rbtdb_findnode(dns_db_t *db, const dns_name_t *name, bool create,
2116 		    dns_dbnode_t **nodep DNS__DB_FLARG) {
2117 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2118 
2119 	REQUIRE(VALID_RBTDB(rbtdb));
2120 
2121 	return dns__rbtdb_findnodeintree(rbtdb, rbtdb->tree, name, create,
2122 					 nodep DNS__DB_FLARG_PASS);
2123 }
2124 
2125 void
2126 dns__rbtdb_bindrdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2127 			dns_slabheader_t *header, isc_stdtime_t now,
2128 			isc_rwlocktype_t locktype,
2129 			dns_rdataset_t *rdataset DNS__DB_FLARG) {
2130 	bool stale = STALE(header);
2131 	bool ancient = ANCIENT(header);
2132 
2133 	/*
2134 	 * Caller must be holding the node reader lock.
2135 	 * XXXJT: technically, we need a writer lock, since we'll increment
2136 	 * the header count below.  However, since the actual counter value
2137 	 * doesn't matter, we prioritize performance here.  (We may want to
2138 	 * use atomic increment when available).
2139 	 */
2140 
2141 	if (rdataset == NULL) {
2142 		return;
2143 	}
2144 
2145 	dns__rbtdb_newref(rbtdb, node, locktype DNS__DB_FLARG_PASS);
2146 
2147 	INSIST(rdataset->methods == NULL); /* We must be disassociated. */
2148 
2149 	/*
2150 	 * Mark header stale or ancient if the RRset is no longer active.
2151 	 */
2152 	if (!ACTIVE(header, now)) {
2153 		dns_ttl_t stale_ttl = header->ttl + STALE_TTL(header, rbtdb);
2154 		/*
2155 		 * If this data is in the stale window keep it and if
2156 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
2157 		 * skip this record.  We skip the records with ZEROTTL
2158 		 * (these records should not be cached anyway).
2159 		 */
2160 
2161 		if (KEEPSTALE(rbtdb) && stale_ttl > now) {
2162 			stale = true;
2163 		} else {
2164 			/*
2165 			 * We are not keeping stale, or it is outside the
2166 			 * stale window. Mark ancient, i.e. ready for cleanup.
2167 			 */
2168 			ancient = true;
2169 		}
2170 	}
2171 
2172 	rdataset->methods = &dns_rdataslab_rdatasetmethods;
2173 	rdataset->rdclass = rbtdb->common.rdclass;
2174 	rdataset->type = DNS_TYPEPAIR_TYPE(header->type);
2175 	rdataset->covers = DNS_TYPEPAIR_COVERS(header->type);
2176 	rdataset->ttl = header->ttl - now;
2177 	rdataset->trust = header->trust;
2178 
2179 	if (NEGATIVE(header)) {
2180 		rdataset->attributes |= DNS_RDATASETATTR_NEGATIVE;
2181 	}
2182 	if (NXDOMAIN(header)) {
2183 		rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2184 	}
2185 	if (OPTOUT(header)) {
2186 		rdataset->attributes |= DNS_RDATASETATTR_OPTOUT;
2187 	}
2188 	if (PREFETCH(header)) {
2189 		rdataset->attributes |= DNS_RDATASETATTR_PREFETCH;
2190 	}
2191 
2192 	if (stale && !ancient) {
2193 		dns_ttl_t stale_ttl = header->ttl + STALE_TTL(header, rbtdb);
2194 		if (stale_ttl > now) {
2195 			rdataset->ttl = stale_ttl - now;
2196 		} else {
2197 			rdataset->ttl = 0;
2198 		}
2199 		if (STALE_WINDOW(header)) {
2200 			rdataset->attributes |= DNS_RDATASETATTR_STALE_WINDOW;
2201 		}
2202 		rdataset->attributes |= DNS_RDATASETATTR_STALE;
2203 	} else if (IS_CACHE(rbtdb) && !ACTIVE(header, now)) {
2204 		rdataset->attributes |= DNS_RDATASETATTR_ANCIENT;
2205 		rdataset->ttl = header->ttl;
2206 	}
2207 
2208 	rdataset->count = atomic_fetch_add_relaxed(&header->count, 1);
2209 
2210 	rdataset->slab.db = (dns_db_t *)rbtdb;
2211 	rdataset->slab.node = (dns_dbnode_t *)node;
2212 	rdataset->slab.raw = dns_slabheader_raw(header);
2213 	rdataset->slab.iter_pos = NULL;
2214 	rdataset->slab.iter_count = 0;
2215 
2216 	/*
2217 	 * Add noqname proof.
2218 	 */
2219 	rdataset->slab.noqname = header->noqname;
2220 	if (header->noqname != NULL) {
2221 		rdataset->attributes |= DNS_RDATASETATTR_NOQNAME;
2222 	}
2223 	rdataset->slab.closest = header->closest;
2224 	if (header->closest != NULL) {
2225 		rdataset->attributes |= DNS_RDATASETATTR_CLOSEST;
2226 	}
2227 
2228 	/*
2229 	 * Copy out re-signing information.
2230 	 */
2231 	if (RESIGN(header)) {
2232 		rdataset->attributes |= DNS_RDATASETATTR_RESIGN;
2233 		rdataset->resign = (header->resign << 1) | header->resign_lsb;
2234 	} else {
2235 		rdataset->resign = 0;
2236 	}
2237 }
2238 
2239 void
2240 dns__rbtdb_attachnode(dns_db_t *db, dns_dbnode_t *source,
2241 		      dns_dbnode_t **targetp DNS__DB_FLARG) {
2242 	REQUIRE(VALID_RBTDB((dns_rbtdb_t *)db));
2243 	REQUIRE(targetp != NULL && *targetp == NULL);
2244 
2245 	dns_rbtnode_t *node = (dns_rbtnode_t *)source;
2246 	uint_fast32_t refs = isc_refcount_increment(&node->references);
2247 
2248 #if DNS_DB_NODETRACE
2249 	fprintf(stderr, "incr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
2250 		func, file, line, node, refs + 1);
2251 #else
2252 	UNUSED(refs);
2253 #endif
2254 
2255 	*targetp = source;
2256 }
2257 
2258 void
2259 dns__rbtdb_detachnode(dns_db_t *db, dns_dbnode_t **targetp DNS__DB_FLARG) {
2260 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2261 	dns_rbtnode_t *node = NULL;
2262 	bool want_free = false;
2263 	bool inactive = false;
2264 	db_nodelock_t *nodelock = NULL;
2265 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
2266 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
2267 
2268 	REQUIRE(VALID_RBTDB(rbtdb));
2269 	REQUIRE(targetp != NULL && *targetp != NULL);
2270 
2271 	node = (dns_rbtnode_t *)(*targetp);
2272 	nodelock = &rbtdb->node_locks[node->locknum];
2273 
2274 	NODE_RDLOCK(&nodelock->lock, &nlocktype);
2275 
2276 	if (dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
2277 			      false DNS__DB_FLARG_PASS))
2278 	{
2279 		if (isc_refcount_current(&nodelock->references) == 0 &&
2280 		    nodelock->exiting)
2281 		{
2282 			inactive = true;
2283 		}
2284 	}
2285 
2286 	NODE_UNLOCK(&nodelock->lock, &nlocktype);
2287 	INSIST(tlocktype == isc_rwlocktype_none);
2288 
2289 	*targetp = NULL;
2290 
2291 	if (inactive) {
2292 		RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
2293 		rbtdb->active--;
2294 		if (rbtdb->active == 0) {
2295 			want_free = true;
2296 		}
2297 		RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2298 		if (want_free) {
2299 			char buf[DNS_NAME_FORMATSIZE];
2300 			if (dns_name_dynamic(&rbtdb->common.origin)) {
2301 				dns_name_format(&rbtdb->common.origin, buf,
2302 						sizeof(buf));
2303 			} else {
2304 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
2305 			}
2306 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
2307 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
2308 				      "calling free_rbtdb(%s)", buf);
2309 			free_rbtdb(rbtdb, true);
2310 		}
2311 	}
2312 }
2313 
2314 isc_result_t
2315 dns__rbtdb_createiterator(dns_db_t *db, unsigned int options,
2316 			  dns_dbiterator_t **iteratorp) {
2317 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2318 	rbtdb_dbiterator_t *rbtdbiter = NULL;
2319 
2320 	REQUIRE(VALID_RBTDB(rbtdb));
2321 	REQUIRE((options & (DNS_DB_NSEC3ONLY | DNS_DB_NONSEC3)) !=
2322 		(DNS_DB_NSEC3ONLY | DNS_DB_NONSEC3));
2323 
2324 	rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
2325 
2326 	rbtdbiter->common.methods = &dbiterator_methods;
2327 	rbtdbiter->common.db = NULL;
2328 	dns_db_attach(db, &rbtdbiter->common.db);
2329 	rbtdbiter->common.relative_names = ((options & DNS_DB_RELATIVENAMES) !=
2330 					    0);
2331 	rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
2332 	rbtdbiter->paused = true;
2333 	rbtdbiter->tree_locked = isc_rwlocktype_none;
2334 	rbtdbiter->result = ISC_R_SUCCESS;
2335 	dns_fixedname_init(&rbtdbiter->name);
2336 	dns_fixedname_init(&rbtdbiter->origin);
2337 	rbtdbiter->node = NULL;
2338 	if ((options & DNS_DB_NSEC3ONLY) != 0) {
2339 		rbtdbiter->nsec3mode = nsec3only;
2340 	} else if ((options & DNS_DB_NONSEC3) != 0) {
2341 		rbtdbiter->nsec3mode = nonsec3;
2342 	} else {
2343 		rbtdbiter->nsec3mode = full;
2344 	}
2345 	dns_rbtnodechain_init(&rbtdbiter->chain);
2346 	dns_rbtnodechain_init(&rbtdbiter->nsec3chain);
2347 	if (rbtdbiter->nsec3mode == nsec3only) {
2348 		rbtdbiter->current = &rbtdbiter->nsec3chain;
2349 	} else {
2350 		rbtdbiter->current = &rbtdbiter->chain;
2351 	}
2352 
2353 	*iteratorp = (dns_dbiterator_t *)rbtdbiter;
2354 
2355 	return ISC_R_SUCCESS;
2356 }
2357 
2358 isc_result_t
2359 dns__rbtdb_allrdatasets(dns_db_t *db, dns_dbnode_t *node,
2360 			dns_dbversion_t *version, unsigned int options,
2361 			isc_stdtime_t now,
2362 			dns_rdatasetiter_t **iteratorp DNS__DB_FLARG) {
2363 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2364 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
2365 	dns_rbtdb_version_t *rbtversion = version;
2366 	rbtdb_rdatasetiter_t *iterator = NULL;
2367 	uint_fast32_t refs;
2368 
2369 	REQUIRE(VALID_RBTDB(rbtdb));
2370 
2371 	iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
2372 
2373 	if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
2374 		now = 0;
2375 		if (rbtversion == NULL) {
2376 			dns__rbtdb_currentversion(
2377 				db, (dns_dbversion_t **)(void *)(&rbtversion));
2378 		} else {
2379 			INSIST(rbtversion->rbtdb == rbtdb);
2380 
2381 			(void)isc_refcount_increment(&rbtversion->references);
2382 		}
2383 	} else {
2384 		if (now == 0) {
2385 			now = isc_stdtime_now();
2386 		}
2387 		rbtversion = NULL;
2388 	}
2389 
2390 	iterator->common.magic = DNS_RDATASETITER_MAGIC;
2391 	iterator->common.methods = &rdatasetiter_methods;
2392 	iterator->common.db = db;
2393 	iterator->common.node = node;
2394 	iterator->common.version = (dns_dbversion_t *)rbtversion;
2395 	iterator->common.options = options;
2396 	iterator->common.now = now;
2397 
2398 	refs = isc_refcount_increment(&rbtnode->references);
2399 #if DNS_DB_NODETRACE
2400 	fprintf(stderr, "incr:node:%s:%s:%u:%p->references = %" PRIuFAST32 "\n",
2401 		func, file, line, node, refs + 1);
2402 #else
2403 	UNUSED(refs);
2404 #endif
2405 
2406 	iterator->current = NULL;
2407 
2408 	*iteratorp = (dns_rdatasetiter_t *)iterator;
2409 
2410 	return ISC_R_SUCCESS;
2411 }
2412 
2413 static bool
2414 cname_and_other_data(dns_rbtnode_t *node, uint32_t serial) {
2415 	dns_slabheader_t *header = NULL, *header_next = NULL;
2416 	bool cname = false, other_data = false;
2417 	dns_rdatatype_t rdtype;
2418 
2419 	/*
2420 	 * The caller must hold the node lock.
2421 	 */
2422 
2423 	/*
2424 	 * Look for CNAME and "other data" rdatasets active in our version.
2425 	 */
2426 	for (header = node->data; header != NULL; header = header_next) {
2427 		header_next = header->next;
2428 		if (!prio_type(header->type)) {
2429 			/*
2430 			 * CNAME is in the priority list, so if we are done
2431 			 * with the priority list, we know there will not be
2432 			 * CNAME, so we are safe to skip the rest of the types.
2433 			 */
2434 			return false;
2435 		}
2436 		if (header->type == dns_rdatatype_cname) {
2437 			/*
2438 			 * Look for an active extant CNAME.
2439 			 */
2440 			do {
2441 				if (header->serial <= serial && !IGNORE(header))
2442 				{
2443 					/*
2444 					 * Is this a "this rdataset doesn't
2445 					 * exist" record?
2446 					 */
2447 					if (NONEXISTENT(header)) {
2448 						header = NULL;
2449 					}
2450 					break;
2451 				} else {
2452 					header = header->down;
2453 				}
2454 			} while (header != NULL);
2455 			if (header != NULL) {
2456 				cname = true;
2457 			}
2458 		} else {
2459 			/*
2460 			 * Look for active extant "other data".
2461 			 *
2462 			 * "Other data" is any rdataset whose type is not
2463 			 * KEY, NSEC, SIG or RRSIG.
2464 			 */
2465 			rdtype = DNS_TYPEPAIR_TYPE(header->type);
2466 			if (rdtype != dns_rdatatype_key &&
2467 			    rdtype != dns_rdatatype_sig &&
2468 			    rdtype != dns_rdatatype_nsec &&
2469 			    rdtype != dns_rdatatype_rrsig)
2470 			{
2471 				/*
2472 				 * Is it active and extant?
2473 				 */
2474 				do {
2475 					if (header->serial <= serial &&
2476 					    !IGNORE(header))
2477 					{
2478 						/*
2479 						 * Is this a "this rdataset
2480 						 * doesn't exist" record?
2481 						 */
2482 						if (NONEXISTENT(header)) {
2483 							header = NULL;
2484 						}
2485 						break;
2486 					} else {
2487 						header = header->down;
2488 					}
2489 				} while (header != NULL);
2490 				if (header != NULL) {
2491 					other_data = true;
2492 				}
2493 			}
2494 		}
2495 		if (cname && other_data) {
2496 			return true;
2497 		}
2498 	}
2499 
2500 	return false;
2501 }
2502 
2503 static uint64_t
2504 recordsize(dns_slabheader_t *header, unsigned int namelen) {
2505 	return dns_rdataslab_rdatasize((unsigned char *)header,
2506 				       sizeof(*header)) +
2507 	       sizeof(dns_ttl_t) + sizeof(dns_rdatatype_t) +
2508 	       sizeof(dns_rdataclass_t) + namelen;
2509 }
2510 
2511 static void
2512 update_recordsandxfrsize(bool add, dns_rbtdb_version_t *rbtversion,
2513 			 dns_slabheader_t *header, unsigned int namelen) {
2514 	unsigned char *hdr = (unsigned char *)header;
2515 	size_t hdrsize = sizeof(*header);
2516 
2517 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
2518 	if (add) {
2519 		rbtversion->records += dns_rdataslab_count(hdr, hdrsize);
2520 		rbtversion->xfrsize += recordsize(header, namelen);
2521 	} else {
2522 		rbtversion->records -= dns_rdataslab_count(hdr, hdrsize);
2523 		rbtversion->xfrsize -= recordsize(header, namelen);
2524 	}
2525 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
2526 }
2527 
2528 static bool
2529 overmaxtype(dns_rbtdb_t *rbtdb, uint32_t ntypes) {
2530 	if (rbtdb->maxtypepername == 0) {
2531 		return false;
2532 	}
2533 
2534 	return ntypes >= rbtdb->maxtypepername;
2535 }
2536 
2537 static bool
2538 prio_header(dns_slabheader_t *header) {
2539 	if (NEGATIVE(header) && prio_type(DNS_TYPEPAIR_COVERS(header->type))) {
2540 		return true;
2541 	}
2542 
2543 	return prio_type(header->type);
2544 }
2545 
2546 isc_result_t
2547 dns__rbtdb_add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
2548 	       const dns_name_t *nodename, dns_rbtdb_version_t *rbtversion,
2549 	       dns_slabheader_t *newheader, unsigned int options, bool loading,
2550 	       dns_rdataset_t *addedrdataset, isc_stdtime_t now DNS__DB_FLARG) {
2551 	rbtdb_changed_t *changed = NULL;
2552 	dns_slabheader_t *topheader = NULL, *topheader_prev = NULL;
2553 	dns_slabheader_t *header = NULL, *sigheader = NULL;
2554 	dns_slabheader_t *prioheader = NULL, *expireheader = NULL;
2555 	unsigned char *merged = NULL;
2556 	isc_result_t result;
2557 	bool header_nx;
2558 	bool newheader_nx;
2559 	bool merge;
2560 	dns_rdatatype_t rdtype, covers;
2561 	dns_typepair_t negtype = 0, sigtype;
2562 	dns_trust_t trust;
2563 	int idx;
2564 	uint32_t ntypes = 0;
2565 
2566 	if ((options & DNS_DBADD_MERGE) != 0) {
2567 		REQUIRE(rbtversion != NULL);
2568 		merge = true;
2569 	} else {
2570 		merge = false;
2571 	}
2572 
2573 	if ((options & DNS_DBADD_FORCE) != 0) {
2574 		trust = dns_trust_ultimate;
2575 	} else {
2576 		trust = newheader->trust;
2577 	}
2578 
2579 	if (rbtversion != NULL && !loading) {
2580 		/*
2581 		 * We always add a changed record, even if no changes end up
2582 		 * being made to this node, because it's harmless and
2583 		 * simplifies the code.
2584 		 */
2585 		changed = add_changed(newheader, rbtversion DNS__DB_FLARG_PASS);
2586 		if (changed == NULL) {
2587 			dns_slabheader_destroy(&newheader);
2588 			return ISC_R_NOMEMORY;
2589 		}
2590 	}
2591 
2592 	newheader_nx = NONEXISTENT(newheader) ? true : false;
2593 	if (rbtversion == NULL && !newheader_nx) {
2594 		rdtype = DNS_TYPEPAIR_TYPE(newheader->type);
2595 		covers = DNS_TYPEPAIR_COVERS(newheader->type);
2596 		sigtype = DNS_SIGTYPE(covers);
2597 		if (NEGATIVE(newheader)) {
2598 			/*
2599 			 * We're adding a negative cache entry.
2600 			 */
2601 			if (covers == dns_rdatatype_any) {
2602 				/*
2603 				 * If we're adding an negative cache entry
2604 				 * which covers all types (NXDOMAIN,
2605 				 * NODATA(QTYPE=ANY)),
2606 				 *
2607 				 * We make all other data ancient so that the
2608 				 * only rdataset that can be found at this
2609 				 * node is the negative cache entry.
2610 				 */
2611 				for (topheader = rbtnode->data;
2612 				     topheader != NULL;
2613 				     topheader = topheader->next)
2614 				{
2615 					mark_ancient(topheader);
2616 				}
2617 				goto find_header;
2618 			}
2619 			/*
2620 			 * Otherwise look for any RRSIGs of the given
2621 			 * type so they can be marked ancient later.
2622 			 */
2623 			for (topheader = rbtnode->data; topheader != NULL;
2624 			     topheader = topheader->next)
2625 			{
2626 				if (topheader->type == sigtype) {
2627 					sigheader = topheader;
2628 					break;
2629 				}
2630 			}
2631 			negtype = DNS_TYPEPAIR_VALUE(covers, 0);
2632 		} else {
2633 			/*
2634 			 * We're adding something that isn't a
2635 			 * negative cache entry.  Look for an extant
2636 			 * non-ancient NXDOMAIN/NODATA(QTYPE=ANY) negative
2637 			 * cache entry.  If we're adding an RRSIG, also
2638 			 * check for an extant non-ancient NODATA ncache
2639 			 * entry which covers the same type as the RRSIG.
2640 			 */
2641 			for (topheader = rbtnode->data; topheader != NULL;
2642 			     topheader = topheader->next)
2643 			{
2644 				if ((topheader->type == RDATATYPE_NCACHEANY) ||
2645 				    (newheader->type == sigtype &&
2646 				     topheader->type ==
2647 					     DNS_TYPEPAIR_VALUE(0, covers)))
2648 				{
2649 					break;
2650 				}
2651 			}
2652 			if (topheader != NULL && EXISTS(topheader) &&
2653 			    ACTIVE(topheader, now))
2654 			{
2655 				/*
2656 				 * Found one.
2657 				 */
2658 				if (trust < topheader->trust) {
2659 					/*
2660 					 * The NXDOMAIN/NODATA(QTYPE=ANY)
2661 					 * is more trusted.
2662 					 */
2663 					dns_slabheader_destroy(&newheader);
2664 					if (addedrdataset != NULL) {
2665 						dns__rbtdb_bindrdataset(
2666 							rbtdb, rbtnode,
2667 							topheader, now,
2668 							isc_rwlocktype_write,
2669 							addedrdataset
2670 								DNS__DB_FLARG_PASS);
2671 					}
2672 					return DNS_R_UNCHANGED;
2673 				}
2674 				/*
2675 				 * The new rdataset is better.  Expire the
2676 				 * ncache entry.
2677 				 */
2678 				mark_ancient(topheader);
2679 				topheader = NULL;
2680 				goto find_header;
2681 			}
2682 			negtype = DNS_TYPEPAIR_VALUE(0, rdtype);
2683 		}
2684 	}
2685 
2686 	for (topheader = rbtnode->data; topheader != NULL;
2687 	     topheader = topheader->next)
2688 	{
2689 		if (IS_CACHE(rbtdb) && ACTIVE(topheader, now)) {
2690 			++ntypes;
2691 			expireheader = topheader;
2692 		} else if (!IS_CACHE(rbtdb)) {
2693 			++ntypes;
2694 		}
2695 		if (prio_header(topheader)) {
2696 			prioheader = topheader;
2697 		}
2698 		if (topheader->type == newheader->type ||
2699 		    topheader->type == negtype)
2700 		{
2701 			break;
2702 		}
2703 		topheader_prev = topheader;
2704 	}
2705 
2706 find_header:
2707 	/*
2708 	 * If header isn't NULL, we've found the right type.  There may be
2709 	 * IGNORE rdatasets between the top of the chain and the first real
2710 	 * data.  We skip over them.
2711 	 */
2712 	header = topheader;
2713 	while (header != NULL && IGNORE(header)) {
2714 		header = header->down;
2715 	}
2716 	if (header != NULL) {
2717 		header_nx = NONEXISTENT(header) ? true : false;
2718 
2719 		/*
2720 		 * Deleting an already non-existent rdataset has no effect.
2721 		 */
2722 		if (header_nx && newheader_nx) {
2723 			dns_slabheader_destroy(&newheader);
2724 			return DNS_R_UNCHANGED;
2725 		}
2726 
2727 		/*
2728 		 * Trying to add an rdataset with lower trust to a cache
2729 		 * DB has no effect, provided that the cache data isn't
2730 		 * stale. If the cache data is stale, new lower trust
2731 		 * data will supersede it below. Unclear what the best
2732 		 * policy is here.
2733 		 */
2734 		if (rbtversion == NULL && trust < header->trust &&
2735 		    (ACTIVE(header, now) || header_nx))
2736 		{
2737 			dns_slabheader_destroy(&newheader);
2738 			if (addedrdataset != NULL) {
2739 				dns__rbtdb_bindrdataset(
2740 					rbtdb, rbtnode, header, now,
2741 					isc_rwlocktype_write,
2742 					addedrdataset DNS__DB_FLARG_PASS);
2743 			}
2744 			return DNS_R_UNCHANGED;
2745 		}
2746 
2747 		/*
2748 		 * Don't merge if a nonexistent rdataset is involved.
2749 		 */
2750 		if (merge && (header_nx || newheader_nx)) {
2751 			merge = false;
2752 		}
2753 
2754 		/*
2755 		 * If 'merge' is true, we'll try to create a new rdataset
2756 		 * that is the union of 'newheader' and 'header'.
2757 		 */
2758 		if (merge) {
2759 			unsigned int flags = 0;
2760 			INSIST(rbtversion->serial >= header->serial);
2761 			merged = NULL;
2762 			result = ISC_R_SUCCESS;
2763 
2764 			if ((options & DNS_DBADD_EXACT) != 0) {
2765 				flags |= DNS_RDATASLAB_EXACT;
2766 			}
2767 			/*
2768 			 * TTL use here is irrelevant to the cache;
2769 			 * merge is only done with zonedbs.
2770 			 */
2771 			if ((options & DNS_DBADD_EXACTTTL) != 0 &&
2772 			    newheader->ttl != header->ttl)
2773 			{
2774 				result = DNS_R_NOTEXACT;
2775 			} else if (newheader->ttl != header->ttl) {
2776 				flags |= DNS_RDATASLAB_FORCE;
2777 			}
2778 			if (result == ISC_R_SUCCESS) {
2779 				result = dns_rdataslab_merge(
2780 					(unsigned char *)header,
2781 					(unsigned char *)newheader,
2782 					(unsigned int)(sizeof(*newheader)),
2783 					rbtdb->common.mctx,
2784 					rbtdb->common.rdclass,
2785 					(dns_rdatatype_t)header->type, flags,
2786 					rbtdb->maxrrperset, &merged);
2787 			}
2788 			if (result == ISC_R_SUCCESS) {
2789 				/*
2790 				 * If 'header' has the same serial number as
2791 				 * we do, we could clean it up now if we knew
2792 				 * that our caller had no references to it.
2793 				 * We don't know this, however, so we leave it
2794 				 * alone.  It will get cleaned up when
2795 				 * clean_zone_node() runs.
2796 				 */
2797 				dns_slabheader_destroy(&newheader);
2798 				newheader = (dns_slabheader_t *)merged;
2799 				dns_slabheader_reset(newheader,
2800 						     (dns_db_t *)rbtdb,
2801 						     (dns_dbnode_t *)rbtnode);
2802 				dns_slabheader_copycase(newheader, header);
2803 				if (loading && RESIGN(newheader) &&
2804 				    RESIGN(header) &&
2805 				    resign_sooner(header, newheader))
2806 				{
2807 					newheader->resign = header->resign;
2808 					newheader->resign_lsb =
2809 						header->resign_lsb;
2810 				}
2811 			} else {
2812 				if (result == DNS_R_TOOMANYRECORDS) {
2813 					dns__db_logtoomanyrecords(
2814 						(dns_db_t *)rbtdb, nodename,
2815 						(dns_rdatatype_t)(header->type),
2816 						"updating", rbtdb->maxrrperset);
2817 				}
2818 				dns_slabheader_destroy(&newheader);
2819 				return result;
2820 			}
2821 		}
2822 		/*
2823 		 * Don't replace existing NS, A and AAAA RRsets in the
2824 		 * cache if they are already exist. This prevents named
2825 		 * being locked to old servers. Don't lower trust of
2826 		 * existing record if the update is forced. Nothing
2827 		 * special to be done w.r.t stale data; it gets replaced
2828 		 * normally further down.
2829 		 */
2830 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
2831 		    header->type == dns_rdatatype_ns && !header_nx &&
2832 		    !newheader_nx && header->trust >= newheader->trust &&
2833 		    dns_rdataslab_equalx((unsigned char *)header,
2834 					 (unsigned char *)newheader,
2835 					 (unsigned int)(sizeof(*newheader)),
2836 					 rbtdb->common.rdclass,
2837 					 (dns_rdatatype_t)header->type))
2838 		{
2839 			/*
2840 			 * Honour the new ttl if it is less than the
2841 			 * older one.
2842 			 */
2843 			if (header->ttl > newheader->ttl) {
2844 				dns__rbtdb_setttl(header, newheader->ttl);
2845 			}
2846 			if (header->last_used != now) {
2847 				ISC_LIST_UNLINK(
2848 					rbtdb->lru[RBTDB_HEADERNODE(header)
2849 							   ->locknum],
2850 					header, link);
2851 				header->last_used = now;
2852 				ISC_LIST_PREPEND(
2853 					rbtdb->lru[RBTDB_HEADERNODE(header)
2854 							   ->locknum],
2855 					header, link);
2856 			}
2857 			if (header->noqname == NULL &&
2858 			    newheader->noqname != NULL)
2859 			{
2860 				header->noqname = newheader->noqname;
2861 				newheader->noqname = NULL;
2862 			}
2863 			if (header->closest == NULL &&
2864 			    newheader->closest != NULL)
2865 			{
2866 				header->closest = newheader->closest;
2867 				newheader->closest = NULL;
2868 			}
2869 			dns_slabheader_destroy(&newheader);
2870 			if (addedrdataset != NULL) {
2871 				dns__rbtdb_bindrdataset(
2872 					rbtdb, rbtnode, header, now,
2873 					isc_rwlocktype_write,
2874 					addedrdataset DNS__DB_FLARG_PASS);
2875 			}
2876 			return ISC_R_SUCCESS;
2877 		}
2878 
2879 		/*
2880 		 * If we have will be replacing a NS RRset force its TTL
2881 		 * to be no more than the current NS RRset's TTL.  This
2882 		 * ensures the delegations that are withdrawn are honoured.
2883 		 */
2884 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
2885 		    header->type == dns_rdatatype_ns && !header_nx &&
2886 		    !newheader_nx && header->trust <= newheader->trust)
2887 		{
2888 			if (newheader->ttl > header->ttl) {
2889 				newheader->ttl = header->ttl;
2890 			}
2891 		}
2892 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
2893 		    (options & DNS_DBADD_PREFETCH) == 0 &&
2894 		    (header->type == dns_rdatatype_a ||
2895 		     header->type == dns_rdatatype_aaaa ||
2896 		     header->type == dns_rdatatype_ds ||
2897 		     header->type == DNS_SIGTYPE(dns_rdatatype_ds)) &&
2898 		    !header_nx && !newheader_nx &&
2899 		    header->trust >= newheader->trust &&
2900 		    dns_rdataslab_equal((unsigned char *)header,
2901 					(unsigned char *)newheader,
2902 					(unsigned int)(sizeof(*newheader))))
2903 		{
2904 			/*
2905 			 * Honour the new ttl if it is less than the
2906 			 * older one.
2907 			 */
2908 			if (header->ttl > newheader->ttl) {
2909 				dns__rbtdb_setttl(header, newheader->ttl);
2910 			}
2911 			if (header->last_used != now) {
2912 				ISC_LIST_UNLINK(
2913 					rbtdb->lru[RBTDB_HEADERNODE(header)
2914 							   ->locknum],
2915 					header, link);
2916 				header->last_used = now;
2917 				ISC_LIST_PREPEND(
2918 					rbtdb->lru[RBTDB_HEADERNODE(header)
2919 							   ->locknum],
2920 					header, link);
2921 			}
2922 			if (header->noqname == NULL &&
2923 			    newheader->noqname != NULL)
2924 			{
2925 				header->noqname = newheader->noqname;
2926 				newheader->noqname = NULL;
2927 			}
2928 			if (header->closest == NULL &&
2929 			    newheader->closest != NULL)
2930 			{
2931 				header->closest = newheader->closest;
2932 				newheader->closest = NULL;
2933 			}
2934 			dns_slabheader_destroy(&newheader);
2935 			if (addedrdataset != NULL) {
2936 				dns__rbtdb_bindrdataset(
2937 					rbtdb, rbtnode, header, now,
2938 					isc_rwlocktype_write,
2939 					addedrdataset DNS__DB_FLARG_PASS);
2940 			}
2941 			return ISC_R_SUCCESS;
2942 		}
2943 		INSIST(rbtversion == NULL ||
2944 		       rbtversion->serial >= topheader->serial);
2945 		if (loading) {
2946 			newheader->down = NULL;
2947 			idx = RBTDB_HEADERNODE(newheader)->locknum;
2948 			if (IS_CACHE(rbtdb)) {
2949 				if (ZEROTTL(newheader)) {
2950 					newheader->last_used =
2951 						rbtdb->last_used + 1;
2952 					ISC_LIST_APPEND(rbtdb->lru[idx],
2953 							newheader, link);
2954 				} else {
2955 					ISC_LIST_PREPEND(rbtdb->lru[idx],
2956 							 newheader, link);
2957 				}
2958 				INSIST(rbtdb->heaps != NULL);
2959 				isc_heap_insert(rbtdb->heaps[idx], newheader);
2960 				newheader->heap = rbtdb->heaps[idx];
2961 			} else if (RESIGN(newheader)) {
2962 				dns__zonerbt_resigninsert(rbtdb, idx,
2963 							  newheader);
2964 				/*
2965 				 * Don't call resigndelete, we don't need
2966 				 * to reverse the delete.  The free_slabheader
2967 				 * call below will clean up the heap entry.
2968 				 */
2969 			}
2970 
2971 			/*
2972 			 * There are no other references to 'header' when
2973 			 * loading, so we MAY clean up 'header' now.
2974 			 * Since we don't generate changed records when
2975 			 * loading, we MUST clean up 'header' now.
2976 			 */
2977 			if (topheader_prev != NULL) {
2978 				topheader_prev->next = newheader;
2979 			} else {
2980 				rbtnode->data = newheader;
2981 			}
2982 			newheader->next = topheader->next;
2983 			if (rbtversion != NULL && !header_nx) {
2984 				update_recordsandxfrsize(false, rbtversion,
2985 							 header,
2986 							 nodename->length);
2987 			}
2988 			dns_slabheader_destroy(&header);
2989 		} else {
2990 			idx = RBTDB_HEADERNODE(newheader)->locknum;
2991 			if (IS_CACHE(rbtdb)) {
2992 				INSIST(rbtdb->heaps != NULL);
2993 				isc_heap_insert(rbtdb->heaps[idx], newheader);
2994 				newheader->heap = rbtdb->heaps[idx];
2995 				if (ZEROTTL(newheader)) {
2996 					newheader->last_used =
2997 						rbtdb->last_used + 1;
2998 					ISC_LIST_APPEND(rbtdb->lru[idx],
2999 							newheader, link);
3000 				} else {
3001 					ISC_LIST_PREPEND(rbtdb->lru[idx],
3002 							 newheader, link);
3003 				}
3004 			} else if (RESIGN(newheader)) {
3005 				dns__zonerbt_resigninsert(rbtdb, idx,
3006 							  newheader);
3007 				dns__zonerbt_resigndelete(
3008 					rbtdb, rbtversion,
3009 					header DNS__DB_FLARG_PASS);
3010 			}
3011 			if (topheader_prev != NULL) {
3012 				topheader_prev->next = newheader;
3013 			} else {
3014 				rbtnode->data = newheader;
3015 			}
3016 			newheader->next = topheader->next;
3017 			newheader->down = topheader;
3018 			topheader->next = newheader;
3019 			rbtnode->dirty = 1;
3020 			if (changed != NULL) {
3021 				changed->dirty = true;
3022 			}
3023 			if (rbtversion == NULL) {
3024 				mark_ancient(header);
3025 				if (sigheader != NULL) {
3026 					mark_ancient(sigheader);
3027 				}
3028 			}
3029 			if (rbtversion != NULL && !header_nx) {
3030 				update_recordsandxfrsize(false, rbtversion,
3031 							 header,
3032 							 nodename->length);
3033 			}
3034 		}
3035 	} else {
3036 		/*
3037 		 * No non-IGNORED rdatasets of the given type exist at
3038 		 * this node.
3039 		 */
3040 
3041 		/*
3042 		 * If we're trying to delete the type, don't bother.
3043 		 */
3044 		if (newheader_nx) {
3045 			dns_slabheader_destroy(&newheader);
3046 			return DNS_R_UNCHANGED;
3047 		}
3048 
3049 		idx = RBTDB_HEADERNODE(newheader)->locknum;
3050 		if (IS_CACHE(rbtdb)) {
3051 			isc_heap_insert(rbtdb->heaps[idx], newheader);
3052 			newheader->heap = rbtdb->heaps[idx];
3053 			if (ZEROTTL(newheader)) {
3054 				ISC_LIST_APPEND(rbtdb->lru[idx], newheader,
3055 						link);
3056 			} else {
3057 				ISC_LIST_PREPEND(rbtdb->lru[idx], newheader,
3058 						 link);
3059 			}
3060 		} else if (RESIGN(newheader)) {
3061 			dns__zonerbt_resigninsert(rbtdb, idx, newheader);
3062 			dns__zonerbt_resigndelete(rbtdb, rbtversion,
3063 						  header DNS__DB_FLARG_PASS);
3064 		}
3065 
3066 		if (topheader != NULL) {
3067 			/*
3068 			 * We have an list of rdatasets of the given type,
3069 			 * but they're all marked IGNORE.  We simply insert
3070 			 * the new rdataset at the head of the list.
3071 			 *
3072 			 * Ignored rdatasets cannot occur during loading, so
3073 			 * we INSIST on it.
3074 			 */
3075 			INSIST(!loading);
3076 			INSIST(rbtversion == NULL ||
3077 			       rbtversion->serial >= topheader->serial);
3078 			if (topheader_prev != NULL) {
3079 				topheader_prev->next = newheader;
3080 			} else {
3081 				rbtnode->data = newheader;
3082 			}
3083 			newheader->next = topheader->next;
3084 			newheader->down = topheader;
3085 			topheader->next = newheader;
3086 			rbtnode->dirty = 1;
3087 			if (changed != NULL) {
3088 				changed->dirty = true;
3089 			}
3090 		} else {
3091 			/*
3092 			 * No rdatasets of the given type exist at the node.
3093 			 */
3094 			INSIST(newheader->down == NULL);
3095 
3096 			if (!IS_CACHE(rbtdb) && overmaxtype(rbtdb, ntypes)) {
3097 				dns_slabheader_destroy(&newheader);
3098 				return DNS_R_TOOMANYRECORDS;
3099 			}
3100 
3101 			if (prio_header(newheader)) {
3102 				/* This is a priority type, prepend it */
3103 				newheader->next = rbtnode->data;
3104 				rbtnode->data = newheader;
3105 			} else if (prioheader != NULL) {
3106 				/* Append after the priority headers */
3107 				newheader->next = prioheader->next;
3108 				prioheader->next = newheader;
3109 			} else {
3110 				/* There were no priority headers */
3111 				newheader->next = rbtnode->data;
3112 				rbtnode->data = newheader;
3113 			}
3114 
3115 			if (IS_CACHE(rbtdb) && overmaxtype(rbtdb, ntypes)) {
3116 				if (expireheader == NULL) {
3117 					expireheader = newheader;
3118 				}
3119 				if (NEGATIVE(newheader) &&
3120 				    !prio_header(newheader))
3121 				{
3122 					/*
3123 					 * Add the new non-priority negative
3124 					 * header to the database only
3125 					 * temporarily.
3126 					 */
3127 					expireheader = newheader;
3128 				}
3129 
3130 				mark_ancient(expireheader);
3131 				/*
3132 				 * FIXME: In theory, we should mark the RRSIG
3133 				 * and the header at the same time, but there is
3134 				 * no direct link between those two header, so
3135 				 * we would have to check the whole list again.
3136 				 */
3137 			}
3138 		}
3139 	}
3140 
3141 	if (rbtversion != NULL && !newheader_nx) {
3142 		update_recordsandxfrsize(true, rbtversion, newheader,
3143 					 nodename->length);
3144 	}
3145 
3146 	/*
3147 	 * Check if the node now contains CNAME and other data.
3148 	 */
3149 	if (rbtversion != NULL &&
3150 	    cname_and_other_data(rbtnode, rbtversion->serial))
3151 	{
3152 		return DNS_R_CNAMEANDOTHER;
3153 	}
3154 
3155 	if (addedrdataset != NULL) {
3156 		dns__rbtdb_bindrdataset(rbtdb, rbtnode, newheader, now,
3157 					isc_rwlocktype_write,
3158 					addedrdataset DNS__DB_FLARG_PASS);
3159 	}
3160 
3161 	return ISC_R_SUCCESS;
3162 }
3163 
3164 static bool
3165 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, dns_typepair_t type) {
3166 	if (IS_CACHE(rbtdb)) {
3167 		if (type == dns_rdatatype_dname) {
3168 			return true;
3169 		} else {
3170 			return false;
3171 		}
3172 	} else if (type == dns_rdatatype_dname ||
3173 		   (type == dns_rdatatype_ns &&
3174 		    (node != rbtdb->origin_node || IS_STUB(rbtdb))))
3175 	{
3176 		return true;
3177 	}
3178 	return false;
3179 }
3180 
3181 static isc_result_t
3182 addnoqname(isc_mem_t *mctx, dns_slabheader_t *newheader, uint32_t maxrrperset,
3183 	   dns_rdataset_t *rdataset) {
3184 	isc_result_t result;
3185 	dns_slabheader_proof_t *noqname = NULL;
3186 	dns_name_t name = DNS_NAME_INITEMPTY;
3187 	dns_rdataset_t neg = DNS_RDATASET_INIT, negsig = DNS_RDATASET_INIT;
3188 	isc_region_t r1, r2;
3189 
3190 	result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig);
3191 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
3192 
3193 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r1, 0, maxrrperset);
3194 	if (result != ISC_R_SUCCESS) {
3195 		goto cleanup;
3196 	}
3197 
3198 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r2, 0, maxrrperset);
3199 	if (result != ISC_R_SUCCESS) {
3200 		goto cleanup;
3201 	}
3202 
3203 	noqname = isc_mem_get(mctx, sizeof(*noqname));
3204 	*noqname = (dns_slabheader_proof_t){
3205 		.neg = r1.base,
3206 		.negsig = r2.base,
3207 		.type = neg.type,
3208 		.name = DNS_NAME_INITEMPTY,
3209 	};
3210 	dns_name_dup(&name, mctx, &noqname->name);
3211 	newheader->noqname = noqname;
3212 
3213 cleanup:
3214 	dns_rdataset_disassociate(&neg);
3215 	dns_rdataset_disassociate(&negsig);
3216 
3217 	return result;
3218 }
3219 
3220 static isc_result_t
3221 addclosest(isc_mem_t *mctx, dns_slabheader_t *newheader, uint32_t maxrrperset,
3222 	   dns_rdataset_t *rdataset) {
3223 	isc_result_t result;
3224 	dns_slabheader_proof_t *closest = NULL;
3225 	dns_name_t name = DNS_NAME_INITEMPTY;
3226 	dns_rdataset_t neg = DNS_RDATASET_INIT, negsig = DNS_RDATASET_INIT;
3227 	isc_region_t r1, r2;
3228 
3229 	result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig);
3230 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
3231 
3232 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r1, 0, maxrrperset);
3233 	if (result != ISC_R_SUCCESS) {
3234 		goto cleanup;
3235 	}
3236 
3237 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r2, 0, maxrrperset);
3238 	if (result != ISC_R_SUCCESS) {
3239 		goto cleanup;
3240 	}
3241 
3242 	closest = isc_mem_get(mctx, sizeof(*closest));
3243 	*closest = (dns_slabheader_proof_t){
3244 		.neg = r1.base,
3245 		.negsig = r2.base,
3246 		.name = DNS_NAME_INITEMPTY,
3247 		.type = neg.type,
3248 	};
3249 	dns_name_dup(&name, mctx, &closest->name);
3250 	newheader->closest = closest;
3251 
3252 cleanup:
3253 	dns_rdataset_disassociate(&neg);
3254 	dns_rdataset_disassociate(&negsig);
3255 	return result;
3256 }
3257 
3258 static void
3259 expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum,
3260 		   isc_rwlocktype_t *tlocktypep, isc_stdtime_t now,
3261 		   bool cache_is_overmem DNS__DB_FLARG);
3262 
3263 isc_result_t
3264 dns__rbtdb_addrdataset(dns_db_t *db, dns_dbnode_t *node,
3265 		       dns_dbversion_t *version, isc_stdtime_t now,
3266 		       dns_rdataset_t *rdataset, unsigned int options,
3267 		       dns_rdataset_t *addedrdataset DNS__DB_FLARG) {
3268 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3269 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3270 	dns_rbtdb_version_t *rbtversion = version;
3271 	isc_region_t region;
3272 	dns_slabheader_t *newheader = NULL;
3273 	isc_result_t result;
3274 	bool delegating;
3275 	bool newnsec;
3276 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
3277 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
3278 	bool cache_is_overmem = false;
3279 	dns_fixedname_t fixed;
3280 	dns_name_t *name = NULL;
3281 
3282 	REQUIRE(VALID_RBTDB(rbtdb));
3283 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
3284 
3285 	if (!IS_CACHE(rbtdb)) {
3286 		/*
3287 		 * SOA records are only allowed at top of zone.
3288 		 */
3289 		if (rdataset->type == dns_rdatatype_soa &&
3290 		    node != rbtdb->origin_node)
3291 		{
3292 			return DNS_R_NOTZONETOP;
3293 		}
3294 		TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
3295 		REQUIRE((rbtnode->nsec == DNS_DB_NSEC_NSEC3 &&
3296 			 (rdataset->type == dns_rdatatype_nsec3 ||
3297 			  rdataset->covers == dns_rdatatype_nsec3)) ||
3298 			(rbtnode->nsec != DNS_DB_NSEC_NSEC3 &&
3299 			 rdataset->type != dns_rdatatype_nsec3 &&
3300 			 rdataset->covers != dns_rdatatype_nsec3));
3301 		TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3302 	}
3303 
3304 	if (rbtversion == NULL) {
3305 		if (now == 0) {
3306 			now = isc_stdtime_now();
3307 		}
3308 	} else {
3309 		now = 0;
3310 	}
3311 
3312 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
3313 					    &region, sizeof(dns_slabheader_t),
3314 					    rbtdb->maxrrperset);
3315 	if (result != ISC_R_SUCCESS) {
3316 		if (result == DNS_R_TOOMANYRECORDS) {
3317 			name = dns_fixedname_initname(&fixed);
3318 			dns__rbtdb_nodefullname(db, node, name);
3319 			dns__db_logtoomanyrecords((dns_db_t *)rbtdb, name,
3320 						  rdataset->type, "adding",
3321 						  rbtdb->maxrrperset);
3322 		}
3323 		return result;
3324 	}
3325 
3326 	name = dns_fixedname_initname(&fixed);
3327 	dns__rbtdb_nodefullname(db, node, name);
3328 	dns_rdataset_getownercase(rdataset, name);
3329 
3330 	newheader = (dns_slabheader_t *)region.base;
3331 	*newheader = (dns_slabheader_t){
3332 		.type = DNS_TYPEPAIR_VALUE(rdataset->type, rdataset->covers),
3333 		.trust = rdataset->trust,
3334 		.last_used = now,
3335 		.node = rbtnode,
3336 	};
3337 
3338 	dns_slabheader_reset(newheader, db, node);
3339 	dns__rbtdb_setttl(newheader, rdataset->ttl + now);
3340 	if (rdataset->ttl == 0U) {
3341 		DNS_SLABHEADER_SETATTR(newheader, DNS_SLABHEADERATTR_ZEROTTL);
3342 	}
3343 	atomic_init(&newheader->count,
3344 		    atomic_fetch_add_relaxed(&init_count, 1));
3345 	if (rbtversion != NULL) {
3346 		newheader->serial = rbtversion->serial;
3347 		now = 0;
3348 
3349 		if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
3350 			DNS_SLABHEADER_SETATTR(newheader,
3351 					       DNS_SLABHEADERATTR_RESIGN);
3352 			newheader->resign =
3353 				(isc_stdtime_t)(dns_time64_from32(
3354 							rdataset->resign) >>
3355 						1);
3356 			newheader->resign_lsb = rdataset->resign & 0x1;
3357 		}
3358 	} else {
3359 		newheader->serial = 1;
3360 		if ((rdataset->attributes & DNS_RDATASETATTR_PREFETCH) != 0) {
3361 			DNS_SLABHEADER_SETATTR(newheader,
3362 					       DNS_SLABHEADERATTR_PREFETCH);
3363 		}
3364 		if ((rdataset->attributes & DNS_RDATASETATTR_NEGATIVE) != 0) {
3365 			DNS_SLABHEADER_SETATTR(newheader,
3366 					       DNS_SLABHEADERATTR_NEGATIVE);
3367 		}
3368 		if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) {
3369 			DNS_SLABHEADER_SETATTR(newheader,
3370 					       DNS_SLABHEADERATTR_NXDOMAIN);
3371 		}
3372 		if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) {
3373 			DNS_SLABHEADER_SETATTR(newheader,
3374 					       DNS_SLABHEADERATTR_OPTOUT);
3375 		}
3376 		if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
3377 			result = addnoqname(rbtdb->common.mctx, newheader,
3378 					    rbtdb->maxrrperset, rdataset);
3379 			if (result != ISC_R_SUCCESS) {
3380 				dns_slabheader_destroy(&newheader);
3381 				return result;
3382 			}
3383 		}
3384 		if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) {
3385 			result = addclosest(rbtdb->common.mctx, newheader,
3386 					    rbtdb->maxrrperset, rdataset);
3387 			if (result != ISC_R_SUCCESS) {
3388 				dns_slabheader_destroy(&newheader);
3389 				return result;
3390 			}
3391 		}
3392 	}
3393 
3394 	/*
3395 	 * If we're adding a delegation type (e.g. NS or DNAME for a zone,
3396 	 * just DNAME for the cache), then we need to set the callback bit
3397 	 * on the node.
3398 	 */
3399 	if (delegating_type(rbtdb, rbtnode, rdataset->type)) {
3400 		delegating = true;
3401 	} else {
3402 		delegating = false;
3403 	}
3404 
3405 	/*
3406 	 * Add to the auxiliary NSEC tree if we're adding an NSEC record.
3407 	 */
3408 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
3409 	if (rbtnode->nsec != DNS_DB_NSEC_HAS_NSEC &&
3410 	    rdataset->type == dns_rdatatype_nsec)
3411 	{
3412 		newnsec = true;
3413 	} else {
3414 		newnsec = false;
3415 	}
3416 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3417 
3418 	/*
3419 	 * If we're adding a delegation type, adding to the auxiliary NSEC
3420 	 * tree, or the DB is a cache in an overmem state, hold an
3421 	 * exclusive lock on the tree.  In the latter case the lock does
3422 	 * not necessarily have to be acquired but it will help purge
3423 	 * ancient entries more effectively.
3424 	 */
3425 	if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) {
3426 		cache_is_overmem = true;
3427 	}
3428 	if (delegating || newnsec || cache_is_overmem) {
3429 		TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
3430 	}
3431 
3432 	if (cache_is_overmem) {
3433 		dns__cacherbt_overmem(rbtdb, newheader,
3434 				      &tlocktype DNS__DB_FLARG_PASS);
3435 	}
3436 
3437 	NODE_WRLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3438 
3439 	if (rbtdb->rrsetstats != NULL) {
3440 		DNS_SLABHEADER_SETATTR(newheader, DNS_SLABHEADERATTR_STATCOUNT);
3441 		update_rrsetstats(rbtdb->rrsetstats, newheader->type,
3442 				  atomic_load_acquire(&newheader->attributes),
3443 				  true);
3444 	}
3445 
3446 	if (IS_CACHE(rbtdb)) {
3447 		if (tlocktype == isc_rwlocktype_write) {
3448 			cleanup_dead_nodes(rbtdb,
3449 					   rbtnode->locknum DNS__DB_FLARG_PASS);
3450 		}
3451 
3452 		expire_ttl_headers(rbtdb, rbtnode->locknum, &tlocktype, now,
3453 				   cache_is_overmem DNS__DB_FLARG_PASS);
3454 
3455 		/*
3456 		 * If we've been holding a write lock on the tree just for
3457 		 * cleaning, we can release it now.  However, we still need the
3458 		 * node lock.
3459 		 */
3460 		if (tlocktype == isc_rwlocktype_write && !delegating &&
3461 		    !newnsec)
3462 		{
3463 			TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3464 		}
3465 	}
3466 
3467 	result = ISC_R_SUCCESS;
3468 	if (newnsec) {
3469 		dns_rbtnode_t *nsecnode = NULL;
3470 
3471 		result = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
3472 		if (result == ISC_R_SUCCESS) {
3473 			nsecnode->nsec = DNS_DB_NSEC_NSEC;
3474 			rbtnode->nsec = DNS_DB_NSEC_HAS_NSEC;
3475 		} else if (result == ISC_R_EXISTS) {
3476 			rbtnode->nsec = DNS_DB_NSEC_HAS_NSEC;
3477 			result = ISC_R_SUCCESS;
3478 		}
3479 	}
3480 
3481 	if (result == ISC_R_SUCCESS) {
3482 		result = dns__rbtdb_add(rbtdb, rbtnode, name, rbtversion,
3483 					newheader, options, false,
3484 					addedrdataset, now DNS__DB_FLARG_PASS);
3485 	}
3486 	if (result == ISC_R_SUCCESS && delegating) {
3487 		rbtnode->find_callback = 1;
3488 	}
3489 
3490 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3491 
3492 	if (tlocktype != isc_rwlocktype_none) {
3493 		TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3494 	}
3495 	INSIST(tlocktype == isc_rwlocktype_none);
3496 
3497 	return result;
3498 }
3499 
3500 isc_result_t
3501 dns__rbtdb_subtractrdataset(dns_db_t *db, dns_dbnode_t *node,
3502 			    dns_dbversion_t *version, dns_rdataset_t *rdataset,
3503 			    unsigned int options,
3504 			    dns_rdataset_t *newrdataset DNS__DB_FLARG) {
3505 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3506 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3507 	dns_rbtdb_version_t *rbtversion = version;
3508 	dns_fixedname_t fname;
3509 	dns_name_t *nodename = dns_fixedname_initname(&fname);
3510 	dns_slabheader_t *topheader = NULL, *topheader_prev = NULL;
3511 	dns_slabheader_t *header = NULL, *newheader = NULL;
3512 	unsigned char *subresult = NULL;
3513 	isc_region_t region;
3514 	isc_result_t result;
3515 	rbtdb_changed_t *changed = NULL;
3516 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
3517 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
3518 
3519 	REQUIRE(VALID_RBTDB(rbtdb));
3520 	REQUIRE(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
3521 
3522 	if (!IS_CACHE(rbtdb)) {
3523 		TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
3524 		REQUIRE((rbtnode->nsec == DNS_DB_NSEC_NSEC3 &&
3525 			 (rdataset->type == dns_rdatatype_nsec3 ||
3526 			  rdataset->covers == dns_rdatatype_nsec3)) ||
3527 			(rbtnode->nsec != DNS_DB_NSEC_NSEC3 &&
3528 			 rdataset->type != dns_rdatatype_nsec3 &&
3529 			 rdataset->covers != dns_rdatatype_nsec3));
3530 		TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3531 	}
3532 
3533 	dns__rbtdb_nodefullname(db, node, nodename);
3534 
3535 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
3536 					    &region, sizeof(dns_slabheader_t),
3537 					    0);
3538 	if (result != ISC_R_SUCCESS) {
3539 		return result;
3540 	}
3541 
3542 	newheader = (dns_slabheader_t *)region.base;
3543 	dns_slabheader_reset(newheader, db, node);
3544 	dns__rbtdb_setttl(newheader, rdataset->ttl);
3545 	newheader->type = DNS_TYPEPAIR_VALUE(rdataset->type, rdataset->covers);
3546 	atomic_init(&newheader->attributes, 0);
3547 	newheader->serial = rbtversion->serial;
3548 	newheader->trust = 0;
3549 	newheader->noqname = NULL;
3550 	newheader->closest = NULL;
3551 	atomic_init(&newheader->count,
3552 		    atomic_fetch_add_relaxed(&init_count, 1));
3553 	newheader->last_used = 0;
3554 	newheader->node = rbtnode;
3555 	newheader->db = (dns_db_t *)rbtdb;
3556 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
3557 		DNS_SLABHEADER_SETATTR(newheader, DNS_SLABHEADERATTR_RESIGN);
3558 		newheader->resign =
3559 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
3560 					1);
3561 		newheader->resign_lsb = rdataset->resign & 0x1;
3562 	} else {
3563 		newheader->resign = 0;
3564 		newheader->resign_lsb = 0;
3565 	}
3566 
3567 	NODE_WRLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3568 
3569 	changed = add_changed(newheader, rbtversion DNS__DB_FLARG_PASS);
3570 	if (changed == NULL) {
3571 		dns_slabheader_destroy(&newheader);
3572 		NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
3573 			    &nlocktype);
3574 		return ISC_R_NOMEMORY;
3575 	}
3576 
3577 	for (topheader = rbtnode->data; topheader != NULL;
3578 	     topheader = topheader->next)
3579 	{
3580 		if (topheader->type == newheader->type) {
3581 			break;
3582 		}
3583 		topheader_prev = topheader;
3584 	}
3585 	/*
3586 	 * If header isn't NULL, we've found the right type.  There may be
3587 	 * IGNORE rdatasets between the top of the chain and the first real
3588 	 * data.  We skip over them.
3589 	 */
3590 	header = topheader;
3591 	while (header != NULL && IGNORE(header)) {
3592 		header = header->down;
3593 	}
3594 	if (header != NULL && EXISTS(header)) {
3595 		unsigned int flags = 0;
3596 		subresult = NULL;
3597 		result = ISC_R_SUCCESS;
3598 		if ((options & DNS_DBSUB_EXACT) != 0) {
3599 			flags |= DNS_RDATASLAB_EXACT;
3600 			if (newheader->ttl != header->ttl) {
3601 				result = DNS_R_NOTEXACT;
3602 			}
3603 		}
3604 		if (result == ISC_R_SUCCESS) {
3605 			result = dns_rdataslab_subtract(
3606 				(unsigned char *)header,
3607 				(unsigned char *)newheader,
3608 				(unsigned int)(sizeof(*newheader)),
3609 				rbtdb->common.mctx, rbtdb->common.rdclass,
3610 				(dns_rdatatype_t)header->type, flags,
3611 				&subresult);
3612 		}
3613 		if (result == ISC_R_SUCCESS) {
3614 			dns_slabheader_destroy(&newheader);
3615 			newheader = (dns_slabheader_t *)subresult;
3616 			dns_slabheader_reset(newheader, db, node);
3617 			dns_slabheader_copycase(newheader, header);
3618 			if (RESIGN(header)) {
3619 				DNS_SLABHEADER_SETATTR(
3620 					newheader, DNS_SLABHEADERATTR_RESIGN);
3621 				newheader->resign = header->resign;
3622 				newheader->resign_lsb = header->resign_lsb;
3623 				dns__zonerbt_resigninsert(
3624 					rbtdb, rbtnode->locknum, newheader);
3625 			}
3626 			/*
3627 			 * We have to set the serial since the rdataslab
3628 			 * subtraction routine copies the reserved portion of
3629 			 * header, not newheader.
3630 			 */
3631 			newheader->serial = rbtversion->serial;
3632 			/*
3633 			 * XXXJT: dns_rdataslab_subtract() copied the pointers
3634 			 * to additional info.  We need to clear these fields
3635 			 * to avoid having duplicated references.
3636 			 */
3637 			update_recordsandxfrsize(true, rbtversion, newheader,
3638 						 nodename->length);
3639 		} else if (result == DNS_R_NXRRSET) {
3640 			/*
3641 			 * This subtraction would remove all of the rdata;
3642 			 * add a nonexistent header instead.
3643 			 */
3644 			dns_slabheader_destroy(&newheader);
3645 			newheader = dns_slabheader_new((dns_db_t *)rbtdb,
3646 						       (dns_dbnode_t *)rbtnode);
3647 			dns__rbtdb_setttl(newheader, 0);
3648 			newheader->type = topheader->type;
3649 			atomic_init(&newheader->attributes,
3650 				    DNS_SLABHEADERATTR_NONEXISTENT);
3651 			newheader->serial = rbtversion->serial;
3652 		} else {
3653 			dns_slabheader_destroy(&newheader);
3654 			goto unlock;
3655 		}
3656 
3657 		/*
3658 		 * If we're here, we want to link newheader in front of
3659 		 * topheader.
3660 		 */
3661 		INSIST(rbtversion->serial >= topheader->serial);
3662 		update_recordsandxfrsize(false, rbtversion, header,
3663 					 nodename->length);
3664 		if (topheader_prev != NULL) {
3665 			topheader_prev->next = newheader;
3666 		} else {
3667 			rbtnode->data = newheader;
3668 		}
3669 		newheader->next = topheader->next;
3670 		newheader->down = topheader;
3671 		topheader->next = newheader;
3672 		rbtnode->dirty = 1;
3673 		changed->dirty = true;
3674 		dns__zonerbt_resigndelete(rbtdb, rbtversion,
3675 					  header DNS__DB_FLARG_PASS);
3676 	} else {
3677 		/*
3678 		 * The rdataset doesn't exist, so we don't need to do anything
3679 		 * to satisfy the deletion request.
3680 		 */
3681 		dns_slabheader_destroy(&newheader);
3682 		if ((options & DNS_DBSUB_EXACT) != 0) {
3683 			result = DNS_R_NOTEXACT;
3684 		} else {
3685 			result = DNS_R_UNCHANGED;
3686 		}
3687 	}
3688 
3689 	if (result == ISC_R_SUCCESS && newrdataset != NULL) {
3690 		dns__rbtdb_bindrdataset(rbtdb, rbtnode, newheader, 0,
3691 					isc_rwlocktype_write,
3692 					newrdataset DNS__DB_FLARG_PASS);
3693 	}
3694 
3695 	if (result == DNS_R_NXRRSET && newrdataset != NULL &&
3696 	    (options & DNS_DBSUB_WANTOLD) != 0)
3697 	{
3698 		dns__rbtdb_bindrdataset(rbtdb, rbtnode, header, 0,
3699 					isc_rwlocktype_write,
3700 					newrdataset DNS__DB_FLARG_PASS);
3701 	}
3702 
3703 unlock:
3704 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3705 
3706 	/*
3707 	 * Update the zone's secure status.  If version is non-NULL
3708 	 * this is deferred until dns__rbtdb_closeversion() is called.
3709 	 */
3710 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
3711 		RWLOCK(&rbtdb->lock, isc_rwlocktype_read);
3712 		version = rbtdb->current_version;
3713 		RWUNLOCK(&rbtdb->lock, isc_rwlocktype_read);
3714 		dns__rbtdb_setsecure(db, version, rbtdb->origin_node);
3715 	}
3716 
3717 	return result;
3718 }
3719 
3720 isc_result_t
3721 dns__rbtdb_deleterdataset(dns_db_t *db, dns_dbnode_t *node,
3722 			  dns_dbversion_t *version, dns_rdatatype_t type,
3723 			  dns_rdatatype_t covers DNS__DB_FLARG) {
3724 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3725 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3726 	dns_rbtdb_version_t *rbtversion = version;
3727 	dns_fixedname_t fname;
3728 	dns_name_t *nodename = dns_fixedname_initname(&fname);
3729 	isc_result_t result;
3730 	dns_slabheader_t *newheader = NULL;
3731 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
3732 
3733 	REQUIRE(VALID_RBTDB(rbtdb));
3734 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
3735 
3736 	if (type == dns_rdatatype_any) {
3737 		return ISC_R_NOTIMPLEMENTED;
3738 	}
3739 	if (type == dns_rdatatype_rrsig && covers == 0) {
3740 		return ISC_R_NOTIMPLEMENTED;
3741 	}
3742 
3743 	newheader = dns_slabheader_new(db, node);
3744 	newheader->type = DNS_TYPEPAIR_VALUE(type, covers);
3745 	dns__rbtdb_setttl(newheader, 0);
3746 	atomic_init(&newheader->attributes, DNS_SLABHEADERATTR_NONEXISTENT);
3747 	if (rbtversion != NULL) {
3748 		newheader->serial = rbtversion->serial;
3749 	}
3750 
3751 	dns__rbtdb_nodefullname(db, node, nodename);
3752 
3753 	NODE_WRLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3754 	result = dns__rbtdb_add(rbtdb, rbtnode, nodename, rbtversion, newheader,
3755 				DNS_DBADD_FORCE, false, NULL,
3756 				0 DNS__DB_FLARG_PASS);
3757 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
3758 
3759 	/*
3760 	 * Update the zone's secure status.  If version is non-NULL
3761 	 * this is deferred until dns__rbtdb_closeversion() is called.
3762 	 */
3763 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
3764 		RWLOCK(&rbtdb->lock, isc_rwlocktype_read);
3765 		version = rbtdb->current_version;
3766 		RWUNLOCK(&rbtdb->lock, isc_rwlocktype_read);
3767 		dns__rbtdb_setsecure(db, version, rbtdb->origin_node);
3768 	}
3769 
3770 	return result;
3771 }
3772 
3773 static void
3774 delete_callback(void *data, void *arg) {
3775 	dns_rbtdb_t *rbtdb = arg;
3776 	dns_slabheader_t *current = NULL, *next = NULL;
3777 	unsigned int locknum;
3778 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
3779 
3780 	current = data;
3781 	locknum = RBTDB_HEADERNODE(current)->locknum;
3782 	NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
3783 	while (current != NULL) {
3784 		next = current->next;
3785 		dns_slabheader_destroy(&current);
3786 		current = next;
3787 	}
3788 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
3789 }
3790 
3791 unsigned int
3792 dns__rbtdb_nodecount(dns_db_t *db, dns_dbtree_t tree) {
3793 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3794 	unsigned int count;
3795 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
3796 
3797 	REQUIRE(VALID_RBTDB(rbtdb));
3798 
3799 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
3800 	switch (tree) {
3801 	case dns_dbtree_main:
3802 		count = dns_rbt_nodecount(rbtdb->tree);
3803 		break;
3804 	case dns_dbtree_nsec:
3805 		count = dns_rbt_nodecount(rbtdb->nsec);
3806 		break;
3807 	case dns_dbtree_nsec3:
3808 		count = dns_rbt_nodecount(rbtdb->nsec3);
3809 		break;
3810 	default:
3811 		UNREACHABLE();
3812 	}
3813 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3814 
3815 	return count;
3816 }
3817 
3818 void
3819 dns__rbtdb_setloop(dns_db_t *db, isc_loop_t *loop) {
3820 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3821 
3822 	REQUIRE(VALID_RBTDB(rbtdb));
3823 
3824 	RWLOCK(&rbtdb->lock, isc_rwlocktype_write);
3825 	if (rbtdb->loop != NULL) {
3826 		isc_loop_detach(&rbtdb->loop);
3827 	}
3828 	if (loop != NULL) {
3829 		isc_loop_attach(loop, &rbtdb->loop);
3830 	}
3831 	RWUNLOCK(&rbtdb->lock, isc_rwlocktype_write);
3832 }
3833 
3834 isc_result_t
3835 dns__rbtdb_getoriginnode(dns_db_t *db, dns_dbnode_t **nodep DNS__DB_FLARG) {
3836 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3837 	dns_rbtnode_t *onode = NULL;
3838 	isc_result_t result = ISC_R_SUCCESS;
3839 
3840 	REQUIRE(VALID_RBTDB(rbtdb));
3841 	REQUIRE(nodep != NULL && *nodep == NULL);
3842 
3843 	/* Note that the access to origin_node doesn't require a DB lock */
3844 	onode = (dns_rbtnode_t *)rbtdb->origin_node;
3845 	if (onode != NULL) {
3846 		dns__rbtdb_newref(rbtdb, onode,
3847 				  isc_rwlocktype_none DNS__DB_FLARG_PASS);
3848 		*nodep = rbtdb->origin_node;
3849 	} else {
3850 		INSIST(IS_CACHE(rbtdb));
3851 		result = ISC_R_NOTFOUND;
3852 	}
3853 
3854 	return result;
3855 }
3856 
3857 void
3858 dns__rbtdb_locknode(dns_db_t *db, dns_dbnode_t *node, isc_rwlocktype_t type) {
3859 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3860 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3861 
3862 	RWLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, type);
3863 }
3864 
3865 void
3866 dns__rbtdb_unlocknode(dns_db_t *db, dns_dbnode_t *node, isc_rwlocktype_t type) {
3867 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3868 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3869 
3870 	RWUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, type);
3871 }
3872 
3873 isc_result_t
3874 dns__rbtdb_nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name) {
3875 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
3876 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
3877 	isc_result_t result;
3878 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
3879 
3880 	REQUIRE(VALID_RBTDB(rbtdb));
3881 	REQUIRE(node != NULL);
3882 	REQUIRE(name != NULL);
3883 
3884 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
3885 	result = dns_rbt_fullnamefromnode(rbtnode, name);
3886 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
3887 
3888 	return result;
3889 }
3890 
3891 isc_result_t
3892 dns__rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
3893 		  dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
3894 		  void *driverarg ISC_ATTR_UNUSED, dns_db_t **dbp) {
3895 	dns_rbtdb_t *rbtdb = NULL;
3896 	isc_result_t result;
3897 	int i;
3898 	dns_name_t name;
3899 	isc_mem_t *hmctx = mctx;
3900 
3901 	rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
3902 	*rbtdb = (dns_rbtdb_t){
3903 		.common.origin = DNS_NAME_INITEMPTY,
3904 		.common.rdclass = rdclass,
3905 		.current_serial = 1,
3906 		.least_serial = 1,
3907 		.next_serial = 2,
3908 		.open_versions = ISC_LIST_INITIALIZER,
3909 	};
3910 
3911 	isc_refcount_init(&rbtdb->common.references, 1);
3912 
3913 	/*
3914 	 * If argv[0] exists, it points to a memory context to use for heap
3915 	 */
3916 	if (argc != 0) {
3917 		hmctx = (isc_mem_t *)argv[0];
3918 	}
3919 
3920 	if (type == dns_dbtype_cache) {
3921 		rbtdb->common.methods = &dns__rbtdb_cachemethods;
3922 		rbtdb->common.attributes |= DNS_DBATTR_CACHE;
3923 	} else if (type == dns_dbtype_stub) {
3924 		rbtdb->common.methods = &dns__rbtdb_zonemethods;
3925 		rbtdb->common.attributes |= DNS_DBATTR_STUB;
3926 	} else {
3927 		rbtdb->common.methods = &dns__rbtdb_zonemethods;
3928 	}
3929 
3930 	isc_rwlock_init(&rbtdb->lock);
3931 	TREE_INITLOCK(&rbtdb->tree_lock);
3932 
3933 	/*
3934 	 * Initialize node_lock_count in a generic way to support future
3935 	 * extension which allows the user to specify this value on creation.
3936 	 * Note that when specified for a cache DB it must be larger than 1
3937 	 * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
3938 	 */
3939 	if (rbtdb->node_lock_count == 0) {
3940 		if (IS_CACHE(rbtdb)) {
3941 			rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
3942 		} else {
3943 			rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
3944 		}
3945 	} else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
3946 		result = ISC_R_RANGE;
3947 		goto cleanup_tree_lock;
3948 	}
3949 	INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
3950 	rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
3951 						      sizeof(db_nodelock_t));
3952 
3953 	rbtdb->common.update_listeners = cds_lfht_new(16, 16, 0, 0, NULL);
3954 
3955 	if (IS_CACHE(rbtdb)) {
3956 		dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
3957 		rbtdb->lru = isc_mem_get(mctx,
3958 					 rbtdb->node_lock_count *
3959 						 sizeof(dns_slabheaderlist_t));
3960 		for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
3961 			ISC_LIST_INIT(rbtdb->lru[i]);
3962 		}
3963 	}
3964 
3965 	/*
3966 	 * Create the heaps.
3967 	 */
3968 	rbtdb->heaps = isc_mem_get(hmctx, rbtdb->node_lock_count *
3969 						  sizeof(isc_heap_t *));
3970 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
3971 		rbtdb->heaps[i] = NULL;
3972 	}
3973 
3974 	rbtdb->sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner;
3975 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
3976 		isc_heap_create(hmctx, rbtdb->sooner, set_index, 0,
3977 				&rbtdb->heaps[i]);
3978 	}
3979 
3980 	/*
3981 	 * Create deadnode lists.
3982 	 */
3983 	rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
3984 						     sizeof(dns_rbtnodelist_t));
3985 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
3986 		ISC_LIST_INIT(rbtdb->deadnodes[i]);
3987 	}
3988 
3989 	rbtdb->active = rbtdb->node_lock_count;
3990 
3991 	for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
3992 		NODE_INITLOCK(&rbtdb->node_locks[i].lock);
3993 		isc_refcount_init(&rbtdb->node_locks[i].references, 0);
3994 		rbtdb->node_locks[i].exiting = false;
3995 	}
3996 
3997 	/*
3998 	 * Attach to the mctx.  The database will persist so long as there
3999 	 * are references to it, and attaching to the mctx ensures that our
4000 	 * mctx won't disappear out from under us.
4001 	 */
4002 	isc_mem_attach(mctx, &rbtdb->common.mctx);
4003 	isc_mem_attach(hmctx, &rbtdb->hmctx);
4004 
4005 	/*
4006 	 * Make a copy of the origin name.
4007 	 */
4008 	dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
4009 
4010 	/*
4011 	 * Make the Red-Black Trees.
4012 	 */
4013 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
4014 	if (result != ISC_R_SUCCESS) {
4015 		free_rbtdb(rbtdb, false);
4016 		return result;
4017 	}
4018 
4019 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec);
4020 	if (result != ISC_R_SUCCESS) {
4021 		free_rbtdb(rbtdb, false);
4022 		return result;
4023 	}
4024 
4025 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3);
4026 	if (result != ISC_R_SUCCESS) {
4027 		free_rbtdb(rbtdb, false);
4028 		return result;
4029 	}
4030 
4031 	/*
4032 	 * In order to set the node callback bit correctly in zone databases,
4033 	 * we need to know if the node has the origin name of the zone.
4034 	 * In loading_addrdataset() we could simply compare the new name
4035 	 * to the origin name, but this is expensive.  Also, we don't know the
4036 	 * node name in dns__rbtdb_addrdataset(), so we need another way of
4037 	 * knowing the zone's top.
4038 	 *
4039 	 * We now explicitly create a node for the zone's origin, and then
4040 	 * we simply remember the node's address.  This is safe, because
4041 	 * the top-of-zone node can never be deleted, nor can its address
4042 	 * change.
4043 	 */
4044 	if (!IS_CACHE(rbtdb)) {
4045 		result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
4046 					 &rbtdb->origin_node);
4047 		if (result != ISC_R_SUCCESS) {
4048 			INSIST(result != ISC_R_EXISTS);
4049 			free_rbtdb(rbtdb, false);
4050 			return result;
4051 		}
4052 		INSIST(rbtdb->origin_node != NULL);
4053 		rbtdb->origin_node->nsec = DNS_DB_NSEC_NORMAL;
4054 		/*
4055 		 * We need to give the origin node the right locknum.
4056 		 */
4057 		dns_name_init(&name, NULL);
4058 		dns_rbt_namefromnode(rbtdb->origin_node, &name);
4059 		rbtdb->origin_node->locknum = rbtdb->origin_node->hashval %
4060 					      rbtdb->node_lock_count;
4061 		/*
4062 		 * Add an apex node to the NSEC3 tree so that NSEC3 searches
4063 		 * return partial matches when there is only a single NSEC3
4064 		 * record in the tree.
4065 		 */
4066 		result = dns_rbt_addnode(rbtdb->nsec3, &rbtdb->common.origin,
4067 					 &rbtdb->nsec3_origin_node);
4068 		if (result != ISC_R_SUCCESS) {
4069 			INSIST(result != ISC_R_EXISTS);
4070 			free_rbtdb(rbtdb, false);
4071 			return result;
4072 		}
4073 		rbtdb->nsec3_origin_node->nsec = DNS_DB_NSEC_NSEC3;
4074 		/*
4075 		 * We need to give the nsec3 origin node the right locknum.
4076 		 */
4077 		dns_name_init(&name, NULL);
4078 		dns_rbt_namefromnode(rbtdb->nsec3_origin_node, &name);
4079 		rbtdb->nsec3_origin_node->locknum =
4080 			rbtdb->nsec3_origin_node->hashval %
4081 			rbtdb->node_lock_count;
4082 	}
4083 
4084 	/*
4085 	 * Version Initialization.
4086 	 */
4087 	rbtdb->current_version = allocate_version(mctx, 1, 1, false);
4088 	rbtdb->current_version->rbtdb = rbtdb;
4089 
4090 	/*
4091 	 * Keep the current version in the open list so that list operation
4092 	 * won't happen in normal lookup operations.
4093 	 */
4094 	PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
4095 
4096 	rbtdb->common.magic = DNS_DB_MAGIC;
4097 	rbtdb->common.impmagic = RBTDB_MAGIC;
4098 
4099 	*dbp = (dns_db_t *)rbtdb;
4100 
4101 	return ISC_R_SUCCESS;
4102 
4103 cleanup_tree_lock:
4104 	TREE_DESTROYLOCK(&rbtdb->tree_lock);
4105 	isc_rwlock_destroy(&rbtdb->lock);
4106 	isc_mem_put(mctx, rbtdb, sizeof(*rbtdb));
4107 	return result;
4108 }
4109 
4110 /*
4111  * Rdataset Iterator Methods
4112  */
4113 
4114 static void
4115 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp DNS__DB_FLARG) {
4116 	rbtdb_rdatasetiter_t *rbtiterator = NULL;
4117 
4118 	rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
4119 
4120 	if (rbtiterator->common.version != NULL) {
4121 		dns__rbtdb_closeversion(rbtiterator->common.db,
4122 					&rbtiterator->common.version,
4123 					false DNS__DB_FLARG_PASS);
4124 	}
4125 	dns__db_detachnode(rbtiterator->common.db,
4126 			   &rbtiterator->common.node DNS__DB_FLARG_PASS);
4127 	isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
4128 		    sizeof(*rbtiterator));
4129 
4130 	*iteratorp = NULL;
4131 }
4132 
4133 static bool
4134 iterator_active(dns_rbtdb_t *rbtdb, rbtdb_rdatasetiter_t *rbtiterator,
4135 		dns_slabheader_t *header) {
4136 	dns_ttl_t stale_ttl = header->ttl + STALE_TTL(header, rbtdb);
4137 
4138 	/*
4139 	 * Is this a "this rdataset doesn't exist" record?
4140 	 */
4141 	if (NONEXISTENT(header)) {
4142 		return false;
4143 	}
4144 
4145 	/*
4146 	 * If this is a zone or this header still active then return it.
4147 	 */
4148 	if (!IS_CACHE(rbtdb) || ACTIVE(header, rbtiterator->common.now)) {
4149 		return true;
4150 	}
4151 
4152 	/*
4153 	 * If we are not returning stale records or the rdataset is
4154 	 * too old don't return it.
4155 	 */
4156 	if (!STALEOK(rbtiterator) || (rbtiterator->common.now > stale_ttl)) {
4157 		return false;
4158 	}
4159 	return true;
4160 }
4161 
4162 static isc_result_t
4163 rdatasetiter_first(dns_rdatasetiter_t *iterator DNS__DB_FLARG) {
4164 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
4165 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
4166 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
4167 	dns_rbtdb_version_t *rbtversion = rbtiterator->common.version;
4168 	dns_slabheader_t *header = NULL, *top_next = NULL;
4169 	uint32_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
4170 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
4171 
4172 	NODE_RDLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4173 
4174 	for (header = rbtnode->data; header != NULL; header = top_next) {
4175 		top_next = header->next;
4176 		do {
4177 			if (EXPIREDOK(rbtiterator)) {
4178 				if (!NONEXISTENT(header)) {
4179 					break;
4180 				}
4181 				header = header->down;
4182 			} else if (header->serial <= serial && !IGNORE(header))
4183 			{
4184 				if (!iterator_active(rbtdb, rbtiterator,
4185 						     header))
4186 				{
4187 					header = NULL;
4188 				}
4189 				break;
4190 			} else {
4191 				header = header->down;
4192 			}
4193 		} while (header != NULL);
4194 		if (header != NULL) {
4195 			break;
4196 		}
4197 	}
4198 
4199 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4200 
4201 	rbtiterator->current = header;
4202 
4203 	if (header == NULL) {
4204 		return ISC_R_NOMORE;
4205 	}
4206 
4207 	return ISC_R_SUCCESS;
4208 }
4209 
4210 static isc_result_t
4211 rdatasetiter_next(dns_rdatasetiter_t *iterator DNS__DB_FLARG) {
4212 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
4213 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
4214 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
4215 	dns_rbtdb_version_t *rbtversion = rbtiterator->common.version;
4216 	dns_slabheader_t *header = NULL, *top_next = NULL;
4217 	uint32_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
4218 	dns_typepair_t type, negtype;
4219 	dns_rdatatype_t rdtype, covers;
4220 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
4221 	bool expiredok = EXPIREDOK(rbtiterator);
4222 
4223 	header = rbtiterator->current;
4224 	if (header == NULL) {
4225 		return ISC_R_NOMORE;
4226 	}
4227 
4228 	NODE_RDLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4229 
4230 	type = header->type;
4231 	rdtype = DNS_TYPEPAIR_TYPE(header->type);
4232 	if (NEGATIVE(header)) {
4233 		covers = DNS_TYPEPAIR_COVERS(header->type);
4234 		negtype = DNS_TYPEPAIR_VALUE(covers, 0);
4235 	} else {
4236 		negtype = DNS_TYPEPAIR_VALUE(0, rdtype);
4237 	}
4238 
4239 	/*
4240 	 * Find the start of the header chain for the next type
4241 	 * by walking back up the list.
4242 	 */
4243 	top_next = header->next;
4244 	while (top_next != NULL &&
4245 	       (top_next->type == type || top_next->type == negtype))
4246 	{
4247 		top_next = top_next->next;
4248 	}
4249 	if (expiredok) {
4250 		/*
4251 		 * Keep walking down the list if possible or
4252 		 * start the next type.
4253 		 */
4254 		header = header->down != NULL ? header->down : top_next;
4255 	} else {
4256 		header = top_next;
4257 	}
4258 	for (; header != NULL; header = top_next) {
4259 		top_next = header->next;
4260 		do {
4261 			if (expiredok) {
4262 				if (!NONEXISTENT(header)) {
4263 					break;
4264 				}
4265 				header = header->down;
4266 			} else if (header->serial <= serial && !IGNORE(header))
4267 			{
4268 				if (!iterator_active(rbtdb, rbtiterator,
4269 						     header))
4270 				{
4271 					header = NULL;
4272 				}
4273 				break;
4274 			} else {
4275 				header = header->down;
4276 			}
4277 		} while (header != NULL);
4278 		if (header != NULL) {
4279 			break;
4280 		}
4281 		/*
4282 		 * Find the start of the header chain for the next type
4283 		 * by walking back up the list.
4284 		 */
4285 		while (top_next != NULL &&
4286 		       (top_next->type == type || top_next->type == negtype))
4287 		{
4288 			top_next = top_next->next;
4289 		}
4290 	}
4291 
4292 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4293 
4294 	rbtiterator->current = header;
4295 
4296 	if (header == NULL) {
4297 		return ISC_R_NOMORE;
4298 	}
4299 
4300 	return ISC_R_SUCCESS;
4301 }
4302 
4303 static void
4304 rdatasetiter_current(dns_rdatasetiter_t *iterator,
4305 		     dns_rdataset_t *rdataset DNS__DB_FLARG) {
4306 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
4307 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
4308 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
4309 	dns_slabheader_t *header = NULL;
4310 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
4311 
4312 	header = rbtiterator->current;
4313 	REQUIRE(header != NULL);
4314 
4315 	NODE_RDLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4316 
4317 	dns__rbtdb_bindrdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
4318 				isc_rwlocktype_read,
4319 				rdataset DNS__DB_FLARG_PASS);
4320 
4321 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
4322 }
4323 
4324 /*
4325  * Database Iterator Methods
4326  */
4327 
4328 static void
4329 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter DNS__DB_FLARG) {
4330 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
4331 	dns_rbtnode_t *node = rbtdbiter->node;
4332 
4333 	if (node == NULL) {
4334 		return;
4335 	}
4336 
4337 	INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
4338 	reactivate_node(rbtdb, node, rbtdbiter->tree_locked DNS__DB_FLARG_PASS);
4339 }
4340 
4341 static void
4342 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter DNS__DB_FLARG) {
4343 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
4344 	dns_rbtnode_t *node = rbtdbiter->node;
4345 	isc_rwlock_t *lock = NULL;
4346 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
4347 	isc_rwlocktype_t tlocktype = rbtdbiter->tree_locked;
4348 
4349 	if (node == NULL) {
4350 		return;
4351 	}
4352 
4353 	REQUIRE(tlocktype != isc_rwlocktype_write);
4354 
4355 	lock = &rbtdb->node_locks[node->locknum].lock;
4356 	NODE_RDLOCK(lock, &nlocktype);
4357 	dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &rbtdbiter->tree_locked,
4358 			  false, false DNS__DB_FLARG_PASS);
4359 	NODE_UNLOCK(lock, &nlocktype);
4360 
4361 	INSIST(rbtdbiter->tree_locked == tlocktype);
4362 
4363 	rbtdbiter->node = NULL;
4364 }
4365 
4366 static void
4367 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
4368 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
4369 
4370 	REQUIRE(rbtdbiter->paused);
4371 	REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
4372 
4373 	TREE_RDLOCK(&rbtdb->tree_lock, &rbtdbiter->tree_locked);
4374 
4375 	rbtdbiter->paused = false;
4376 }
4377 
4378 static void
4379 dbiterator_destroy(dns_dbiterator_t **iteratorp DNS__DB_FLARG) {
4380 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
4381 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
4382 	dns_db_t *db = NULL;
4383 
4384 	if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
4385 		TREE_UNLOCK(&rbtdb->tree_lock, &rbtdbiter->tree_locked);
4386 	}
4387 	INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
4388 
4389 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4390 
4391 	dns_db_attach(rbtdbiter->common.db, &db);
4392 	dns_db_detach(&rbtdbiter->common.db);
4393 
4394 	dns_rbtnodechain_reset(&rbtdbiter->chain);
4395 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
4396 	isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
4397 	dns_db_detach(&db);
4398 
4399 	*iteratorp = NULL;
4400 }
4401 
4402 static isc_result_t
4403 dbiterator_first(dns_dbiterator_t *iterator DNS__DB_FLARG) {
4404 	isc_result_t result;
4405 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4406 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4407 	dns_name_t *name = NULL, *origin = NULL;
4408 
4409 	if (rbtdbiter->result != ISC_R_SUCCESS &&
4410 	    rbtdbiter->result != ISC_R_NOTFOUND &&
4411 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
4412 	    rbtdbiter->result != ISC_R_NOMORE)
4413 	{
4414 		return rbtdbiter->result;
4415 	}
4416 
4417 	if (rbtdbiter->paused) {
4418 		resume_iteration(rbtdbiter);
4419 	}
4420 
4421 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4422 
4423 	name = dns_fixedname_name(&rbtdbiter->name);
4424 	origin = dns_fixedname_name(&rbtdbiter->origin);
4425 	dns_rbtnodechain_reset(&rbtdbiter->chain);
4426 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
4427 
4428 	switch (rbtdbiter->nsec3mode) {
4429 	case nsec3only:
4430 		rbtdbiter->current = &rbtdbiter->nsec3chain;
4431 		result = dns_rbtnodechain_first(rbtdbiter->current,
4432 						rbtdb->nsec3, name, origin);
4433 		break;
4434 	case nonsec3:
4435 		rbtdbiter->current = &rbtdbiter->chain;
4436 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
4437 						name, origin);
4438 		break;
4439 	case full:
4440 		rbtdbiter->current = &rbtdbiter->chain;
4441 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
4442 						name, origin);
4443 		if (result == ISC_R_NOTFOUND) {
4444 			rbtdbiter->current = &rbtdbiter->nsec3chain;
4445 			result = dns_rbtnodechain_first(
4446 				rbtdbiter->current, rbtdb->nsec3, name, origin);
4447 		}
4448 		break;
4449 	default:
4450 		UNREACHABLE();
4451 	}
4452 
4453 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
4454 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
4455 						  NULL, &rbtdbiter->node);
4456 
4457 		/* If we're in the NSEC3 tree, skip the origin */
4458 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
4459 			rbtdbiter->node = NULL;
4460 			result = dns_rbtnodechain_next(rbtdbiter->current, name,
4461 						       origin);
4462 			if (result == ISC_R_SUCCESS ||
4463 			    result == DNS_R_NEWORIGIN)
4464 			{
4465 				result = dns_rbtnodechain_current(
4466 					rbtdbiter->current, NULL, NULL,
4467 					&rbtdbiter->node);
4468 			}
4469 		}
4470 		if (result == ISC_R_SUCCESS) {
4471 			rbtdbiter->new_origin = true;
4472 			reference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4473 		}
4474 	} else {
4475 		INSIST(result == ISC_R_NOTFOUND);
4476 		result = ISC_R_NOMORE; /* The tree is empty. */
4477 	}
4478 
4479 	rbtdbiter->result = result;
4480 
4481 	if (result != ISC_R_SUCCESS) {
4482 		ENSURE(!rbtdbiter->paused);
4483 	}
4484 
4485 	return result;
4486 }
4487 
4488 static isc_result_t
4489 dbiterator_last(dns_dbiterator_t *iterator DNS__DB_FLARG) {
4490 	isc_result_t result;
4491 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4492 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4493 	dns_name_t *name = NULL, *origin = NULL;
4494 
4495 	if (rbtdbiter->result != ISC_R_SUCCESS &&
4496 	    rbtdbiter->result != ISC_R_NOTFOUND &&
4497 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
4498 	    rbtdbiter->result != ISC_R_NOMORE)
4499 	{
4500 		return rbtdbiter->result;
4501 	}
4502 
4503 	if (rbtdbiter->paused) {
4504 		resume_iteration(rbtdbiter);
4505 	}
4506 
4507 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4508 
4509 	name = dns_fixedname_name(&rbtdbiter->name);
4510 	origin = dns_fixedname_name(&rbtdbiter->origin);
4511 	dns_rbtnodechain_reset(&rbtdbiter->chain);
4512 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
4513 
4514 	switch (rbtdbiter->nsec3mode) {
4515 	case nsec3only:
4516 		rbtdbiter->current = &rbtdbiter->nsec3chain;
4517 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
4518 					       name, origin);
4519 		break;
4520 	case nonsec3:
4521 		rbtdbiter->current = &rbtdbiter->chain;
4522 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
4523 					       name, origin);
4524 		break;
4525 	case full:
4526 		rbtdbiter->current = &rbtdbiter->nsec3chain;
4527 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
4528 					       name, origin);
4529 		if (result == ISC_R_NOTFOUND) {
4530 			rbtdbiter->current = &rbtdbiter->chain;
4531 			result = dns_rbtnodechain_last(
4532 				rbtdbiter->current, rbtdb->tree, name, origin);
4533 		}
4534 		break;
4535 	default:
4536 		UNREACHABLE();
4537 	}
4538 
4539 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
4540 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
4541 						  NULL, &rbtdbiter->node);
4542 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
4543 			/*
4544 			 * NSEC3 tree only has an origin node.
4545 			 */
4546 			rbtdbiter->node = NULL;
4547 			switch (rbtdbiter->nsec3mode) {
4548 			case nsec3only:
4549 				result = ISC_R_NOMORE;
4550 				break;
4551 			case nonsec3:
4552 			case full:
4553 				rbtdbiter->current = &rbtdbiter->chain;
4554 				result = dns_rbtnodechain_last(
4555 					rbtdbiter->current, rbtdb->tree, name,
4556 					origin);
4557 				if (result == ISC_R_SUCCESS ||
4558 				    result == DNS_R_NEWORIGIN)
4559 				{
4560 					result = dns_rbtnodechain_current(
4561 						rbtdbiter->current, NULL, NULL,
4562 						&rbtdbiter->node);
4563 				}
4564 				break;
4565 			default:
4566 				UNREACHABLE();
4567 			}
4568 		}
4569 		if (result == ISC_R_SUCCESS) {
4570 			rbtdbiter->new_origin = true;
4571 			reference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4572 		}
4573 	} else {
4574 		INSIST(result == ISC_R_NOTFOUND);
4575 		result = ISC_R_NOMORE; /* The tree is empty. */
4576 	}
4577 
4578 	rbtdbiter->result = result;
4579 
4580 	return result;
4581 }
4582 
4583 static isc_result_t
4584 dbiterator_seek(dns_dbiterator_t *iterator,
4585 		const dns_name_t *name DNS__DB_FLARG) {
4586 	isc_result_t result, tresult;
4587 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4588 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4589 	dns_name_t *iname = NULL, *origin = NULL;
4590 
4591 	if (rbtdbiter->result != ISC_R_SUCCESS &&
4592 	    rbtdbiter->result != ISC_R_NOTFOUND &&
4593 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
4594 	    rbtdbiter->result != ISC_R_NOMORE)
4595 	{
4596 		return rbtdbiter->result;
4597 	}
4598 
4599 	if (rbtdbiter->paused) {
4600 		resume_iteration(rbtdbiter);
4601 	}
4602 
4603 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4604 
4605 	iname = dns_fixedname_name(&rbtdbiter->name);
4606 	origin = dns_fixedname_name(&rbtdbiter->origin);
4607 	dns_rbtnodechain_reset(&rbtdbiter->chain);
4608 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
4609 
4610 	switch (rbtdbiter->nsec3mode) {
4611 	case nsec3only:
4612 		rbtdbiter->current = &rbtdbiter->nsec3chain;
4613 		result = dns_rbt_findnode(rbtdb->nsec3, name, NULL,
4614 					  &rbtdbiter->node, rbtdbiter->current,
4615 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
4616 		break;
4617 	case nonsec3:
4618 		rbtdbiter->current = &rbtdbiter->chain;
4619 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
4620 					  &rbtdbiter->node, rbtdbiter->current,
4621 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
4622 		break;
4623 	case full:
4624 		/*
4625 		 * Stay on main chain if not found on either chain.
4626 		 */
4627 		rbtdbiter->current = &rbtdbiter->chain;
4628 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
4629 					  &rbtdbiter->node, rbtdbiter->current,
4630 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
4631 		if (result == DNS_R_PARTIALMATCH) {
4632 			dns_rbtnode_t *node = NULL;
4633 			tresult = dns_rbt_findnode(
4634 				rbtdb->nsec3, name, NULL, &node,
4635 				&rbtdbiter->nsec3chain, DNS_RBTFIND_EMPTYDATA,
4636 				NULL, NULL);
4637 			if (tresult == ISC_R_SUCCESS) {
4638 				rbtdbiter->node = node;
4639 				rbtdbiter->current = &rbtdbiter->nsec3chain;
4640 				result = tresult;
4641 			}
4642 		}
4643 		break;
4644 	default:
4645 		UNREACHABLE();
4646 	}
4647 
4648 	if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) {
4649 		tresult = dns_rbtnodechain_current(rbtdbiter->current, iname,
4650 						   origin, NULL);
4651 		if (tresult == ISC_R_SUCCESS) {
4652 			rbtdbiter->new_origin = true;
4653 			reference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4654 		} else {
4655 			result = tresult;
4656 			rbtdbiter->node = NULL;
4657 		}
4658 	} else {
4659 		rbtdbiter->node = NULL;
4660 	}
4661 
4662 	rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? ISC_R_SUCCESS
4663 							   : result;
4664 
4665 	return result;
4666 }
4667 
4668 static isc_result_t
4669 dbiterator_prev(dns_dbiterator_t *iterator DNS__DB_FLARG) {
4670 	isc_result_t result;
4671 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4672 	dns_name_t *name = NULL, *origin = NULL;
4673 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4674 
4675 	REQUIRE(rbtdbiter->node != NULL);
4676 
4677 	if (rbtdbiter->result != ISC_R_SUCCESS) {
4678 		return rbtdbiter->result;
4679 	}
4680 
4681 	if (rbtdbiter->paused) {
4682 		resume_iteration(rbtdbiter);
4683 	}
4684 
4685 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4686 
4687 	name = dns_fixedname_name(&rbtdbiter->name);
4688 	origin = dns_fixedname_name(&rbtdbiter->origin);
4689 	result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin);
4690 	if (rbtdbiter->current == &rbtdbiter->nsec3chain &&
4691 	    (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN))
4692 	{
4693 		/*
4694 		 * If we're in the NSEC3 tree, it's empty or we've
4695 		 * reached the origin, then we're done with it.
4696 		 */
4697 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
4698 						  NULL, &rbtdbiter->node);
4699 		if (result == ISC_R_NOTFOUND ||
4700 		    RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter))
4701 		{
4702 			rbtdbiter->node = NULL;
4703 			result = ISC_R_NOMORE;
4704 		}
4705 	}
4706 	if (result == ISC_R_NOMORE && rbtdbiter->nsec3mode != nsec3only &&
4707 	    &rbtdbiter->nsec3chain == rbtdbiter->current)
4708 	{
4709 		rbtdbiter->current = &rbtdbiter->chain;
4710 		dns_rbtnodechain_reset(rbtdbiter->current);
4711 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
4712 					       name, origin);
4713 		if (result == ISC_R_NOTFOUND) {
4714 			result = ISC_R_NOMORE;
4715 		}
4716 	}
4717 
4718 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
4719 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
4720 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
4721 						  NULL, &rbtdbiter->node);
4722 	}
4723 
4724 	if (result == ISC_R_SUCCESS) {
4725 		reference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4726 	}
4727 
4728 	rbtdbiter->result = result;
4729 
4730 	return result;
4731 }
4732 
4733 static isc_result_t
4734 dbiterator_next(dns_dbiterator_t *iterator DNS__DB_FLARG) {
4735 	isc_result_t result;
4736 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4737 	dns_name_t *name = NULL, *origin = NULL;
4738 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4739 
4740 	REQUIRE(rbtdbiter->node != NULL);
4741 
4742 	if (rbtdbiter->result != ISC_R_SUCCESS) {
4743 		return rbtdbiter->result;
4744 	}
4745 
4746 	if (rbtdbiter->paused) {
4747 		resume_iteration(rbtdbiter);
4748 	}
4749 
4750 	name = dns_fixedname_name(&rbtdbiter->name);
4751 	origin = dns_fixedname_name(&rbtdbiter->origin);
4752 	result = dns_rbtnodechain_next(rbtdbiter->current, name, origin);
4753 	if (result == ISC_R_NOMORE && rbtdbiter->nsec3mode != nonsec3 &&
4754 	    &rbtdbiter->chain == rbtdbiter->current)
4755 	{
4756 		rbtdbiter->current = &rbtdbiter->nsec3chain;
4757 		dns_rbtnodechain_reset(rbtdbiter->current);
4758 		result = dns_rbtnodechain_first(rbtdbiter->current,
4759 						rbtdb->nsec3, name, origin);
4760 		if (result == ISC_R_NOTFOUND) {
4761 			result = ISC_R_NOMORE;
4762 		}
4763 	}
4764 
4765 	dereference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4766 
4767 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
4768 		/*
4769 		 * If we've just started the NSEC3 tree,
4770 		 * skip over the origin.
4771 		 */
4772 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
4773 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
4774 						  NULL, &rbtdbiter->node);
4775 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
4776 			rbtdbiter->node = NULL;
4777 			result = dns_rbtnodechain_next(rbtdbiter->current, name,
4778 						       origin);
4779 			if (result == ISC_R_SUCCESS ||
4780 			    result == DNS_R_NEWORIGIN)
4781 			{
4782 				result = dns_rbtnodechain_current(
4783 					rbtdbiter->current, NULL, NULL,
4784 					&rbtdbiter->node);
4785 			}
4786 		}
4787 	}
4788 	if (result == ISC_R_SUCCESS) {
4789 		reference_iter_node(rbtdbiter DNS__DB_FLARG_PASS);
4790 	}
4791 
4792 	rbtdbiter->result = result;
4793 
4794 	return result;
4795 }
4796 
4797 static isc_result_t
4798 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
4799 		   dns_name_t *name DNS__DB_FLARG) {
4800 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4801 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4802 	dns_rbtnode_t *node = rbtdbiter->node;
4803 	isc_result_t result;
4804 	dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
4805 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
4806 
4807 	REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
4808 	REQUIRE(rbtdbiter->node != NULL);
4809 
4810 	if (rbtdbiter->paused) {
4811 		resume_iteration(rbtdbiter);
4812 	}
4813 
4814 	if (name != NULL) {
4815 		if (rbtdbiter->common.relative_names) {
4816 			origin = NULL;
4817 		}
4818 		result = dns_name_concatenate(nodename, origin, name, NULL);
4819 		if (result != ISC_R_SUCCESS) {
4820 			return result;
4821 		}
4822 		if (rbtdbiter->common.relative_names && rbtdbiter->new_origin) {
4823 			result = DNS_R_NEWORIGIN;
4824 		}
4825 	} else {
4826 		result = ISC_R_SUCCESS;
4827 	}
4828 
4829 	dns__rbtdb_newref(rbtdb, node, isc_rwlocktype_none DNS__DB_FLARG_PASS);
4830 
4831 	*nodep = rbtdbiter->node;
4832 
4833 	return result;
4834 }
4835 
4836 static isc_result_t
4837 dbiterator_pause(dns_dbiterator_t *iterator) {
4838 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
4839 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4840 
4841 	if (rbtdbiter->result != ISC_R_SUCCESS &&
4842 	    rbtdbiter->result != ISC_R_NOTFOUND &&
4843 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
4844 	    rbtdbiter->result != ISC_R_NOMORE)
4845 	{
4846 		return rbtdbiter->result;
4847 	}
4848 
4849 	if (rbtdbiter->paused) {
4850 		return ISC_R_SUCCESS;
4851 	}
4852 
4853 	rbtdbiter->paused = true;
4854 
4855 	if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
4856 		TREE_UNLOCK(&rbtdb->tree_lock, &rbtdbiter->tree_locked);
4857 	}
4858 	INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
4859 
4860 	return ISC_R_SUCCESS;
4861 }
4862 
4863 static isc_result_t
4864 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
4865 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
4866 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
4867 
4868 	if (rbtdbiter->result != ISC_R_SUCCESS) {
4869 		return rbtdbiter->result;
4870 	}
4871 
4872 	dns_name_copy(origin, name);
4873 	return ISC_R_SUCCESS;
4874 }
4875 
4876 static void
4877 freeglue(isc_mem_t *mctx, dns_glue_t *glue) {
4878 	while (glue != NULL) {
4879 		dns_glue_t *next = glue->next;
4880 
4881 		if (dns_rdataset_isassociated(&glue->rdataset_a)) {
4882 			dns_rdataset_disassociate(&glue->rdataset_a);
4883 		}
4884 		if (dns_rdataset_isassociated(&glue->sigrdataset_a)) {
4885 			dns_rdataset_disassociate(&glue->sigrdataset_a);
4886 		}
4887 
4888 		if (dns_rdataset_isassociated(&glue->rdataset_aaaa)) {
4889 			dns_rdataset_disassociate(&glue->rdataset_aaaa);
4890 		}
4891 		if (dns_rdataset_isassociated(&glue->sigrdataset_aaaa)) {
4892 			dns_rdataset_disassociate(&glue->sigrdataset_aaaa);
4893 		}
4894 
4895 		dns_rdataset_invalidate(&glue->rdataset_a);
4896 		dns_rdataset_invalidate(&glue->sigrdataset_a);
4897 		dns_rdataset_invalidate(&glue->rdataset_aaaa);
4898 		dns_rdataset_invalidate(&glue->sigrdataset_aaaa);
4899 
4900 		isc_mem_put(mctx, glue, sizeof(*glue));
4901 
4902 		glue = next;
4903 	}
4904 }
4905 
4906 void
4907 dns__rbtdb_free_gluenode_rcu(struct rcu_head *rcu_head) {
4908 	dns_gluenode_t *gluenode = caa_container_of(rcu_head, dns_gluenode_t,
4909 						    rcu_head);
4910 
4911 	freeglue(gluenode->mctx, gluenode->glue);
4912 
4913 	dns_db_detachnode(gluenode->db, (dns_dbnode_t **)&gluenode->node);
4914 
4915 	isc_mem_putanddetach(&gluenode->mctx, gluenode, sizeof(*gluenode));
4916 }
4917 
4918 void
4919 dns__rbtdb_free_gluenode(dns_gluenode_t *gluenode) {
4920 	call_rcu(&gluenode->rcu_head, dns__rbtdb_free_gluenode_rcu);
4921 }
4922 
4923 static void
4924 free_gluetable(struct cds_lfht *glue_table) {
4925 	struct cds_lfht_iter iter;
4926 	dns_gluenode_t *gluenode = NULL;
4927 
4928 	rcu_read_lock();
4929 	cds_lfht_for_each_entry(glue_table, &iter, gluenode, ht_node) {
4930 		INSIST(!cds_lfht_del(glue_table, &gluenode->ht_node));
4931 		dns__rbtdb_free_gluenode(gluenode);
4932 	}
4933 	rcu_read_unlock();
4934 
4935 	cds_lfht_destroy(glue_table, NULL);
4936 }
4937 
4938 void
4939 dns__rbtdb_deletedata(dns_db_t *db ISC_ATTR_UNUSED,
4940 		      dns_dbnode_t *node ISC_ATTR_UNUSED, void *data) {
4941 	dns_slabheader_t *header = data;
4942 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)header->db;
4943 
4944 	if (header->heap != NULL && header->heap_index != 0) {
4945 		isc_heap_delete(header->heap, header->heap_index);
4946 	}
4947 
4948 	if (IS_CACHE(rbtdb)) {
4949 		update_rrsetstats(rbtdb->rrsetstats, header->type,
4950 				  atomic_load_acquire(&header->attributes),
4951 				  false);
4952 
4953 		if (ISC_LINK_LINKED(header, link)) {
4954 			int idx = RBTDB_HEADERNODE(header)->locknum;
4955 			INSIST(IS_CACHE(rbtdb));
4956 			ISC_LIST_UNLINK(rbtdb->lru[idx], header, link);
4957 		}
4958 
4959 		if (header->noqname != NULL) {
4960 			dns_slabheader_freeproof(db->mctx, &header->noqname);
4961 		}
4962 		if (header->closest != NULL) {
4963 			dns_slabheader_freeproof(db->mctx, &header->closest);
4964 		}
4965 	}
4966 }
4967 
4968 /*
4969  * Caller must be holding the node write lock.
4970  */
4971 static void
4972 expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum,
4973 		   isc_rwlocktype_t *tlocktypep, isc_stdtime_t now,
4974 		   bool cache_is_overmem DNS__DB_FLARG) {
4975 	isc_heap_t *heap = rbtdb->heaps[locknum];
4976 
4977 	for (size_t i = 0; i < DNS_RBTDB_EXPIRE_TTL_COUNT; i++) {
4978 		dns_slabheader_t *header = isc_heap_element(heap, 1);
4979 
4980 		if (header == NULL) {
4981 			/* No headers left on this TTL heap; exit cleaning */
4982 			return;
4983 		}
4984 
4985 		dns_ttl_t ttl = header->ttl;
4986 
4987 		if (!cache_is_overmem) {
4988 			/* Only account for stale TTL if cache is not overmem */
4989 			ttl += STALE_TTL(header, rbtdb);
4990 		}
4991 
4992 		if (ttl >= now - RBTDB_VIRTUAL) {
4993 			/*
4994 			 * The header at the top of this TTL heap is not yet
4995 			 * eligible for expiry, so none of the other headers on
4996 			 * the same heap can be eligible for expiry, either;
4997 			 * exit cleaning.
4998 			 */
4999 			return;
5000 		}
5001 
5002 		dns__cacherbt_expireheader(header, tlocktypep,
5003 					   dns_expire_ttl DNS__DB_FLARG_PASS);
5004 	}
5005 }
5006 
5007 void
5008 dns__rbtdb_setmaxrrperset(dns_db_t *db, uint32_t value) {
5009 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5010 
5011 	REQUIRE(VALID_RBTDB(rbtdb));
5012 
5013 	rbtdb->maxrrperset = value;
5014 }
5015 
5016 void
5017 dns__rbtdb_setmaxtypepername(dns_db_t *db, uint32_t maxtypepername) {
5018 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5019 
5020 	REQUIRE(VALID_RBTDB(rbtdb));
5021 
5022 	rbtdb->maxtypepername = maxtypepername;
5023 }
5024