xref: /netbsd-src/external/mpl/bind/dist/lib/dns/rbtdb.c (revision 4ac76180e904e771b9d522c7e57296d371f06499)
1 /*	$NetBSD: rbtdb.c,v 1.17 2023/06/26 22:03:00 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <ctype.h>
19 #include <inttypes.h>
20 #include <stdbool.h>
21 
22 #include <isc/atomic.h>
23 #include <isc/crc64.h>
24 #include <isc/event.h>
25 #include <isc/file.h>
26 #include <isc/hash.h>
27 #include <isc/heap.h>
28 #include <isc/hex.h>
29 #include <isc/mem.h>
30 #include <isc/mutex.h>
31 #include <isc/once.h>
32 #include <isc/platform.h>
33 #include <isc/print.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/serial.h>
38 #include <isc/socket.h>
39 #include <isc/stdio.h>
40 #include <isc/string.h>
41 #include <isc/task.h>
42 #include <isc/time.h>
43 #include <isc/util.h>
44 
45 #include <dns/callbacks.h>
46 #include <dns/db.h>
47 #include <dns/dbiterator.h>
48 #include <dns/events.h>
49 #include <dns/fixedname.h>
50 #include <dns/lib.h>
51 #include <dns/log.h>
52 #include <dns/masterdump.h>
53 #include <dns/nsec.h>
54 #include <dns/nsec3.h>
55 #include <dns/rbt.h>
56 #include <dns/rdata.h>
57 #include <dns/rdataset.h>
58 #include <dns/rdatasetiter.h>
59 #include <dns/rdataslab.h>
60 #include <dns/rdatastruct.h>
61 #include <dns/result.h>
62 #include <dns/stats.h>
63 #include <dns/time.h>
64 #include <dns/version.h>
65 #include <dns/view.h>
66 #include <dns/zone.h>
67 #include <dns/zonekey.h>
68 
69 #ifndef WIN32
70 #include <sys/mman.h>
71 #else /* ifndef WIN32 */
72 #define PROT_READ   0x01
73 #define PROT_WRITE  0x02
74 #define MAP_PRIVATE 0x0002
75 #define MAP_FAILED  ((void *)-1)
76 #endif /* ifndef WIN32 */
77 
78 #include "rbtdb.h"
79 
80 #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
81 
82 #define CHECK(op)                            \
83 	do {                                 \
84 		result = (op);               \
85 		if (result != ISC_R_SUCCESS) \
86 			goto failure;        \
87 	} while (0)
88 
89 /*
90  * This is the map file header for RBTDB images.  It is populated, and then
91  * written, as the LAST thing done to the file.  Writing this last (with
92  * zeros in the header area initially) will ensure that the header is only
93  * valid when the RBTDB image is also valid.
94  */
95 typedef struct rbtdb_file_header rbtdb_file_header_t;
96 
97 /* Header length, always the same size regardless of structure size */
98 #define RBTDB_HEADER_LENGTH 1024
99 
100 struct rbtdb_file_header {
101 	char version1[32];
102 	uint32_t ptrsize;
103 	unsigned int bigendian : 1;
104 	uint64_t tree;
105 	uint64_t nsec;
106 	uint64_t nsec3;
107 
108 	char version2[32]; /* repeated; must match version1 */
109 };
110 
111 /*%
112  * Note that "impmagic" is not the first four bytes of the struct, so
113  * ISC_MAGIC_VALID cannot be used.
114  */
115 #define VALID_RBTDB(rbtdb) \
116 	((rbtdb) != NULL && (rbtdb)->common.impmagic == RBTDB_MAGIC)
117 
118 typedef uint32_t rbtdb_serial_t;
119 typedef uint32_t rbtdb_rdatatype_t;
120 
121 #define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type)&0xFFFF))
122 #define RBTDB_RDATATYPE_EXT(type)  ((dns_rdatatype_t)((type) >> 16))
123 #define RBTDB_RDATATYPE_VALUE(base, ext)              \
124 	((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \
125 	 (((uint32_t)base) & 0xffff))
126 
127 #define RBTDB_RDATATYPE_SIGNSEC \
128 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
129 #define RBTDB_RDATATYPE_SIGNSEC3 \
130 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
131 #define RBTDB_RDATATYPE_SIGNS \
132 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
133 #define RBTDB_RDATATYPE_SIGCNAME \
134 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
135 #define RBTDB_RDATATYPE_SIGDNAME \
136 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
137 #define RBTDB_RDATATYPE_SIGDS \
138 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
139 #define RBTDB_RDATATYPE_SIGSOA \
140 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa)
141 #define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
142 
143 #define RBTDB_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
144 #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
145 #define RBTDB_LOCK(l, t)     RWLOCK((l), (t))
146 #define RBTDB_UNLOCK(l, t)   RWUNLOCK((l), (t))
147 
148 /*
149  * Since node locking is sensitive to both performance and memory footprint,
150  * we need some trick here.  If we have both high-performance rwlock and
151  * high performance and small-memory reference counters, we use rwlock for
152  * node lock and isc_refcount for node references.  In this case, we don't have
153  * to protect the access to the counters by locks.
154  * Otherwise, we simply use ordinary mutex lock for node locking, and use
155  * simple integers as reference counters which is protected by the lock.
156  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
157  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
158  * counters first and then protect other parts of a node as read-only data.
159  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
160  * provided for these special cases.  When we can use the efficient backend
161  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
162  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
163  * section including the access to the reference counter.
164  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
165  * section is also protected by NODE_STRONGLOCK().
166  */
167 typedef isc_rwlock_t nodelock_t;
168 
169 #define NODE_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
170 #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
171 #define NODE_LOCK(l, t)	    RWLOCK((l), (t))
172 #define NODE_UNLOCK(l, t)   RWUNLOCK((l), (t))
173 #define NODE_TRYUPGRADE(l)  isc_rwlock_tryupgrade(l)
174 #define NODE_DOWNGRADE(l)   isc_rwlock_downgrade(l)
175 
176 /*%
177  * Whether to rate-limit updating the LRU to avoid possible thread contention.
178  * Updating LRU requires write locking, so we don't do it every time the
179  * record is touched - only after some time passes.
180  */
181 #ifndef DNS_RBTDB_LIMITLRUUPDATE
182 #define DNS_RBTDB_LIMITLRUUPDATE 1
183 #endif
184 
185 /*% Time after which we update LRU for glue records, 5 minutes */
186 #define DNS_RBTDB_LRUUPDATE_GLUE 300
187 /*% Time after which we update LRU for all other records, 10 minutes */
188 #define DNS_RBTDB_LRUUPDATE_REGULAR 600
189 
190 /*
191  * Allow clients with a virtual time of up to 5 minutes in the past to see
192  * records that would have otherwise have expired.
193  */
194 #define RBTDB_VIRTUAL 300
195 
196 struct noqname {
197 	dns_name_t name;
198 	void *neg;
199 	void *negsig;
200 	dns_rdatatype_t type;
201 };
202 
203 typedef struct rdatasetheader {
204 	/*%
205 	 * Locked by the owning node's lock.
206 	 */
207 	rbtdb_serial_t serial;
208 	dns_ttl_t rdh_ttl;
209 	rbtdb_rdatatype_t type;
210 	atomic_uint_least16_t attributes;
211 	dns_trust_t trust;
212 	atomic_uint_fast32_t last_refresh_fail_ts;
213 	struct noqname *noqname;
214 	struct noqname *closest;
215 	unsigned int is_mmapped	      : 1;
216 	unsigned int next_is_relative : 1;
217 	unsigned int node_is_relative : 1;
218 	unsigned int resign_lsb	      : 1;
219 	/*%<
220 	 * We don't use the LIST macros, because the LIST structure has
221 	 * both head and tail pointers, and is doubly linked.
222 	 */
223 
224 	struct rdatasetheader *next;
225 	/*%<
226 	 * If this is the top header for an rdataset, 'next' points
227 	 * to the top header for the next rdataset (i.e., the next type).
228 	 * Otherwise, it points up to the header whose down pointer points
229 	 * at this header.
230 	 */
231 
232 	struct rdatasetheader *down;
233 	/*%<
234 	 * Points to the header for the next older version of
235 	 * this rdataset.
236 	 */
237 
238 	atomic_uint_fast32_t count;
239 	/*%<
240 	 * Monotonously increased every time this rdataset is bound so that
241 	 * it is used as the base of the starting point in DNS responses
242 	 * when the "cyclic" rrset-order is required.
243 	 */
244 
245 	dns_rbtnode_t *node;
246 	isc_stdtime_t last_used;
247 	ISC_LINK(struct rdatasetheader) link;
248 
249 	unsigned int heap_index;
250 	/*%<
251 	 * Used for TTL-based cache cleaning.
252 	 */
253 	isc_stdtime_t resign;
254 	/*%<
255 	 * Case vector.  If the bit is set then the corresponding
256 	 * character in the owner name needs to be AND'd with 0x20,
257 	 * rendering that character upper case.
258 	 */
259 	unsigned char upper[32];
260 } rdatasetheader_t;
261 
262 typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
263 typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
264 
265 #define RDATASET_ATTR_NONEXISTENT 0x0001
266 /*%< May be potentially served as stale data. */
267 #define RDATASET_ATTR_STALE	     0x0002
268 #define RDATASET_ATTR_IGNORE	     0x0004
269 #define RDATASET_ATTR_RETAIN	     0x0008
270 #define RDATASET_ATTR_NXDOMAIN	     0x0010
271 #define RDATASET_ATTR_RESIGN	     0x0020
272 #define RDATASET_ATTR_STATCOUNT	     0x0040
273 #define RDATASET_ATTR_OPTOUT	     0x0080
274 #define RDATASET_ATTR_NEGATIVE	     0x0100
275 #define RDATASET_ATTR_PREFETCH	     0x0200
276 #define RDATASET_ATTR_CASESET	     0x0400
277 #define RDATASET_ATTR_ZEROTTL	     0x0800
278 #define RDATASET_ATTR_CASEFULLYLOWER 0x1000
279 /*%< Ancient - awaiting cleanup. */
280 #define RDATASET_ATTR_ANCIENT	   0x2000
281 #define RDATASET_ATTR_STALE_WINDOW 0x4000
282 
283 /*
284  * XXX
285  * When the cache will pre-expire data (due to memory low or other
286  * situations) before the rdataset's TTL has expired, it MUST
287  * respect the RETAIN bit and not expire the data until its TTL is
288  * expired.
289  */
290 
291 #undef IGNORE /* WIN32 winbase.h defines this. */
292 
293 #define EXISTS(header)                                 \
294 	((atomic_load_acquire(&(header)->attributes) & \
295 	  RDATASET_ATTR_NONEXISTENT) == 0)
296 #define NONEXISTENT(header)                            \
297 	((atomic_load_acquire(&(header)->attributes) & \
298 	  RDATASET_ATTR_NONEXISTENT) != 0)
299 #define IGNORE(header)                                 \
300 	((atomic_load_acquire(&(header)->attributes) & \
301 	  RDATASET_ATTR_IGNORE) != 0)
302 #define RETAIN(header)                                 \
303 	((atomic_load_acquire(&(header)->attributes) & \
304 	  RDATASET_ATTR_RETAIN) != 0)
305 #define NXDOMAIN(header)                               \
306 	((atomic_load_acquire(&(header)->attributes) & \
307 	  RDATASET_ATTR_NXDOMAIN) != 0)
308 #define STALE(header)                                                          \
309 	((atomic_load_acquire(&(header)->attributes) & RDATASET_ATTR_STALE) != \
310 	 0)
311 #define STALE_WINDOW(header)                           \
312 	((atomic_load_acquire(&(header)->attributes) & \
313 	  RDATASET_ATTR_STALE_WINDOW) != 0)
314 #define RESIGN(header)                                 \
315 	((atomic_load_acquire(&(header)->attributes) & \
316 	  RDATASET_ATTR_RESIGN) != 0)
317 #define OPTOUT(header)                                 \
318 	((atomic_load_acquire(&(header)->attributes) & \
319 	  RDATASET_ATTR_OPTOUT) != 0)
320 #define NEGATIVE(header)                               \
321 	((atomic_load_acquire(&(header)->attributes) & \
322 	  RDATASET_ATTR_NEGATIVE) != 0)
323 #define PREFETCH(header)                               \
324 	((atomic_load_acquire(&(header)->attributes) & \
325 	  RDATASET_ATTR_PREFETCH) != 0)
326 #define CASESET(header)                                \
327 	((atomic_load_acquire(&(header)->attributes) & \
328 	  RDATASET_ATTR_CASESET) != 0)
329 #define ZEROTTL(header)                                \
330 	((atomic_load_acquire(&(header)->attributes) & \
331 	  RDATASET_ATTR_ZEROTTL) != 0)
332 #define CASEFULLYLOWER(header)                         \
333 	((atomic_load_acquire(&(header)->attributes) & \
334 	  RDATASET_ATTR_CASEFULLYLOWER) != 0)
335 #define ANCIENT(header)                                \
336 	((atomic_load_acquire(&(header)->attributes) & \
337 	  RDATASET_ATTR_ANCIENT) != 0)
338 #define STATCOUNT(header)                              \
339 	((atomic_load_acquire(&(header)->attributes) & \
340 	  RDATASET_ATTR_STATCOUNT) != 0)
341 
342 #define RDATASET_ATTR_GET(header, attribute) \
343 	(atomic_load_acquire(&(header)->attributes) & attribute)
344 #define RDATASET_ATTR_SET(header, attribute) \
345 	atomic_fetch_or_release(&(header)->attributes, attribute)
346 #define RDATASET_ATTR_CLR(header, attribute) \
347 	atomic_fetch_and_release(&(header)->attributes, ~(attribute))
348 
349 #define ACTIVE(header, now)             \
350 	(((header)->rdh_ttl > (now)) || \
351 	 ((header)->rdh_ttl == (now) && ZEROTTL(header)))
352 
353 #define DEFAULT_NODE_LOCK_COUNT	    7 /*%< Should be prime. */
354 #define RBTDB_GLUE_TABLE_INIT_BITS  2U
355 #define RBTDB_GLUE_TABLE_MAX_BITS   32U
356 #define RBTDB_GLUE_TABLE_OVERCOMMIT 3
357 
358 #define GOLDEN_RATIO_32 0x61C88647
359 #define HASHSIZE(bits)	(UINT64_C(1) << (bits))
360 
361 static uint32_t
362 hash_32(uint32_t val, unsigned int bits) {
363 	REQUIRE(bits <= RBTDB_GLUE_TABLE_MAX_BITS);
364 	/* High bits are more random. */
365 	return (val * GOLDEN_RATIO_32 >> (32 - bits));
366 }
367 
368 #define EXPIREDOK(rbtiterator) \
369 	(((rbtiterator)->common.options & DNS_DB_EXPIREDOK) != 0)
370 
371 #define STALEOK(rbtiterator) \
372 	(((rbtiterator)->common.options & DNS_DB_STALEOK) != 0)
373 
374 /*%
375  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
376  * There is a tradeoff issue about configuring this value: if this is too
377  * small, it may cause heavier contention between threads; if this is too large,
378  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
379  * The default value should work well for most environments, but this can
380  * also be configurable at compilation time via the
381  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
382  * 1 due to the assumption of overmem_purge().
383  */
384 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
385 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
386 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
387 #else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
388 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
389 #endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
390 #else  /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
391 #define DEFAULT_CACHE_NODE_LOCK_COUNT 17
392 #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
393 
394 typedef struct {
395 	nodelock_t lock;
396 	/* Protected in the refcount routines. */
397 	isc_refcount_t references;
398 	/* Locked by lock. */
399 	bool exiting;
400 } rbtdb_nodelock_t;
401 
402 typedef struct rbtdb_changed {
403 	dns_rbtnode_t *node;
404 	bool dirty;
405 	ISC_LINK(struct rbtdb_changed) link;
406 } rbtdb_changed_t;
407 
408 typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
409 
410 typedef enum { dns_db_insecure, dns_db_partial, dns_db_secure } dns_db_secure_t;
411 
412 typedef struct dns_rbtdb dns_rbtdb_t;
413 
414 /* Reason for expiring a record from cache */
415 typedef enum { expire_lru, expire_ttl, expire_flush } expire_t;
416 
417 typedef struct rbtdb_glue rbtdb_glue_t;
418 
419 typedef struct rbtdb_glue_table_node {
420 	struct rbtdb_glue_table_node *next;
421 	dns_rbtnode_t *node;
422 	rbtdb_glue_t *glue_list;
423 } rbtdb_glue_table_node_t;
424 
425 typedef enum {
426 	rdataset_ttl_fresh,
427 	rdataset_ttl_stale,
428 	rdataset_ttl_ancient
429 } rdataset_ttl_t;
430 
431 typedef struct rbtdb_version {
432 	/* Not locked */
433 	rbtdb_serial_t serial;
434 	dns_rbtdb_t *rbtdb;
435 	/*
436 	 * Protected in the refcount routines.
437 	 * XXXJT: should we change the lock policy based on the refcount
438 	 * performance?
439 	 */
440 	isc_refcount_t references;
441 	/* Locked by database lock. */
442 	bool writer;
443 	bool commit_ok;
444 	rbtdb_changedlist_t changed_list;
445 	rdatasetheaderlist_t resigned_list;
446 	ISC_LINK(struct rbtdb_version) link;
447 	dns_db_secure_t secure;
448 	bool havensec3;
449 	/* NSEC3 parameters */
450 	dns_hash_t hash;
451 	uint8_t flags;
452 	uint16_t iterations;
453 	uint8_t salt_length;
454 	unsigned char salt[DNS_NSEC3_SALTSIZE];
455 
456 	/*
457 	 * records and xfrsize are covered by rwlock.
458 	 */
459 	isc_rwlock_t rwlock;
460 	uint64_t records;
461 	uint64_t xfrsize;
462 
463 	isc_rwlock_t glue_rwlock;
464 	size_t glue_table_bits;
465 	size_t glue_table_nodecount;
466 	rbtdb_glue_table_node_t **glue_table;
467 } rbtdb_version_t;
468 
469 typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
470 
471 struct dns_rbtdb {
472 	/* Unlocked. */
473 	dns_db_t common;
474 	/* Locks the data in this struct */
475 	isc_rwlock_t lock;
476 	/* Locks the tree structure (prevents nodes appearing/disappearing) */
477 	isc_rwlock_t tree_lock;
478 	/* Locks for individual tree nodes */
479 	unsigned int node_lock_count;
480 	rbtdb_nodelock_t *node_locks;
481 	dns_rbtnode_t *origin_node;
482 	dns_rbtnode_t *nsec3_origin_node;
483 	dns_stats_t *rrsetstats;     /* cache DB only */
484 	isc_stats_t *cachestats;     /* cache DB only */
485 	isc_stats_t *gluecachestats; /* zone DB only */
486 	/* Locked by lock. */
487 	unsigned int active;
488 	isc_refcount_t references;
489 	unsigned int attributes;
490 	rbtdb_serial_t current_serial;
491 	rbtdb_serial_t least_serial;
492 	rbtdb_serial_t next_serial;
493 	rbtdb_version_t *current_version;
494 	rbtdb_version_t *future_version;
495 	rbtdb_versionlist_t open_versions;
496 	isc_task_t *task;
497 	dns_dbnode_t *soanode;
498 	dns_dbnode_t *nsnode;
499 
500 	/*
501 	 * Maximum length of time to keep using a stale answer past its
502 	 * normal TTL expiry.
503 	 */
504 	dns_ttl_t serve_stale_ttl;
505 
506 	/*
507 	 * The time after a failed lookup, where stale answers from cache
508 	 * may be used directly in a DNS response without attempting a
509 	 * new iterative lookup.
510 	 */
511 	uint32_t serve_stale_refresh;
512 
513 	/*
514 	 * This is a linked list used to implement the LRU cache.  There will
515 	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
516 	 * placed on the linked list rdatasets[1].
517 	 */
518 	rdatasetheaderlist_t *rdatasets;
519 
520 	/*%
521 	 * Temporary storage for stale cache nodes and dynamically deleted
522 	 * nodes that await being cleaned up.
523 	 */
524 	rbtnodelist_t *deadnodes;
525 
526 	/*
527 	 * Heaps.  These are used for TTL based expiry in a cache,
528 	 * or for zone resigning in a zone DB.  hmctx is the memory
529 	 * context to use for the heap (which differs from the main
530 	 * database memory context in the case of a cache).
531 	 */
532 	isc_mem_t *hmctx;
533 	isc_heap_t **heaps;
534 
535 	/*
536 	 * Base values for the mmap() code.
537 	 */
538 	void *mmap_location;
539 	size_t mmap_size;
540 
541 	/* Locked by tree_lock. */
542 	dns_rbt_t *tree;
543 	dns_rbt_t *nsec;
544 	dns_rbt_t *nsec3;
545 
546 	/* Unlocked */
547 	unsigned int quantum;
548 };
549 
550 #define RBTDB_ATTR_LOADED  0x01
551 #define RBTDB_ATTR_LOADING 0x02
552 
553 #define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0)
554 
555 /*%
556  * Search Context
557  */
558 typedef struct {
559 	dns_rbtdb_t *rbtdb;
560 	rbtdb_version_t *rbtversion;
561 	rbtdb_serial_t serial;
562 	unsigned int options;
563 	dns_rbtnodechain_t chain;
564 	bool copy_name;
565 	bool need_cleanup;
566 	bool wild;
567 	dns_rbtnode_t *zonecut;
568 	rdatasetheader_t *zonecut_rdataset;
569 	rdatasetheader_t *zonecut_sigrdataset;
570 	dns_fixedname_t zonecut_name;
571 	isc_stdtime_t now;
572 } rbtdb_search_t;
573 
574 /*%
575  * Load Context
576  */
577 typedef struct {
578 	dns_rbtdb_t *rbtdb;
579 	isc_stdtime_t now;
580 } rbtdb_load_t;
581 
582 static void
583 delete_callback(void *data, void *arg);
584 static void
585 rdataset_disassociate(dns_rdataset_t *rdataset);
586 static isc_result_t
587 rdataset_first(dns_rdataset_t *rdataset);
588 static isc_result_t
589 rdataset_next(dns_rdataset_t *rdataset);
590 static void
591 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
592 static void
593 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
594 static unsigned int
595 rdataset_count(dns_rdataset_t *rdataset);
596 static isc_result_t
597 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
598 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
599 static isc_result_t
600 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
601 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
602 static bool
603 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now);
604 static void
605 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now);
606 static void
607 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
608 	      expire_t reason);
609 static void
610 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
611 	      bool tree_locked);
612 static void
613 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
614 static void
615 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
616 	      rdatasetheader_t *header);
617 static void
618 prune_tree(isc_task_t *task, isc_event_t *event);
619 static void
620 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
621 static void
622 rdataset_expire(dns_rdataset_t *rdataset);
623 static void
624 rdataset_clearprefetch(dns_rdataset_t *rdataset);
625 static void
626 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name);
627 static void
628 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name);
629 static isc_result_t
630 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
631 		 dns_message_t *msg);
632 static void
633 free_gluetable(rbtdb_version_t *version);
634 static isc_result_t
635 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name);
636 
637 static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate,
638 						  rdataset_first,
639 						  rdataset_next,
640 						  rdataset_current,
641 						  rdataset_clone,
642 						  rdataset_count,
643 						  NULL, /* addnoqname */
644 						  rdataset_getnoqname,
645 						  NULL, /* addclosest */
646 						  rdataset_getclosest,
647 						  rdataset_settrust,
648 						  rdataset_expire,
649 						  rdataset_clearprefetch,
650 						  rdataset_setownercase,
651 						  rdataset_getownercase,
652 						  rdataset_addglue };
653 
654 static dns_rdatasetmethods_t slab_methods = {
655 	rdataset_disassociate,
656 	rdataset_first,
657 	rdataset_next,
658 	rdataset_current,
659 	rdataset_clone,
660 	rdataset_count,
661 	NULL, /* addnoqname */
662 	NULL, /* getnoqname */
663 	NULL, /* addclosest */
664 	NULL, /* getclosest */
665 	NULL, /* settrust */
666 	NULL, /* expire */
667 	NULL, /* clearprefetch */
668 	NULL, /* setownercase */
669 	NULL, /* getownercase */
670 	NULL  /* addglue */
671 };
672 
673 static void
674 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
675 static isc_result_t
676 rdatasetiter_first(dns_rdatasetiter_t *iterator);
677 static isc_result_t
678 rdatasetiter_next(dns_rdatasetiter_t *iterator);
679 static void
680 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset);
681 
682 static dns_rdatasetitermethods_t rdatasetiter_methods = {
683 	rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next,
684 	rdatasetiter_current
685 };
686 
687 typedef struct rbtdb_rdatasetiter {
688 	dns_rdatasetiter_t common;
689 	rdatasetheader_t *current;
690 } rbtdb_rdatasetiter_t;
691 
692 /*
693  * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
694  * DNS_DB_NONSEC3, will transparently move between the last node of the
695  * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
696  * ("nsec3chain" field) of the database in question, as if the latter was a
697  * successor to the former in lexical order.  The "current" field always holds
698  * the address of either "chain" or "nsec3chain", depending on which RBT is
699  * being traversed at given time.
700  */
701 static void
702 dbiterator_destroy(dns_dbiterator_t **iteratorp);
703 static isc_result_t
704 dbiterator_first(dns_dbiterator_t *iterator);
705 static isc_result_t
706 dbiterator_last(dns_dbiterator_t *iterator);
707 static isc_result_t
708 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name);
709 static isc_result_t
710 dbiterator_prev(dns_dbiterator_t *iterator);
711 static isc_result_t
712 dbiterator_next(dns_dbiterator_t *iterator);
713 static isc_result_t
714 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
715 		   dns_name_t *name);
716 static isc_result_t
717 dbiterator_pause(dns_dbiterator_t *iterator);
718 static isc_result_t
719 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name);
720 
721 static dns_dbiteratormethods_t dbiterator_methods = {
722 	dbiterator_destroy, dbiterator_first, dbiterator_last,
723 	dbiterator_seek,    dbiterator_prev,  dbiterator_next,
724 	dbiterator_current, dbiterator_pause, dbiterator_origin
725 };
726 
727 #define DELETION_BATCH_MAX 64
728 
729 /*
730  * If 'paused' is true, then the tree lock is not being held.
731  */
732 typedef struct rbtdb_dbiterator {
733 	dns_dbiterator_t common;
734 	bool paused;
735 	bool new_origin;
736 	isc_rwlocktype_t tree_locked;
737 	isc_result_t result;
738 	dns_fixedname_t name;
739 	dns_fixedname_t origin;
740 	dns_rbtnodechain_t chain;
741 	dns_rbtnodechain_t nsec3chain;
742 	dns_rbtnodechain_t *current;
743 	dns_rbtnode_t *node;
744 	dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
745 	int delcnt;
746 	bool nsec3only;
747 	bool nonsec3;
748 } rbtdb_dbiterator_t;
749 
750 #define IS_STUB(rbtdb)	(((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
751 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
752 
753 static void
754 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event);
755 static void
756 overmem(dns_db_t *db, bool over);
757 static void
758 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
759 static void
760 setownercase(rdatasetheader_t *header, const dns_name_t *name);
761 
762 static bool
763 match_header_version(rbtdb_file_header_t *header);
764 
765 /* Pad to 32 bytes */
766 static char FILE_VERSION[32] = "\0";
767 
768 /*%
769  * 'init_count' is used to initialize 'newheader->count' which inturn
770  * is used to determine where in the cycle rrset-order cyclic starts.
771  * We don't lock this as we don't care about simultaneous updates.
772  *
773  * Note:
774  *      Both init_count and header->count can be UINT32_MAX.
775  *      The count on the returned rdataset however can't be as
776  *      that indicates that the database does not implement cyclic
777  *      processing.
778  */
779 static atomic_uint_fast32_t init_count = 0;
780 
781 /*
782  * Locking
783  *
784  * If a routine is going to lock more than one lock in this module, then
785  * the locking must be done in the following order:
786  *
787  *      Tree Lock
788  *
789  *      Node Lock       (Only one from the set may be locked at one time by
790  *                       any caller)
791  *
792  *      Database Lock
793  *
794  * Failure to follow this hierarchy can result in deadlock.
795  */
796 
797 /*
798  * Deleting Nodes
799  *
800  * For zone databases the node for the origin of the zone MUST NOT be deleted.
801  */
802 
803 /*
804  * Debugging routines
805  */
806 #ifdef DEBUG
807 static void
808 hexdump(const char *desc, unsigned char *data, size_t size) {
809 	char hexdump[BUFSIZ * 2 + 1];
810 	isc_buffer_t b;
811 	isc_region_t r;
812 	isc_result_t result;
813 	size_t bytes;
814 
815 	fprintf(stderr, "%s: ", desc);
816 	do {
817 		isc_buffer_init(&b, hexdump, sizeof(hexdump));
818 		r.base = data;
819 		r.length = bytes = (size > BUFSIZ) ? BUFSIZ : size;
820 		result = isc_hex_totext(&r, 0, "", &b);
821 		RUNTIME_CHECK(result == ISC_R_SUCCESS);
822 		isc_buffer_putuint8(&b, 0);
823 		fprintf(stderr, "%s", hexdump);
824 		data += bytes;
825 		size -= bytes;
826 	} while (size > 0);
827 	fprintf(stderr, "\n");
828 }
829 #endif /* ifdef DEBUG */
830 
831 /* Fixed RRSet helper macros */
832 
833 #define DNS_RDATASET_LENGTH 2;
834 
835 #if DNS_RDATASET_FIXED
836 #define DNS_RDATASET_ORDER 2
837 #define DNS_RDATASET_COUNT (count * 4)
838 #else /* !DNS_RDATASET_FIXED */
839 #define DNS_RDATASET_ORDER 0
840 #define DNS_RDATASET_COUNT 0
841 #endif /* DNS_RDATASET_FIXED */
842 
843 /*
844  * DB Routines
845  */
846 
847 static void
848 attach(dns_db_t *source, dns_db_t **targetp) {
849 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
850 
851 	REQUIRE(VALID_RBTDB(rbtdb));
852 
853 	isc_refcount_increment(&rbtdb->references);
854 
855 	*targetp = source;
856 }
857 
858 static void
859 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
860 	dns_rbtdb_t *rbtdb = event->ev_arg;
861 
862 	UNUSED(task);
863 
864 	free_rbtdb(rbtdb, true, event);
865 }
866 
867 static void
868 update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
869 	INSIST(IS_CACHE(rbtdb));
870 
871 	if (rbtdb->cachestats == NULL) {
872 		return;
873 	}
874 
875 	switch (result) {
876 	case ISC_R_SUCCESS:
877 	case DNS_R_CNAME:
878 	case DNS_R_DNAME:
879 	case DNS_R_DELEGATION:
880 	case DNS_R_NCACHENXDOMAIN:
881 	case DNS_R_NCACHENXRRSET:
882 		isc_stats_increment(rbtdb->cachestats,
883 				    dns_cachestatscounter_hits);
884 		break;
885 	default:
886 		isc_stats_increment(rbtdb->cachestats,
887 				    dns_cachestatscounter_misses);
888 	}
889 }
890 
891 static bool
892 do_stats(rdatasetheader_t *header) {
893 	return (EXISTS(header) && STATCOUNT(header));
894 }
895 
896 static void
897 update_rrsetstats(dns_rbtdb_t *rbtdb, const rbtdb_rdatatype_t htype,
898 		  const uint_least16_t hattributes, const bool increment) {
899 	dns_rdatastatstype_t statattributes = 0;
900 	dns_rdatastatstype_t base = 0;
901 	dns_rdatastatstype_t type;
902 	rdatasetheader_t *header = &(rdatasetheader_t){
903 		.type = htype,
904 		.attributes = hattributes,
905 	};
906 
907 	if (!do_stats(header)) {
908 		return;
909 	}
910 
911 	/* At the moment we count statistics only for cache DB */
912 	INSIST(IS_CACHE(rbtdb));
913 
914 	if (NEGATIVE(header)) {
915 		if (NXDOMAIN(header)) {
916 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
917 		} else {
918 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
919 			base = RBTDB_RDATATYPE_EXT(header->type);
920 		}
921 	} else {
922 		base = RBTDB_RDATATYPE_BASE(header->type);
923 	}
924 
925 	if (STALE(header)) {
926 		statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE;
927 	}
928 	if (ANCIENT(header)) {
929 		statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT;
930 	}
931 
932 	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
933 	if (increment) {
934 		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
935 	} else {
936 		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
937 	}
938 }
939 
940 static void
941 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
942 	int idx;
943 	isc_heap_t *heap;
944 	dns_ttl_t oldttl;
945 
946 	if (!IS_CACHE(rbtdb)) {
947 		header->rdh_ttl = newttl;
948 		return;
949 	}
950 
951 	oldttl = header->rdh_ttl;
952 	header->rdh_ttl = newttl;
953 
954 	/*
955 	 * It's possible the rbtdb is not a cache.  If this is the case,
956 	 * we will not have a heap, and we move on.  If we do, though,
957 	 * we might need to adjust things.
958 	 */
959 	if (header->heap_index == 0 || newttl == oldttl) {
960 		return;
961 	}
962 	idx = header->node->locknum;
963 	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) {
964 		return;
965 	}
966 	heap = rbtdb->heaps[idx];
967 
968 	if (newttl < oldttl) {
969 		isc_heap_increased(heap, header->heap_index);
970 	} else {
971 		isc_heap_decreased(heap, header->heap_index);
972 	}
973 }
974 
975 /*%
976  * These functions allow the heap code to rank the priority of each
977  * element.  It returns true if v1 happens "sooner" than v2.
978  */
979 static bool
980 ttl_sooner(void *v1, void *v2) {
981 	rdatasetheader_t *h1 = v1;
982 	rdatasetheader_t *h2 = v2;
983 
984 	return (h1->rdh_ttl < h2->rdh_ttl);
985 }
986 
987 /*%
988  * Return which RRset should be resigned sooner.  If the RRsets have the
989  * same signing time, prefer the other RRset over the SOA RRset.
990  */
991 static bool
992 resign_sooner(void *v1, void *v2) {
993 	rdatasetheader_t *h1 = v1;
994 	rdatasetheader_t *h2 = v2;
995 
996 	return (h1->resign < h2->resign ||
997 		(h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) ||
998 		(h1->resign == h2->resign && h1->resign_lsb == h2->resign_lsb &&
999 		 h2->type == RBTDB_RDATATYPE_SIGSOA));
1000 }
1001 
1002 /*%
1003  * This function sets the heap index into the header.
1004  */
1005 static void
1006 set_index(void *what, unsigned int idx) {
1007 	rdatasetheader_t *h = what;
1008 
1009 	h->heap_index = idx;
1010 }
1011 
1012 /*%
1013  * Work out how many nodes can be deleted in the time between two
1014  * requests to the nameserver.  Smooth the resulting number and use it
1015  * as a estimate for the number of nodes to be deleted in the next
1016  * iteration.
1017  */
1018 static unsigned int
1019 adjust_quantum(unsigned int old, isc_time_t *start) {
1020 	unsigned int pps = dns_pps; /* packets per second */
1021 	unsigned int interval;
1022 	uint64_t usecs;
1023 	isc_time_t end;
1024 	unsigned int nodes;
1025 
1026 	if (pps < 100) {
1027 		pps = 100;
1028 	}
1029 	isc_time_now(&end);
1030 
1031 	interval = 1000000 / pps; /* interval in usec */
1032 	if (interval == 0) {
1033 		interval = 1;
1034 	}
1035 	usecs = isc_time_microdiff(&end, start);
1036 	if (usecs == 0) {
1037 		/*
1038 		 * We were unable to measure the amount of time taken.
1039 		 * Double the nodes deleted next time.
1040 		 */
1041 		old *= 2;
1042 		if (old > 1000) {
1043 			old = 1000;
1044 		}
1045 		return (old);
1046 	}
1047 	nodes = old * interval;
1048 	nodes /= (unsigned int)usecs;
1049 	if (nodes == 0) {
1050 		nodes = 1;
1051 	} else if (nodes > 1000) {
1052 		nodes = 1000;
1053 	}
1054 
1055 	/* Smooth */
1056 	nodes = (nodes + old * 3) / 4;
1057 
1058 	if (nodes != old) {
1059 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1060 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1061 			      "adjust_quantum: old=%d, new=%d", old, nodes);
1062 	}
1063 
1064 	return (nodes);
1065 }
1066 
1067 static void
1068 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
1069 	unsigned int i;
1070 	isc_result_t result;
1071 	char buf[DNS_NAME_FORMATSIZE];
1072 	dns_rbt_t **treep;
1073 	isc_time_t start;
1074 
1075 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1076 		overmem((dns_db_t *)rbtdb, (bool)-1);
1077 	}
1078 
1079 	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
1080 	REQUIRE(rbtdb->future_version == NULL);
1081 
1082 	if (rbtdb->current_version != NULL) {
1083 		isc_refcount_decrementz(&rbtdb->current_version->references);
1084 		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
1085 		isc_rwlock_destroy(&rbtdb->current_version->glue_rwlock);
1086 		isc_refcount_destroy(&rbtdb->current_version->references);
1087 		isc_rwlock_destroy(&rbtdb->current_version->rwlock);
1088 		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
1089 			    sizeof(rbtdb_version_t));
1090 	}
1091 
1092 	/*
1093 	 * We assume the number of remaining dead nodes is reasonably small;
1094 	 * the overhead of unlinking all nodes here should be negligible.
1095 	 */
1096 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1097 		dns_rbtnode_t *node;
1098 
1099 		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1100 		while (node != NULL) {
1101 			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
1102 			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1103 		}
1104 	}
1105 
1106 	if (event == NULL) {
1107 		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
1108 	}
1109 
1110 	for (;;) {
1111 		/*
1112 		 * pick the next tree to (start to) destroy
1113 		 */
1114 		treep = &rbtdb->tree;
1115 		if (*treep == NULL) {
1116 			treep = &rbtdb->nsec;
1117 			if (*treep == NULL) {
1118 				treep = &rbtdb->nsec3;
1119 				/*
1120 				 * we're finished after clear cutting
1121 				 */
1122 				if (*treep == NULL) {
1123 					break;
1124 				}
1125 			}
1126 		}
1127 
1128 		isc_time_now(&start);
1129 		result = dns_rbt_destroy2(treep, rbtdb->quantum);
1130 		if (result == ISC_R_QUOTA) {
1131 			INSIST(rbtdb->task != NULL);
1132 			if (rbtdb->quantum != 0) {
1133 				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
1134 								&start);
1135 			}
1136 			if (event == NULL) {
1137 				event = isc_event_allocate(
1138 					rbtdb->common.mctx, NULL,
1139 					DNS_EVENT_FREESTORAGE,
1140 					free_rbtdb_callback, rbtdb,
1141 					sizeof(isc_event_t));
1142 			}
1143 			isc_task_send(rbtdb->task, &event);
1144 			return;
1145 		}
1146 		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
1147 	}
1148 
1149 	if (event != NULL) {
1150 		isc_event_free(&event);
1151 	}
1152 	if (log) {
1153 		if (dns_name_dynamic(&rbtdb->common.origin)) {
1154 			dns_name_format(&rbtdb->common.origin, buf,
1155 					sizeof(buf));
1156 		} else {
1157 			strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1158 		}
1159 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1160 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1161 			      "done free_rbtdb(%s)", buf);
1162 	}
1163 	if (dns_name_dynamic(&rbtdb->common.origin)) {
1164 		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
1165 	}
1166 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1167 		isc_refcount_destroy(&rbtdb->node_locks[i].references);
1168 		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
1169 	}
1170 
1171 	/*
1172 	 * Clean up LRU / re-signing order lists.
1173 	 */
1174 	if (rbtdb->rdatasets != NULL) {
1175 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1176 			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
1177 		}
1178 		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
1179 			    rbtdb->node_lock_count *
1180 				    sizeof(rdatasetheaderlist_t));
1181 	}
1182 	/*
1183 	 * Clean up dead node buckets.
1184 	 */
1185 	if (rbtdb->deadnodes != NULL) {
1186 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1187 			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
1188 		}
1189 		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
1190 			    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
1191 	}
1192 	/*
1193 	 * Clean up heap objects.
1194 	 */
1195 	if (rbtdb->heaps != NULL) {
1196 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1197 			isc_heap_destroy(&rbtdb->heaps[i]);
1198 		}
1199 		isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
1200 			    rbtdb->node_lock_count * sizeof(isc_heap_t *));
1201 	}
1202 
1203 	if (rbtdb->rrsetstats != NULL) {
1204 		dns_stats_detach(&rbtdb->rrsetstats);
1205 	}
1206 	if (rbtdb->cachestats != NULL) {
1207 		isc_stats_detach(&rbtdb->cachestats);
1208 	}
1209 	if (rbtdb->gluecachestats != NULL) {
1210 		isc_stats_detach(&rbtdb->gluecachestats);
1211 	}
1212 
1213 	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
1214 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
1215 	isc_rwlock_destroy(&rbtdb->tree_lock);
1216 	isc_refcount_destroy(&rbtdb->references);
1217 	if (rbtdb->task != NULL) {
1218 		isc_task_detach(&rbtdb->task);
1219 	}
1220 
1221 	RBTDB_DESTROYLOCK(&rbtdb->lock);
1222 	rbtdb->common.magic = 0;
1223 	rbtdb->common.impmagic = 0;
1224 	isc_mem_detach(&rbtdb->hmctx);
1225 
1226 	if (rbtdb->mmap_location != NULL) {
1227 		isc_file_munmap(rbtdb->mmap_location, (size_t)rbtdb->mmap_size);
1228 	}
1229 
1230 	INSIST(ISC_LIST_EMPTY(rbtdb->common.update_listeners));
1231 
1232 	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
1233 }
1234 
1235 static void
1236 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
1237 	bool want_free = false;
1238 	unsigned int i;
1239 	unsigned int inactive = 0;
1240 
1241 	/* XXX check for open versions here */
1242 
1243 	if (rbtdb->soanode != NULL) {
1244 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
1245 	}
1246 	if (rbtdb->nsnode != NULL) {
1247 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
1248 	}
1249 
1250 	/*
1251 	 * The current version's glue table needs to be freed early
1252 	 * so the nodes are dereferenced before we check the active
1253 	 * node count below.
1254 	 */
1255 	if (rbtdb->current_version != NULL) {
1256 		free_gluetable(rbtdb->current_version);
1257 	}
1258 
1259 	/*
1260 	 * Even though there are no external direct references, there still
1261 	 * may be nodes in use.
1262 	 */
1263 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1264 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1265 		rbtdb->node_locks[i].exiting = true;
1266 		if (isc_refcount_current(&rbtdb->node_locks[i].references) == 0)
1267 		{
1268 			inactive++;
1269 		}
1270 		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1271 	}
1272 
1273 	if (inactive != 0) {
1274 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1275 		rbtdb->active -= inactive;
1276 		if (rbtdb->active == 0) {
1277 			want_free = true;
1278 		}
1279 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1280 		if (want_free) {
1281 			char buf[DNS_NAME_FORMATSIZE];
1282 			if (dns_name_dynamic(&rbtdb->common.origin)) {
1283 				dns_name_format(&rbtdb->common.origin, buf,
1284 						sizeof(buf));
1285 			} else {
1286 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1287 			}
1288 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1289 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1290 				      "calling free_rbtdb(%s)", buf);
1291 			free_rbtdb(rbtdb, true, NULL);
1292 		}
1293 	}
1294 }
1295 
1296 static void
1297 detach(dns_db_t **dbp) {
1298 	REQUIRE(dbp != NULL && VALID_RBTDB((dns_rbtdb_t *)(*dbp)));
1299 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
1300 	*dbp = NULL;
1301 
1302 	if (isc_refcount_decrement(&rbtdb->references) == 1) {
1303 		maybe_free_rbtdb(rbtdb);
1304 	}
1305 }
1306 
1307 static void
1308 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
1309 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1310 	rbtdb_version_t *version;
1311 
1312 	REQUIRE(VALID_RBTDB(rbtdb));
1313 
1314 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1315 	version = rbtdb->current_version;
1316 	isc_refcount_increment(&version->references);
1317 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1318 
1319 	*versionp = (dns_dbversion_t *)version;
1320 }
1321 
1322 static rbtdb_version_t *
1323 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
1324 		 unsigned int references, bool writer) {
1325 	rbtdb_version_t *version;
1326 	size_t size;
1327 
1328 	version = isc_mem_get(mctx, sizeof(*version));
1329 	version->serial = serial;
1330 
1331 	isc_refcount_init(&version->references, references);
1332 	isc_rwlock_init(&version->glue_rwlock, 0, 0);
1333 
1334 	version->glue_table_bits = RBTDB_GLUE_TABLE_INIT_BITS;
1335 	version->glue_table_nodecount = 0U;
1336 
1337 	size = HASHSIZE(version->glue_table_bits) *
1338 	       sizeof(version->glue_table[0]);
1339 	version->glue_table = isc_mem_get(mctx, size);
1340 	memset(version->glue_table, 0, size);
1341 
1342 	version->writer = writer;
1343 	version->commit_ok = false;
1344 	ISC_LIST_INIT(version->changed_list);
1345 	ISC_LIST_INIT(version->resigned_list);
1346 	ISC_LINK_INIT(version, link);
1347 
1348 	return (version);
1349 }
1350 
1351 static isc_result_t
1352 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1353 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1354 	rbtdb_version_t *version;
1355 
1356 	REQUIRE(VALID_RBTDB(rbtdb));
1357 	REQUIRE(versionp != NULL && *versionp == NULL);
1358 	REQUIRE(rbtdb->future_version == NULL);
1359 
1360 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1361 	RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
1362 	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1363 				   true);
1364 	version->rbtdb = rbtdb;
1365 	version->commit_ok = true;
1366 	version->secure = rbtdb->current_version->secure;
1367 	version->havensec3 = rbtdb->current_version->havensec3;
1368 	if (version->havensec3) {
1369 		version->flags = rbtdb->current_version->flags;
1370 		version->iterations = rbtdb->current_version->iterations;
1371 		version->hash = rbtdb->current_version->hash;
1372 		version->salt_length = rbtdb->current_version->salt_length;
1373 		memmove(version->salt, rbtdb->current_version->salt,
1374 			version->salt_length);
1375 	} else {
1376 		version->flags = 0;
1377 		version->iterations = 0;
1378 		version->hash = 0;
1379 		version->salt_length = 0;
1380 		memset(version->salt, 0, sizeof(version->salt));
1381 	}
1382 	isc_rwlock_init(&version->rwlock, 0, 0);
1383 	RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1384 	version->records = rbtdb->current_version->records;
1385 	version->xfrsize = rbtdb->current_version->xfrsize;
1386 	RWUNLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1387 	rbtdb->next_serial++;
1388 	rbtdb->future_version = version;
1389 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1390 
1391 	*versionp = version;
1392 
1393 	return (ISC_R_SUCCESS);
1394 }
1395 
1396 static void
1397 attachversion(dns_db_t *db, dns_dbversion_t *source,
1398 	      dns_dbversion_t **targetp) {
1399 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1400 	rbtdb_version_t *rbtversion = source;
1401 
1402 	REQUIRE(VALID_RBTDB(rbtdb));
1403 	INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
1404 
1405 	isc_refcount_increment(&rbtversion->references);
1406 
1407 	*targetp = rbtversion;
1408 }
1409 
1410 static rbtdb_changed_t *
1411 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) {
1412 	rbtdb_changed_t *changed;
1413 
1414 	/*
1415 	 * Caller must be holding the node lock if its reference must be
1416 	 * protected by the lock.
1417 	 */
1418 
1419 	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1420 
1421 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1422 
1423 	REQUIRE(version->writer);
1424 
1425 	if (changed != NULL) {
1426 		isc_refcount_increment(&node->references);
1427 		changed->node = node;
1428 		changed->dirty = false;
1429 		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1430 	} else {
1431 		version->commit_ok = false;
1432 	}
1433 
1434 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1435 
1436 	return (changed);
1437 }
1438 
1439 static void
1440 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1441 	if (dns_name_dynamic(&(*noqname)->name)) {
1442 		dns_name_free(&(*noqname)->name, mctx);
1443 	}
1444 	if ((*noqname)->neg != NULL) {
1445 		isc_mem_put(mctx, (*noqname)->neg,
1446 			    dns_rdataslab_size((*noqname)->neg, 0));
1447 	}
1448 	if ((*noqname)->negsig != NULL) {
1449 		isc_mem_put(mctx, (*noqname)->negsig,
1450 			    dns_rdataslab_size((*noqname)->negsig, 0));
1451 	}
1452 	isc_mem_put(mctx, *noqname, sizeof(**noqname));
1453 	*noqname = NULL;
1454 }
1455 
1456 static void
1457 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) {
1458 	ISC_LINK_INIT(h, link);
1459 	h->heap_index = 0;
1460 	h->is_mmapped = 0;
1461 	h->next_is_relative = 0;
1462 	h->node_is_relative = 0;
1463 	atomic_init(&h->attributes, 0);
1464 	atomic_init(&h->last_refresh_fail_ts, 0);
1465 
1466 	STATIC_ASSERT((sizeof(h->attributes) == 2),
1467 		      "The .attributes field of rdatasetheader_t needs to be "
1468 		      "16-bit int type exactly.");
1469 
1470 #if TRACE_HEADER
1471 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1472 		fprintf(stderr, "initialized header: %p\n", h);
1473 	}
1474 #else  /* if TRACE_HEADER */
1475 	UNUSED(rbtdb);
1476 #endif /* if TRACE_HEADER */
1477 }
1478 
1479 /*
1480  * Update the copied values of 'next' and 'node' if they are relative.
1481  */
1482 static void
1483 update_newheader(rdatasetheader_t *newh, rdatasetheader_t *old) {
1484 	char *p;
1485 
1486 	if (old->next_is_relative) {
1487 		p = (char *)old;
1488 		p += (uintptr_t)old->next;
1489 		newh->next = (rdatasetheader_t *)p;
1490 	}
1491 	if (old->node_is_relative) {
1492 		p = (char *)old;
1493 		p += (uintptr_t)old->node;
1494 		newh->node = (dns_rbtnode_t *)p;
1495 	}
1496 	if (CASESET(old)) {
1497 		uint_least16_t attr = RDATASET_ATTR_GET(
1498 			old,
1499 			(RDATASET_ATTR_CASESET | RDATASET_ATTR_CASEFULLYLOWER));
1500 		RDATASET_ATTR_SET(newh, attr);
1501 		memmove(newh->upper, old->upper, sizeof(old->upper));
1502 	}
1503 }
1504 
1505 static rdatasetheader_t *
1506 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) {
1507 	rdatasetheader_t *h;
1508 
1509 	h = isc_mem_get(mctx, sizeof(*h));
1510 
1511 #if TRACE_HEADER
1512 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1513 		fprintf(stderr, "allocated header: %p\n", h);
1514 	}
1515 #endif /* if TRACE_HEADER */
1516 	memset(h->upper, 0xeb, sizeof(h->upper));
1517 	init_rdataset(rbtdb, h);
1518 	h->rdh_ttl = 0;
1519 	return (h);
1520 }
1521 
1522 static void
1523 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) {
1524 	unsigned int size;
1525 	int idx;
1526 
1527 	update_rrsetstats(rbtdb, rdataset->type,
1528 			  atomic_load_acquire(&rdataset->attributes), false);
1529 
1530 	idx = rdataset->node->locknum;
1531 	if (ISC_LINK_LINKED(rdataset, link)) {
1532 		INSIST(IS_CACHE(rbtdb));
1533 		ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link);
1534 	}
1535 
1536 	if (rdataset->heap_index != 0) {
1537 		isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
1538 	}
1539 	rdataset->heap_index = 0;
1540 
1541 	if (rdataset->noqname != NULL) {
1542 		free_noqname(mctx, &rdataset->noqname);
1543 	}
1544 	if (rdataset->closest != NULL) {
1545 		free_noqname(mctx, &rdataset->closest);
1546 	}
1547 
1548 	if (NONEXISTENT(rdataset)) {
1549 		size = sizeof(*rdataset);
1550 	} else {
1551 		size = dns_rdataslab_size((unsigned char *)rdataset,
1552 					  sizeof(*rdataset));
1553 	}
1554 
1555 	if (rdataset->is_mmapped == 1) {
1556 		return;
1557 	}
1558 
1559 	isc_mem_put(mctx, rdataset, size);
1560 }
1561 
1562 static void
1563 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1564 	rdatasetheader_t *header, *dcurrent;
1565 	bool make_dirty = false;
1566 
1567 	/*
1568 	 * Caller must hold the node lock.
1569 	 */
1570 
1571 	/*
1572 	 * We set the IGNORE attribute on rdatasets with serial number
1573 	 * 'serial'.  When the reference count goes to zero, these rdatasets
1574 	 * will be cleaned up; until that time, they will be ignored.
1575 	 */
1576 	for (header = node->data; header != NULL; header = header->next) {
1577 		if (header->serial == serial) {
1578 			RDATASET_ATTR_SET(header, RDATASET_ATTR_IGNORE);
1579 			make_dirty = true;
1580 		}
1581 		for (dcurrent = header->down; dcurrent != NULL;
1582 		     dcurrent = dcurrent->down)
1583 		{
1584 			if (dcurrent->serial == serial) {
1585 				RDATASET_ATTR_SET(dcurrent,
1586 						  RDATASET_ATTR_IGNORE);
1587 				make_dirty = true;
1588 			}
1589 		}
1590 	}
1591 	if (make_dirty) {
1592 		node->dirty = 1;
1593 	}
1594 }
1595 
1596 static void
1597 mark_header_ancient(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1598 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1599 	uint_least16_t newattributes = 0;
1600 
1601 	/*
1602 	 * If we are already ancient there is nothing to do.
1603 	 */
1604 	do {
1605 		if ((attributes & RDATASET_ATTR_ANCIENT) != 0) {
1606 			return;
1607 		}
1608 		newattributes = attributes | RDATASET_ATTR_ANCIENT;
1609 	} while (!atomic_compare_exchange_weak_acq_rel(
1610 		&header->attributes, &attributes, newattributes));
1611 
1612 	/*
1613 	 * Decrement the stats counter for the appropriate RRtype.
1614 	 * If the STALE attribute is set, this will decrement the
1615 	 * stale type counter, otherwise it decrements the active
1616 	 * stats type counter.
1617 	 */
1618 	update_rrsetstats(rbtdb, header->type, attributes, false);
1619 	header->node->dirty = 1;
1620 
1621 	/* Increment the stats counter for the ancient RRtype. */
1622 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1623 }
1624 
1625 static void
1626 mark_header_stale(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1627 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1628 	uint_least16_t newattributes = 0;
1629 
1630 	INSIST((attributes & RDATASET_ATTR_ZEROTTL) == 0);
1631 
1632 	/*
1633 	 * If we are already stale there is nothing to do.
1634 	 */
1635 	do {
1636 		if ((attributes & RDATASET_ATTR_STALE) != 0) {
1637 			return;
1638 		}
1639 		newattributes = attributes | RDATASET_ATTR_STALE;
1640 	} while (!atomic_compare_exchange_weak_acq_rel(
1641 		&header->attributes, &attributes, newattributes));
1642 
1643 	/* Decrement the stats counter for the appropriate RRtype.
1644 	 * If the ANCIENT attribute is set (although it is very
1645 	 * unlikely that an RRset goes from ANCIENT to STALE), this
1646 	 * will decrement the ancient stale type counter, otherwise it
1647 	 * decrements the active stats type counter.
1648 	 */
1649 
1650 	update_rrsetstats(rbtdb, header->type, attributes, false);
1651 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1652 }
1653 
1654 static void
1655 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx,
1656 		    rdatasetheader_t *top) {
1657 	rdatasetheader_t *d, *down_next;
1658 
1659 	for (d = top->down; d != NULL; d = down_next) {
1660 		down_next = d->down;
1661 		free_rdataset(rbtdb, mctx, d);
1662 	}
1663 	top->down = NULL;
1664 }
1665 
1666 static void
1667 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1668 	rdatasetheader_t *current, *top_prev, *top_next;
1669 	isc_mem_t *mctx = rbtdb->common.mctx;
1670 
1671 	/*
1672 	 * Caller must be holding the node lock.
1673 	 */
1674 
1675 	top_prev = NULL;
1676 	for (current = node->data; current != NULL; current = top_next) {
1677 		top_next = current->next;
1678 		clean_stale_headers(rbtdb, mctx, current);
1679 		/*
1680 		 * If current is nonexistent, ancient, or stale and
1681 		 * we are not keeping stale, we can clean it up.
1682 		 */
1683 		if (NONEXISTENT(current) || ANCIENT(current) ||
1684 		    (STALE(current) && !KEEPSTALE(rbtdb)))
1685 		{
1686 			if (top_prev != NULL) {
1687 				top_prev->next = current->next;
1688 			} else {
1689 				node->data = current->next;
1690 			}
1691 			free_rdataset(rbtdb, mctx, current);
1692 		} else {
1693 			top_prev = current;
1694 		}
1695 	}
1696 	node->dirty = 0;
1697 }
1698 
1699 static void
1700 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1701 		rbtdb_serial_t least_serial) {
1702 	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1703 	rdatasetheader_t *top_prev, *top_next;
1704 	isc_mem_t *mctx = rbtdb->common.mctx;
1705 	bool still_dirty = false;
1706 
1707 	/*
1708 	 * Caller must be holding the node lock.
1709 	 */
1710 	REQUIRE(least_serial != 0);
1711 
1712 	top_prev = NULL;
1713 	for (current = node->data; current != NULL; current = top_next) {
1714 		top_next = current->next;
1715 
1716 		/*
1717 		 * First, we clean up any instances of multiple rdatasets
1718 		 * with the same serial number, or that have the IGNORE
1719 		 * attribute.
1720 		 */
1721 		dparent = current;
1722 		for (dcurrent = current->down; dcurrent != NULL;
1723 		     dcurrent = down_next)
1724 		{
1725 			down_next = dcurrent->down;
1726 			INSIST(dcurrent->serial <= dparent->serial);
1727 			if (dcurrent->serial == dparent->serial ||
1728 			    IGNORE(dcurrent))
1729 			{
1730 				if (down_next != NULL) {
1731 					down_next->next = dparent;
1732 				}
1733 				dparent->down = down_next;
1734 				free_rdataset(rbtdb, mctx, dcurrent);
1735 			} else {
1736 				dparent = dcurrent;
1737 			}
1738 		}
1739 
1740 		/*
1741 		 * We've now eliminated all IGNORE datasets with the possible
1742 		 * exception of current, which we now check.
1743 		 */
1744 		if (IGNORE(current)) {
1745 			down_next = current->down;
1746 			if (down_next == NULL) {
1747 				if (top_prev != NULL) {
1748 					top_prev->next = current->next;
1749 				} else {
1750 					node->data = current->next;
1751 				}
1752 				free_rdataset(rbtdb, mctx, current);
1753 				/*
1754 				 * current no longer exists, so we can
1755 				 * just continue with the loop.
1756 				 */
1757 				continue;
1758 			} else {
1759 				/*
1760 				 * Pull up current->down, making it the new
1761 				 * current.
1762 				 */
1763 				if (top_prev != NULL) {
1764 					top_prev->next = down_next;
1765 				} else {
1766 					node->data = down_next;
1767 				}
1768 				down_next->next = top_next;
1769 				free_rdataset(rbtdb, mctx, current);
1770 				current = down_next;
1771 			}
1772 		}
1773 
1774 		/*
1775 		 * We now try to find the first down node less than the
1776 		 * least serial.
1777 		 */
1778 		dparent = current;
1779 		for (dcurrent = current->down; dcurrent != NULL;
1780 		     dcurrent = down_next)
1781 		{
1782 			down_next = dcurrent->down;
1783 			if (dcurrent->serial < least_serial) {
1784 				break;
1785 			}
1786 			dparent = dcurrent;
1787 		}
1788 
1789 		/*
1790 		 * If there is a such an rdataset, delete it and any older
1791 		 * versions.
1792 		 */
1793 		if (dcurrent != NULL) {
1794 			do {
1795 				down_next = dcurrent->down;
1796 				INSIST(dcurrent->serial <= least_serial);
1797 				free_rdataset(rbtdb, mctx, dcurrent);
1798 				dcurrent = down_next;
1799 			} while (dcurrent != NULL);
1800 			dparent->down = NULL;
1801 		}
1802 
1803 		/*
1804 		 * Note.  The serial number of 'current' might be less than
1805 		 * least_serial too, but we cannot delete it because it is
1806 		 * the most recent version, unless it is a NONEXISTENT
1807 		 * rdataset.
1808 		 */
1809 		if (current->down != NULL) {
1810 			still_dirty = true;
1811 			top_prev = current;
1812 		} else {
1813 			/*
1814 			 * If this is a NONEXISTENT rdataset, we can delete it.
1815 			 */
1816 			if (NONEXISTENT(current)) {
1817 				if (top_prev != NULL) {
1818 					top_prev->next = current->next;
1819 				} else {
1820 					node->data = current->next;
1821 				}
1822 				free_rdataset(rbtdb, mctx, current);
1823 			} else {
1824 				top_prev = current;
1825 			}
1826 		}
1827 	}
1828 	if (!still_dirty) {
1829 		node->dirty = 0;
1830 	}
1831 }
1832 
1833 /*
1834  * tree_lock(write) must be held.
1835  */
1836 static void
1837 delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1838 	dns_rbtnode_t *nsecnode;
1839 	dns_fixedname_t fname;
1840 	dns_name_t *name;
1841 	isc_result_t result = ISC_R_UNEXPECTED;
1842 
1843 	INSIST(!ISC_LINK_LINKED(node, deadlink));
1844 
1845 	if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1846 		char printname[DNS_NAME_FORMATSIZE];
1847 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1848 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1849 			      "delete_node(): %p %s (bucket %d)", node,
1850 			      dns_rbt_formatnodename(node, printname,
1851 						     sizeof(printname)),
1852 			      node->locknum);
1853 	}
1854 
1855 	switch (node->nsec) {
1856 	case DNS_RBT_NSEC_NORMAL:
1857 		/*
1858 		 * Though this may be wasteful, it has to be done before
1859 		 * node is deleted.
1860 		 */
1861 		name = dns_fixedname_initname(&fname);
1862 		dns_rbt_fullnamefromnode(node, name);
1863 
1864 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1865 		break;
1866 	case DNS_RBT_NSEC_HAS_NSEC:
1867 		name = dns_fixedname_initname(&fname);
1868 		dns_rbt_fullnamefromnode(node, name);
1869 		/*
1870 		 * Delete the corresponding node from the auxiliary NSEC
1871 		 * tree before deleting from the main tree.
1872 		 */
1873 		nsecnode = NULL;
1874 		result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
1875 					  NULL, DNS_RBTFIND_EMPTYDATA, NULL,
1876 					  NULL);
1877 		if (result != ISC_R_SUCCESS) {
1878 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1879 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1880 				      "delete_node: "
1881 				      "dns_rbt_findnode(nsec): %s",
1882 				      isc_result_totext(result));
1883 		} else {
1884 			result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
1885 						    false);
1886 			if (result != ISC_R_SUCCESS) {
1887 				isc_log_write(
1888 					dns_lctx, DNS_LOGCATEGORY_DATABASE,
1889 					DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1890 					"delete_node(): "
1891 					"dns_rbt_deletenode(nsecnode): %s",
1892 					isc_result_totext(result));
1893 			}
1894 		}
1895 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1896 		break;
1897 	case DNS_RBT_NSEC_NSEC:
1898 		result = dns_rbt_deletenode(rbtdb->nsec, node, false);
1899 		break;
1900 	case DNS_RBT_NSEC_NSEC3:
1901 		result = dns_rbt_deletenode(rbtdb->nsec3, node, false);
1902 		break;
1903 	}
1904 	if (result != ISC_R_SUCCESS) {
1905 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1906 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1907 			      "delete_node(): "
1908 			      "dns_rbt_deletenode: %s",
1909 			      isc_result_totext(result));
1910 	}
1911 }
1912 
1913 /*
1914  * Caller must be holding the node lock.
1915  */
1916 static void
1917 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1918 	      isc_rwlocktype_t locktype) {
1919 	if (locktype == isc_rwlocktype_write && ISC_LINK_LINKED(node, deadlink))
1920 	{
1921 		ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
1922 				deadlink);
1923 	}
1924 	if (isc_refcount_increment0(&node->references) == 0) {
1925 		/* this is the first reference to the node */
1926 		isc_refcount_increment0(
1927 			&rbtdb->node_locks[node->locknum].references);
1928 	}
1929 }
1930 
1931 /*%
1932  * The tree lock must be held for the result to be valid.
1933  */
1934 static bool
1935 is_leaf(dns_rbtnode_t *node) {
1936 	return (node->parent != NULL && node->parent->down == node &&
1937 		node->left == NULL && node->right == NULL);
1938 }
1939 
1940 static void
1941 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1942 		   isc_rwlocktype_t locktype) {
1943 	isc_event_t *ev;
1944 	dns_db_t *db;
1945 
1946 	ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE,
1947 				prune_tree, node, sizeof(isc_event_t));
1948 	new_reference(rbtdb, node, locktype);
1949 	db = NULL;
1950 	attach((dns_db_t *)rbtdb, &db);
1951 	ev->ev_sender = db;
1952 	isc_task_send(rbtdb->task, &ev);
1953 }
1954 
1955 /*%
1956  * Clean up dead nodes.  These are nodes which have no references, and
1957  * have no data.  They are dead but we could not or chose not to delete
1958  * them when we deleted all the data at that node because we did not want
1959  * to wait for the tree write lock.
1960  *
1961  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1962  */
1963 static void
1964 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1965 	dns_rbtnode_t *node;
1966 	int count = 10; /* XXXJT: should be adjustable */
1967 
1968 	node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1969 	while (node != NULL && count > 0) {
1970 		ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1971 
1972 		/*
1973 		 * We might have reactivated this node without a tree write
1974 		 * lock, so we couldn't remove this node from deadnodes then
1975 		 * and we have to do it now.
1976 		 */
1977 		if (isc_refcount_current(&node->references) != 0 ||
1978 		    node->data != NULL)
1979 		{
1980 			node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1981 			count--;
1982 			continue;
1983 		}
1984 
1985 		if (is_leaf(node) && rbtdb->task != NULL) {
1986 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
1987 		} else if (node->down == NULL && node->data == NULL) {
1988 			/*
1989 			 * Not a interior node and not needing to be
1990 			 * reactivated.
1991 			 */
1992 			delete_node(rbtdb, node);
1993 		} else if (node->data == NULL) {
1994 			/*
1995 			 * A interior node without data. Leave linked to
1996 			 * to be cleaned up when node->down becomes NULL.
1997 			 */
1998 			ISC_LIST_APPEND(rbtdb->deadnodes[bucketnum], node,
1999 					deadlink);
2000 		}
2001 		node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
2002 		count--;
2003 	}
2004 }
2005 
2006 /*
2007  * This function is assumed to be called when a node is newly referenced
2008  * and can be in the deadnode list.  In that case the node must be retrieved
2009  * from the list because it is going to be used.  In addition, if the caller
2010  * happens to hold a write lock on the tree, it's a good chance to purge dead
2011  * nodes.
2012  * Note: while a new reference is gained in multiple places, there are only very
2013  * few cases where the node can be in the deadnode list (only empty nodes can
2014  * have been added to the list).
2015  */
2016 static void
2017 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2018 		isc_rwlocktype_t treelocktype) {
2019 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
2020 	nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
2021 	bool maybe_cleanup = false;
2022 
2023 	POST(locktype);
2024 
2025 	NODE_LOCK(nodelock, locktype);
2026 
2027 	/*
2028 	 * Check if we can possibly cleanup the dead node.  If so, upgrade
2029 	 * the node lock below to perform the cleanup.
2030 	 */
2031 	if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
2032 	    treelocktype == isc_rwlocktype_write)
2033 	{
2034 		maybe_cleanup = true;
2035 	}
2036 
2037 	if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
2038 		/*
2039 		 * Upgrade the lock and test if we still need to unlink.
2040 		 */
2041 		NODE_UNLOCK(nodelock, locktype);
2042 		locktype = isc_rwlocktype_write;
2043 		POST(locktype);
2044 		NODE_LOCK(nodelock, locktype);
2045 		if (ISC_LINK_LINKED(node, deadlink)) {
2046 			ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
2047 					deadlink);
2048 		}
2049 		if (maybe_cleanup) {
2050 			cleanup_dead_nodes(rbtdb, node->locknum);
2051 		}
2052 	}
2053 
2054 	new_reference(rbtdb, node, locktype);
2055 
2056 	NODE_UNLOCK(nodelock, locktype);
2057 }
2058 
2059 /*
2060  * Caller must be holding the node lock; either the "strong", read or write
2061  * lock.  Note that the lock must be held even when node references are
2062  * atomically modified; in that case the decrement operation itself does not
2063  * have to be protected, but we must avoid a race condition where multiple
2064  * threads are decreasing the reference to zero simultaneously and at least
2065  * one of them is going to free the node.
2066  *
2067  * This function returns true if and only if the node reference decreases
2068  * to zero.
2069  *
2070  * NOTE: Decrementing the reference count of a node to zero does not mean it
2071  * will be immediately freed.
2072  */
2073 static bool
2074 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2075 		    rbtdb_serial_t least_serial, isc_rwlocktype_t nlock,
2076 		    isc_rwlocktype_t tlock, bool pruning) {
2077 	isc_result_t result;
2078 	bool write_locked;
2079 	bool locked = tlock != isc_rwlocktype_none;
2080 	rbtdb_nodelock_t *nodelock;
2081 	int bucket = node->locknum;
2082 	bool no_reference = true;
2083 	uint_fast32_t refs;
2084 
2085 	nodelock = &rbtdb->node_locks[bucket];
2086 
2087 #define KEEP_NODE(n, r, l)                                  \
2088 	((n)->data != NULL || ((l) && (n)->down != NULL) || \
2089 	 (n) == (r)->origin_node || (n) == (r)->nsec3_origin_node)
2090 
2091 	/* Handle easy and typical case first. */
2092 	if (!node->dirty && KEEP_NODE(node, rbtdb, locked)) {
2093 		if (isc_refcount_decrement(&node->references) == 1) {
2094 			refs = isc_refcount_decrement(&nodelock->references);
2095 			INSIST(refs > 0);
2096 			return (true);
2097 		} else {
2098 			return (false);
2099 		}
2100 	}
2101 
2102 	/* Upgrade the lock? */
2103 	if (nlock == isc_rwlocktype_read) {
2104 		NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
2105 		NODE_LOCK(&nodelock->lock, isc_rwlocktype_write);
2106 	}
2107 
2108 	if (isc_refcount_decrement(&node->references) > 1) {
2109 		/* Restore the lock? */
2110 		if (nlock == isc_rwlocktype_read) {
2111 			NODE_DOWNGRADE(&nodelock->lock);
2112 		}
2113 		return (false);
2114 	}
2115 
2116 	if (node->dirty) {
2117 		if (IS_CACHE(rbtdb)) {
2118 			clean_cache_node(rbtdb, node);
2119 		} else {
2120 			if (least_serial == 0) {
2121 				/*
2122 				 * Caller doesn't know the least serial.
2123 				 * Get it.
2124 				 */
2125 				RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2126 				least_serial = rbtdb->least_serial;
2127 				RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2128 			}
2129 			clean_zone_node(rbtdb, node, least_serial);
2130 		}
2131 	}
2132 
2133 	/*
2134 	 * Attempt to switch to a write lock on the tree.  If this fails,
2135 	 * we will add this node to a linked list of nodes in this locking
2136 	 * bucket which we will free later.
2137 	 */
2138 	if (tlock != isc_rwlocktype_write) {
2139 		/*
2140 		 * Locking hierarchy notwithstanding, we don't need to free
2141 		 * the node lock before acquiring the tree write lock because
2142 		 * we only do a trylock.
2143 		 */
2144 		if (tlock == isc_rwlocktype_read) {
2145 			result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
2146 		} else {
2147 			result = isc_rwlock_trylock(&rbtdb->tree_lock,
2148 						    isc_rwlocktype_write);
2149 		}
2150 		RUNTIME_CHECK(result == ISC_R_SUCCESS ||
2151 			      result == ISC_R_LOCKBUSY);
2152 
2153 		write_locked = (result == ISC_R_SUCCESS);
2154 	} else {
2155 		write_locked = true;
2156 	}
2157 
2158 	refs = isc_refcount_decrement(&nodelock->references);
2159 	INSIST(refs > 0);
2160 
2161 	if (KEEP_NODE(node, rbtdb, locked || write_locked)) {
2162 		goto restore_locks;
2163 	}
2164 
2165 #undef KEEP_NODE
2166 
2167 	if (write_locked) {
2168 		/*
2169 		 * We can now delete the node.
2170 		 */
2171 
2172 		/*
2173 		 * If this node is the only one in the level it's in, deleting
2174 		 * this node may recursively make its parent the only node in
2175 		 * the parent level; if so, and if no one is currently using
2176 		 * the parent node, this is almost the only opportunity to
2177 		 * clean it up.  But the recursive cleanup is not that trivial
2178 		 * since the child and parent may be in different lock buckets,
2179 		 * which would cause a lock order reversal problem.  To avoid
2180 		 * the trouble, we'll dispatch a separate event for batch
2181 		 * cleaning.  We need to check whether we're deleting the node
2182 		 * as a result of pruning to avoid infinite dispatching.
2183 		 * Note: pruning happens only when a task has been set for the
2184 		 * rbtdb.  If the user of the rbtdb chooses not to set a task,
2185 		 * it's their responsibility to purge stale leaves (e.g. by
2186 		 * periodic walk-through).
2187 		 */
2188 		if (!pruning && is_leaf(node) && rbtdb->task != NULL) {
2189 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
2190 			no_reference = false;
2191 		} else {
2192 			delete_node(rbtdb, node);
2193 		}
2194 	} else {
2195 		INSIST(node->data == NULL);
2196 		if (!ISC_LINK_LINKED(node, deadlink)) {
2197 			ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
2198 					deadlink);
2199 		}
2200 	}
2201 
2202 restore_locks:
2203 	/* Restore the lock? */
2204 	if (nlock == isc_rwlocktype_read) {
2205 		NODE_DOWNGRADE(&nodelock->lock);
2206 	}
2207 
2208 	/*
2209 	 * Relock a read lock, or unlock the write lock if no lock was held.
2210 	 */
2211 	if (tlock == isc_rwlocktype_none) {
2212 		if (write_locked) {
2213 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2214 		}
2215 	}
2216 
2217 	if (tlock == isc_rwlocktype_read) {
2218 		if (write_locked) {
2219 			isc_rwlock_downgrade(&rbtdb->tree_lock);
2220 		}
2221 	}
2222 
2223 	return (no_reference);
2224 }
2225 
2226 /*
2227  * Prune the tree by recursively cleaning-up single leaves.  In the worst
2228  * case, the number of iteration is the number of tree levels, which is at
2229  * most the maximum number of domain name labels, i.e, 127.  In practice, this
2230  * should be much smaller (only a few times), and even the worst case would be
2231  * acceptable for a single event.
2232  */
2233 static void
2234 prune_tree(isc_task_t *task, isc_event_t *event) {
2235 	dns_rbtdb_t *rbtdb = event->ev_sender;
2236 	dns_rbtnode_t *node = event->ev_arg;
2237 	dns_rbtnode_t *parent;
2238 	unsigned int locknum;
2239 
2240 	UNUSED(task);
2241 
2242 	isc_event_free(&event);
2243 
2244 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2245 	locknum = node->locknum;
2246 	NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
2247 	do {
2248 		parent = node->parent;
2249 		decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
2250 				    isc_rwlocktype_write, true);
2251 
2252 		if (parent != NULL && parent->down == NULL) {
2253 			/*
2254 			 * node was the only down child of the parent and has
2255 			 * just been removed.  We'll then need to examine the
2256 			 * parent.  Keep the lock if possible; otherwise,
2257 			 * release the old lock and acquire one for the parent.
2258 			 */
2259 			if (parent->locknum != locknum) {
2260 				NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2261 					    isc_rwlocktype_write);
2262 				locknum = parent->locknum;
2263 				NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2264 					  isc_rwlocktype_write);
2265 			}
2266 
2267 			/*
2268 			 * We need to gain a reference to the node before
2269 			 * decrementing it in the next iteration.
2270 			 */
2271 			if (ISC_LINK_LINKED(parent, deadlink)) {
2272 				ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
2273 						parent, deadlink);
2274 			}
2275 			new_reference(rbtdb, parent, isc_rwlocktype_write);
2276 		} else {
2277 			parent = NULL;
2278 		}
2279 
2280 		node = parent;
2281 	} while (node != NULL);
2282 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
2283 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2284 
2285 	detach((dns_db_t **)(void *)&rbtdb);
2286 }
2287 
2288 static void
2289 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
2290 		   rbtdb_changedlist_t *cleanup_list) {
2291 	/*
2292 	 * Caller must be holding the database lock.
2293 	 */
2294 
2295 	rbtdb->least_serial = version->serial;
2296 	*cleanup_list = version->changed_list;
2297 	ISC_LIST_INIT(version->changed_list);
2298 }
2299 
2300 static void
2301 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
2302 	rbtdb_changed_t *changed, *next_changed;
2303 
2304 	/*
2305 	 * If the changed record is dirty, then
2306 	 * an update created multiple versions of
2307 	 * a given rdataset.  We keep this list
2308 	 * until we're the least open version, at
2309 	 * which point it's safe to get rid of any
2310 	 * older versions.
2311 	 *
2312 	 * If the changed record isn't dirty, then
2313 	 * we don't need it anymore since we're
2314 	 * committing and not rolling back.
2315 	 *
2316 	 * The caller must be holding the database lock.
2317 	 */
2318 	for (changed = HEAD(version->changed_list); changed != NULL;
2319 	     changed = next_changed)
2320 	{
2321 		next_changed = NEXT(changed, link);
2322 		if (!changed->dirty) {
2323 			UNLINK(version->changed_list, changed, link);
2324 			APPEND(*cleanup_list, changed, link);
2325 		}
2326 	}
2327 }
2328 
2329 static void
2330 iszonesecure(dns_db_t *db, rbtdb_version_t *version, dns_dbnode_t *origin) {
2331 	dns_rdataset_t keyset;
2332 	dns_rdataset_t nsecset, signsecset;
2333 	bool haszonekey = false;
2334 	bool hasnsec = false;
2335 	isc_result_t result;
2336 
2337 	dns_rdataset_init(&keyset);
2338 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey,
2339 				     0, 0, &keyset, NULL);
2340 	if (result == ISC_R_SUCCESS) {
2341 		result = dns_rdataset_first(&keyset);
2342 		while (result == ISC_R_SUCCESS) {
2343 			dns_rdata_t keyrdata = DNS_RDATA_INIT;
2344 			dns_rdataset_current(&keyset, &keyrdata);
2345 			if (dns_zonekey_iszonekey(&keyrdata)) {
2346 				haszonekey = true;
2347 				break;
2348 			}
2349 			result = dns_rdataset_next(&keyset);
2350 		}
2351 		dns_rdataset_disassociate(&keyset);
2352 	}
2353 	if (!haszonekey) {
2354 		version->secure = dns_db_insecure;
2355 		version->havensec3 = false;
2356 		return;
2357 	}
2358 
2359 	dns_rdataset_init(&nsecset);
2360 	dns_rdataset_init(&signsecset);
2361 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, 0,
2362 				     0, &nsecset, &signsecset);
2363 	if (result == ISC_R_SUCCESS) {
2364 		if (dns_rdataset_isassociated(&signsecset)) {
2365 			hasnsec = true;
2366 			dns_rdataset_disassociate(&signsecset);
2367 		}
2368 		dns_rdataset_disassociate(&nsecset);
2369 	}
2370 
2371 	setnsec3parameters(db, version);
2372 
2373 	/*
2374 	 * Do we have a valid NSEC/NSEC3 chain?
2375 	 */
2376 	if (version->havensec3 || hasnsec) {
2377 		version->secure = dns_db_secure;
2378 	} else {
2379 		version->secure = dns_db_insecure;
2380 	}
2381 }
2382 
2383 /*%<
2384  * Walk the origin node looking for NSEC3PARAM records.
2385  * Cache the nsec3 parameters.
2386  */
2387 static void
2388 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version) {
2389 	dns_rbtnode_t *node;
2390 	dns_rdata_nsec3param_t nsec3param;
2391 	dns_rdata_t rdata = DNS_RDATA_INIT;
2392 	isc_region_t region;
2393 	isc_result_t result;
2394 	rdatasetheader_t *header, *header_next;
2395 	unsigned char *raw; /* RDATASLAB */
2396 	unsigned int count, length;
2397 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2398 
2399 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2400 	version->havensec3 = false;
2401 	node = rbtdb->origin_node;
2402 	NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2403 		  isc_rwlocktype_read);
2404 	for (header = node->data; header != NULL; header = header_next) {
2405 		header_next = header->next;
2406 		do {
2407 			if (header->serial <= version->serial &&
2408 			    !IGNORE(header))
2409 			{
2410 				if (NONEXISTENT(header)) {
2411 					header = NULL;
2412 				}
2413 				break;
2414 			} else {
2415 				header = header->down;
2416 			}
2417 		} while (header != NULL);
2418 
2419 		if (header != NULL &&
2420 		    (header->type == dns_rdatatype_nsec3param))
2421 		{
2422 			/*
2423 			 * Find A NSEC3PARAM with a supported algorithm.
2424 			 */
2425 			raw = (unsigned char *)header + sizeof(*header);
2426 			count = raw[0] * 256 + raw[1]; /* count */
2427 			raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
2428 			while (count-- > 0U) {
2429 				length = raw[0] * 256 + raw[1];
2430 				raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
2431 				region.base = raw;
2432 				region.length = length;
2433 				raw += length;
2434 				dns_rdata_fromregion(
2435 					&rdata, rbtdb->common.rdclass,
2436 					dns_rdatatype_nsec3param, &region);
2437 				result = dns_rdata_tostruct(&rdata, &nsec3param,
2438 							    NULL);
2439 				INSIST(result == ISC_R_SUCCESS);
2440 				dns_rdata_reset(&rdata);
2441 
2442 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG &&
2443 				    !dns_nsec3_supportedhash(nsec3param.hash))
2444 				{
2445 					continue;
2446 				}
2447 
2448 				if (nsec3param.flags != 0) {
2449 					continue;
2450 				}
2451 
2452 				memmove(version->salt, nsec3param.salt,
2453 					nsec3param.salt_length);
2454 				version->hash = nsec3param.hash;
2455 				version->salt_length = nsec3param.salt_length;
2456 				version->iterations = nsec3param.iterations;
2457 				version->flags = nsec3param.flags;
2458 				version->havensec3 = true;
2459 				/*
2460 				 * Look for a better algorithm than the
2461 				 * unknown test algorithm.
2462 				 */
2463 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) {
2464 					goto unlock;
2465 				}
2466 			}
2467 		}
2468 	}
2469 unlock:
2470 	NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2471 		    isc_rwlocktype_read);
2472 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2473 }
2474 
2475 static void
2476 cleanup_dead_nodes_callback(isc_task_t *task, isc_event_t *event) {
2477 	dns_rbtdb_t *rbtdb = event->ev_arg;
2478 	bool again = false;
2479 	unsigned int locknum;
2480 
2481 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2482 	for (locknum = 0; locknum < rbtdb->node_lock_count; locknum++) {
2483 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2484 			  isc_rwlocktype_write);
2485 		cleanup_dead_nodes(rbtdb, locknum);
2486 		if (ISC_LIST_HEAD(rbtdb->deadnodes[locknum]) != NULL) {
2487 			again = true;
2488 		}
2489 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2490 			    isc_rwlocktype_write);
2491 	}
2492 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2493 	if (again) {
2494 		isc_task_send(task, &event);
2495 	} else {
2496 		isc_event_free(&event);
2497 		if (isc_refcount_decrement(&rbtdb->references) == 1) {
2498 			(void)isc_refcount_current(&rbtdb->references);
2499 			maybe_free_rbtdb(rbtdb);
2500 		}
2501 	}
2502 }
2503 
2504 static void
2505 closeversion(dns_db_t *db, dns_dbversion_t **versionp, bool commit) {
2506 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2507 	rbtdb_version_t *version, *cleanup_version, *least_greater;
2508 	bool rollback = false;
2509 	rbtdb_changedlist_t cleanup_list;
2510 	rdatasetheaderlist_t resigned_list;
2511 	rbtdb_changed_t *changed, *next_changed;
2512 	rbtdb_serial_t serial, least_serial;
2513 	dns_rbtnode_t *rbtnode;
2514 	rdatasetheader_t *header;
2515 
2516 	REQUIRE(VALID_RBTDB(rbtdb));
2517 	version = (rbtdb_version_t *)*versionp;
2518 	INSIST(version->rbtdb == rbtdb);
2519 
2520 	cleanup_version = NULL;
2521 	ISC_LIST_INIT(cleanup_list);
2522 	ISC_LIST_INIT(resigned_list);
2523 
2524 	if (isc_refcount_decrement(&version->references) > 1) {
2525 		/* typical and easy case first */
2526 		if (commit) {
2527 			RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2528 			INSIST(!version->writer);
2529 			RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2530 		}
2531 		goto end;
2532 	}
2533 
2534 	/*
2535 	 * Update the zone's secure status in version before making
2536 	 * it the current version.
2537 	 */
2538 	if (version->writer && commit && !IS_CACHE(rbtdb)) {
2539 		iszonesecure(db, version, rbtdb->origin_node);
2540 	}
2541 
2542 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
2543 	serial = version->serial;
2544 	if (version->writer) {
2545 		if (commit) {
2546 			unsigned cur_ref;
2547 			rbtdb_version_t *cur_version;
2548 
2549 			INSIST(version->commit_ok);
2550 			INSIST(version == rbtdb->future_version);
2551 			/*
2552 			 * The current version is going to be replaced.
2553 			 * Release the (likely last) reference to it from the
2554 			 * DB itself and unlink it from the open list.
2555 			 */
2556 			cur_version = rbtdb->current_version;
2557 			cur_ref = isc_refcount_decrement(
2558 				&cur_version->references);
2559 			if (cur_ref == 1) {
2560 				(void)isc_refcount_current(
2561 					&cur_version->references);
2562 				if (cur_version->serial == rbtdb->least_serial)
2563 				{
2564 					INSIST(EMPTY(
2565 						cur_version->changed_list));
2566 				}
2567 				UNLINK(rbtdb->open_versions, cur_version, link);
2568 			}
2569 			if (EMPTY(rbtdb->open_versions)) {
2570 				/*
2571 				 * We're going to become the least open
2572 				 * version.
2573 				 */
2574 				make_least_version(rbtdb, version,
2575 						   &cleanup_list);
2576 			} else {
2577 				/*
2578 				 * Some other open version is the
2579 				 * least version.  We can't cleanup
2580 				 * records that were changed in this
2581 				 * version because the older versions
2582 				 * may still be in use by an open
2583 				 * version.
2584 				 *
2585 				 * We can, however, discard the
2586 				 * changed records for things that
2587 				 * we've added that didn't exist in
2588 				 * prior versions.
2589 				 */
2590 				cleanup_nondirty(version, &cleanup_list);
2591 			}
2592 			/*
2593 			 * If the (soon to be former) current version
2594 			 * isn't being used by anyone, we can clean
2595 			 * it up.
2596 			 */
2597 			if (cur_ref == 1) {
2598 				cleanup_version = cur_version;
2599 				APPENDLIST(version->changed_list,
2600 					   cleanup_version->changed_list, link);
2601 			}
2602 			/*
2603 			 * Become the current version.
2604 			 */
2605 			version->writer = false;
2606 			rbtdb->current_version = version;
2607 			rbtdb->current_serial = version->serial;
2608 			rbtdb->future_version = NULL;
2609 
2610 			/*
2611 			 * Keep the current version in the open list, and
2612 			 * gain a reference for the DB itself (see the DB
2613 			 * creation function below).  This must be the only
2614 			 * case where we need to increment the counter from
2615 			 * zero and need to use isc_refcount_increment0().
2616 			 */
2617 			INSIST(isc_refcount_increment0(&version->references) ==
2618 			       0);
2619 			PREPEND(rbtdb->open_versions, rbtdb->current_version,
2620 				link);
2621 			resigned_list = version->resigned_list;
2622 			ISC_LIST_INIT(version->resigned_list);
2623 		} else {
2624 			/*
2625 			 * We're rolling back this transaction.
2626 			 */
2627 			cleanup_list = version->changed_list;
2628 			ISC_LIST_INIT(version->changed_list);
2629 			resigned_list = version->resigned_list;
2630 			ISC_LIST_INIT(version->resigned_list);
2631 			rollback = true;
2632 			cleanup_version = version;
2633 			rbtdb->future_version = NULL;
2634 		}
2635 	} else {
2636 		if (version != rbtdb->current_version) {
2637 			/*
2638 			 * There are no external or internal references
2639 			 * to this version and it can be cleaned up.
2640 			 */
2641 			cleanup_version = version;
2642 
2643 			/*
2644 			 * Find the version with the least serial
2645 			 * number greater than ours.
2646 			 */
2647 			least_greater = PREV(version, link);
2648 			if (least_greater == NULL) {
2649 				least_greater = rbtdb->current_version;
2650 			}
2651 
2652 			INSIST(version->serial < least_greater->serial);
2653 			/*
2654 			 * Is this the least open version?
2655 			 */
2656 			if (version->serial == rbtdb->least_serial) {
2657 				/*
2658 				 * Yes.  Install the new least open
2659 				 * version.
2660 				 */
2661 				make_least_version(rbtdb, least_greater,
2662 						   &cleanup_list);
2663 			} else {
2664 				/*
2665 				 * Add any unexecuted cleanups to
2666 				 * those of the least greater version.
2667 				 */
2668 				APPENDLIST(least_greater->changed_list,
2669 					   version->changed_list, link);
2670 			}
2671 		} else if (version->serial == rbtdb->least_serial) {
2672 			INSIST(EMPTY(version->changed_list));
2673 		}
2674 		UNLINK(rbtdb->open_versions, version, link);
2675 	}
2676 	least_serial = rbtdb->least_serial;
2677 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2678 
2679 	if (cleanup_version != NULL) {
2680 		INSIST(EMPTY(cleanup_version->changed_list));
2681 		free_gluetable(cleanup_version);
2682 		isc_rwlock_destroy(&cleanup_version->glue_rwlock);
2683 		isc_rwlock_destroy(&cleanup_version->rwlock);
2684 		isc_mem_put(rbtdb->common.mctx, cleanup_version,
2685 			    sizeof(*cleanup_version));
2686 	}
2687 
2688 	/*
2689 	 * Commit/rollback re-signed headers.
2690 	 */
2691 	for (header = HEAD(resigned_list); header != NULL;
2692 	     header = HEAD(resigned_list))
2693 	{
2694 		nodelock_t *lock;
2695 
2696 		ISC_LIST_UNLINK(resigned_list, header, link);
2697 
2698 		lock = &rbtdb->node_locks[header->node->locknum].lock;
2699 		NODE_LOCK(lock, isc_rwlocktype_write);
2700 		if (rollback && !IGNORE(header)) {
2701 			resign_insert(rbtdb, header->node->locknum, header);
2702 		}
2703 		decrement_reference(rbtdb, header->node, least_serial,
2704 				    isc_rwlocktype_write, isc_rwlocktype_none,
2705 				    false);
2706 		NODE_UNLOCK(lock, isc_rwlocktype_write);
2707 	}
2708 
2709 	if (!EMPTY(cleanup_list)) {
2710 		isc_event_t *event = NULL;
2711 		isc_rwlocktype_t tlock = isc_rwlocktype_none;
2712 
2713 		if (rbtdb->task != NULL) {
2714 			event = isc_event_allocate(rbtdb->common.mctx, NULL,
2715 						   DNS_EVENT_RBTDEADNODES,
2716 						   cleanup_dead_nodes_callback,
2717 						   rbtdb, sizeof(isc_event_t));
2718 		}
2719 		if (event == NULL) {
2720 			/*
2721 			 * We acquire a tree write lock here in order to make
2722 			 * sure that stale nodes will be removed in
2723 			 * decrement_reference().  If we didn't have the lock,
2724 			 * those nodes could miss the chance to be removed
2725 			 * until the server stops.  The write lock is
2726 			 * expensive, but this event should be rare enough
2727 			 * to justify the cost.
2728 			 */
2729 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2730 			tlock = isc_rwlocktype_write;
2731 		}
2732 
2733 		for (changed = HEAD(cleanup_list); changed != NULL;
2734 		     changed = next_changed)
2735 		{
2736 			nodelock_t *lock;
2737 
2738 			next_changed = NEXT(changed, link);
2739 			rbtnode = changed->node;
2740 			lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2741 
2742 			NODE_LOCK(lock, isc_rwlocktype_write);
2743 			/*
2744 			 * This is a good opportunity to purge any dead nodes,
2745 			 * so use it.
2746 			 */
2747 			if (event == NULL) {
2748 				cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2749 			}
2750 
2751 			if (rollback) {
2752 				rollback_node(rbtnode, serial);
2753 			}
2754 			decrement_reference(rbtdb, rbtnode, least_serial,
2755 					    isc_rwlocktype_write, tlock, false);
2756 
2757 			NODE_UNLOCK(lock, isc_rwlocktype_write);
2758 
2759 			isc_mem_put(rbtdb->common.mctx, changed,
2760 				    sizeof(*changed));
2761 		}
2762 		if (event != NULL) {
2763 			isc_refcount_increment(&rbtdb->references);
2764 			isc_task_send(rbtdb->task, &event);
2765 		} else {
2766 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2767 		}
2768 	}
2769 
2770 end:
2771 	*versionp = NULL;
2772 }
2773 
2774 /*
2775  * Add the necessary magic for the wildcard name 'name'
2776  * to be found in 'rbtdb'.
2777  *
2778  * In order for wildcard matching to work correctly in
2779  * zone_find(), we must ensure that a node for the wildcarding
2780  * level exists in the database, and has its 'find_callback'
2781  * and 'wild' bits set.
2782  *
2783  * E.g. if the wildcard name is "*.sub.example." then we
2784  * must ensure that "sub.example." exists and is marked as
2785  * a wildcard level.
2786  *
2787  * tree_lock(write) must be held.
2788  */
2789 static isc_result_t
2790 add_wildcard_magic(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2791 	isc_result_t result;
2792 	dns_name_t foundname;
2793 	dns_offsets_t offsets;
2794 	unsigned int n;
2795 	dns_rbtnode_t *node = NULL;
2796 
2797 	dns_name_init(&foundname, offsets);
2798 	n = dns_name_countlabels(name);
2799 	INSIST(n >= 2);
2800 	n--;
2801 	dns_name_getlabelsequence(name, 1, n, &foundname);
2802 	result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2803 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2804 		return (result);
2805 	}
2806 	if (result == ISC_R_SUCCESS) {
2807 		node->nsec = DNS_RBT_NSEC_NORMAL;
2808 	}
2809 	node->find_callback = 1;
2810 	if (lock) {
2811 		NODE_LOCK(&rbtdb->node_locks[node->locknum].lock,
2812 			  isc_rwlocktype_write);
2813 	}
2814 	node->wild = 1;
2815 	if (lock) {
2816 		NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
2817 			    isc_rwlocktype_write);
2818 	}
2819 	return (ISC_R_SUCCESS);
2820 }
2821 
2822 /*
2823  * tree_lock(write) must be held.
2824  */
2825 static isc_result_t
2826 add_empty_wildcards(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2827 	isc_result_t result;
2828 	dns_name_t foundname;
2829 	dns_offsets_t offsets;
2830 	unsigned int n, l, i;
2831 
2832 	dns_name_init(&foundname, offsets);
2833 	n = dns_name_countlabels(name);
2834 	l = dns_name_countlabels(&rbtdb->common.origin);
2835 	i = l + 1;
2836 	while (i < n) {
2837 		dns_rbtnode_t *node = NULL; /* dummy */
2838 		dns_name_getlabelsequence(name, n - i, i, &foundname);
2839 		if (dns_name_iswildcard(&foundname)) {
2840 			result = add_wildcard_magic(rbtdb, &foundname, lock);
2841 			if (result != ISC_R_SUCCESS) {
2842 				return (result);
2843 			}
2844 			result = dns_rbt_addnode(rbtdb->tree, &foundname,
2845 						 &node);
2846 			if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2847 				return (result);
2848 			}
2849 			if (result == ISC_R_SUCCESS) {
2850 				node->nsec = DNS_RBT_NSEC_NORMAL;
2851 			}
2852 		}
2853 		i++;
2854 	}
2855 	return (ISC_R_SUCCESS);
2856 }
2857 
2858 static isc_result_t
2859 findnodeintree(dns_rbtdb_t *rbtdb, dns_rbt_t *tree, const dns_name_t *name,
2860 	       bool create, dns_dbnode_t **nodep) {
2861 	dns_rbtnode_t *node = NULL;
2862 	dns_name_t nodename;
2863 	isc_result_t result;
2864 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
2865 
2866 	INSIST(tree == rbtdb->tree || tree == rbtdb->nsec3);
2867 
2868 	dns_name_init(&nodename, NULL);
2869 	RWLOCK(&rbtdb->tree_lock, locktype);
2870 	result = dns_rbt_findnode(tree, name, NULL, &node, NULL,
2871 				  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2872 	if (result != ISC_R_SUCCESS) {
2873 		RWUNLOCK(&rbtdb->tree_lock, locktype);
2874 		if (!create) {
2875 			if (result == DNS_R_PARTIALMATCH) {
2876 				result = ISC_R_NOTFOUND;
2877 			}
2878 			return (result);
2879 		}
2880 		/*
2881 		 * It would be nice to try to upgrade the lock instead of
2882 		 * unlocking then relocking.
2883 		 */
2884 		locktype = isc_rwlocktype_write;
2885 		RWLOCK(&rbtdb->tree_lock, locktype);
2886 		node = NULL;
2887 		result = dns_rbt_addnode(tree, name, &node);
2888 		if (result == ISC_R_SUCCESS) {
2889 			dns_rbt_namefromnode(node, &nodename);
2890 			node->locknum = node->hashval % rbtdb->node_lock_count;
2891 			if (tree == rbtdb->tree) {
2892 				add_empty_wildcards(rbtdb, name, true);
2893 
2894 				if (dns_name_iswildcard(name)) {
2895 					result = add_wildcard_magic(rbtdb, name,
2896 								    true);
2897 					if (result != ISC_R_SUCCESS) {
2898 						RWUNLOCK(&rbtdb->tree_lock,
2899 							 locktype);
2900 						return (result);
2901 					}
2902 				}
2903 			}
2904 			if (tree == rbtdb->nsec3) {
2905 				node->nsec = DNS_RBT_NSEC_NSEC3;
2906 			}
2907 		} else if (result != ISC_R_EXISTS) {
2908 			RWUNLOCK(&rbtdb->tree_lock, locktype);
2909 			return (result);
2910 		}
2911 	}
2912 
2913 	if (tree == rbtdb->nsec3) {
2914 		INSIST(node->nsec == DNS_RBT_NSEC_NSEC3);
2915 	}
2916 
2917 	reactivate_node(rbtdb, node, locktype);
2918 
2919 	RWUNLOCK(&rbtdb->tree_lock, locktype);
2920 
2921 	*nodep = (dns_dbnode_t *)node;
2922 
2923 	return (ISC_R_SUCCESS);
2924 }
2925 
2926 static isc_result_t
2927 findnode(dns_db_t *db, const dns_name_t *name, bool create,
2928 	 dns_dbnode_t **nodep) {
2929 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2930 
2931 	REQUIRE(VALID_RBTDB(rbtdb));
2932 
2933 	return (findnodeintree(rbtdb, rbtdb->tree, name, create, nodep));
2934 }
2935 
2936 static isc_result_t
2937 findnsec3node(dns_db_t *db, const dns_name_t *name, bool create,
2938 	      dns_dbnode_t **nodep) {
2939 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2940 
2941 	REQUIRE(VALID_RBTDB(rbtdb));
2942 
2943 	return (findnodeintree(rbtdb, rbtdb->nsec3, name, create, nodep));
2944 }
2945 
2946 static isc_result_t
2947 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2948 	rbtdb_search_t *search = arg;
2949 	rdatasetheader_t *header, *header_next;
2950 	rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2951 	rdatasetheader_t *found;
2952 	isc_result_t result;
2953 	dns_rbtnode_t *onode;
2954 
2955 	/*
2956 	 * We only want to remember the topmost zone cut, since it's the one
2957 	 * that counts, so we'll just continue if we've already found a
2958 	 * zonecut.
2959 	 */
2960 	if (search->zonecut != NULL) {
2961 		return (DNS_R_CONTINUE);
2962 	}
2963 
2964 	found = NULL;
2965 	result = DNS_R_CONTINUE;
2966 	onode = search->rbtdb->origin_node;
2967 
2968 	NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2969 		  isc_rwlocktype_read);
2970 
2971 	/*
2972 	 * Look for an NS or DNAME rdataset active in our version.
2973 	 */
2974 	ns_header = NULL;
2975 	dname_header = NULL;
2976 	sigdname_header = NULL;
2977 	for (header = node->data; header != NULL; header = header_next) {
2978 		header_next = header->next;
2979 		if (header->type == dns_rdatatype_ns ||
2980 		    header->type == dns_rdatatype_dname ||
2981 		    header->type == RBTDB_RDATATYPE_SIGDNAME)
2982 		{
2983 			do {
2984 				if (header->serial <= search->serial &&
2985 				    !IGNORE(header))
2986 				{
2987 					/*
2988 					 * Is this a "this rdataset doesn't
2989 					 * exist" record?
2990 					 */
2991 					if (NONEXISTENT(header)) {
2992 						header = NULL;
2993 					}
2994 					break;
2995 				} else {
2996 					header = header->down;
2997 				}
2998 			} while (header != NULL);
2999 			if (header != NULL) {
3000 				if (header->type == dns_rdatatype_dname) {
3001 					dname_header = header;
3002 				} else if (header->type ==
3003 					   RBTDB_RDATATYPE_SIGDNAME)
3004 				{
3005 					sigdname_header = header;
3006 				} else if (node != onode ||
3007 					   IS_STUB(search->rbtdb))
3008 				{
3009 					/*
3010 					 * We've found an NS rdataset that
3011 					 * isn't at the origin node.  We check
3012 					 * that they're not at the origin node,
3013 					 * because otherwise we'd erroneously
3014 					 * treat the zone top as if it were
3015 					 * a delegation.
3016 					 */
3017 					ns_header = header;
3018 				}
3019 			}
3020 		}
3021 	}
3022 
3023 	/*
3024 	 * Did we find anything?
3025 	 */
3026 	if (!IS_CACHE(search->rbtdb) && !IS_STUB(search->rbtdb) &&
3027 	    ns_header != NULL)
3028 	{
3029 		/*
3030 		 * Note that NS has precedence over DNAME if both exist
3031 		 * in a zone.  Otherwise DNAME take precedence over NS.
3032 		 */
3033 		found = ns_header;
3034 		search->zonecut_sigrdataset = NULL;
3035 	} else if (dname_header != NULL) {
3036 		found = dname_header;
3037 		search->zonecut_sigrdataset = sigdname_header;
3038 	} else if (ns_header != NULL) {
3039 		found = ns_header;
3040 		search->zonecut_sigrdataset = NULL;
3041 	}
3042 
3043 	if (found != NULL) {
3044 		/*
3045 		 * We increment the reference count on node to ensure that
3046 		 * search->zonecut_rdataset will still be valid later.
3047 		 */
3048 		new_reference(search->rbtdb, node, isc_rwlocktype_read);
3049 		search->zonecut = node;
3050 		search->zonecut_rdataset = found;
3051 		search->need_cleanup = true;
3052 		/*
3053 		 * Since we've found a zonecut, anything beneath it is
3054 		 * glue and is not subject to wildcard matching, so we
3055 		 * may clear search->wild.
3056 		 */
3057 		search->wild = false;
3058 		if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
3059 			/*
3060 			 * If the caller does not want to find glue, then
3061 			 * this is the best answer and the search should
3062 			 * stop now.
3063 			 */
3064 			result = DNS_R_PARTIALMATCH;
3065 		} else {
3066 			dns_name_t *zcname;
3067 
3068 			/*
3069 			 * The search will continue beneath the zone cut.
3070 			 * This may or may not be the best match.  In case it
3071 			 * is, we need to remember the node name.
3072 			 */
3073 			zcname = dns_fixedname_name(&search->zonecut_name);
3074 			dns_name_copynf(name, zcname);
3075 			search->copy_name = true;
3076 		}
3077 	} else {
3078 		/*
3079 		 * There is no zonecut at this node which is active in this
3080 		 * version.
3081 		 *
3082 		 * If this is a "wild" node and the caller hasn't disabled
3083 		 * wildcard matching, remember that we've seen a wild node
3084 		 * in case we need to go searching for wildcard matches
3085 		 * later on.
3086 		 */
3087 		if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0) {
3088 			search->wild = true;
3089 		}
3090 	}
3091 
3092 	NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3093 		    isc_rwlocktype_read);
3094 
3095 	return (result);
3096 }
3097 
3098 static void
3099 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header,
3100 	      isc_stdtime_t now, isc_rwlocktype_t locktype,
3101 	      dns_rdataset_t *rdataset) {
3102 	unsigned char *raw; /* RDATASLAB */
3103 	bool stale = STALE(header);
3104 	bool ancient = ANCIENT(header);
3105 
3106 	/*
3107 	 * Caller must be holding the node reader lock.
3108 	 * XXXJT: technically, we need a writer lock, since we'll increment
3109 	 * the header count below.  However, since the actual counter value
3110 	 * doesn't matter, we prioritize performance here.  (We may want to
3111 	 * use atomic increment when available).
3112 	 */
3113 
3114 	if (rdataset == NULL) {
3115 		return;
3116 	}
3117 
3118 	new_reference(rbtdb, node, locktype);
3119 
3120 	INSIST(rdataset->methods == NULL); /* We must be disassociated. */
3121 
3122 	/*
3123 	 * Mark header stale or ancient if the RRset is no longer active.
3124 	 */
3125 	if (!ACTIVE(header, now)) {
3126 		dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
3127 		/*
3128 		 * If this data is in the stale window keep it and if
3129 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
3130 		 * skip this record.  We skip the records with ZEROTTL
3131 		 * (these records should not be cached anyway).
3132 		 */
3133 
3134 		if (KEEPSTALE(rbtdb) && stale_ttl > now) {
3135 			stale = true;
3136 		} else {
3137 			/*
3138 			 * We are not keeping stale, or it is outside the
3139 			 * stale window. Mark ancient, i.e. ready for cleanup.
3140 			 */
3141 			ancient = true;
3142 		}
3143 	}
3144 
3145 	rdataset->methods = &rdataset_methods;
3146 	rdataset->rdclass = rbtdb->common.rdclass;
3147 	rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
3148 	rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
3149 	rdataset->ttl = header->rdh_ttl - now;
3150 	rdataset->trust = header->trust;
3151 
3152 	if (NEGATIVE(header)) {
3153 		rdataset->attributes |= DNS_RDATASETATTR_NEGATIVE;
3154 	}
3155 	if (NXDOMAIN(header)) {
3156 		rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
3157 	}
3158 	if (OPTOUT(header)) {
3159 		rdataset->attributes |= DNS_RDATASETATTR_OPTOUT;
3160 	}
3161 	if (PREFETCH(header)) {
3162 		rdataset->attributes |= DNS_RDATASETATTR_PREFETCH;
3163 	}
3164 
3165 	if (stale && !ancient) {
3166 		dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
3167 		if (stale_ttl > now) {
3168 			rdataset->ttl = stale_ttl - now;
3169 		} else {
3170 			rdataset->ttl = 0;
3171 		}
3172 		if (STALE_WINDOW(header)) {
3173 			rdataset->attributes |= DNS_RDATASETATTR_STALE_WINDOW;
3174 		}
3175 		rdataset->attributes |= DNS_RDATASETATTR_STALE;
3176 	} else if (IS_CACHE(rbtdb) && !ACTIVE(header, now)) {
3177 		rdataset->attributes |= DNS_RDATASETATTR_ANCIENT;
3178 		rdataset->ttl = header->rdh_ttl;
3179 	}
3180 
3181 	rdataset->private1 = rbtdb;
3182 	rdataset->private2 = node;
3183 	raw = (unsigned char *)header + sizeof(*header);
3184 	rdataset->private3 = raw;
3185 	rdataset->count = atomic_fetch_add_relaxed(&header->count, 1);
3186 	if (rdataset->count == UINT32_MAX) {
3187 		rdataset->count = 0;
3188 	}
3189 
3190 	/*
3191 	 * Reset iterator state.
3192 	 */
3193 	rdataset->privateuint4 = 0;
3194 	rdataset->private5 = NULL;
3195 
3196 	/*
3197 	 * Add noqname proof.
3198 	 */
3199 	rdataset->private6 = header->noqname;
3200 	if (rdataset->private6 != NULL) {
3201 		rdataset->attributes |= DNS_RDATASETATTR_NOQNAME;
3202 	}
3203 	rdataset->private7 = header->closest;
3204 	if (rdataset->private7 != NULL) {
3205 		rdataset->attributes |= DNS_RDATASETATTR_CLOSEST;
3206 	}
3207 
3208 	/*
3209 	 * Copy out re-signing information.
3210 	 */
3211 	if (RESIGN(header)) {
3212 		rdataset->attributes |= DNS_RDATASETATTR_RESIGN;
3213 		rdataset->resign = (header->resign << 1) | header->resign_lsb;
3214 	} else {
3215 		rdataset->resign = 0;
3216 	}
3217 }
3218 
3219 static isc_result_t
3220 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
3221 		 dns_name_t *foundname, dns_rdataset_t *rdataset,
3222 		 dns_rdataset_t *sigrdataset) {
3223 	dns_name_t *zcname;
3224 	rbtdb_rdatatype_t type;
3225 	dns_rbtnode_t *node;
3226 
3227 	/*
3228 	 * The caller MUST NOT be holding any node locks.
3229 	 */
3230 
3231 	node = search->zonecut;
3232 	type = search->zonecut_rdataset->type;
3233 
3234 	/*
3235 	 * If we have to set foundname, we do it before anything else.
3236 	 * If we were to set foundname after we had set nodep or bound the
3237 	 * rdataset, then we'd have to undo that work if dns_name_copy()
3238 	 * failed.  By setting foundname first, there's nothing to undo if
3239 	 * we have trouble.
3240 	 */
3241 	if (foundname != NULL && search->copy_name) {
3242 		zcname = dns_fixedname_name(&search->zonecut_name);
3243 		dns_name_copynf(zcname, foundname);
3244 	}
3245 	if (nodep != NULL) {
3246 		/*
3247 		 * Note that we don't have to increment the node's reference
3248 		 * count here because we're going to use the reference we
3249 		 * already have in the search block.
3250 		 */
3251 		*nodep = node;
3252 		search->need_cleanup = false;
3253 	}
3254 	if (rdataset != NULL) {
3255 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3256 			  isc_rwlocktype_read);
3257 		bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
3258 			      search->now, isc_rwlocktype_read, rdataset);
3259 		if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
3260 		{
3261 			bind_rdataset(search->rbtdb, node,
3262 				      search->zonecut_sigrdataset, search->now,
3263 				      isc_rwlocktype_read, sigrdataset);
3264 		}
3265 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3266 			    isc_rwlocktype_read);
3267 	}
3268 
3269 	if (type == dns_rdatatype_dname) {
3270 		return (DNS_R_DNAME);
3271 	}
3272 	return (DNS_R_DELEGATION);
3273 }
3274 
3275 static bool
3276 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
3277 	   dns_rbtnode_t *node) {
3278 	unsigned char *raw; /* RDATASLAB */
3279 	unsigned int count, size;
3280 	dns_name_t ns_name;
3281 	bool valid = false;
3282 	dns_offsets_t offsets;
3283 	isc_region_t region;
3284 	rdatasetheader_t *header;
3285 
3286 	/*
3287 	 * No additional locking is required.
3288 	 */
3289 
3290 	/*
3291 	 * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
3292 	 * if it occurs at a zone cut, but is not valid below it.
3293 	 */
3294 	if (type == dns_rdatatype_ns) {
3295 		if (node != search->zonecut) {
3296 			return (false);
3297 		}
3298 	} else if (type != dns_rdatatype_a && type != dns_rdatatype_aaaa &&
3299 		   type != dns_rdatatype_a6)
3300 	{
3301 		return (false);
3302 	}
3303 
3304 	header = search->zonecut_rdataset;
3305 	raw = (unsigned char *)header + sizeof(*header);
3306 	count = raw[0] * 256 + raw[1];
3307 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3308 
3309 	while (count > 0) {
3310 		count--;
3311 		size = raw[0] * 256 + raw[1];
3312 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3313 		region.base = raw;
3314 		region.length = size;
3315 		raw += size;
3316 		/*
3317 		 * XXX Until we have rdata structures, we have no choice but
3318 		 * to directly access the rdata format.
3319 		 */
3320 		dns_name_init(&ns_name, offsets);
3321 		dns_name_fromregion(&ns_name, &region);
3322 		if (dns_name_compare(&ns_name, name) == 0) {
3323 			valid = true;
3324 			break;
3325 		}
3326 	}
3327 
3328 	return (valid);
3329 }
3330 
3331 static bool
3332 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
3333 	    const dns_name_t *name) {
3334 	dns_fixedname_t fnext;
3335 	dns_fixedname_t forigin;
3336 	dns_name_t *next;
3337 	dns_name_t *origin;
3338 	dns_name_t prefix;
3339 	dns_rbtdb_t *rbtdb;
3340 	dns_rbtnode_t *node;
3341 	isc_result_t result;
3342 	bool answer = false;
3343 	rdatasetheader_t *header;
3344 
3345 	rbtdb = search->rbtdb;
3346 
3347 	dns_name_init(&prefix, NULL);
3348 	next = dns_fixedname_initname(&fnext);
3349 	origin = dns_fixedname_initname(&forigin);
3350 
3351 	result = dns_rbtnodechain_next(chain, NULL, NULL);
3352 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3353 		node = NULL;
3354 		result = dns_rbtnodechain_current(chain, &prefix, origin,
3355 						  &node);
3356 		if (result != ISC_R_SUCCESS) {
3357 			break;
3358 		}
3359 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3360 			  isc_rwlocktype_read);
3361 		for (header = node->data; header != NULL; header = header->next)
3362 		{
3363 			if (header->serial <= search->serial &&
3364 			    !IGNORE(header) && EXISTS(header))
3365 			{
3366 				break;
3367 			}
3368 		}
3369 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3370 			    isc_rwlocktype_read);
3371 		if (header != NULL) {
3372 			break;
3373 		}
3374 		result = dns_rbtnodechain_next(chain, NULL, NULL);
3375 	}
3376 	if (result == ISC_R_SUCCESS) {
3377 		result = dns_name_concatenate(&prefix, origin, next, NULL);
3378 	}
3379 	if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name)) {
3380 		answer = true;
3381 	}
3382 	return (answer);
3383 }
3384 
3385 static bool
3386 activeemptynode(rbtdb_search_t *search, const dns_name_t *qname,
3387 		dns_name_t *wname) {
3388 	dns_fixedname_t fnext;
3389 	dns_fixedname_t forigin;
3390 	dns_fixedname_t fprev;
3391 	dns_name_t *next;
3392 	dns_name_t *origin;
3393 	dns_name_t *prev;
3394 	dns_name_t name;
3395 	dns_name_t rname;
3396 	dns_name_t tname;
3397 	dns_rbtdb_t *rbtdb;
3398 	dns_rbtnode_t *node;
3399 	dns_rbtnodechain_t chain;
3400 	bool check_next = true;
3401 	bool check_prev = true;
3402 	bool answer = false;
3403 	isc_result_t result;
3404 	rdatasetheader_t *header;
3405 	unsigned int n;
3406 
3407 	rbtdb = search->rbtdb;
3408 
3409 	dns_name_init(&name, NULL);
3410 	dns_name_init(&tname, NULL);
3411 	dns_name_init(&rname, NULL);
3412 	next = dns_fixedname_initname(&fnext);
3413 	prev = dns_fixedname_initname(&fprev);
3414 	origin = dns_fixedname_initname(&forigin);
3415 
3416 	/*
3417 	 * Find if qname is at or below a empty node.
3418 	 * Use our own copy of the chain.
3419 	 */
3420 
3421 	chain = search->chain;
3422 	do {
3423 		node = NULL;
3424 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3425 		if (result != ISC_R_SUCCESS) {
3426 			break;
3427 		}
3428 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3429 			  isc_rwlocktype_read);
3430 		for (header = node->data; header != NULL; header = header->next)
3431 		{
3432 			if (header->serial <= search->serial &&
3433 			    !IGNORE(header) && EXISTS(header))
3434 			{
3435 				break;
3436 			}
3437 		}
3438 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3439 			    isc_rwlocktype_read);
3440 		if (header != NULL) {
3441 			break;
3442 		}
3443 		result = dns_rbtnodechain_prev(&chain, NULL, NULL);
3444 	} while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
3445 	if (result == ISC_R_SUCCESS) {
3446 		result = dns_name_concatenate(&name, origin, prev, NULL);
3447 	}
3448 	if (result != ISC_R_SUCCESS) {
3449 		check_prev = false;
3450 	}
3451 
3452 	result = dns_rbtnodechain_next(&chain, NULL, NULL);
3453 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3454 		node = NULL;
3455 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3456 		if (result != ISC_R_SUCCESS) {
3457 			break;
3458 		}
3459 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3460 			  isc_rwlocktype_read);
3461 		for (header = node->data; header != NULL; header = header->next)
3462 		{
3463 			if (header->serial <= search->serial &&
3464 			    !IGNORE(header) && EXISTS(header))
3465 			{
3466 				break;
3467 			}
3468 		}
3469 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3470 			    isc_rwlocktype_read);
3471 		if (header != NULL) {
3472 			break;
3473 		}
3474 		result = dns_rbtnodechain_next(&chain, NULL, NULL);
3475 	}
3476 	if (result == ISC_R_SUCCESS) {
3477 		result = dns_name_concatenate(&name, origin, next, NULL);
3478 	}
3479 	if (result != ISC_R_SUCCESS) {
3480 		check_next = false;
3481 	}
3482 
3483 	dns_name_clone(qname, &rname);
3484 
3485 	/*
3486 	 * Remove the wildcard label to find the terminal name.
3487 	 */
3488 	n = dns_name_countlabels(wname);
3489 	dns_name_getlabelsequence(wname, 1, n - 1, &tname);
3490 
3491 	do {
3492 		if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
3493 		    (check_next && dns_name_issubdomain(next, &rname)))
3494 		{
3495 			answer = true;
3496 			break;
3497 		}
3498 		/*
3499 		 * Remove the left hand label.
3500 		 */
3501 		n = dns_name_countlabels(&rname);
3502 		dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
3503 	} while (!dns_name_equal(&rname, &tname));
3504 	return (answer);
3505 }
3506 
3507 static isc_result_t
3508 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
3509 	      const dns_name_t *qname) {
3510 	unsigned int i, j;
3511 	dns_rbtnode_t *node, *level_node, *wnode;
3512 	rdatasetheader_t *header;
3513 	isc_result_t result = ISC_R_NOTFOUND;
3514 	dns_name_t name;
3515 	dns_name_t *wname;
3516 	dns_fixedname_t fwname;
3517 	dns_rbtdb_t *rbtdb;
3518 	bool done, wild, active;
3519 	dns_rbtnodechain_t wchain;
3520 
3521 	/*
3522 	 * Caller must be holding the tree lock and MUST NOT be holding
3523 	 * any node locks.
3524 	 */
3525 
3526 	/*
3527 	 * Examine each ancestor level.  If the level's wild bit
3528 	 * is set, then construct the corresponding wildcard name and
3529 	 * search for it.  If the wildcard node exists, and is active in
3530 	 * this version, we're done.  If not, then we next check to see
3531 	 * if the ancestor is active in this version.  If so, then there
3532 	 * can be no possible wildcard match and again we're done.  If not,
3533 	 * continue the search.
3534 	 */
3535 
3536 	rbtdb = search->rbtdb;
3537 	i = search->chain.level_matches;
3538 	done = false;
3539 	node = *nodep;
3540 	do {
3541 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3542 			  isc_rwlocktype_read);
3543 
3544 		/*
3545 		 * First we try to figure out if this node is active in
3546 		 * the search's version.  We do this now, even though we
3547 		 * may not need the information, because it simplifies the
3548 		 * locking and code flow.
3549 		 */
3550 		for (header = node->data; header != NULL; header = header->next)
3551 		{
3552 			if (header->serial <= search->serial &&
3553 			    !IGNORE(header) && EXISTS(header) &&
3554 			    !ANCIENT(header))
3555 			{
3556 				break;
3557 			}
3558 		}
3559 		if (header != NULL) {
3560 			active = true;
3561 		} else {
3562 			active = false;
3563 		}
3564 
3565 		if (node->wild) {
3566 			wild = true;
3567 		} else {
3568 			wild = false;
3569 		}
3570 
3571 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3572 			    isc_rwlocktype_read);
3573 
3574 		if (wild) {
3575 			/*
3576 			 * Construct the wildcard name for this level.
3577 			 */
3578 			dns_name_init(&name, NULL);
3579 			dns_rbt_namefromnode(node, &name);
3580 			wname = dns_fixedname_initname(&fwname);
3581 			result = dns_name_concatenate(dns_wildcardname, &name,
3582 						      wname, NULL);
3583 			j = i;
3584 			while (result == ISC_R_SUCCESS && j != 0) {
3585 				j--;
3586 				level_node = search->chain.levels[j];
3587 				dns_name_init(&name, NULL);
3588 				dns_rbt_namefromnode(level_node, &name);
3589 				result = dns_name_concatenate(wname, &name,
3590 							      wname, NULL);
3591 			}
3592 			if (result != ISC_R_SUCCESS) {
3593 				break;
3594 			}
3595 
3596 			wnode = NULL;
3597 			dns_rbtnodechain_init(&wchain);
3598 			result = dns_rbt_findnode(
3599 				rbtdb->tree, wname, NULL, &wnode, &wchain,
3600 				DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3601 			if (result == ISC_R_SUCCESS) {
3602 				nodelock_t *lock;
3603 
3604 				/*
3605 				 * We have found the wildcard node.  If it
3606 				 * is active in the search's version, we're
3607 				 * done.
3608 				 */
3609 				lock = &rbtdb->node_locks[wnode->locknum].lock;
3610 				NODE_LOCK(lock, isc_rwlocktype_read);
3611 				for (header = wnode->data; header != NULL;
3612 				     header = header->next)
3613 				{
3614 					if (header->serial <= search->serial &&
3615 					    !IGNORE(header) && EXISTS(header) &&
3616 					    !ANCIENT(header))
3617 					{
3618 						break;
3619 					}
3620 				}
3621 				NODE_UNLOCK(lock, isc_rwlocktype_read);
3622 				if (header != NULL ||
3623 				    activeempty(search, &wchain, wname))
3624 				{
3625 					if (activeemptynode(search, qname,
3626 							    wname))
3627 					{
3628 						return (ISC_R_NOTFOUND);
3629 					}
3630 					/*
3631 					 * The wildcard node is active!
3632 					 *
3633 					 * Note: result is still ISC_R_SUCCESS
3634 					 * so we don't have to set it.
3635 					 */
3636 					*nodep = wnode;
3637 					break;
3638 				}
3639 			} else if (result != ISC_R_NOTFOUND &&
3640 				   result != DNS_R_PARTIALMATCH)
3641 			{
3642 				/*
3643 				 * An error has occurred.  Bail out.
3644 				 */
3645 				break;
3646 			}
3647 		}
3648 
3649 		if (active) {
3650 			/*
3651 			 * The level node is active.  Any wildcarding
3652 			 * present at higher levels has no
3653 			 * effect and we're done.
3654 			 */
3655 			result = ISC_R_NOTFOUND;
3656 			break;
3657 		}
3658 
3659 		if (i > 0) {
3660 			i--;
3661 			node = search->chain.levels[i];
3662 		} else {
3663 			done = true;
3664 		}
3665 	} while (!done);
3666 
3667 	return (result);
3668 }
3669 
3670 static bool
3671 matchparams(rdatasetheader_t *header, rbtdb_search_t *search) {
3672 	dns_rdata_t rdata = DNS_RDATA_INIT;
3673 	dns_rdata_nsec3_t nsec3;
3674 	unsigned char *raw; /* RDATASLAB */
3675 	unsigned int rdlen, count;
3676 	isc_region_t region;
3677 	isc_result_t result;
3678 
3679 	REQUIRE(header->type == dns_rdatatype_nsec3);
3680 
3681 	raw = (unsigned char *)header + sizeof(*header);
3682 	count = raw[0] * 256 + raw[1]; /* count */
3683 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3684 
3685 	while (count-- > 0) {
3686 		rdlen = raw[0] * 256 + raw[1];
3687 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3688 		region.base = raw;
3689 		region.length = rdlen;
3690 		dns_rdata_fromregion(&rdata, search->rbtdb->common.rdclass,
3691 				     dns_rdatatype_nsec3, &region);
3692 		raw += rdlen;
3693 		result = dns_rdata_tostruct(&rdata, &nsec3, NULL);
3694 		INSIST(result == ISC_R_SUCCESS);
3695 		if (nsec3.hash == search->rbtversion->hash &&
3696 		    nsec3.iterations == search->rbtversion->iterations &&
3697 		    nsec3.salt_length == search->rbtversion->salt_length &&
3698 		    memcmp(nsec3.salt, search->rbtversion->salt,
3699 			   nsec3.salt_length) == 0)
3700 		{
3701 			return (true);
3702 		}
3703 		dns_rdata_reset(&rdata);
3704 	}
3705 	return (false);
3706 }
3707 
3708 /*
3709  * Find node of the NSEC/NSEC3 record that is 'name'.
3710  */
3711 static isc_result_t
3712 previous_closest_nsec(dns_rdatatype_t type, rbtdb_search_t *search,
3713 		      dns_name_t *name, dns_name_t *origin,
3714 		      dns_rbtnode_t **nodep, dns_rbtnodechain_t *nsecchain,
3715 		      bool *firstp) {
3716 	dns_fixedname_t ftarget;
3717 	dns_name_t *target;
3718 	dns_rbtnode_t *nsecnode;
3719 	isc_result_t result;
3720 
3721 	REQUIRE(nodep != NULL && *nodep == NULL);
3722 	REQUIRE(type == dns_rdatatype_nsec3 || firstp != NULL);
3723 
3724 	if (type == dns_rdatatype_nsec3) {
3725 		result = dns_rbtnodechain_prev(&search->chain, NULL, NULL);
3726 		if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) {
3727 			return (result);
3728 		}
3729 		result = dns_rbtnodechain_current(&search->chain, name, origin,
3730 						  nodep);
3731 		return (result);
3732 	}
3733 
3734 	target = dns_fixedname_initname(&ftarget);
3735 
3736 	for (;;) {
3737 		if (*firstp) {
3738 			/*
3739 			 * Construct the name of the second node to check.
3740 			 * It is the first node sought in the NSEC tree.
3741 			 */
3742 			*firstp = false;
3743 			dns_rbtnodechain_init(nsecchain);
3744 			result = dns_name_concatenate(name, origin, target,
3745 						      NULL);
3746 			if (result != ISC_R_SUCCESS) {
3747 				return (result);
3748 			}
3749 			nsecnode = NULL;
3750 			result = dns_rbt_findnode(
3751 				search->rbtdb->nsec, target, NULL, &nsecnode,
3752 				nsecchain, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3753 			if (result == ISC_R_SUCCESS) {
3754 				/*
3755 				 * Since this was the first loop, finding the
3756 				 * name in the NSEC tree implies that the first
3757 				 * node checked in the main tree had an
3758 				 * unacceptable NSEC record.
3759 				 * Try the previous node in the NSEC tree.
3760 				 */
3761 				result = dns_rbtnodechain_prev(nsecchain, name,
3762 							       origin);
3763 				if (result == DNS_R_NEWORIGIN) {
3764 					result = ISC_R_SUCCESS;
3765 				}
3766 			} else if (result == ISC_R_NOTFOUND ||
3767 				   result == DNS_R_PARTIALMATCH)
3768 			{
3769 				result = dns_rbtnodechain_current(
3770 					nsecchain, name, origin, NULL);
3771 				if (result == ISC_R_NOTFOUND) {
3772 					result = ISC_R_NOMORE;
3773 				}
3774 			}
3775 		} else {
3776 			/*
3777 			 * This is a second or later trip through the auxiliary
3778 			 * tree for the name of a third or earlier NSEC node in
3779 			 * the main tree.  Previous trips through the NSEC tree
3780 			 * must have found nodes in the main tree with NSEC
3781 			 * records.  Perhaps they lacked signature records.
3782 			 */
3783 			result = dns_rbtnodechain_prev(nsecchain, name, origin);
3784 			if (result == DNS_R_NEWORIGIN) {
3785 				result = ISC_R_SUCCESS;
3786 			}
3787 		}
3788 		if (result != ISC_R_SUCCESS) {
3789 			return (result);
3790 		}
3791 
3792 		/*
3793 		 * Construct the name to seek in the main tree.
3794 		 */
3795 		result = dns_name_concatenate(name, origin, target, NULL);
3796 		if (result != ISC_R_SUCCESS) {
3797 			return (result);
3798 		}
3799 
3800 		*nodep = NULL;
3801 		result = dns_rbt_findnode(search->rbtdb->tree, target, NULL,
3802 					  nodep, &search->chain,
3803 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3804 		if (result == ISC_R_SUCCESS) {
3805 			return (result);
3806 		}
3807 
3808 		/*
3809 		 * There should always be a node in the main tree with the
3810 		 * same name as the node in the auxiliary NSEC tree, except for
3811 		 * nodes in the auxiliary tree that are awaiting deletion.
3812 		 */
3813 		if (result != DNS_R_PARTIALMATCH && result != ISC_R_NOTFOUND) {
3814 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
3815 				      DNS_LOGMODULE_CACHE, ISC_LOG_ERROR,
3816 				      "previous_closest_nsec(): %s",
3817 				      isc_result_totext(result));
3818 			return (DNS_R_BADDB);
3819 		}
3820 	}
3821 }
3822 
3823 /*
3824  * Find the NSEC/NSEC3 which is or before the current point on the
3825  * search chain.  For NSEC3 records only NSEC3 records that match the
3826  * current NSEC3PARAM record are considered.
3827  */
3828 static isc_result_t
3829 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3830 		  dns_name_t *foundname, dns_rdataset_t *rdataset,
3831 		  dns_rdataset_t *sigrdataset, dns_rbt_t *tree,
3832 		  dns_db_secure_t secure) {
3833 	dns_rbtnode_t *node, *prevnode;
3834 	rdatasetheader_t *header, *header_next, *found, *foundsig;
3835 	dns_rbtnodechain_t nsecchain;
3836 	bool empty_node;
3837 	isc_result_t result;
3838 	dns_fixedname_t fname, forigin;
3839 	dns_name_t *name, *origin;
3840 	dns_rdatatype_t type;
3841 	rbtdb_rdatatype_t sigtype;
3842 	bool wraps;
3843 	bool first = true;
3844 	bool need_sig = (secure == dns_db_secure);
3845 
3846 	if (tree == search->rbtdb->nsec3) {
3847 		type = dns_rdatatype_nsec3;
3848 		sigtype = RBTDB_RDATATYPE_SIGNSEC3;
3849 		wraps = true;
3850 	} else {
3851 		type = dns_rdatatype_nsec;
3852 		sigtype = RBTDB_RDATATYPE_SIGNSEC;
3853 		wraps = false;
3854 	}
3855 
3856 	/*
3857 	 * Use the auxiliary tree only starting with the second node in the
3858 	 * hope that the original node will be right much of the time.
3859 	 */
3860 	name = dns_fixedname_initname(&fname);
3861 	origin = dns_fixedname_initname(&forigin);
3862 again:
3863 	node = NULL;
3864 	prevnode = NULL;
3865 	result = dns_rbtnodechain_current(&search->chain, name, origin, &node);
3866 	if (result != ISC_R_SUCCESS) {
3867 		return (result);
3868 	}
3869 	do {
3870 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3871 			  isc_rwlocktype_read);
3872 		found = NULL;
3873 		foundsig = NULL;
3874 		empty_node = true;
3875 		for (header = node->data; header != NULL; header = header_next)
3876 		{
3877 			header_next = header->next;
3878 			/*
3879 			 * Look for an active, extant NSEC or RRSIG NSEC.
3880 			 */
3881 			do {
3882 				if (header->serial <= search->serial &&
3883 				    !IGNORE(header))
3884 				{
3885 					/*
3886 					 * Is this a "this rdataset doesn't
3887 					 * exist" record?
3888 					 */
3889 					if (NONEXISTENT(header)) {
3890 						header = NULL;
3891 					}
3892 					break;
3893 				} else {
3894 					header = header->down;
3895 				}
3896 			} while (header != NULL);
3897 			if (header != NULL) {
3898 				/*
3899 				 * We now know that there is at least one
3900 				 * active rdataset at this node.
3901 				 */
3902 				empty_node = false;
3903 				if (header->type == type) {
3904 					found = header;
3905 					if (foundsig != NULL) {
3906 						break;
3907 					}
3908 				} else if (header->type == sigtype) {
3909 					foundsig = header;
3910 					if (found != NULL) {
3911 						break;
3912 					}
3913 				}
3914 			}
3915 		}
3916 		if (!empty_node) {
3917 			if (found != NULL && search->rbtversion->havensec3 &&
3918 			    found->type == dns_rdatatype_nsec3 &&
3919 			    !matchparams(found, search))
3920 			{
3921 				empty_node = true;
3922 				found = NULL;
3923 				foundsig = NULL;
3924 				result = previous_closest_nsec(
3925 					type, search, name, origin, &prevnode,
3926 					NULL, NULL);
3927 			} else if (found != NULL &&
3928 				   (foundsig != NULL || !need_sig))
3929 			{
3930 				/*
3931 				 * We've found the right NSEC/NSEC3 record.
3932 				 *
3933 				 * Note: for this to really be the right
3934 				 * NSEC record, it's essential that the NSEC
3935 				 * records of any nodes obscured by a zone
3936 				 * cut have been removed; we assume this is
3937 				 * the case.
3938 				 */
3939 				result = dns_name_concatenate(name, origin,
3940 							      foundname, NULL);
3941 				if (result == ISC_R_SUCCESS) {
3942 					if (nodep != NULL) {
3943 						new_reference(
3944 							search->rbtdb, node,
3945 							isc_rwlocktype_read);
3946 						*nodep = node;
3947 					}
3948 					bind_rdataset(search->rbtdb, node,
3949 						      found, search->now,
3950 						      isc_rwlocktype_read,
3951 						      rdataset);
3952 					if (foundsig != NULL) {
3953 						bind_rdataset(
3954 							search->rbtdb, node,
3955 							foundsig, search->now,
3956 							isc_rwlocktype_read,
3957 							sigrdataset);
3958 					}
3959 				}
3960 			} else if (found == NULL && foundsig == NULL) {
3961 				/*
3962 				 * This node is active, but has no NSEC or
3963 				 * RRSIG NSEC.  That means it's glue or
3964 				 * other obscured zone data that isn't
3965 				 * relevant for our search.  Treat the
3966 				 * node as if it were empty and keep looking.
3967 				 */
3968 				empty_node = true;
3969 				result = previous_closest_nsec(
3970 					type, search, name, origin, &prevnode,
3971 					&nsecchain, &first);
3972 			} else {
3973 				/*
3974 				 * We found an active node, but either the
3975 				 * NSEC or the RRSIG NSEC is missing.  This
3976 				 * shouldn't happen.
3977 				 */
3978 				result = DNS_R_BADDB;
3979 			}
3980 		} else {
3981 			/*
3982 			 * This node isn't active.  We've got to keep
3983 			 * looking.
3984 			 */
3985 			result = previous_closest_nsec(type, search, name,
3986 						       origin, &prevnode,
3987 						       &nsecchain, &first);
3988 		}
3989 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3990 			    isc_rwlocktype_read);
3991 		node = prevnode;
3992 		prevnode = NULL;
3993 	} while (empty_node && result == ISC_R_SUCCESS);
3994 
3995 	if (!first) {
3996 		dns_rbtnodechain_invalidate(&nsecchain);
3997 	}
3998 
3999 	if (result == ISC_R_NOMORE && wraps) {
4000 		result = dns_rbtnodechain_last(&search->chain, tree, NULL,
4001 					       NULL);
4002 		if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
4003 			wraps = false;
4004 			goto again;
4005 		}
4006 	}
4007 
4008 	/*
4009 	 * If the result is ISC_R_NOMORE, then we got to the beginning of
4010 	 * the database and didn't find a NSEC record.  This shouldn't
4011 	 * happen.
4012 	 */
4013 	if (result == ISC_R_NOMORE) {
4014 		result = DNS_R_BADDB;
4015 	}
4016 
4017 	return (result);
4018 }
4019 
4020 static isc_result_t
4021 zone_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
4022 	  dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
4023 	  dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset,
4024 	  dns_rdataset_t *sigrdataset) {
4025 	dns_rbtnode_t *node = NULL;
4026 	isc_result_t result;
4027 	rbtdb_search_t search;
4028 	bool cname_ok = true;
4029 	bool close_version = false;
4030 	bool maybe_zonecut = false;
4031 	bool at_zonecut = false;
4032 	bool wild;
4033 	bool empty_node;
4034 	rdatasetheader_t *header, *header_next, *found, *nsecheader;
4035 	rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
4036 	rbtdb_rdatatype_t sigtype;
4037 	bool active;
4038 	nodelock_t *lock;
4039 	dns_rbt_t *tree;
4040 
4041 	search.rbtdb = (dns_rbtdb_t *)db;
4042 
4043 	REQUIRE(VALID_RBTDB(search.rbtdb));
4044 	INSIST(version == NULL ||
4045 	       ((rbtdb_version_t *)version)->rbtdb == (dns_rbtdb_t *)db);
4046 
4047 	/*
4048 	 * We don't care about 'now'.
4049 	 */
4050 	UNUSED(now);
4051 
4052 	/*
4053 	 * If the caller didn't supply a version, attach to the current
4054 	 * version.
4055 	 */
4056 	if (version == NULL) {
4057 		currentversion(db, &version);
4058 		close_version = true;
4059 	}
4060 
4061 	search.rbtversion = version;
4062 	search.serial = search.rbtversion->serial;
4063 	search.options = options;
4064 	search.copy_name = false;
4065 	search.need_cleanup = false;
4066 	search.wild = false;
4067 	search.zonecut = NULL;
4068 	dns_fixedname_init(&search.zonecut_name);
4069 	dns_rbtnodechain_init(&search.chain);
4070 	search.now = 0;
4071 
4072 	/*
4073 	 * 'wild' will be true iff. we've matched a wildcard.
4074 	 */
4075 	wild = false;
4076 
4077 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4078 
4079 	/*
4080 	 * Search down from the root of the tree.  If, while going down, we
4081 	 * encounter a callback node, zone_zonecut_callback() will search the
4082 	 * rdatasets at the zone cut for active DNAME or NS rdatasets.
4083 	 */
4084 	tree = (options & DNS_DBFIND_FORCENSEC3) != 0 ? search.rbtdb->nsec3
4085 						      : search.rbtdb->tree;
4086 	result = dns_rbt_findnode(tree, name, foundname, &node, &search.chain,
4087 				  DNS_RBTFIND_EMPTYDATA, zone_zonecut_callback,
4088 				  &search);
4089 
4090 	if (result == DNS_R_PARTIALMATCH) {
4091 	partial_match:
4092 		if (search.zonecut != NULL) {
4093 			result = setup_delegation(&search, nodep, foundname,
4094 						  rdataset, sigrdataset);
4095 			goto tree_exit;
4096 		}
4097 
4098 		if (search.wild) {
4099 			/*
4100 			 * At least one of the levels in the search chain
4101 			 * potentially has a wildcard.  For each such level,
4102 			 * we must see if there's a matching wildcard active
4103 			 * in the current version.
4104 			 */
4105 			result = find_wildcard(&search, &node, name);
4106 			if (result == ISC_R_SUCCESS) {
4107 				dns_name_copynf(name, foundname);
4108 				wild = true;
4109 				goto found;
4110 			} else if (result != ISC_R_NOTFOUND) {
4111 				goto tree_exit;
4112 			}
4113 		}
4114 
4115 		active = false;
4116 		if ((options & DNS_DBFIND_FORCENSEC3) == 0) {
4117 			/*
4118 			 * The NSEC3 tree won't have empty nodes,
4119 			 * so it isn't necessary to check for them.
4120 			 */
4121 			dns_rbtnodechain_t chain = search.chain;
4122 			active = activeempty(&search, &chain, name);
4123 		}
4124 
4125 		/*
4126 		 * If we're here, then the name does not exist, is not
4127 		 * beneath a zonecut, and there's no matching wildcard.
4128 		 */
4129 		if ((search.rbtversion->secure == dns_db_secure &&
4130 		     !search.rbtversion->havensec3) ||
4131 		    (search.options & DNS_DBFIND_FORCENSEC) != 0 ||
4132 		    (search.options & DNS_DBFIND_FORCENSEC3) != 0)
4133 		{
4134 			result = find_closest_nsec(&search, nodep, foundname,
4135 						   rdataset, sigrdataset, tree,
4136 						   search.rbtversion->secure);
4137 			if (result == ISC_R_SUCCESS) {
4138 				result = active ? DNS_R_EMPTYNAME
4139 						: DNS_R_NXDOMAIN;
4140 			}
4141 		} else {
4142 			result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
4143 		}
4144 		goto tree_exit;
4145 	} else if (result != ISC_R_SUCCESS) {
4146 		goto tree_exit;
4147 	}
4148 
4149 found:
4150 	/*
4151 	 * We have found a node whose name is the desired name, or we
4152 	 * have matched a wildcard.
4153 	 */
4154 
4155 	if (search.zonecut != NULL) {
4156 		/*
4157 		 * If we're beneath a zone cut, we don't want to look for
4158 		 * CNAMEs because they're not legitimate zone glue.
4159 		 */
4160 		cname_ok = false;
4161 	} else {
4162 		/*
4163 		 * The node may be a zone cut itself.  If it might be one,
4164 		 * make sure we check for it later.
4165 		 *
4166 		 * DS records live above the zone cut in ordinary zone so
4167 		 * we want to ignore any referral.
4168 		 *
4169 		 * Stub zones don't have anything "above" the delegation so
4170 		 * we always return a referral.
4171 		 */
4172 		if (node->find_callback &&
4173 		    ((node != search.rbtdb->origin_node &&
4174 		      !dns_rdatatype_atparent(type)) ||
4175 		     IS_STUB(search.rbtdb)))
4176 		{
4177 			maybe_zonecut = true;
4178 		}
4179 	}
4180 
4181 	/*
4182 	 * Certain DNSSEC types are not subject to CNAME matching
4183 	 * (RFC4035, section 2.5 and RFC3007).
4184 	 *
4185 	 * We don't check for RRSIG, because we don't store RRSIG records
4186 	 * directly.
4187 	 */
4188 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
4189 		cname_ok = false;
4190 	}
4191 
4192 	/*
4193 	 * We now go looking for rdata...
4194 	 */
4195 
4196 	lock = &search.rbtdb->node_locks[node->locknum].lock;
4197 	NODE_LOCK(lock, isc_rwlocktype_read);
4198 
4199 	found = NULL;
4200 	foundsig = NULL;
4201 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4202 	nsecheader = NULL;
4203 	nsecsig = NULL;
4204 	cnamesig = NULL;
4205 	empty_node = true;
4206 	for (header = node->data; header != NULL; header = header_next) {
4207 		header_next = header->next;
4208 		/*
4209 		 * Look for an active, extant rdataset.
4210 		 */
4211 		do {
4212 			if (header->serial <= search.serial && !IGNORE(header))
4213 			{
4214 				/*
4215 				 * Is this a "this rdataset doesn't
4216 				 * exist" record?
4217 				 */
4218 				if (NONEXISTENT(header)) {
4219 					header = NULL;
4220 				}
4221 				break;
4222 			} else {
4223 				header = header->down;
4224 			}
4225 		} while (header != NULL);
4226 		if (header != NULL) {
4227 			/*
4228 			 * We now know that there is at least one active
4229 			 * rdataset at this node.
4230 			 */
4231 			empty_node = false;
4232 
4233 			/*
4234 			 * Do special zone cut handling, if requested.
4235 			 */
4236 			if (maybe_zonecut && header->type == dns_rdatatype_ns) {
4237 				/*
4238 				 * We increment the reference count on node to
4239 				 * ensure that search->zonecut_rdataset will
4240 				 * still be valid later.
4241 				 */
4242 				new_reference(search.rbtdb, node,
4243 					      isc_rwlocktype_read);
4244 				search.zonecut = node;
4245 				search.zonecut_rdataset = header;
4246 				search.zonecut_sigrdataset = NULL;
4247 				search.need_cleanup = true;
4248 				maybe_zonecut = false;
4249 				at_zonecut = true;
4250 				/*
4251 				 * It is not clear if KEY should still be
4252 				 * allowed at the parent side of the zone
4253 				 * cut or not.  It is needed for RFC3007
4254 				 * validated updates.
4255 				 */
4256 				if ((search.options & DNS_DBFIND_GLUEOK) == 0 &&
4257 				    type != dns_rdatatype_nsec &&
4258 				    type != dns_rdatatype_key)
4259 				{
4260 					/*
4261 					 * Glue is not OK, but any answer we
4262 					 * could return would be glue.  Return
4263 					 * the delegation.
4264 					 */
4265 					found = NULL;
4266 					break;
4267 				}
4268 				if (found != NULL && foundsig != NULL) {
4269 					break;
4270 				}
4271 			}
4272 
4273 			/*
4274 			 * If the NSEC3 record doesn't match the chain
4275 			 * we are using behave as if it isn't here.
4276 			 */
4277 			if (header->type == dns_rdatatype_nsec3 &&
4278 			    !matchparams(header, &search))
4279 			{
4280 				NODE_UNLOCK(lock, isc_rwlocktype_read);
4281 				goto partial_match;
4282 			}
4283 			/*
4284 			 * If we found a type we were looking for,
4285 			 * remember it.
4286 			 */
4287 			if (header->type == type || type == dns_rdatatype_any ||
4288 			    (header->type == dns_rdatatype_cname && cname_ok))
4289 			{
4290 				/*
4291 				 * We've found the answer!
4292 				 */
4293 				found = header;
4294 				if (header->type == dns_rdatatype_cname &&
4295 				    cname_ok)
4296 				{
4297 					/*
4298 					 * We may be finding a CNAME instead
4299 					 * of the desired type.
4300 					 *
4301 					 * If we've already got the CNAME RRSIG,
4302 					 * use it, otherwise change sigtype
4303 					 * so that we find it.
4304 					 */
4305 					if (cnamesig != NULL) {
4306 						foundsig = cnamesig;
4307 					} else {
4308 						sigtype =
4309 							RBTDB_RDATATYPE_SIGCNAME;
4310 					}
4311 				}
4312 				/*
4313 				 * If we've got all we need, end the search.
4314 				 */
4315 				if (!maybe_zonecut && foundsig != NULL) {
4316 					break;
4317 				}
4318 			} else if (header->type == sigtype) {
4319 				/*
4320 				 * We've found the RRSIG rdataset for our
4321 				 * target type.  Remember it.
4322 				 */
4323 				foundsig = header;
4324 				/*
4325 				 * If we've got all we need, end the search.
4326 				 */
4327 				if (!maybe_zonecut && found != NULL) {
4328 					break;
4329 				}
4330 			} else if (header->type == dns_rdatatype_nsec &&
4331 				   !search.rbtversion->havensec3)
4332 			{
4333 				/*
4334 				 * Remember a NSEC rdataset even if we're
4335 				 * not specifically looking for it, because
4336 				 * we might need it later.
4337 				 */
4338 				nsecheader = header;
4339 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC &&
4340 				   !search.rbtversion->havensec3)
4341 			{
4342 				/*
4343 				 * If we need the NSEC rdataset, we'll also
4344 				 * need its signature.
4345 				 */
4346 				nsecsig = header;
4347 			} else if (cname_ok &&
4348 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
4349 			{
4350 				/*
4351 				 * If we get a CNAME match, we'll also need
4352 				 * its signature.
4353 				 */
4354 				cnamesig = header;
4355 			}
4356 		}
4357 	}
4358 
4359 	if (empty_node) {
4360 		/*
4361 		 * We have an exact match for the name, but there are no
4362 		 * active rdatasets in the desired version.  That means that
4363 		 * this node doesn't exist in the desired version, and that
4364 		 * we really have a partial match.
4365 		 */
4366 		if (!wild) {
4367 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4368 			goto partial_match;
4369 		}
4370 	}
4371 
4372 	/*
4373 	 * If we didn't find what we were looking for...
4374 	 */
4375 	if (found == NULL) {
4376 		if (search.zonecut != NULL) {
4377 			/*
4378 			 * We were trying to find glue at a node beneath a
4379 			 * zone cut, but didn't.
4380 			 *
4381 			 * Return the delegation.
4382 			 */
4383 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4384 			result = setup_delegation(&search, nodep, foundname,
4385 						  rdataset, sigrdataset);
4386 			goto tree_exit;
4387 		}
4388 		/*
4389 		 * The desired type doesn't exist.
4390 		 */
4391 		result = DNS_R_NXRRSET;
4392 		if (search.rbtversion->secure == dns_db_secure &&
4393 		    !search.rbtversion->havensec3 &&
4394 		    (nsecheader == NULL || nsecsig == NULL))
4395 		{
4396 			/*
4397 			 * The zone is secure but there's no NSEC,
4398 			 * or the NSEC has no signature!
4399 			 */
4400 			if (!wild) {
4401 				result = DNS_R_BADDB;
4402 				goto node_exit;
4403 			}
4404 
4405 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4406 			result = find_closest_nsec(&search, nodep, foundname,
4407 						   rdataset, sigrdataset,
4408 						   search.rbtdb->tree,
4409 						   search.rbtversion->secure);
4410 			if (result == ISC_R_SUCCESS) {
4411 				result = DNS_R_EMPTYWILD;
4412 			}
4413 			goto tree_exit;
4414 		}
4415 		if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
4416 		    nsecheader == NULL)
4417 		{
4418 			/*
4419 			 * There's no NSEC record, and we were told
4420 			 * to find one.
4421 			 */
4422 			result = DNS_R_BADDB;
4423 			goto node_exit;
4424 		}
4425 		if (nodep != NULL) {
4426 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4427 			*nodep = node;
4428 		}
4429 		if ((search.rbtversion->secure == dns_db_secure &&
4430 		     !search.rbtversion->havensec3) ||
4431 		    (search.options & DNS_DBFIND_FORCENSEC) != 0)
4432 		{
4433 			bind_rdataset(search.rbtdb, node, nsecheader, 0,
4434 				      isc_rwlocktype_read, rdataset);
4435 			if (nsecsig != NULL) {
4436 				bind_rdataset(search.rbtdb, node, nsecsig, 0,
4437 					      isc_rwlocktype_read, sigrdataset);
4438 			}
4439 		}
4440 		if (wild) {
4441 			foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4442 		}
4443 		goto node_exit;
4444 	}
4445 
4446 	/*
4447 	 * We found what we were looking for, or we found a CNAME.
4448 	 */
4449 
4450 	if (type != found->type && type != dns_rdatatype_any &&
4451 	    found->type == dns_rdatatype_cname)
4452 	{
4453 		/*
4454 		 * We weren't doing an ANY query and we found a CNAME instead
4455 		 * of the type we were looking for, so we need to indicate
4456 		 * that result to the caller.
4457 		 */
4458 		result = DNS_R_CNAME;
4459 	} else if (search.zonecut != NULL) {
4460 		/*
4461 		 * If we're beneath a zone cut, we must indicate that the
4462 		 * result is glue, unless we're actually at the zone cut
4463 		 * and the type is NSEC or KEY.
4464 		 */
4465 		if (search.zonecut == node) {
4466 			/*
4467 			 * It is not clear if KEY should still be
4468 			 * allowed at the parent side of the zone
4469 			 * cut or not.  It is needed for RFC3007
4470 			 * validated updates.
4471 			 */
4472 			if (type == dns_rdatatype_nsec ||
4473 			    type == dns_rdatatype_nsec3 ||
4474 			    type == dns_rdatatype_key)
4475 			{
4476 				result = ISC_R_SUCCESS;
4477 			} else if (type == dns_rdatatype_any) {
4478 				result = DNS_R_ZONECUT;
4479 			} else {
4480 				result = DNS_R_GLUE;
4481 			}
4482 		} else {
4483 			result = DNS_R_GLUE;
4484 		}
4485 		/*
4486 		 * We might have found data that isn't glue, but was occluded
4487 		 * by a dynamic update.  If the caller cares about this, they
4488 		 * will have told us to validate glue.
4489 		 *
4490 		 * XXX We should cache the glue validity state!
4491 		 */
4492 		if (result == DNS_R_GLUE &&
4493 		    (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
4494 		    !valid_glue(&search, foundname, type, node))
4495 		{
4496 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4497 			result = setup_delegation(&search, nodep, foundname,
4498 						  rdataset, sigrdataset);
4499 			goto tree_exit;
4500 		}
4501 	} else {
4502 		/*
4503 		 * An ordinary successful query!
4504 		 */
4505 		result = ISC_R_SUCCESS;
4506 	}
4507 
4508 	if (nodep != NULL) {
4509 		if (!at_zonecut) {
4510 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4511 		} else {
4512 			search.need_cleanup = false;
4513 		}
4514 		*nodep = node;
4515 	}
4516 
4517 	if (type != dns_rdatatype_any) {
4518 		bind_rdataset(search.rbtdb, node, found, 0, isc_rwlocktype_read,
4519 			      rdataset);
4520 		if (foundsig != NULL) {
4521 			bind_rdataset(search.rbtdb, node, foundsig, 0,
4522 				      isc_rwlocktype_read, sigrdataset);
4523 		}
4524 	}
4525 
4526 	if (wild) {
4527 		foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4528 	}
4529 
4530 node_exit:
4531 	NODE_UNLOCK(lock, isc_rwlocktype_read);
4532 
4533 tree_exit:
4534 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4535 
4536 	/*
4537 	 * If we found a zonecut but aren't going to use it, we have to
4538 	 * let go of it.
4539 	 */
4540 	if (search.need_cleanup) {
4541 		node = search.zonecut;
4542 		INSIST(node != NULL);
4543 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
4544 
4545 		NODE_LOCK(lock, isc_rwlocktype_read);
4546 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
4547 				    isc_rwlocktype_none, false);
4548 		NODE_UNLOCK(lock, isc_rwlocktype_read);
4549 	}
4550 
4551 	if (close_version) {
4552 		closeversion(db, &version, false);
4553 	}
4554 
4555 	dns_rbtnodechain_reset(&search.chain);
4556 
4557 	return (result);
4558 }
4559 
4560 static isc_result_t
4561 zone_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
4562 		 isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname,
4563 		 dns_name_t *dcname, dns_rdataset_t *rdataset,
4564 		 dns_rdataset_t *sigrdataset) {
4565 	UNUSED(db);
4566 	UNUSED(name);
4567 	UNUSED(options);
4568 	UNUSED(now);
4569 	UNUSED(nodep);
4570 	UNUSED(foundname);
4571 	UNUSED(dcname);
4572 	UNUSED(rdataset);
4573 	UNUSED(sigrdataset);
4574 
4575 	FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
4576 
4577 	UNREACHABLE();
4578 	return (ISC_R_NOTIMPLEMENTED);
4579 }
4580 
4581 static bool
4582 check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header,
4583 		   isc_rwlocktype_t *locktype, nodelock_t *lock,
4584 		   rbtdb_search_t *search, rdatasetheader_t **header_prev) {
4585 	if (!ACTIVE(header, search->now)) {
4586 		dns_ttl_t stale = header->rdh_ttl +
4587 				  search->rbtdb->serve_stale_ttl;
4588 		/*
4589 		 * If this data is in the stale window keep it and if
4590 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
4591 		 * skip this record.  We skip the records with ZEROTTL
4592 		 * (these records should not be cached anyway).
4593 		 */
4594 
4595 		RDATASET_ATTR_CLR(header, RDATASET_ATTR_STALE_WINDOW);
4596 		if (!ZEROTTL(header) && KEEPSTALE(search->rbtdb) &&
4597 		    stale > search->now)
4598 		{
4599 			mark_header_stale(search->rbtdb, header);
4600 			*header_prev = header;
4601 			/*
4602 			 * If DNS_DBFIND_STALESTART is set then it means we
4603 			 * failed to resolve the name during recursion, in
4604 			 * this case we mark the time in which the refresh
4605 			 * failed.
4606 			 */
4607 			if ((search->options & DNS_DBFIND_STALESTART) != 0) {
4608 				atomic_store_release(
4609 					&header->last_refresh_fail_ts,
4610 					search->now);
4611 			} else if ((search->options &
4612 				    DNS_DBFIND_STALEENABLED) != 0 &&
4613 				   search->now <
4614 					   (atomic_load_acquire(
4615 						    &header->last_refresh_fail_ts) +
4616 					    search->rbtdb->serve_stale_refresh))
4617 			{
4618 				/*
4619 				 * If we are within interval between last
4620 				 * refresh failure time + 'stale-refresh-time',
4621 				 * then don't skip this stale entry but use it
4622 				 * instead.
4623 				 */
4624 				RDATASET_ATTR_SET(header,
4625 						  RDATASET_ATTR_STALE_WINDOW);
4626 				return (false);
4627 			} else if ((search->options &
4628 				    DNS_DBFIND_STALETIMEOUT) != 0)
4629 			{
4630 				/*
4631 				 * We want stale RRset due to timeout, so we
4632 				 * don't skip it.
4633 				 */
4634 				return (false);
4635 			}
4636 			return ((search->options & DNS_DBFIND_STALEOK) == 0);
4637 		}
4638 
4639 		/*
4640 		 * This rdataset is stale.  If no one else is using the
4641 		 * node, we can clean it up right now, otherwise we mark
4642 		 * it as ancient, and the node as dirty, so it will get
4643 		 * cleaned up later.
4644 		 */
4645 		if ((header->rdh_ttl < search->now - RBTDB_VIRTUAL) &&
4646 		    (*locktype == isc_rwlocktype_write ||
4647 		     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
4648 		{
4649 			/*
4650 			 * We update the node's status only when we can
4651 			 * get write access; otherwise, we leave others
4652 			 * to this work.  Periodical cleaning will
4653 			 * eventually take the job as the last resort.
4654 			 * We won't downgrade the lock, since other
4655 			 * rdatasets are probably stale, too.
4656 			 */
4657 			*locktype = isc_rwlocktype_write;
4658 
4659 			if (isc_refcount_current(&node->references) == 0) {
4660 				isc_mem_t *mctx;
4661 
4662 				/*
4663 				 * header->down can be non-NULL if the
4664 				 * refcount has just decremented to 0
4665 				 * but decrement_reference() has not
4666 				 * performed clean_cache_node(), in
4667 				 * which case we need to purge the stale
4668 				 * headers first.
4669 				 */
4670 				mctx = search->rbtdb->common.mctx;
4671 				clean_stale_headers(search->rbtdb, mctx,
4672 						    header);
4673 				if (*header_prev != NULL) {
4674 					(*header_prev)->next = header->next;
4675 				} else {
4676 					node->data = header->next;
4677 				}
4678 				free_rdataset(search->rbtdb, mctx, header);
4679 			} else {
4680 				mark_header_ancient(search->rbtdb, header);
4681 				*header_prev = header;
4682 			}
4683 		} else {
4684 			*header_prev = header;
4685 		}
4686 		return (true);
4687 	}
4688 	return (false);
4689 }
4690 
4691 static isc_result_t
4692 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
4693 	rbtdb_search_t *search = arg;
4694 	rdatasetheader_t *header, *header_prev, *header_next;
4695 	rdatasetheader_t *dname_header, *sigdname_header;
4696 	isc_result_t result;
4697 	nodelock_t *lock;
4698 	isc_rwlocktype_t locktype;
4699 
4700 	/* XXX comment */
4701 
4702 	REQUIRE(search->zonecut == NULL);
4703 
4704 	/*
4705 	 * Keep compiler silent.
4706 	 */
4707 	UNUSED(name);
4708 
4709 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
4710 	locktype = isc_rwlocktype_read;
4711 	NODE_LOCK(lock, locktype);
4712 
4713 	/*
4714 	 * Look for a DNAME or RRSIG DNAME rdataset.
4715 	 */
4716 	dname_header = NULL;
4717 	sigdname_header = NULL;
4718 	header_prev = NULL;
4719 	for (header = node->data; header != NULL; header = header_next) {
4720 		header_next = header->next;
4721 		if (check_stale_header(node, header, &locktype, lock, search,
4722 				       &header_prev))
4723 		{
4724 			/* Do nothing. */
4725 		} else if (header->type == dns_rdatatype_dname &&
4726 			   EXISTS(header) && !ANCIENT(header))
4727 		{
4728 			dname_header = header;
4729 			header_prev = header;
4730 		} else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
4731 			   EXISTS(header) && !ANCIENT(header))
4732 		{
4733 			sigdname_header = header;
4734 			header_prev = header;
4735 		} else {
4736 			header_prev = header;
4737 		}
4738 	}
4739 
4740 	if (dname_header != NULL &&
4741 	    (!DNS_TRUST_PENDING(dname_header->trust) ||
4742 	     (search->options & DNS_DBFIND_PENDINGOK) != 0))
4743 	{
4744 		/*
4745 		 * We increment the reference count on node to ensure that
4746 		 * search->zonecut_rdataset will still be valid later.
4747 		 */
4748 		new_reference(search->rbtdb, node, locktype);
4749 		search->zonecut = node;
4750 		search->zonecut_rdataset = dname_header;
4751 		search->zonecut_sigrdataset = sigdname_header;
4752 		search->need_cleanup = true;
4753 		result = DNS_R_PARTIALMATCH;
4754 	} else {
4755 		result = DNS_R_CONTINUE;
4756 	}
4757 
4758 	NODE_UNLOCK(lock, locktype);
4759 
4760 	return (result);
4761 }
4762 
4763 static isc_result_t
4764 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
4765 		     dns_dbnode_t **nodep, dns_name_t *foundname,
4766 		     dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
4767 	unsigned int i;
4768 	dns_rbtnode_t *level_node;
4769 	rdatasetheader_t *header, *header_prev, *header_next;
4770 	rdatasetheader_t *found, *foundsig;
4771 	isc_result_t result = ISC_R_NOTFOUND;
4772 	dns_name_t name;
4773 	dns_rbtdb_t *rbtdb;
4774 	bool done;
4775 	nodelock_t *lock;
4776 	isc_rwlocktype_t locktype;
4777 
4778 	/*
4779 	 * Caller must be holding the tree lock.
4780 	 */
4781 
4782 	rbtdb = search->rbtdb;
4783 	i = search->chain.level_matches;
4784 	done = false;
4785 	do {
4786 		locktype = isc_rwlocktype_read;
4787 		lock = &rbtdb->node_locks[node->locknum].lock;
4788 		NODE_LOCK(lock, locktype);
4789 
4790 		/*
4791 		 * Look for NS and RRSIG NS rdatasets.
4792 		 */
4793 		found = NULL;
4794 		foundsig = NULL;
4795 		header_prev = NULL;
4796 		for (header = node->data; header != NULL; header = header_next)
4797 		{
4798 			header_next = header->next;
4799 			if (check_stale_header(node, header, &locktype, lock,
4800 					       search, &header_prev))
4801 			{
4802 				/* Do nothing. */
4803 			} else if (EXISTS(header) && !ANCIENT(header)) {
4804 				/*
4805 				 * We've found an extant rdataset.  See if
4806 				 * we're interested in it.
4807 				 */
4808 				if (header->type == dns_rdatatype_ns) {
4809 					found = header;
4810 					if (foundsig != NULL) {
4811 						break;
4812 					}
4813 				} else if (header->type ==
4814 					   RBTDB_RDATATYPE_SIGNS)
4815 				{
4816 					foundsig = header;
4817 					if (found != NULL) {
4818 						break;
4819 					}
4820 				}
4821 				header_prev = header;
4822 			} else {
4823 				header_prev = header;
4824 			}
4825 		}
4826 
4827 		if (found != NULL) {
4828 			/*
4829 			 * If we have to set foundname, we do it before
4830 			 * anything else.  If we were to set foundname after
4831 			 * we had set nodep or bound the rdataset, then we'd
4832 			 * have to undo that work if dns_name_concatenate()
4833 			 * failed.  By setting foundname first, there's
4834 			 * nothing to undo if we have trouble.
4835 			 */
4836 			if (foundname != NULL) {
4837 				dns_name_init(&name, NULL);
4838 				dns_rbt_namefromnode(node, &name);
4839 				dns_name_copynf(&name, foundname);
4840 				while (i > 0) {
4841 					i--;
4842 					level_node = search->chain.levels[i];
4843 					dns_name_init(&name, NULL);
4844 					dns_rbt_namefromnode(level_node, &name);
4845 					result = dns_name_concatenate(
4846 						foundname, &name, foundname,
4847 						NULL);
4848 					if (result != ISC_R_SUCCESS) {
4849 						if (nodep != NULL) {
4850 							*nodep = NULL;
4851 						}
4852 						goto node_exit;
4853 					}
4854 				}
4855 			}
4856 			result = DNS_R_DELEGATION;
4857 			if (nodep != NULL) {
4858 				new_reference(search->rbtdb, node, locktype);
4859 				*nodep = node;
4860 			}
4861 			bind_rdataset(search->rbtdb, node, found, search->now,
4862 				      locktype, rdataset);
4863 			if (foundsig != NULL) {
4864 				bind_rdataset(search->rbtdb, node, foundsig,
4865 					      search->now, locktype,
4866 					      sigrdataset);
4867 			}
4868 			if (need_headerupdate(found, search->now) ||
4869 			    (foundsig != NULL &&
4870 			     need_headerupdate(foundsig, search->now)))
4871 			{
4872 				if (locktype != isc_rwlocktype_write) {
4873 					NODE_UNLOCK(lock, locktype);
4874 					NODE_LOCK(lock, isc_rwlocktype_write);
4875 					locktype = isc_rwlocktype_write;
4876 					POST(locktype);
4877 				}
4878 				if (need_headerupdate(found, search->now)) {
4879 					update_header(search->rbtdb, found,
4880 						      search->now);
4881 				}
4882 				if (foundsig != NULL &&
4883 				    need_headerupdate(foundsig, search->now))
4884 				{
4885 					update_header(search->rbtdb, foundsig,
4886 						      search->now);
4887 				}
4888 			}
4889 		}
4890 
4891 	node_exit:
4892 		NODE_UNLOCK(lock, locktype);
4893 
4894 		if (found == NULL && i > 0) {
4895 			i--;
4896 			node = search->chain.levels[i];
4897 		} else {
4898 			done = true;
4899 		}
4900 	} while (!done);
4901 
4902 	return (result);
4903 }
4904 
4905 static isc_result_t
4906 find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
4907 		  isc_stdtime_t now, dns_name_t *foundname,
4908 		  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
4909 	dns_rbtnode_t *node;
4910 	rdatasetheader_t *header, *header_next, *header_prev;
4911 	rdatasetheader_t *found, *foundsig;
4912 	bool empty_node;
4913 	isc_result_t result;
4914 	dns_fixedname_t fname, forigin;
4915 	dns_name_t *name, *origin;
4916 	rbtdb_rdatatype_t matchtype, sigmatchtype;
4917 	nodelock_t *lock;
4918 	isc_rwlocktype_t locktype;
4919 	dns_rbtnodechain_t chain;
4920 
4921 	chain = search->chain;
4922 
4923 	matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
4924 	sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
4925 					     dns_rdatatype_nsec);
4926 
4927 	do {
4928 		node = NULL;
4929 		name = dns_fixedname_initname(&fname);
4930 		origin = dns_fixedname_initname(&forigin);
4931 		result = dns_rbtnodechain_current(&chain, name, origin, &node);
4932 		if (result != ISC_R_SUCCESS) {
4933 			return (result);
4934 		}
4935 		locktype = isc_rwlocktype_read;
4936 		lock = &(search->rbtdb->node_locks[node->locknum].lock);
4937 		NODE_LOCK(lock, locktype);
4938 		found = NULL;
4939 		foundsig = NULL;
4940 		empty_node = true;
4941 		header_prev = NULL;
4942 		for (header = node->data; header != NULL; header = header_next)
4943 		{
4944 			header_next = header->next;
4945 			if (check_stale_header(node, header, &locktype, lock,
4946 					       search, &header_prev))
4947 			{
4948 				continue;
4949 			}
4950 			if (NONEXISTENT(header) ||
4951 			    RBTDB_RDATATYPE_BASE(header->type) == 0)
4952 			{
4953 				header_prev = header;
4954 				continue;
4955 			}
4956 			/*
4957 			 * Don't stop on provable noqname / RRSIG.
4958 			 */
4959 			if (header->noqname == NULL &&
4960 			    RBTDB_RDATATYPE_BASE(header->type) !=
4961 				    dns_rdatatype_rrsig)
4962 			{
4963 				empty_node = false;
4964 			}
4965 			if (header->type == matchtype) {
4966 				found = header;
4967 			} else if (header->type == sigmatchtype) {
4968 				foundsig = header;
4969 			}
4970 			header_prev = header;
4971 		}
4972 		if (found != NULL) {
4973 			result = dns_name_concatenate(name, origin, foundname,
4974 						      NULL);
4975 			if (result != ISC_R_SUCCESS) {
4976 				goto unlock_node;
4977 			}
4978 			bind_rdataset(search->rbtdb, node, found, now, locktype,
4979 				      rdataset);
4980 			if (foundsig != NULL) {
4981 				bind_rdataset(search->rbtdb, node, foundsig,
4982 					      now, locktype, sigrdataset);
4983 			}
4984 			new_reference(search->rbtdb, node, locktype);
4985 			*nodep = node;
4986 			result = DNS_R_COVERINGNSEC;
4987 		} else if (!empty_node) {
4988 			result = ISC_R_NOTFOUND;
4989 		} else {
4990 			result = dns_rbtnodechain_prev(&chain, NULL, NULL);
4991 		}
4992 	unlock_node:
4993 		NODE_UNLOCK(lock, locktype);
4994 	} while (empty_node && result == ISC_R_SUCCESS);
4995 	return (result);
4996 }
4997 
4998 static isc_result_t
4999 cache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
5000 	   dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
5001 	   dns_dbnode_t **nodep, dns_name_t *foundname,
5002 	   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5003 	dns_rbtnode_t *node = NULL;
5004 	isc_result_t result;
5005 	rbtdb_search_t search;
5006 	bool cname_ok = true;
5007 	bool empty_node;
5008 	nodelock_t *lock;
5009 	isc_rwlocktype_t locktype;
5010 	rdatasetheader_t *header, *header_prev, *header_next;
5011 	rdatasetheader_t *found, *nsheader;
5012 	rdatasetheader_t *foundsig, *nssig, *cnamesig;
5013 	rdatasetheader_t *update, *updatesig;
5014 	rdatasetheader_t *nsecheader, *nsecsig;
5015 	rbtdb_rdatatype_t sigtype, negtype;
5016 
5017 	UNUSED(version);
5018 
5019 	search.rbtdb = (dns_rbtdb_t *)db;
5020 
5021 	REQUIRE(VALID_RBTDB(search.rbtdb));
5022 	REQUIRE(version == NULL);
5023 
5024 	if (now == 0) {
5025 		isc_stdtime_get(&now);
5026 	}
5027 
5028 	search.rbtversion = NULL;
5029 	search.serial = 1;
5030 	search.options = options;
5031 	search.copy_name = false;
5032 	search.need_cleanup = false;
5033 	search.wild = false;
5034 	search.zonecut = NULL;
5035 	dns_fixedname_init(&search.zonecut_name);
5036 	dns_rbtnodechain_init(&search.chain);
5037 	search.now = now;
5038 	update = NULL;
5039 	updatesig = NULL;
5040 
5041 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5042 
5043 	/*
5044 	 * Search down from the root of the tree.  If, while going down, we
5045 	 * encounter a callback node, cache_zonecut_callback() will search the
5046 	 * rdatasets at the zone cut for a DNAME rdataset.
5047 	 */
5048 	result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
5049 				  &search.chain, DNS_RBTFIND_EMPTYDATA,
5050 				  cache_zonecut_callback, &search);
5051 
5052 	if (result == DNS_R_PARTIALMATCH) {
5053 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
5054 			result = find_coveringnsec(&search, nodep, now,
5055 						   foundname, rdataset,
5056 						   sigrdataset);
5057 			if (result == DNS_R_COVERINGNSEC) {
5058 				goto tree_exit;
5059 			}
5060 		}
5061 		if (search.zonecut != NULL) {
5062 			result = setup_delegation(&search, nodep, foundname,
5063 						  rdataset, sigrdataset);
5064 			goto tree_exit;
5065 		} else {
5066 		find_ns:
5067 			result = find_deepest_zonecut(&search, node, nodep,
5068 						      foundname, rdataset,
5069 						      sigrdataset);
5070 			goto tree_exit;
5071 		}
5072 	} else if (result != ISC_R_SUCCESS) {
5073 		goto tree_exit;
5074 	}
5075 
5076 	/*
5077 	 * Certain DNSSEC types are not subject to CNAME matching
5078 	 * (RFC4035, section 2.5 and RFC3007).
5079 	 *
5080 	 * We don't check for RRSIG, because we don't store RRSIG records
5081 	 * directly.
5082 	 */
5083 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
5084 		cname_ok = false;
5085 	}
5086 
5087 	/*
5088 	 * We now go looking for rdata...
5089 	 */
5090 
5091 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5092 	locktype = isc_rwlocktype_read;
5093 	NODE_LOCK(lock, locktype);
5094 
5095 	found = NULL;
5096 	foundsig = NULL;
5097 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5098 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
5099 	nsheader = NULL;
5100 	nsecheader = NULL;
5101 	nssig = NULL;
5102 	nsecsig = NULL;
5103 	cnamesig = NULL;
5104 	empty_node = true;
5105 	header_prev = NULL;
5106 	for (header = node->data; header != NULL; header = header_next) {
5107 		header_next = header->next;
5108 		if (check_stale_header(node, header, &locktype, lock, &search,
5109 				       &header_prev))
5110 		{
5111 			/* Do nothing. */
5112 		} else if (EXISTS(header) && !ANCIENT(header)) {
5113 			/*
5114 			 * We now know that there is at least one active
5115 			 * non-stale rdataset at this node.
5116 			 */
5117 			empty_node = false;
5118 
5119 			/*
5120 			 * If we found a type we were looking for, remember
5121 			 * it.
5122 			 */
5123 			if (header->type == type ||
5124 			    (type == dns_rdatatype_any &&
5125 			     RBTDB_RDATATYPE_BASE(header->type) != 0) ||
5126 			    (cname_ok && header->type == dns_rdatatype_cname))
5127 			{
5128 				/*
5129 				 * We've found the answer.
5130 				 */
5131 				found = header;
5132 				if (header->type == dns_rdatatype_cname &&
5133 				    cname_ok && cnamesig != NULL)
5134 				{
5135 					/*
5136 					 * If we've already got the
5137 					 * CNAME RRSIG, use it.
5138 					 */
5139 					foundsig = cnamesig;
5140 				}
5141 			} else if (header->type == sigtype) {
5142 				/*
5143 				 * We've found the RRSIG rdataset for our
5144 				 * target type.  Remember it.
5145 				 */
5146 				foundsig = header;
5147 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
5148 				   header->type == negtype)
5149 			{
5150 				/*
5151 				 * We've found a negative cache entry.
5152 				 */
5153 				found = header;
5154 			} else if (header->type == dns_rdatatype_ns) {
5155 				/*
5156 				 * Remember a NS rdataset even if we're
5157 				 * not specifically looking for it, because
5158 				 * we might need it later.
5159 				 */
5160 				nsheader = header;
5161 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5162 				/*
5163 				 * If we need the NS rdataset, we'll also
5164 				 * need its signature.
5165 				 */
5166 				nssig = header;
5167 			} else if (header->type == dns_rdatatype_nsec) {
5168 				nsecheader = header;
5169 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
5170 				nsecsig = header;
5171 			} else if (cname_ok &&
5172 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
5173 			{
5174 				/*
5175 				 * If we get a CNAME match, we'll also need
5176 				 * its signature.
5177 				 */
5178 				cnamesig = header;
5179 			}
5180 			header_prev = header;
5181 		} else {
5182 			header_prev = header;
5183 		}
5184 	}
5185 
5186 	if (empty_node) {
5187 		/*
5188 		 * We have an exact match for the name, but there are no
5189 		 * extant rdatasets.  That means that this node doesn't
5190 		 * meaningfully exist, and that we really have a partial match.
5191 		 */
5192 		NODE_UNLOCK(lock, locktype);
5193 		goto find_ns;
5194 	}
5195 
5196 	/*
5197 	 * If we didn't find what we were looking for...
5198 	 */
5199 	if (found == NULL ||
5200 	    (DNS_TRUST_ADDITIONAL(found->trust) &&
5201 	     ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
5202 	    (found->trust == dns_trust_glue &&
5203 	     ((options & DNS_DBFIND_GLUEOK) == 0)) ||
5204 	    (DNS_TRUST_PENDING(found->trust) &&
5205 	     ((options & DNS_DBFIND_PENDINGOK) == 0)))
5206 	{
5207 		/*
5208 		 * Return covering NODATA NSEC record.
5209 		 */
5210 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
5211 		    nsecheader != NULL)
5212 		{
5213 			if (nodep != NULL) {
5214 				new_reference(search.rbtdb, node, locktype);
5215 				*nodep = node;
5216 			}
5217 			bind_rdataset(search.rbtdb, node, nsecheader,
5218 				      search.now, locktype, rdataset);
5219 			if (need_headerupdate(nsecheader, search.now)) {
5220 				update = nsecheader;
5221 			}
5222 			if (nsecsig != NULL) {
5223 				bind_rdataset(search.rbtdb, node, nsecsig,
5224 					      search.now, locktype,
5225 					      sigrdataset);
5226 				if (need_headerupdate(nsecsig, search.now)) {
5227 					updatesig = nsecsig;
5228 				}
5229 			}
5230 			result = DNS_R_COVERINGNSEC;
5231 			goto node_exit;
5232 		}
5233 
5234 		/*
5235 		 * If there is an NS rdataset at this node, then this is the
5236 		 * deepest zone cut.
5237 		 */
5238 		if (nsheader != NULL) {
5239 			if (nodep != NULL) {
5240 				new_reference(search.rbtdb, node, locktype);
5241 				*nodep = node;
5242 			}
5243 			bind_rdataset(search.rbtdb, node, nsheader, search.now,
5244 				      locktype, rdataset);
5245 			if (need_headerupdate(nsheader, search.now)) {
5246 				update = nsheader;
5247 			}
5248 			if (nssig != NULL) {
5249 				bind_rdataset(search.rbtdb, node, nssig,
5250 					      search.now, locktype,
5251 					      sigrdataset);
5252 				if (need_headerupdate(nssig, search.now)) {
5253 					updatesig = nssig;
5254 				}
5255 			}
5256 			result = DNS_R_DELEGATION;
5257 			goto node_exit;
5258 		}
5259 
5260 		/*
5261 		 * Go find the deepest zone cut.
5262 		 */
5263 		NODE_UNLOCK(lock, locktype);
5264 		goto find_ns;
5265 	}
5266 
5267 	/*
5268 	 * We found what we were looking for, or we found a CNAME.
5269 	 */
5270 
5271 	if (nodep != NULL) {
5272 		new_reference(search.rbtdb, node, locktype);
5273 		*nodep = node;
5274 	}
5275 
5276 	if (NEGATIVE(found)) {
5277 		/*
5278 		 * We found a negative cache entry.
5279 		 */
5280 		if (NXDOMAIN(found)) {
5281 			result = DNS_R_NCACHENXDOMAIN;
5282 		} else {
5283 			result = DNS_R_NCACHENXRRSET;
5284 		}
5285 	} else if (type != found->type && type != dns_rdatatype_any &&
5286 		   found->type == dns_rdatatype_cname)
5287 	{
5288 		/*
5289 		 * We weren't doing an ANY query and we found a CNAME instead
5290 		 * of the type we were looking for, so we need to indicate
5291 		 * that result to the caller.
5292 		 */
5293 		result = DNS_R_CNAME;
5294 	} else {
5295 		/*
5296 		 * An ordinary successful query!
5297 		 */
5298 		result = ISC_R_SUCCESS;
5299 	}
5300 
5301 	if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
5302 	    result == DNS_R_NCACHENXRRSET)
5303 	{
5304 		bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5305 			      rdataset);
5306 		if (need_headerupdate(found, search.now)) {
5307 			update = found;
5308 		}
5309 		if (!NEGATIVE(found) && foundsig != NULL) {
5310 			bind_rdataset(search.rbtdb, node, foundsig, search.now,
5311 				      locktype, sigrdataset);
5312 			if (need_headerupdate(foundsig, search.now)) {
5313 				updatesig = foundsig;
5314 			}
5315 		}
5316 	}
5317 
5318 node_exit:
5319 	if ((update != NULL || updatesig != NULL) &&
5320 	    locktype != isc_rwlocktype_write)
5321 	{
5322 		NODE_UNLOCK(lock, locktype);
5323 		NODE_LOCK(lock, isc_rwlocktype_write);
5324 		locktype = isc_rwlocktype_write;
5325 		POST(locktype);
5326 	}
5327 	if (update != NULL && need_headerupdate(update, search.now)) {
5328 		update_header(search.rbtdb, update, search.now);
5329 	}
5330 	if (updatesig != NULL && need_headerupdate(updatesig, search.now)) {
5331 		update_header(search.rbtdb, updatesig, search.now);
5332 	}
5333 
5334 	NODE_UNLOCK(lock, locktype);
5335 
5336 tree_exit:
5337 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5338 
5339 	/*
5340 	 * If we found a zonecut but aren't going to use it, we have to
5341 	 * let go of it.
5342 	 */
5343 	if (search.need_cleanup) {
5344 		node = search.zonecut;
5345 		INSIST(node != NULL);
5346 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
5347 
5348 		NODE_LOCK(lock, isc_rwlocktype_read);
5349 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
5350 				    isc_rwlocktype_none, false);
5351 		NODE_UNLOCK(lock, isc_rwlocktype_read);
5352 	}
5353 
5354 	dns_rbtnodechain_reset(&search.chain);
5355 
5356 	update_cachestats(search.rbtdb, result);
5357 	return (result);
5358 }
5359 
5360 static isc_result_t
5361 cache_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
5362 		  isc_stdtime_t now, dns_dbnode_t **nodep,
5363 		  dns_name_t *foundname, dns_name_t *dcname,
5364 		  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5365 	dns_rbtnode_t *node = NULL;
5366 	nodelock_t *lock;
5367 	isc_result_t result;
5368 	rbtdb_search_t search;
5369 	rdatasetheader_t *header, *header_prev, *header_next;
5370 	rdatasetheader_t *found, *foundsig;
5371 	unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
5372 	isc_rwlocktype_t locktype;
5373 	bool dcnull = (dcname == NULL);
5374 
5375 	search.rbtdb = (dns_rbtdb_t *)db;
5376 
5377 	REQUIRE(VALID_RBTDB(search.rbtdb));
5378 
5379 	if (now == 0) {
5380 		isc_stdtime_get(&now);
5381 	}
5382 
5383 	search.rbtversion = NULL;
5384 	search.serial = 1;
5385 	search.options = options;
5386 	search.copy_name = false;
5387 	search.need_cleanup = false;
5388 	search.wild = false;
5389 	search.zonecut = NULL;
5390 	dns_fixedname_init(&search.zonecut_name);
5391 	dns_rbtnodechain_init(&search.chain);
5392 	search.now = now;
5393 
5394 	if (dcnull) {
5395 		dcname = foundname;
5396 	}
5397 
5398 	if ((options & DNS_DBFIND_NOEXACT) != 0) {
5399 		rbtoptions |= DNS_RBTFIND_NOEXACT;
5400 	}
5401 
5402 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5403 
5404 	/*
5405 	 * Search down from the root of the tree.
5406 	 */
5407 	result = dns_rbt_findnode(search.rbtdb->tree, name, dcname, &node,
5408 				  &search.chain, rbtoptions, NULL, &search);
5409 
5410 	if (result == DNS_R_PARTIALMATCH) {
5411 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5412 					      rdataset, sigrdataset);
5413 		goto tree_exit;
5414 	} else if (result != ISC_R_SUCCESS) {
5415 		goto tree_exit;
5416 	} else if (!dcnull) {
5417 		dns_name_copynf(dcname, foundname);
5418 	}
5419 
5420 	/*
5421 	 * We now go looking for an NS rdataset at the node.
5422 	 */
5423 
5424 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5425 	locktype = isc_rwlocktype_read;
5426 	NODE_LOCK(lock, locktype);
5427 
5428 	found = NULL;
5429 	foundsig = NULL;
5430 	header_prev = NULL;
5431 	for (header = node->data; header != NULL; header = header_next) {
5432 		header_next = header->next;
5433 		if (check_stale_header(node, header, &locktype, lock, &search,
5434 				       &header_prev))
5435 		{
5436 			/*
5437 			 * The function dns_rbt_findnode found us the a matching
5438 			 * node for 'name' and stored the result in 'dcname'.
5439 			 * This is the deepest known zonecut in our database.
5440 			 * However, this node may be stale and if serve-stale
5441 			 * is not enabled (in other words 'stale-answer-enable'
5442 			 * is set to no), this node may not be used as a
5443 			 * zonecut we know about. If so, find the deepest
5444 			 * zonecut from this node up and return that instead.
5445 			 */
5446 			NODE_UNLOCK(lock, locktype);
5447 			result = find_deepest_zonecut(&search, node, nodep,
5448 						      foundname, rdataset,
5449 						      sigrdataset);
5450 			dns_name_copynf(foundname, dcname);
5451 			goto tree_exit;
5452 		} else if (EXISTS(header) && !ANCIENT(header)) {
5453 			/*
5454 			 * If we found a type we were looking for, remember
5455 			 * it.
5456 			 */
5457 			if (header->type == dns_rdatatype_ns) {
5458 				/*
5459 				 * Remember a NS rdataset even if we're
5460 				 * not specifically looking for it, because
5461 				 * we might need it later.
5462 				 */
5463 				found = header;
5464 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5465 				/*
5466 				 * If we need the NS rdataset, we'll also
5467 				 * need its signature.
5468 				 */
5469 				foundsig = header;
5470 			}
5471 			header_prev = header;
5472 		} else {
5473 			header_prev = header;
5474 		}
5475 	}
5476 
5477 	if (found == NULL) {
5478 		/*
5479 		 * No NS records here.
5480 		 */
5481 		NODE_UNLOCK(lock, locktype);
5482 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5483 					      rdataset, sigrdataset);
5484 		goto tree_exit;
5485 	}
5486 
5487 	if (nodep != NULL) {
5488 		new_reference(search.rbtdb, node, locktype);
5489 		*nodep = node;
5490 	}
5491 
5492 	bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5493 		      rdataset);
5494 	if (foundsig != NULL) {
5495 		bind_rdataset(search.rbtdb, node, foundsig, search.now,
5496 			      locktype, sigrdataset);
5497 	}
5498 
5499 	if (need_headerupdate(found, search.now) ||
5500 	    (foundsig != NULL && need_headerupdate(foundsig, search.now)))
5501 	{
5502 		if (locktype != isc_rwlocktype_write) {
5503 			NODE_UNLOCK(lock, locktype);
5504 			NODE_LOCK(lock, isc_rwlocktype_write);
5505 			locktype = isc_rwlocktype_write;
5506 			POST(locktype);
5507 		}
5508 		if (need_headerupdate(found, search.now)) {
5509 			update_header(search.rbtdb, found, search.now);
5510 		}
5511 		if (foundsig != NULL && need_headerupdate(foundsig, search.now))
5512 		{
5513 			update_header(search.rbtdb, foundsig, search.now);
5514 		}
5515 	}
5516 
5517 	NODE_UNLOCK(lock, locktype);
5518 
5519 tree_exit:
5520 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5521 
5522 	INSIST(!search.need_cleanup);
5523 
5524 	dns_rbtnodechain_reset(&search.chain);
5525 
5526 	if (result == DNS_R_DELEGATION) {
5527 		result = ISC_R_SUCCESS;
5528 	}
5529 
5530 	return (result);
5531 }
5532 
5533 static void
5534 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
5535 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5536 	dns_rbtnode_t *node = (dns_rbtnode_t *)source;
5537 
5538 	REQUIRE(VALID_RBTDB(rbtdb));
5539 	REQUIRE(targetp != NULL && *targetp == NULL);
5540 
5541 	isc_refcount_increment(&node->references);
5542 
5543 	*targetp = source;
5544 }
5545 
5546 static void
5547 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
5548 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5549 	dns_rbtnode_t *node;
5550 	bool want_free = false;
5551 	bool inactive = false;
5552 	rbtdb_nodelock_t *nodelock;
5553 
5554 	REQUIRE(VALID_RBTDB(rbtdb));
5555 	REQUIRE(targetp != NULL && *targetp != NULL);
5556 
5557 	node = (dns_rbtnode_t *)(*targetp);
5558 	nodelock = &rbtdb->node_locks[node->locknum];
5559 
5560 	NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
5561 
5562 	if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
5563 				isc_rwlocktype_none, false))
5564 	{
5565 		if (isc_refcount_current(&nodelock->references) == 0 &&
5566 		    nodelock->exiting)
5567 		{
5568 			inactive = true;
5569 		}
5570 	}
5571 
5572 	NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
5573 
5574 	*targetp = NULL;
5575 
5576 	if (inactive) {
5577 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5578 		rbtdb->active--;
5579 		if (rbtdb->active == 0) {
5580 			want_free = true;
5581 		}
5582 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5583 		if (want_free) {
5584 			char buf[DNS_NAME_FORMATSIZE];
5585 			if (dns_name_dynamic(&rbtdb->common.origin)) {
5586 				dns_name_format(&rbtdb->common.origin, buf,
5587 						sizeof(buf));
5588 			} else {
5589 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
5590 			}
5591 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
5592 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
5593 				      "calling free_rbtdb(%s)", buf);
5594 			free_rbtdb(rbtdb, true, NULL);
5595 		}
5596 	}
5597 }
5598 
5599 static isc_result_t
5600 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
5601 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5602 	dns_rbtnode_t *rbtnode = node;
5603 	rdatasetheader_t *header;
5604 	bool force_expire = false;
5605 	/*
5606 	 * These are the category and module used by the cache cleaner.
5607 	 */
5608 	bool log = false;
5609 	isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
5610 	isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
5611 	int level = ISC_LOG_DEBUG(2);
5612 	char printname[DNS_NAME_FORMATSIZE];
5613 
5614 	REQUIRE(VALID_RBTDB(rbtdb));
5615 
5616 	/*
5617 	 * Caller must hold a tree lock.
5618 	 */
5619 
5620 	if (now == 0) {
5621 		isc_stdtime_get(&now);
5622 	}
5623 
5624 	if (isc_mem_isovermem(rbtdb->common.mctx)) {
5625 		/*
5626 		 * Force expire with 25% probability.
5627 		 * XXXDCL Could stand to have a better policy, like LRU.
5628 		 */
5629 		force_expire = (rbtnode->down == NULL &&
5630 				(isc_random32() % 4) == 0);
5631 
5632 		/*
5633 		 * Note that 'log' can be true IFF overmem is also true.
5634 		 * overmem can currently only be true for cache
5635 		 * databases -- hence all of the "overmem cache" log strings.
5636 		 */
5637 		log = isc_log_wouldlog(dns_lctx, level);
5638 		if (log) {
5639 			isc_log_write(
5640 				dns_lctx, category, module, level,
5641 				"overmem cache: %s %s",
5642 				force_expire ? "FORCE" : "check",
5643 				dns_rbt_formatnodename(rbtnode, printname,
5644 						       sizeof(printname)));
5645 		}
5646 	}
5647 
5648 	/*
5649 	 * We may not need write access, but this code path is not performance
5650 	 * sensitive, so it should be okay to always lock as a writer.
5651 	 */
5652 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5653 		  isc_rwlocktype_write);
5654 
5655 	for (header = rbtnode->data; header != NULL; header = header->next) {
5656 		if (header->rdh_ttl + rbtdb->serve_stale_ttl <=
5657 		    now - RBTDB_VIRTUAL)
5658 		{
5659 			/*
5660 			 * We don't check if refcurrent(rbtnode) == 0 and try
5661 			 * to free like we do in cache_find(), because
5662 			 * refcurrent(rbtnode) must be non-zero.  This is so
5663 			 * because 'node' is an argument to the function.
5664 			 */
5665 			mark_header_ancient(rbtdb, header);
5666 			if (log) {
5667 				isc_log_write(dns_lctx, category, module, level,
5668 					      "overmem cache: ancient %s",
5669 					      printname);
5670 			}
5671 		} else if (force_expire) {
5672 			if (!RETAIN(header)) {
5673 				set_ttl(rbtdb, header, 0);
5674 				mark_header_ancient(rbtdb, header);
5675 			} else if (log) {
5676 				isc_log_write(dns_lctx, category, module, level,
5677 					      "overmem cache: "
5678 					      "reprieve by RETAIN() %s",
5679 					      printname);
5680 			}
5681 		} else if (isc_mem_isovermem(rbtdb->common.mctx) && log) {
5682 			isc_log_write(dns_lctx, category, module, level,
5683 				      "overmem cache: saved %s", printname);
5684 		}
5685 	}
5686 
5687 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5688 		    isc_rwlocktype_write);
5689 
5690 	return (ISC_R_SUCCESS);
5691 }
5692 
5693 static void
5694 overmem(dns_db_t *db, bool over) {
5695 	/* This is an empty callback.  See adb.c:water() */
5696 
5697 	UNUSED(db);
5698 	UNUSED(over);
5699 
5700 	return;
5701 }
5702 
5703 static void
5704 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
5705 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5706 	dns_rbtnode_t *rbtnode = node;
5707 	bool first;
5708 	uint32_t refs;
5709 
5710 	REQUIRE(VALID_RBTDB(rbtdb));
5711 
5712 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5713 		  isc_rwlocktype_read);
5714 
5715 	refs = isc_refcount_current(&rbtnode->references);
5716 	fprintf(out, "node %p, %" PRIu32 " references, locknum = %u\n", rbtnode,
5717 		refs, rbtnode->locknum);
5718 	if (rbtnode->data != NULL) {
5719 		rdatasetheader_t *current, *top_next;
5720 
5721 		for (current = rbtnode->data; current != NULL;
5722 		     current = top_next)
5723 		{
5724 			top_next = current->next;
5725 			first = true;
5726 			fprintf(out, "\ttype %u", current->type);
5727 			do {
5728 				uint_least16_t attributes = atomic_load_acquire(
5729 					&current->attributes);
5730 				if (!first) {
5731 					fprintf(out, "\t");
5732 				}
5733 				first = false;
5734 				fprintf(out,
5735 					"\tserial = %lu, ttl = %u, "
5736 					"trust = %u, attributes = %" PRIuLEAST16
5737 					", "
5738 					"resign = %u\n",
5739 					(unsigned long)current->serial,
5740 					current->rdh_ttl, current->trust,
5741 					attributes,
5742 					(current->resign << 1) |
5743 						current->resign_lsb);
5744 				current = current->down;
5745 			} while (current != NULL);
5746 		}
5747 	} else {
5748 		fprintf(out, "(empty)\n");
5749 	}
5750 
5751 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5752 		    isc_rwlocktype_read);
5753 }
5754 
5755 static isc_result_t
5756 createiterator(dns_db_t *db, unsigned int options,
5757 	       dns_dbiterator_t **iteratorp) {
5758 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5759 	rbtdb_dbiterator_t *rbtdbiter;
5760 
5761 	REQUIRE(VALID_RBTDB(rbtdb));
5762 
5763 	rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
5764 
5765 	rbtdbiter->common.methods = &dbiterator_methods;
5766 	rbtdbiter->common.db = NULL;
5767 	dns_db_attach(db, &rbtdbiter->common.db);
5768 	rbtdbiter->common.relative_names = ((options & DNS_DB_RELATIVENAMES) !=
5769 					    0);
5770 	rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
5771 	rbtdbiter->common.cleaning = false;
5772 	rbtdbiter->paused = true;
5773 	rbtdbiter->tree_locked = isc_rwlocktype_none;
5774 	rbtdbiter->result = ISC_R_SUCCESS;
5775 	dns_fixedname_init(&rbtdbiter->name);
5776 	dns_fixedname_init(&rbtdbiter->origin);
5777 	rbtdbiter->node = NULL;
5778 	rbtdbiter->delcnt = 0;
5779 	rbtdbiter->nsec3only = ((options & DNS_DB_NSEC3ONLY) != 0);
5780 	rbtdbiter->nonsec3 = ((options & DNS_DB_NONSEC3) != 0);
5781 	memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
5782 	dns_rbtnodechain_init(&rbtdbiter->chain);
5783 	dns_rbtnodechain_init(&rbtdbiter->nsec3chain);
5784 	if (rbtdbiter->nsec3only) {
5785 		rbtdbiter->current = &rbtdbiter->nsec3chain;
5786 	} else {
5787 		rbtdbiter->current = &rbtdbiter->chain;
5788 	}
5789 
5790 	*iteratorp = (dns_dbiterator_t *)rbtdbiter;
5791 
5792 	return (ISC_R_SUCCESS);
5793 }
5794 
5795 static isc_result_t
5796 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5797 		  dns_rdatatype_t type, dns_rdatatype_t covers,
5798 		  isc_stdtime_t now, dns_rdataset_t *rdataset,
5799 		  dns_rdataset_t *sigrdataset) {
5800 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5801 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5802 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5803 	rbtdb_serial_t serial;
5804 	rbtdb_version_t *rbtversion = version;
5805 	bool close_version = false;
5806 	rbtdb_rdatatype_t matchtype, sigmatchtype;
5807 
5808 	REQUIRE(VALID_RBTDB(rbtdb));
5809 	REQUIRE(type != dns_rdatatype_any);
5810 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
5811 
5812 	if (rbtversion == NULL) {
5813 		currentversion(db, (dns_dbversion_t **)(void *)(&rbtversion));
5814 		close_version = true;
5815 	}
5816 	serial = rbtversion->serial;
5817 	now = 0;
5818 
5819 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5820 		  isc_rwlocktype_read);
5821 
5822 	found = NULL;
5823 	foundsig = NULL;
5824 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
5825 	if (covers == 0) {
5826 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5827 	} else {
5828 		sigmatchtype = 0;
5829 	}
5830 
5831 	for (header = rbtnode->data; header != NULL; header = header_next) {
5832 		header_next = header->next;
5833 		do {
5834 			if (header->serial <= serial && !IGNORE(header)) {
5835 				/*
5836 				 * Is this a "this rdataset doesn't
5837 				 * exist" record?
5838 				 */
5839 				if (NONEXISTENT(header)) {
5840 					header = NULL;
5841 				}
5842 				break;
5843 			} else {
5844 				header = header->down;
5845 			}
5846 		} while (header != NULL);
5847 		if (header != NULL) {
5848 			/*
5849 			 * We have an active, extant rdataset.  If it's a
5850 			 * type we're looking for, remember it.
5851 			 */
5852 			if (header->type == matchtype) {
5853 				found = header;
5854 				if (foundsig != NULL) {
5855 					break;
5856 				}
5857 			} else if (header->type == sigmatchtype) {
5858 				foundsig = header;
5859 				if (found != NULL) {
5860 					break;
5861 				}
5862 			}
5863 		}
5864 	}
5865 	if (found != NULL) {
5866 		bind_rdataset(rbtdb, rbtnode, found, now, isc_rwlocktype_read,
5867 			      rdataset);
5868 		if (foundsig != NULL) {
5869 			bind_rdataset(rbtdb, rbtnode, foundsig, now,
5870 				      isc_rwlocktype_read, sigrdataset);
5871 		}
5872 	}
5873 
5874 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5875 		    isc_rwlocktype_read);
5876 
5877 	if (close_version) {
5878 		closeversion(db, (dns_dbversion_t **)(void *)(&rbtversion),
5879 			     false);
5880 	}
5881 
5882 	if (found == NULL) {
5883 		return (ISC_R_NOTFOUND);
5884 	}
5885 
5886 	return (ISC_R_SUCCESS);
5887 }
5888 
5889 static isc_result_t
5890 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5891 		   dns_rdatatype_t type, dns_rdatatype_t covers,
5892 		   isc_stdtime_t now, dns_rdataset_t *rdataset,
5893 		   dns_rdataset_t *sigrdataset) {
5894 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5895 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5896 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5897 	rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
5898 	isc_result_t result;
5899 	nodelock_t *lock;
5900 	isc_rwlocktype_t locktype;
5901 
5902 	REQUIRE(VALID_RBTDB(rbtdb));
5903 	REQUIRE(type != dns_rdatatype_any);
5904 
5905 	UNUSED(version);
5906 
5907 	result = ISC_R_SUCCESS;
5908 
5909 	if (now == 0) {
5910 		isc_stdtime_get(&now);
5911 	}
5912 
5913 	lock = &rbtdb->node_locks[rbtnode->locknum].lock;
5914 	locktype = isc_rwlocktype_read;
5915 	NODE_LOCK(lock, locktype);
5916 
5917 	found = NULL;
5918 	foundsig = NULL;
5919 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
5920 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
5921 	if (covers == 0) {
5922 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5923 	} else {
5924 		sigmatchtype = 0;
5925 	}
5926 
5927 	for (header = rbtnode->data; header != NULL; header = header_next) {
5928 		header_next = header->next;
5929 		if (!ACTIVE(header, now)) {
5930 			if ((header->rdh_ttl + rbtdb->serve_stale_ttl <
5931 			     now - RBTDB_VIRTUAL) &&
5932 			    (locktype == isc_rwlocktype_write ||
5933 			     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
5934 			{
5935 				/*
5936 				 * We update the node's status only when we
5937 				 * can get write access.
5938 				 */
5939 				locktype = isc_rwlocktype_write;
5940 
5941 				/*
5942 				 * We don't check if refcurrent(rbtnode) == 0
5943 				 * and try to free like we do in cache_find(),
5944 				 * because refcurrent(rbtnode) must be
5945 				 * non-zero.  This is so because 'node' is an
5946 				 * argument to the function.
5947 				 */
5948 				mark_header_ancient(rbtdb, header);
5949 			}
5950 		} else if (EXISTS(header) && !ANCIENT(header)) {
5951 			if (header->type == matchtype) {
5952 				found = header;
5953 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
5954 				   header->type == negtype)
5955 			{
5956 				found = header;
5957 			} else if (header->type == sigmatchtype) {
5958 				foundsig = header;
5959 			}
5960 		}
5961 	}
5962 	if (found != NULL) {
5963 		bind_rdataset(rbtdb, rbtnode, found, now, locktype, rdataset);
5964 		if (!NEGATIVE(found) && foundsig != NULL) {
5965 			bind_rdataset(rbtdb, rbtnode, foundsig, now, locktype,
5966 				      sigrdataset);
5967 		}
5968 	}
5969 
5970 	NODE_UNLOCK(lock, locktype);
5971 
5972 	if (found == NULL) {
5973 		return (ISC_R_NOTFOUND);
5974 	}
5975 
5976 	if (NEGATIVE(found)) {
5977 		/*
5978 		 * We found a negative cache entry.
5979 		 */
5980 		if (NXDOMAIN(found)) {
5981 			result = DNS_R_NCACHENXDOMAIN;
5982 		} else {
5983 			result = DNS_R_NCACHENXRRSET;
5984 		}
5985 	}
5986 
5987 	update_cachestats(rbtdb, result);
5988 
5989 	return (result);
5990 }
5991 
5992 static isc_result_t
5993 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5994 	     unsigned int options, isc_stdtime_t now,
5995 	     dns_rdatasetiter_t **iteratorp) {
5996 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5997 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5998 	rbtdb_version_t *rbtversion = version;
5999 	rbtdb_rdatasetiter_t *iterator;
6000 
6001 	REQUIRE(VALID_RBTDB(rbtdb));
6002 
6003 	iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
6004 
6005 	if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
6006 		now = 0;
6007 		if (rbtversion == NULL) {
6008 			currentversion(
6009 				db, (dns_dbversion_t **)(void *)(&rbtversion));
6010 		} else {
6011 			INSIST(rbtversion->rbtdb == rbtdb);
6012 
6013 			(void)isc_refcount_increment(&rbtversion->references);
6014 		}
6015 	} else {
6016 		if (now == 0) {
6017 			isc_stdtime_get(&now);
6018 		}
6019 		rbtversion = NULL;
6020 	}
6021 
6022 	iterator->common.magic = DNS_RDATASETITER_MAGIC;
6023 	iterator->common.methods = &rdatasetiter_methods;
6024 	iterator->common.db = db;
6025 	iterator->common.node = node;
6026 	iterator->common.version = (dns_dbversion_t *)rbtversion;
6027 	iterator->common.options = options;
6028 	iterator->common.now = now;
6029 
6030 	isc_refcount_increment(&rbtnode->references);
6031 
6032 	iterator->current = NULL;
6033 
6034 	*iteratorp = (dns_rdatasetiter_t *)iterator;
6035 
6036 	return (ISC_R_SUCCESS);
6037 }
6038 
6039 static bool
6040 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
6041 	rdatasetheader_t *header, *header_next;
6042 	bool cname, other_data;
6043 	dns_rdatatype_t rdtype;
6044 
6045 	/*
6046 	 * The caller must hold the node lock.
6047 	 */
6048 
6049 	/*
6050 	 * Look for CNAME and "other data" rdatasets active in our version.
6051 	 */
6052 	cname = false;
6053 	other_data = false;
6054 	for (header = node->data; header != NULL; header = header_next) {
6055 		header_next = header->next;
6056 		if (header->type == dns_rdatatype_cname) {
6057 			/*
6058 			 * Look for an active extant CNAME.
6059 			 */
6060 			do {
6061 				if (header->serial <= serial && !IGNORE(header))
6062 				{
6063 					/*
6064 					 * Is this a "this rdataset doesn't
6065 					 * exist" record?
6066 					 */
6067 					if (NONEXISTENT(header)) {
6068 						header = NULL;
6069 					}
6070 					break;
6071 				} else {
6072 					header = header->down;
6073 				}
6074 			} while (header != NULL);
6075 			if (header != NULL) {
6076 				cname = true;
6077 			}
6078 		} else {
6079 			/*
6080 			 * Look for active extant "other data".
6081 			 *
6082 			 * "Other data" is any rdataset whose type is not
6083 			 * KEY, NSEC, SIG or RRSIG.
6084 			 */
6085 			rdtype = RBTDB_RDATATYPE_BASE(header->type);
6086 			if (rdtype != dns_rdatatype_key &&
6087 			    rdtype != dns_rdatatype_sig &&
6088 			    rdtype != dns_rdatatype_nsec &&
6089 			    rdtype != dns_rdatatype_rrsig)
6090 			{
6091 				/*
6092 				 * Is it active and extant?
6093 				 */
6094 				do {
6095 					if (header->serial <= serial &&
6096 					    !IGNORE(header))
6097 					{
6098 						/*
6099 						 * Is this a "this rdataset
6100 						 * doesn't exist" record?
6101 						 */
6102 						if (NONEXISTENT(header)) {
6103 							header = NULL;
6104 						}
6105 						break;
6106 					} else {
6107 						header = header->down;
6108 					}
6109 				} while (header != NULL);
6110 				if (header != NULL) {
6111 					other_data = true;
6112 				}
6113 			}
6114 		}
6115 	}
6116 
6117 	if (cname && other_data) {
6118 		return (true);
6119 	}
6120 
6121 	return (false);
6122 }
6123 
6124 static void
6125 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader) {
6126 	INSIST(!IS_CACHE(rbtdb));
6127 	INSIST(newheader->heap_index == 0);
6128 	INSIST(!ISC_LINK_LINKED(newheader, link));
6129 
6130 	isc_heap_insert(rbtdb->heaps[idx], newheader);
6131 }
6132 
6133 /*
6134  * node write lock must be held.
6135  */
6136 static void
6137 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
6138 	      rdatasetheader_t *header) {
6139 	/*
6140 	 * Remove the old header from the heap
6141 	 */
6142 	if (header != NULL && header->heap_index != 0) {
6143 		isc_heap_delete(rbtdb->heaps[header->node->locknum],
6144 				header->heap_index);
6145 		header->heap_index = 0;
6146 		if (version != NULL) {
6147 			new_reference(rbtdb, header->node,
6148 				      isc_rwlocktype_write);
6149 			ISC_LIST_APPEND(version->resigned_list, header, link);
6150 		}
6151 	}
6152 }
6153 
6154 static uint64_t
6155 recordsize(rdatasetheader_t *header, unsigned int namelen) {
6156 	return (dns_rdataslab_rdatasize((unsigned char *)header,
6157 					sizeof(*header)) +
6158 		sizeof(dns_ttl_t) + sizeof(dns_rdatatype_t) +
6159 		sizeof(dns_rdataclass_t) + namelen);
6160 }
6161 
6162 static void
6163 update_recordsandxfrsize(bool add, rbtdb_version_t *rbtversion,
6164 			 rdatasetheader_t *header, unsigned int namelen) {
6165 	unsigned char *hdr = (unsigned char *)header;
6166 	size_t hdrsize = sizeof(*header);
6167 
6168 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6169 	if (add) {
6170 		rbtversion->records += dns_rdataslab_count(hdr, hdrsize);
6171 		rbtversion->xfrsize += recordsize(header, namelen);
6172 	} else {
6173 		rbtversion->records -= dns_rdataslab_count(hdr, hdrsize);
6174 		rbtversion->xfrsize -= recordsize(header, namelen);
6175 	}
6176 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6177 }
6178 
6179 /*
6180  * write lock on rbtnode must be held.
6181  */
6182 static isc_result_t
6183 add32(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, const dns_name_t *nodename,
6184       rbtdb_version_t *rbtversion, rdatasetheader_t *newheader,
6185       unsigned int options, bool loading, dns_rdataset_t *addedrdataset,
6186       isc_stdtime_t now) {
6187 	rbtdb_changed_t *changed = NULL;
6188 	rdatasetheader_t *topheader = NULL, *topheader_prev = NULL;
6189 	rdatasetheader_t *header = NULL, *sigheader = NULL;
6190 	unsigned char *merged = NULL;
6191 	isc_result_t result;
6192 	bool header_nx;
6193 	bool newheader_nx;
6194 	bool merge;
6195 	dns_rdatatype_t rdtype, covers;
6196 	rbtdb_rdatatype_t negtype, sigtype;
6197 	dns_trust_t trust;
6198 	int idx;
6199 
6200 	/*
6201 	 * Add an rdatasetheader_t to a node.
6202 	 */
6203 
6204 	/*
6205 	 * Caller must be holding the node lock.
6206 	 */
6207 
6208 	if ((options & DNS_DBADD_MERGE) != 0) {
6209 		REQUIRE(rbtversion != NULL);
6210 		merge = true;
6211 	} else {
6212 		merge = false;
6213 	}
6214 
6215 	if ((options & DNS_DBADD_FORCE) != 0) {
6216 		trust = dns_trust_ultimate;
6217 	} else {
6218 		trust = newheader->trust;
6219 	}
6220 
6221 	if (rbtversion != NULL && !loading) {
6222 		/*
6223 		 * We always add a changed record, even if no changes end up
6224 		 * being made to this node, because it's harmless and
6225 		 * simplifies the code.
6226 		 */
6227 		changed = add_changed(rbtdb, rbtversion, rbtnode);
6228 		if (changed == NULL) {
6229 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6230 			return (ISC_R_NOMEMORY);
6231 		}
6232 	}
6233 
6234 	newheader_nx = NONEXISTENT(newheader) ? true : false;
6235 	topheader_prev = NULL;
6236 	sigheader = NULL;
6237 	negtype = 0;
6238 	if (rbtversion == NULL && !newheader_nx) {
6239 		rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
6240 		covers = RBTDB_RDATATYPE_EXT(newheader->type);
6241 		sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, covers);
6242 		if (NEGATIVE(newheader)) {
6243 			/*
6244 			 * We're adding a negative cache entry.
6245 			 */
6246 			if (covers == dns_rdatatype_any) {
6247 				/*
6248 				 * If we're adding an negative cache entry
6249 				 * which covers all types (NXDOMAIN,
6250 				 * NODATA(QTYPE=ANY)),
6251 				 *
6252 				 * We make all other data ancient so that the
6253 				 * only rdataset that can be found at this
6254 				 * node is the negative cache entry.
6255 				 */
6256 				for (topheader = rbtnode->data;
6257 				     topheader != NULL;
6258 				     topheader = topheader->next)
6259 				{
6260 					set_ttl(rbtdb, topheader, 0);
6261 					mark_header_ancient(rbtdb, topheader);
6262 				}
6263 				goto find_header;
6264 			}
6265 			/*
6266 			 * Otherwise look for any RRSIGs of the given
6267 			 * type so they can be marked ancient later.
6268 			 */
6269 			for (topheader = rbtnode->data; topheader != NULL;
6270 			     topheader = topheader->next)
6271 			{
6272 				if (topheader->type == sigtype) {
6273 					sigheader = topheader;
6274 				}
6275 			}
6276 			negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6277 		} else {
6278 			/*
6279 			 * We're adding something that isn't a
6280 			 * negative cache entry.  Look for an extant
6281 			 * non-ancient NXDOMAIN/NODATA(QTYPE=ANY) negative
6282 			 * cache entry.  If we're adding an RRSIG, also
6283 			 * check for an extant non-ancient NODATA ncache
6284 			 * entry which covers the same type as the RRSIG.
6285 			 */
6286 			for (topheader = rbtnode->data; topheader != NULL;
6287 			     topheader = topheader->next)
6288 			{
6289 				if ((topheader->type ==
6290 				     RBTDB_RDATATYPE_NCACHEANY) ||
6291 				    (newheader->type == sigtype &&
6292 				     topheader->type ==
6293 					     RBTDB_RDATATYPE_VALUE(0, covers)))
6294 				{
6295 					break;
6296 				}
6297 			}
6298 			if (topheader != NULL && EXISTS(topheader) &&
6299 			    ACTIVE(topheader, now))
6300 			{
6301 				/*
6302 				 * Found one.
6303 				 */
6304 				if (trust < topheader->trust) {
6305 					/*
6306 					 * The NXDOMAIN/NODATA(QTYPE=ANY)
6307 					 * is more trusted.
6308 					 */
6309 					free_rdataset(rbtdb, rbtdb->common.mctx,
6310 						      newheader);
6311 					if (addedrdataset != NULL) {
6312 						bind_rdataset(
6313 							rbtdb, rbtnode,
6314 							topheader, now,
6315 							isc_rwlocktype_write,
6316 							addedrdataset);
6317 					}
6318 					return (DNS_R_UNCHANGED);
6319 				}
6320 				/*
6321 				 * The new rdataset is better.  Expire the
6322 				 * ncache entry.
6323 				 */
6324 				set_ttl(rbtdb, topheader, 0);
6325 				mark_header_ancient(rbtdb, topheader);
6326 				topheader = NULL;
6327 				goto find_header;
6328 			}
6329 			negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6330 		}
6331 	}
6332 
6333 	for (topheader = rbtnode->data; topheader != NULL;
6334 	     topheader = topheader->next)
6335 	{
6336 		if (topheader->type == newheader->type ||
6337 		    topheader->type == negtype)
6338 		{
6339 			break;
6340 		}
6341 		topheader_prev = topheader;
6342 	}
6343 
6344 find_header:
6345 	/*
6346 	 * If header isn't NULL, we've found the right type.  There may be
6347 	 * IGNORE rdatasets between the top of the chain and the first real
6348 	 * data.  We skip over them.
6349 	 */
6350 	header = topheader;
6351 	while (header != NULL && IGNORE(header)) {
6352 		header = header->down;
6353 	}
6354 	if (header != NULL) {
6355 		header_nx = NONEXISTENT(header) ? true : false;
6356 
6357 		/*
6358 		 * Deleting an already non-existent rdataset has no effect.
6359 		 */
6360 		if (header_nx && newheader_nx) {
6361 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6362 			return (DNS_R_UNCHANGED);
6363 		}
6364 
6365 		/*
6366 		 * Trying to add an rdataset with lower trust to a cache
6367 		 * DB has no effect, provided that the cache data isn't
6368 		 * stale. If the cache data is stale, new lower trust
6369 		 * data will supersede it below. Unclear what the best
6370 		 * policy is here.
6371 		 */
6372 		if (rbtversion == NULL && trust < header->trust &&
6373 		    (ACTIVE(header, now) || header_nx))
6374 		{
6375 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6376 			if (addedrdataset != NULL) {
6377 				bind_rdataset(rbtdb, rbtnode, header, now,
6378 					      isc_rwlocktype_write,
6379 					      addedrdataset);
6380 			}
6381 			return (DNS_R_UNCHANGED);
6382 		}
6383 
6384 		/*
6385 		 * Don't merge if a nonexistent rdataset is involved.
6386 		 */
6387 		if (merge && (header_nx || newheader_nx)) {
6388 			merge = false;
6389 		}
6390 
6391 		/*
6392 		 * If 'merge' is true, we'll try to create a new rdataset
6393 		 * that is the union of 'newheader' and 'header'.
6394 		 */
6395 		if (merge) {
6396 			unsigned int flags = 0;
6397 			INSIST(rbtversion->serial >= header->serial);
6398 			merged = NULL;
6399 			result = ISC_R_SUCCESS;
6400 
6401 			if ((options & DNS_DBADD_EXACT) != 0) {
6402 				flags |= DNS_RDATASLAB_EXACT;
6403 			}
6404 			/*
6405 			 * TTL use here is irrelevant to the cache;
6406 			 * merge is only done with zonedbs.
6407 			 */
6408 			if ((options & DNS_DBADD_EXACTTTL) != 0 &&
6409 			    newheader->rdh_ttl != header->rdh_ttl)
6410 			{
6411 				result = DNS_R_NOTEXACT;
6412 			} else if (newheader->rdh_ttl != header->rdh_ttl) {
6413 				flags |= DNS_RDATASLAB_FORCE;
6414 			}
6415 			if (result == ISC_R_SUCCESS) {
6416 				result = dns_rdataslab_merge(
6417 					(unsigned char *)header,
6418 					(unsigned char *)newheader,
6419 					(unsigned int)(sizeof(*newheader)),
6420 					rbtdb->common.mctx,
6421 					rbtdb->common.rdclass,
6422 					(dns_rdatatype_t)header->type, flags,
6423 					&merged);
6424 			}
6425 			if (result == ISC_R_SUCCESS) {
6426 				/*
6427 				 * If 'header' has the same serial number as
6428 				 * we do, we could clean it up now if we knew
6429 				 * that our caller had no references to it.
6430 				 * We don't know this, however, so we leave it
6431 				 * alone.  It will get cleaned up when
6432 				 * clean_zone_node() runs.
6433 				 */
6434 				free_rdataset(rbtdb, rbtdb->common.mctx,
6435 					      newheader);
6436 				newheader = (rdatasetheader_t *)merged;
6437 				init_rdataset(rbtdb, newheader);
6438 				update_newheader(newheader, header);
6439 				if (loading && RESIGN(newheader) &&
6440 				    RESIGN(header) &&
6441 				    resign_sooner(header, newheader))
6442 				{
6443 					newheader->resign = header->resign;
6444 					newheader->resign_lsb =
6445 						header->resign_lsb;
6446 				}
6447 			} else {
6448 				free_rdataset(rbtdb, rbtdb->common.mctx,
6449 					      newheader);
6450 				return (result);
6451 			}
6452 		}
6453 		/*
6454 		 * Don't replace existing NS, A and AAAA RRsets in the
6455 		 * cache if they are already exist. This prevents named
6456 		 * being locked to old servers. Don't lower trust of
6457 		 * existing record if the update is forced. Nothing
6458 		 * special to be done w.r.t stale data; it gets replaced
6459 		 * normally further down.
6460 		 */
6461 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6462 		    header->type == dns_rdatatype_ns && !header_nx &&
6463 		    !newheader_nx && header->trust >= newheader->trust &&
6464 		    dns_rdataslab_equalx((unsigned char *)header,
6465 					 (unsigned char *)newheader,
6466 					 (unsigned int)(sizeof(*newheader)),
6467 					 rbtdb->common.rdclass,
6468 					 (dns_rdatatype_t)header->type))
6469 		{
6470 			/*
6471 			 * Honour the new ttl if it is less than the
6472 			 * older one.
6473 			 */
6474 			if (header->rdh_ttl > newheader->rdh_ttl) {
6475 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6476 			}
6477 			if (header->noqname == NULL &&
6478 			    newheader->noqname != NULL)
6479 			{
6480 				header->noqname = newheader->noqname;
6481 				newheader->noqname = NULL;
6482 			}
6483 			if (header->closest == NULL &&
6484 			    newheader->closest != NULL)
6485 			{
6486 				header->closest = newheader->closest;
6487 				newheader->closest = NULL;
6488 			}
6489 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6490 			if (addedrdataset != NULL) {
6491 				bind_rdataset(rbtdb, rbtnode, header, now,
6492 					      isc_rwlocktype_write,
6493 					      addedrdataset);
6494 			}
6495 			return (ISC_R_SUCCESS);
6496 		}
6497 		/*
6498 		 * If we have will be replacing a NS RRset force its TTL
6499 		 * to be no more than the current NS RRset's TTL.  This
6500 		 * ensures the delegations that are withdrawn are honoured.
6501 		 */
6502 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6503 		    header->type == dns_rdatatype_ns && !header_nx &&
6504 		    !newheader_nx && header->trust <= newheader->trust)
6505 		{
6506 			if (newheader->rdh_ttl > header->rdh_ttl) {
6507 				newheader->rdh_ttl = header->rdh_ttl;
6508 			}
6509 		}
6510 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6511 		    (options & DNS_DBADD_PREFETCH) == 0 &&
6512 		    (header->type == dns_rdatatype_a ||
6513 		     header->type == dns_rdatatype_aaaa ||
6514 		     header->type == dns_rdatatype_ds ||
6515 		     header->type == RBTDB_RDATATYPE_SIGDS) &&
6516 		    !header_nx && !newheader_nx &&
6517 		    header->trust >= newheader->trust &&
6518 		    dns_rdataslab_equal((unsigned char *)header,
6519 					(unsigned char *)newheader,
6520 					(unsigned int)(sizeof(*newheader))))
6521 		{
6522 			/*
6523 			 * Honour the new ttl if it is less than the
6524 			 * older one.
6525 			 */
6526 			if (header->rdh_ttl > newheader->rdh_ttl) {
6527 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6528 			}
6529 			if (header->noqname == NULL &&
6530 			    newheader->noqname != NULL)
6531 			{
6532 				header->noqname = newheader->noqname;
6533 				newheader->noqname = NULL;
6534 			}
6535 			if (header->closest == NULL &&
6536 			    newheader->closest != NULL)
6537 			{
6538 				header->closest = newheader->closest;
6539 				newheader->closest = NULL;
6540 			}
6541 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6542 			if (addedrdataset != NULL) {
6543 				bind_rdataset(rbtdb, rbtnode, header, now,
6544 					      isc_rwlocktype_write,
6545 					      addedrdataset);
6546 			}
6547 			return (ISC_R_SUCCESS);
6548 		}
6549 		INSIST(rbtversion == NULL ||
6550 		       rbtversion->serial >= topheader->serial);
6551 		if (loading) {
6552 			newheader->down = NULL;
6553 			idx = newheader->node->locknum;
6554 			if (IS_CACHE(rbtdb)) {
6555 				if (ZEROTTL(newheader)) {
6556 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6557 							newheader, link);
6558 				} else {
6559 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6560 							 newheader, link);
6561 				}
6562 				INSIST(rbtdb->heaps != NULL);
6563 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6564 			} else if (RESIGN(newheader)) {
6565 				resign_insert(rbtdb, idx, newheader);
6566 				/*
6567 				 * Don't call resign_delete as we don't need
6568 				 * to reverse the delete.  The free_rdataset
6569 				 * call below will clean up the heap entry.
6570 				 */
6571 			}
6572 
6573 			/*
6574 			 * There are no other references to 'header' when
6575 			 * loading, so we MAY clean up 'header' now.
6576 			 * Since we don't generate changed records when
6577 			 * loading, we MUST clean up 'header' now.
6578 			 */
6579 			if (topheader_prev != NULL) {
6580 				topheader_prev->next = newheader;
6581 			} else {
6582 				rbtnode->data = newheader;
6583 			}
6584 			newheader->next = topheader->next;
6585 			if (rbtversion != NULL && !header_nx) {
6586 				update_recordsandxfrsize(false, rbtversion,
6587 							 header,
6588 							 nodename->length);
6589 			}
6590 			free_rdataset(rbtdb, rbtdb->common.mctx, header);
6591 		} else {
6592 			idx = newheader->node->locknum;
6593 			if (IS_CACHE(rbtdb)) {
6594 				INSIST(rbtdb->heaps != NULL);
6595 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6596 				if (ZEROTTL(newheader)) {
6597 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6598 							newheader, link);
6599 				} else {
6600 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6601 							 newheader, link);
6602 				}
6603 			} else if (RESIGN(newheader)) {
6604 				resign_insert(rbtdb, idx, newheader);
6605 				resign_delete(rbtdb, rbtversion, header);
6606 			}
6607 			if (topheader_prev != NULL) {
6608 				topheader_prev->next = newheader;
6609 			} else {
6610 				rbtnode->data = newheader;
6611 			}
6612 			newheader->next = topheader->next;
6613 			newheader->down = topheader;
6614 			topheader->next = newheader;
6615 			rbtnode->dirty = 1;
6616 			if (changed != NULL) {
6617 				changed->dirty = true;
6618 			}
6619 			if (rbtversion == NULL) {
6620 				set_ttl(rbtdb, header, 0);
6621 				mark_header_ancient(rbtdb, header);
6622 				if (sigheader != NULL) {
6623 					set_ttl(rbtdb, sigheader, 0);
6624 					mark_header_ancient(rbtdb, sigheader);
6625 				}
6626 			}
6627 			if (rbtversion != NULL && !header_nx) {
6628 				update_recordsandxfrsize(false, rbtversion,
6629 							 header,
6630 							 nodename->length);
6631 			}
6632 		}
6633 	} else {
6634 		/*
6635 		 * No non-IGNORED rdatasets of the given type exist at
6636 		 * this node.
6637 		 */
6638 
6639 		/*
6640 		 * If we're trying to delete the type, don't bother.
6641 		 */
6642 		if (newheader_nx) {
6643 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6644 			return (DNS_R_UNCHANGED);
6645 		}
6646 
6647 		idx = newheader->node->locknum;
6648 		if (IS_CACHE(rbtdb)) {
6649 			isc_heap_insert(rbtdb->heaps[idx], newheader);
6650 			if (ZEROTTL(newheader)) {
6651 				ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6652 						newheader, link);
6653 			} else {
6654 				ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6655 						 newheader, link);
6656 			}
6657 		} else if (RESIGN(newheader)) {
6658 			resign_insert(rbtdb, idx, newheader);
6659 			resign_delete(rbtdb, rbtversion, header);
6660 		}
6661 
6662 		if (topheader != NULL) {
6663 			/*
6664 			 * We have an list of rdatasets of the given type,
6665 			 * but they're all marked IGNORE.  We simply insert
6666 			 * the new rdataset at the head of the list.
6667 			 *
6668 			 * Ignored rdatasets cannot occur during loading, so
6669 			 * we INSIST on it.
6670 			 */
6671 			INSIST(!loading);
6672 			INSIST(rbtversion == NULL ||
6673 			       rbtversion->serial >= topheader->serial);
6674 			if (topheader_prev != NULL) {
6675 				topheader_prev->next = newheader;
6676 			} else {
6677 				rbtnode->data = newheader;
6678 			}
6679 			newheader->next = topheader->next;
6680 			newheader->down = topheader;
6681 			topheader->next = newheader;
6682 			rbtnode->dirty = 1;
6683 			if (changed != NULL) {
6684 				changed->dirty = true;
6685 			}
6686 		} else {
6687 			/*
6688 			 * No rdatasets of the given type exist at the node.
6689 			 */
6690 			newheader->next = rbtnode->data;
6691 			newheader->down = NULL;
6692 			rbtnode->data = newheader;
6693 		}
6694 	}
6695 
6696 	if (rbtversion != NULL && !newheader_nx) {
6697 		update_recordsandxfrsize(true, rbtversion, newheader,
6698 					 nodename->length);
6699 	}
6700 
6701 	/*
6702 	 * Check if the node now contains CNAME and other data.
6703 	 */
6704 	if (rbtversion != NULL &&
6705 	    cname_and_other_data(rbtnode, rbtversion->serial))
6706 	{
6707 		return (DNS_R_CNAMEANDOTHER);
6708 	}
6709 
6710 	if (addedrdataset != NULL) {
6711 		bind_rdataset(rbtdb, rbtnode, newheader, now,
6712 			      isc_rwlocktype_write, addedrdataset);
6713 	}
6714 
6715 	return (ISC_R_SUCCESS);
6716 }
6717 
6718 static bool
6719 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
6720 		rbtdb_rdatatype_t type) {
6721 	if (IS_CACHE(rbtdb)) {
6722 		if (type == dns_rdatatype_dname) {
6723 			return (true);
6724 		} else {
6725 			return (false);
6726 		}
6727 	} else if (type == dns_rdatatype_dname ||
6728 		   (type == dns_rdatatype_ns &&
6729 		    (node != rbtdb->origin_node || IS_STUB(rbtdb))))
6730 	{
6731 		return (true);
6732 	}
6733 	return (false);
6734 }
6735 
6736 static isc_result_t
6737 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6738 	   dns_rdataset_t *rdataset) {
6739 	struct noqname *noqname;
6740 	isc_mem_t *mctx = rbtdb->common.mctx;
6741 	dns_name_t name;
6742 	dns_rdataset_t neg, negsig;
6743 	isc_result_t result;
6744 	isc_region_t r;
6745 
6746 	dns_name_init(&name, NULL);
6747 	dns_rdataset_init(&neg);
6748 	dns_rdataset_init(&negsig);
6749 
6750 	result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig);
6751 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6752 
6753 	noqname = isc_mem_get(mctx, sizeof(*noqname));
6754 	dns_name_init(&noqname->name, NULL);
6755 	noqname->neg = NULL;
6756 	noqname->negsig = NULL;
6757 	noqname->type = neg.type;
6758 	dns_name_dup(&name, mctx, &noqname->name);
6759 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0);
6760 	if (result != ISC_R_SUCCESS) {
6761 		goto cleanup;
6762 	}
6763 	noqname->neg = r.base;
6764 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0);
6765 	if (result != ISC_R_SUCCESS) {
6766 		goto cleanup;
6767 	}
6768 	noqname->negsig = r.base;
6769 	dns_rdataset_disassociate(&neg);
6770 	dns_rdataset_disassociate(&negsig);
6771 	newheader->noqname = noqname;
6772 	return (ISC_R_SUCCESS);
6773 
6774 cleanup:
6775 	dns_rdataset_disassociate(&neg);
6776 	dns_rdataset_disassociate(&negsig);
6777 	free_noqname(mctx, &noqname);
6778 	return (result);
6779 }
6780 
6781 static isc_result_t
6782 addclosest(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6783 	   dns_rdataset_t *rdataset) {
6784 	struct noqname *closest;
6785 	isc_mem_t *mctx = rbtdb->common.mctx;
6786 	dns_name_t name;
6787 	dns_rdataset_t neg, negsig;
6788 	isc_result_t result;
6789 	isc_region_t r;
6790 
6791 	dns_name_init(&name, NULL);
6792 	dns_rdataset_init(&neg);
6793 	dns_rdataset_init(&negsig);
6794 
6795 	result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig);
6796 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6797 
6798 	closest = isc_mem_get(mctx, sizeof(*closest));
6799 	dns_name_init(&closest->name, NULL);
6800 	closest->neg = NULL;
6801 	closest->negsig = NULL;
6802 	closest->type = neg.type;
6803 	dns_name_dup(&name, mctx, &closest->name);
6804 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0);
6805 	if (result != ISC_R_SUCCESS) {
6806 		goto cleanup;
6807 	}
6808 	closest->neg = r.base;
6809 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0);
6810 	if (result != ISC_R_SUCCESS) {
6811 		goto cleanup;
6812 	}
6813 	closest->negsig = r.base;
6814 	dns_rdataset_disassociate(&neg);
6815 	dns_rdataset_disassociate(&negsig);
6816 	newheader->closest = closest;
6817 	return (ISC_R_SUCCESS);
6818 
6819 cleanup:
6820 	dns_rdataset_disassociate(&neg);
6821 	dns_rdataset_disassociate(&negsig);
6822 	free_noqname(mctx, &closest);
6823 	return (result);
6824 }
6825 
6826 static dns_dbmethods_t zone_methods;
6827 
6828 static size_t
6829 rdataset_size(rdatasetheader_t *header) {
6830 	if (!NONEXISTENT(header)) {
6831 		return (dns_rdataslab_size((unsigned char *)header,
6832 					   sizeof(*header)));
6833 	}
6834 
6835 	return (sizeof(*header));
6836 }
6837 
6838 static isc_result_t
6839 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
6840 	    isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
6841 	    dns_rdataset_t *addedrdataset) {
6842 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6843 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
6844 	rbtdb_version_t *rbtversion = version;
6845 	isc_region_t region;
6846 	rdatasetheader_t *newheader;
6847 	rdatasetheader_t *header;
6848 	isc_result_t result;
6849 	bool delegating;
6850 	bool newnsec;
6851 	bool tree_locked = false;
6852 	bool cache_is_overmem = false;
6853 	dns_fixedname_t fixed;
6854 	dns_name_t *name;
6855 
6856 	REQUIRE(VALID_RBTDB(rbtdb));
6857 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
6858 
6859 	if (rbtdb->common.methods == &zone_methods) {
6860 		/*
6861 		 * SOA records are only allowed at top of zone.
6862 		 */
6863 		if (rdataset->type == dns_rdatatype_soa &&
6864 		    node != rbtdb->origin_node)
6865 		{
6866 			return (DNS_R_NOTZONETOP);
6867 		}
6868 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6869 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
6870 			  (rdataset->type == dns_rdatatype_nsec3 ||
6871 			   rdataset->covers == dns_rdatatype_nsec3)) ||
6872 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
6873 			  rdataset->type != dns_rdatatype_nsec3 &&
6874 			  rdataset->covers != dns_rdatatype_nsec3)));
6875 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6876 	}
6877 
6878 	if (rbtversion == NULL) {
6879 		if (now == 0) {
6880 			isc_stdtime_get(&now);
6881 		}
6882 	} else {
6883 		now = 0;
6884 	}
6885 
6886 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
6887 					    &region, sizeof(rdatasetheader_t));
6888 	if (result != ISC_R_SUCCESS) {
6889 		return (result);
6890 	}
6891 
6892 	name = dns_fixedname_initname(&fixed);
6893 	nodefullname(db, node, name);
6894 	dns_rdataset_getownercase(rdataset, name);
6895 
6896 	newheader = (rdatasetheader_t *)region.base;
6897 	init_rdataset(rbtdb, newheader);
6898 	setownercase(newheader, name);
6899 	set_ttl(rbtdb, newheader, rdataset->ttl + now);
6900 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
6901 						rdataset->covers);
6902 	atomic_init(&newheader->attributes, 0);
6903 	if (rdataset->ttl == 0U) {
6904 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_ZEROTTL);
6905 	}
6906 	newheader->noqname = NULL;
6907 	newheader->closest = NULL;
6908 	atomic_init(&newheader->count,
6909 		    atomic_fetch_add_relaxed(&init_count, 1));
6910 	newheader->trust = rdataset->trust;
6911 	newheader->last_used = now;
6912 	newheader->node = rbtnode;
6913 	if (rbtversion != NULL) {
6914 		newheader->serial = rbtversion->serial;
6915 		now = 0;
6916 
6917 		if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
6918 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
6919 			newheader->resign =
6920 				(isc_stdtime_t)(dns_time64_from32(
6921 							rdataset->resign) >>
6922 						1);
6923 			newheader->resign_lsb = rdataset->resign & 0x1;
6924 		} else {
6925 			newheader->resign = 0;
6926 			newheader->resign_lsb = 0;
6927 		}
6928 	} else {
6929 		newheader->serial = 1;
6930 		newheader->resign = 0;
6931 		newheader->resign_lsb = 0;
6932 		if ((rdataset->attributes & DNS_RDATASETATTR_PREFETCH) != 0) {
6933 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_PREFETCH);
6934 		}
6935 		if ((rdataset->attributes & DNS_RDATASETATTR_NEGATIVE) != 0) {
6936 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NEGATIVE);
6937 		}
6938 		if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) {
6939 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NXDOMAIN);
6940 		}
6941 		if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) {
6942 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_OPTOUT);
6943 		}
6944 		if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
6945 			result = addnoqname(rbtdb, newheader, rdataset);
6946 			if (result != ISC_R_SUCCESS) {
6947 				free_rdataset(rbtdb, rbtdb->common.mctx,
6948 					      newheader);
6949 				return (result);
6950 			}
6951 		}
6952 		if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) {
6953 			result = addclosest(rbtdb, newheader, rdataset);
6954 			if (result != ISC_R_SUCCESS) {
6955 				free_rdataset(rbtdb, rbtdb->common.mctx,
6956 					      newheader);
6957 				return (result);
6958 			}
6959 		}
6960 	}
6961 
6962 	/*
6963 	 * If we're adding a delegation type (e.g. NS or DNAME for a zone,
6964 	 * just DNAME for the cache), then we need to set the callback bit
6965 	 * on the node.
6966 	 */
6967 	if (delegating_type(rbtdb, rbtnode, rdataset->type)) {
6968 		delegating = true;
6969 	} else {
6970 		delegating = false;
6971 	}
6972 
6973 	/*
6974 	 * Add to the auxiliary NSEC tree if we're adding an NSEC record.
6975 	 */
6976 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6977 	if (rbtnode->nsec != DNS_RBT_NSEC_HAS_NSEC &&
6978 	    rdataset->type == dns_rdatatype_nsec)
6979 	{
6980 		newnsec = true;
6981 	} else {
6982 		newnsec = false;
6983 	}
6984 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6985 
6986 	/*
6987 	 * If we're adding a delegation type, adding to the auxiliary NSEC
6988 	 * tree, or the DB is a cache in an overmem state, hold an
6989 	 * exclusive lock on the tree.  In the latter case the lock does
6990 	 * not necessarily have to be acquired but it will help purge
6991 	 * ancient entries more effectively.
6992 	 */
6993 	if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) {
6994 		cache_is_overmem = true;
6995 	}
6996 	if (delegating || newnsec || cache_is_overmem) {
6997 		tree_locked = true;
6998 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6999 	}
7000 
7001 	if (cache_is_overmem) {
7002 		overmem_purge(rbtdb, rbtnode->locknum, rdataset_size(newheader),
7003 			      tree_locked);
7004 	}
7005 
7006 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7007 		  isc_rwlocktype_write);
7008 
7009 	if (rbtdb->rrsetstats != NULL) {
7010 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_STATCOUNT);
7011 		update_rrsetstats(rbtdb, newheader->type,
7012 				  atomic_load_acquire(&newheader->attributes),
7013 				  true);
7014 	}
7015 
7016 	if (IS_CACHE(rbtdb)) {
7017 		if (tree_locked) {
7018 			cleanup_dead_nodes(rbtdb, rbtnode->locknum);
7019 		}
7020 
7021 		header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
7022 		if (header != NULL) {
7023 			dns_ttl_t rdh_ttl = header->rdh_ttl;
7024 
7025 			/* Only account for stale TTL if cache is not overmem */
7026 			if (!cache_is_overmem) {
7027 				rdh_ttl += rbtdb->serve_stale_ttl;
7028 			}
7029 
7030 			if (rdh_ttl < now - RBTDB_VIRTUAL) {
7031 				expire_header(rbtdb, header, tree_locked,
7032 					      expire_ttl);
7033 			}
7034 		}
7035 
7036 		/*
7037 		 * If we've been holding a write lock on the tree just for
7038 		 * cleaning, we can release it now.  However, we still need the
7039 		 * node lock.
7040 		 */
7041 		if (tree_locked && !delegating && !newnsec) {
7042 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7043 			tree_locked = false;
7044 		}
7045 	}
7046 
7047 	result = ISC_R_SUCCESS;
7048 	if (newnsec) {
7049 		dns_rbtnode_t *nsecnode;
7050 
7051 		nsecnode = NULL;
7052 		result = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7053 		if (result == ISC_R_SUCCESS) {
7054 			nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7055 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7056 		} else if (result == ISC_R_EXISTS) {
7057 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7058 			result = ISC_R_SUCCESS;
7059 		}
7060 	}
7061 
7062 	if (result == ISC_R_SUCCESS) {
7063 		result = add32(rbtdb, rbtnode, name, rbtversion, newheader,
7064 			       options, false, addedrdataset, now);
7065 	}
7066 	if (result == ISC_R_SUCCESS && delegating) {
7067 		rbtnode->find_callback = 1;
7068 	}
7069 
7070 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7071 		    isc_rwlocktype_write);
7072 
7073 	if (tree_locked) {
7074 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7075 	}
7076 
7077 	/*
7078 	 * Update the zone's secure status.  If version is non-NULL
7079 	 * this is deferred until closeversion() is called.
7080 	 */
7081 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7082 		iszonesecure(db, version, rbtdb->origin_node);
7083 	}
7084 
7085 	return (result);
7086 }
7087 
7088 static isc_result_t
7089 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7090 		 dns_rdataset_t *rdataset, unsigned int options,
7091 		 dns_rdataset_t *newrdataset) {
7092 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7093 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7094 	rbtdb_version_t *rbtversion = version;
7095 	dns_fixedname_t fname;
7096 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7097 	rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
7098 	unsigned char *subresult;
7099 	isc_region_t region;
7100 	isc_result_t result;
7101 	rbtdb_changed_t *changed;
7102 
7103 	REQUIRE(VALID_RBTDB(rbtdb));
7104 	REQUIRE(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
7105 
7106 	if (rbtdb->common.methods == &zone_methods) {
7107 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7108 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
7109 			  (rdataset->type == dns_rdatatype_nsec3 ||
7110 			   rdataset->covers == dns_rdatatype_nsec3)) ||
7111 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
7112 			  rdataset->type != dns_rdatatype_nsec3 &&
7113 			  rdataset->covers != dns_rdatatype_nsec3)));
7114 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7115 	}
7116 
7117 	nodefullname(db, node, nodename);
7118 
7119 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7120 					    &region, sizeof(rdatasetheader_t));
7121 	if (result != ISC_R_SUCCESS) {
7122 		return (result);
7123 	}
7124 	newheader = (rdatasetheader_t *)region.base;
7125 	init_rdataset(rbtdb, newheader);
7126 	set_ttl(rbtdb, newheader, rdataset->ttl);
7127 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7128 						rdataset->covers);
7129 	atomic_init(&newheader->attributes, 0);
7130 	newheader->serial = rbtversion->serial;
7131 	newheader->trust = 0;
7132 	newheader->noqname = NULL;
7133 	newheader->closest = NULL;
7134 	atomic_init(&newheader->count,
7135 		    atomic_fetch_add_relaxed(&init_count, 1));
7136 	newheader->last_used = 0;
7137 	newheader->node = rbtnode;
7138 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7139 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7140 		newheader->resign =
7141 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7142 					1);
7143 		newheader->resign_lsb = rdataset->resign & 0x1;
7144 	} else {
7145 		newheader->resign = 0;
7146 		newheader->resign_lsb = 0;
7147 	}
7148 
7149 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7150 		  isc_rwlocktype_write);
7151 
7152 	changed = add_changed(rbtdb, rbtversion, rbtnode);
7153 	if (changed == NULL) {
7154 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7155 		NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7156 			    isc_rwlocktype_write);
7157 		return (ISC_R_NOMEMORY);
7158 	}
7159 
7160 	topheader_prev = NULL;
7161 	for (topheader = rbtnode->data; topheader != NULL;
7162 	     topheader = topheader->next)
7163 	{
7164 		if (topheader->type == newheader->type) {
7165 			break;
7166 		}
7167 		topheader_prev = topheader;
7168 	}
7169 	/*
7170 	 * If header isn't NULL, we've found the right type.  There may be
7171 	 * IGNORE rdatasets between the top of the chain and the first real
7172 	 * data.  We skip over them.
7173 	 */
7174 	header = topheader;
7175 	while (header != NULL && IGNORE(header)) {
7176 		header = header->down;
7177 	}
7178 	if (header != NULL && EXISTS(header)) {
7179 		unsigned int flags = 0;
7180 		subresult = NULL;
7181 		result = ISC_R_SUCCESS;
7182 		if ((options & DNS_DBSUB_EXACT) != 0) {
7183 			flags |= DNS_RDATASLAB_EXACT;
7184 			if (newheader->rdh_ttl != header->rdh_ttl) {
7185 				result = DNS_R_NOTEXACT;
7186 			}
7187 		}
7188 		if (result == ISC_R_SUCCESS) {
7189 			result = dns_rdataslab_subtract(
7190 				(unsigned char *)header,
7191 				(unsigned char *)newheader,
7192 				(unsigned int)(sizeof(*newheader)),
7193 				rbtdb->common.mctx, rbtdb->common.rdclass,
7194 				(dns_rdatatype_t)header->type, flags,
7195 				&subresult);
7196 		}
7197 		if (result == ISC_R_SUCCESS) {
7198 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7199 			newheader = (rdatasetheader_t *)subresult;
7200 			init_rdataset(rbtdb, newheader);
7201 			update_newheader(newheader, header);
7202 			if (RESIGN(header)) {
7203 				RDATASET_ATTR_SET(newheader,
7204 						  RDATASET_ATTR_RESIGN);
7205 				newheader->resign = header->resign;
7206 				newheader->resign_lsb = header->resign_lsb;
7207 				resign_insert(rbtdb, rbtnode->locknum,
7208 					      newheader);
7209 			}
7210 			/*
7211 			 * We have to set the serial since the rdataslab
7212 			 * subtraction routine copies the reserved portion of
7213 			 * header, not newheader.
7214 			 */
7215 			newheader->serial = rbtversion->serial;
7216 			/*
7217 			 * XXXJT: dns_rdataslab_subtract() copied the pointers
7218 			 * to additional info.  We need to clear these fields
7219 			 * to avoid having duplicated references.
7220 			 */
7221 			update_recordsandxfrsize(true, rbtversion, newheader,
7222 						 nodename->length);
7223 		} else if (result == DNS_R_NXRRSET) {
7224 			/*
7225 			 * This subtraction would remove all of the rdata;
7226 			 * add a nonexistent header instead.
7227 			 */
7228 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7229 			newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7230 			if (newheader == NULL) {
7231 				result = ISC_R_NOMEMORY;
7232 				goto unlock;
7233 			}
7234 			init_rdataset(rbtdb, newheader);
7235 			set_ttl(rbtdb, newheader, 0);
7236 			newheader->type = topheader->type;
7237 			atomic_init(&newheader->attributes,
7238 				    RDATASET_ATTR_NONEXISTENT);
7239 			newheader->trust = 0;
7240 			newheader->serial = rbtversion->serial;
7241 			newheader->noqname = NULL;
7242 			newheader->closest = NULL;
7243 			atomic_init(&newheader->count, 0);
7244 			newheader->node = rbtnode;
7245 			newheader->resign = 0;
7246 			newheader->resign_lsb = 0;
7247 			newheader->last_used = 0;
7248 		} else {
7249 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7250 			goto unlock;
7251 		}
7252 
7253 		/*
7254 		 * If we're here, we want to link newheader in front of
7255 		 * topheader.
7256 		 */
7257 		INSIST(rbtversion->serial >= topheader->serial);
7258 		update_recordsandxfrsize(false, rbtversion, header,
7259 					 nodename->length);
7260 		if (topheader_prev != NULL) {
7261 			topheader_prev->next = newheader;
7262 		} else {
7263 			rbtnode->data = newheader;
7264 		}
7265 		newheader->next = topheader->next;
7266 		newheader->down = topheader;
7267 		topheader->next = newheader;
7268 		rbtnode->dirty = 1;
7269 		changed->dirty = true;
7270 		resign_delete(rbtdb, rbtversion, header);
7271 	} else {
7272 		/*
7273 		 * The rdataset doesn't exist, so we don't need to do anything
7274 		 * to satisfy the deletion request.
7275 		 */
7276 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7277 		if ((options & DNS_DBSUB_EXACT) != 0) {
7278 			result = DNS_R_NOTEXACT;
7279 		} else {
7280 			result = DNS_R_UNCHANGED;
7281 		}
7282 	}
7283 
7284 	if (result == ISC_R_SUCCESS && newrdataset != NULL) {
7285 		bind_rdataset(rbtdb, rbtnode, newheader, 0,
7286 			      isc_rwlocktype_write, newrdataset);
7287 	}
7288 
7289 	if (result == DNS_R_NXRRSET && newrdataset != NULL &&
7290 	    (options & DNS_DBSUB_WANTOLD) != 0)
7291 	{
7292 		bind_rdataset(rbtdb, rbtnode, header, 0, isc_rwlocktype_write,
7293 			      newrdataset);
7294 	}
7295 
7296 unlock:
7297 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7298 		    isc_rwlocktype_write);
7299 
7300 	/*
7301 	 * Update the zone's secure status.  If version is non-NULL
7302 	 * this is deferred until closeversion() is called.
7303 	 */
7304 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7305 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7306 		version = rbtdb->current_version;
7307 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7308 		iszonesecure(db, version, rbtdb->origin_node);
7309 	}
7310 
7311 	return (result);
7312 }
7313 
7314 static isc_result_t
7315 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7316 	       dns_rdatatype_t type, dns_rdatatype_t covers) {
7317 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7318 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7319 	rbtdb_version_t *rbtversion = version;
7320 	dns_fixedname_t fname;
7321 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7322 	isc_result_t result;
7323 	rdatasetheader_t *newheader;
7324 
7325 	REQUIRE(VALID_RBTDB(rbtdb));
7326 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7327 
7328 	if (type == dns_rdatatype_any) {
7329 		return (ISC_R_NOTIMPLEMENTED);
7330 	}
7331 	if (type == dns_rdatatype_rrsig && covers == 0) {
7332 		return (ISC_R_NOTIMPLEMENTED);
7333 	}
7334 
7335 	newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7336 	if (newheader == NULL) {
7337 		return (ISC_R_NOMEMORY);
7338 	}
7339 	init_rdataset(rbtdb, newheader);
7340 	set_ttl(rbtdb, newheader, 0);
7341 	newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
7342 	atomic_init(&newheader->attributes, RDATASET_ATTR_NONEXISTENT);
7343 	newheader->trust = 0;
7344 	newheader->noqname = NULL;
7345 	newheader->closest = NULL;
7346 	if (rbtversion != NULL) {
7347 		newheader->serial = rbtversion->serial;
7348 	} else {
7349 		newheader->serial = 0;
7350 	}
7351 	atomic_init(&newheader->count, 0);
7352 	newheader->last_used = 0;
7353 	newheader->node = rbtnode;
7354 
7355 	nodefullname(db, node, nodename);
7356 
7357 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7358 		  isc_rwlocktype_write);
7359 	result = add32(rbtdb, rbtnode, nodename, rbtversion, newheader,
7360 		       DNS_DBADD_FORCE, false, NULL, 0);
7361 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7362 		    isc_rwlocktype_write);
7363 
7364 	/*
7365 	 * Update the zone's secure status.  If version is non-NULL
7366 	 * this is deferred until closeversion() is called.
7367 	 */
7368 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7369 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7370 		version = rbtdb->current_version;
7371 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7372 		iszonesecure(db, version, rbtdb->origin_node);
7373 	}
7374 
7375 	return (result);
7376 }
7377 
7378 /*
7379  * load a non-NSEC3 node in the main tree and optionally to the auxiliary NSEC
7380  */
7381 static isc_result_t
7382 loadnode(dns_rbtdb_t *rbtdb, const dns_name_t *name, dns_rbtnode_t **nodep,
7383 	 bool hasnsec) {
7384 	isc_result_t noderesult, nsecresult, tmpresult;
7385 	dns_rbtnode_t *nsecnode = NULL, *node = NULL;
7386 
7387 	noderesult = dns_rbt_addnode(rbtdb->tree, name, &node);
7388 	if (!hasnsec) {
7389 		goto done;
7390 	}
7391 	if (noderesult == ISC_R_EXISTS) {
7392 		/*
7393 		 * Add a node to the auxiliary NSEC tree for an old node
7394 		 * just now getting an NSEC record.
7395 		 */
7396 		if (node->nsec == DNS_RBT_NSEC_HAS_NSEC) {
7397 			goto done;
7398 		}
7399 	} else if (noderesult != ISC_R_SUCCESS) {
7400 		goto done;
7401 	}
7402 
7403 	/*
7404 	 * Build the auxiliary tree for NSECs as we go.
7405 	 * This tree speeds searches for closest NSECs that would otherwise
7406 	 * need to examine many irrelevant nodes in large TLDs.
7407 	 *
7408 	 * Add nodes to the auxiliary tree after corresponding nodes have
7409 	 * been added to the main tree.
7410 	 */
7411 	nsecresult = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7412 	if (nsecresult == ISC_R_SUCCESS) {
7413 		nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7414 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7415 		goto done;
7416 	}
7417 
7418 	if (nsecresult == ISC_R_EXISTS) {
7419 #if 1 /* 0 */
7420 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7421 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7422 			      "addnode: NSEC node already exists");
7423 #endif /* if 1 */
7424 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7425 		goto done;
7426 	}
7427 
7428 	if (noderesult == ISC_R_SUCCESS) {
7429 		/*
7430 		 * Remove the node we just added above.
7431 		 */
7432 		tmpresult = dns_rbt_deletenode(rbtdb->tree, node, false);
7433 		if (tmpresult != ISC_R_SUCCESS) {
7434 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7435 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7436 				      "loading_addrdataset: "
7437 				      "dns_rbt_deletenode: %s after "
7438 				      "dns_rbt_addnode(NSEC): %s",
7439 				      isc_result_totext(tmpresult),
7440 				      isc_result_totext(noderesult));
7441 		}
7442 	}
7443 
7444 	/*
7445 	 * Set the error condition to be returned.
7446 	 */
7447 	noderesult = nsecresult;
7448 
7449 done:
7450 	if (noderesult == ISC_R_SUCCESS || noderesult == ISC_R_EXISTS) {
7451 		*nodep = node;
7452 	}
7453 
7454 	return (noderesult);
7455 }
7456 
7457 static isc_result_t
7458 loading_addrdataset(void *arg, const dns_name_t *name,
7459 		    dns_rdataset_t *rdataset) {
7460 	rbtdb_load_t *loadctx = arg;
7461 	dns_rbtdb_t *rbtdb = loadctx->rbtdb;
7462 	dns_rbtnode_t *node;
7463 	isc_result_t result;
7464 	isc_region_t region;
7465 	rdatasetheader_t *newheader;
7466 
7467 	REQUIRE(rdataset->rdclass == rbtdb->common.rdclass);
7468 
7469 	/*
7470 	 * SOA records are only allowed at top of zone.
7471 	 */
7472 	if (rdataset->type == dns_rdatatype_soa && !IS_CACHE(rbtdb) &&
7473 	    !dns_name_equal(name, &rbtdb->common.origin))
7474 	{
7475 		return (DNS_R_NOTZONETOP);
7476 	}
7477 
7478 	if (rdataset->type != dns_rdatatype_nsec3 &&
7479 	    rdataset->covers != dns_rdatatype_nsec3)
7480 	{
7481 		add_empty_wildcards(rbtdb, name, false);
7482 	}
7483 
7484 	if (dns_name_iswildcard(name)) {
7485 		/*
7486 		 * NS record owners cannot legally be wild cards.
7487 		 */
7488 		if (rdataset->type == dns_rdatatype_ns) {
7489 			return (DNS_R_INVALIDNS);
7490 		}
7491 		/*
7492 		 * NSEC3 record owners cannot legally be wild cards.
7493 		 */
7494 		if (rdataset->type == dns_rdatatype_nsec3) {
7495 			return (DNS_R_INVALIDNSEC3);
7496 		}
7497 		result = add_wildcard_magic(rbtdb, name, false);
7498 		if (result != ISC_R_SUCCESS) {
7499 			return (result);
7500 		}
7501 	}
7502 
7503 	node = NULL;
7504 	if (rdataset->type == dns_rdatatype_nsec3 ||
7505 	    rdataset->covers == dns_rdatatype_nsec3)
7506 	{
7507 		result = dns_rbt_addnode(rbtdb->nsec3, name, &node);
7508 		if (result == ISC_R_SUCCESS) {
7509 			node->nsec = DNS_RBT_NSEC_NSEC3;
7510 		}
7511 	} else if (rdataset->type == dns_rdatatype_nsec) {
7512 		result = loadnode(rbtdb, name, &node, true);
7513 	} else {
7514 		result = loadnode(rbtdb, name, &node, false);
7515 	}
7516 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
7517 		return (result);
7518 	}
7519 	if (result == ISC_R_SUCCESS) {
7520 		node->locknum = node->hashval % rbtdb->node_lock_count;
7521 	}
7522 
7523 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7524 					    &region, sizeof(rdatasetheader_t));
7525 	if (result != ISC_R_SUCCESS) {
7526 		return (result);
7527 	}
7528 	newheader = (rdatasetheader_t *)region.base;
7529 	init_rdataset(rbtdb, newheader);
7530 	set_ttl(rbtdb, newheader, rdataset->ttl + loadctx->now); /* XXX overflow
7531 								  * check */
7532 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7533 						rdataset->covers);
7534 	atomic_init(&newheader->attributes, 0);
7535 	newheader->trust = rdataset->trust;
7536 	newheader->serial = 1;
7537 	newheader->noqname = NULL;
7538 	newheader->closest = NULL;
7539 	atomic_init(&newheader->count,
7540 		    atomic_fetch_add_relaxed(&init_count, 1));
7541 	newheader->last_used = 0;
7542 	newheader->node = node;
7543 	setownercase(newheader, name);
7544 
7545 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7546 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7547 		newheader->resign =
7548 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7549 					1);
7550 		newheader->resign_lsb = rdataset->resign & 0x1;
7551 	} else {
7552 		newheader->resign = 0;
7553 		newheader->resign_lsb = 0;
7554 	}
7555 
7556 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
7557 	result = add32(rbtdb, node, name, rbtdb->current_version, newheader,
7558 		       DNS_DBADD_MERGE, true, NULL, 0);
7559 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
7560 		    isc_rwlocktype_write);
7561 
7562 	if (result == ISC_R_SUCCESS &&
7563 	    delegating_type(rbtdb, node, rdataset->type))
7564 	{
7565 		node->find_callback = 1;
7566 	} else if (result == DNS_R_UNCHANGED) {
7567 		result = ISC_R_SUCCESS;
7568 	}
7569 
7570 	return (result);
7571 }
7572 
7573 static isc_result_t
7574 rbt_datafixer(dns_rbtnode_t *rbtnode, void *base, size_t filesize, void *arg,
7575 	      uint64_t *crc) {
7576 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
7577 	rdatasetheader_t *header;
7578 	unsigned char *limit = ((unsigned char *)base) + filesize;
7579 
7580 	REQUIRE(rbtnode != NULL);
7581 	REQUIRE(VALID_RBTDB(rbtdb));
7582 
7583 	for (header = rbtnode->data; header != NULL; header = header->next) {
7584 		unsigned char *p = (unsigned char *)header;
7585 		size_t size = dns_rdataslab_size(p, sizeof(*header));
7586 		isc_crc64_update(crc, p, size);
7587 #ifdef DEBUG
7588 		hexdump("hashing header", p, sizeof(rdatasetheader_t));
7589 		hexdump("hashing slab", p + sizeof(rdatasetheader_t),
7590 			size - sizeof(rdatasetheader_t));
7591 #endif /* ifdef DEBUG */
7592 		header->serial = 1;
7593 		header->is_mmapped = 1;
7594 		header->node = rbtnode;
7595 		header->node_is_relative = 0;
7596 
7597 		if (RESIGN(header) &&
7598 		    (header->resign != 0 || header->resign_lsb != 0))
7599 		{
7600 			int idx = header->node->locknum;
7601 			isc_heap_insert(rbtdb->heaps[idx], header);
7602 		}
7603 
7604 		if (header->next != NULL) {
7605 			size_t cooked = dns_rbt_serialize_align(size);
7606 			if ((uintptr_t)header->next !=
7607 			    (p - (unsigned char *)base) + cooked)
7608 			{
7609 				return (ISC_R_INVALIDFILE);
7610 			}
7611 			header->next = (rdatasetheader_t *)(p + cooked);
7612 			header->next_is_relative = 0;
7613 			if ((header->next < (rdatasetheader_t *)base) ||
7614 			    (header->next > (rdatasetheader_t *)limit))
7615 			{
7616 				return (ISC_R_INVALIDFILE);
7617 			}
7618 		}
7619 
7620 		update_recordsandxfrsize(true, rbtdb->current_version, header,
7621 					 rbtnode->fullnamelen);
7622 	}
7623 
7624 	/* We're done deserializing; clear fullnamelen */
7625 	rbtnode->fullnamelen = 0;
7626 
7627 	return (ISC_R_SUCCESS);
7628 }
7629 
7630 /*
7631  * Load the RBT database from the image in 'f'
7632  */
7633 static isc_result_t
7634 deserialize(void *arg, FILE *f, off_t offset) {
7635 	isc_result_t result;
7636 	rbtdb_load_t *loadctx = arg;
7637 	dns_rbtdb_t *rbtdb = loadctx->rbtdb;
7638 	rbtdb_file_header_t *header;
7639 	int fd;
7640 	off_t filesize = 0;
7641 	char *base;
7642 	dns_rbt_t *tree = NULL, *nsec = NULL, *nsec3 = NULL;
7643 	int protect, flags;
7644 	dns_rbtnode_t *origin_node = NULL;
7645 
7646 	REQUIRE(VALID_RBTDB(rbtdb));
7647 
7648 	/*
7649 	 * TODO CKB: since this is read-write (had to be to add nodes later)
7650 	 * we will need to lock the file or the nodes in it before modifying
7651 	 * the nodes in the file.
7652 	 */
7653 
7654 	/* Map in the whole file in one go */
7655 	fd = fileno(f);
7656 	isc_file_getsizefd(fd, &filesize);
7657 	protect = PROT_READ | PROT_WRITE;
7658 	flags = MAP_PRIVATE;
7659 #ifdef MAP_FILE
7660 	flags |= MAP_FILE;
7661 #endif /* ifdef MAP_FILE */
7662 
7663 	base = isc_file_mmap(NULL, filesize, protect, flags, fd, 0);
7664 	if (base == NULL || base == MAP_FAILED) {
7665 		return (ISC_R_FAILURE);
7666 	}
7667 
7668 	header = (rbtdb_file_header_t *)(base + offset);
7669 	if (!match_header_version(header)) {
7670 		result = ISC_R_INVALIDFILE;
7671 		goto cleanup;
7672 	}
7673 
7674 	if (header->tree != 0) {
7675 		result = dns_rbt_deserialize_tree(
7676 			base, filesize, (off_t)header->tree, rbtdb->common.mctx,
7677 			delete_callback, rbtdb, rbt_datafixer, rbtdb, NULL,
7678 			&tree);
7679 		if (result != ISC_R_SUCCESS) {
7680 			goto cleanup;
7681 		}
7682 
7683 		result = dns_rbt_findnode(tree, &rbtdb->common.origin, NULL,
7684 					  &origin_node, NULL,
7685 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
7686 		if (result != ISC_R_SUCCESS) {
7687 			goto cleanup;
7688 		}
7689 	}
7690 
7691 	if (header->nsec != 0) {
7692 		result = dns_rbt_deserialize_tree(
7693 			base, filesize, (off_t)header->nsec, rbtdb->common.mctx,
7694 			delete_callback, rbtdb, rbt_datafixer, rbtdb, NULL,
7695 			&nsec);
7696 		if (result != ISC_R_SUCCESS) {
7697 			goto cleanup;
7698 		}
7699 	}
7700 
7701 	if (header->nsec3 != 0) {
7702 		result = dns_rbt_deserialize_tree(
7703 			base, filesize, (off_t)header->nsec3,
7704 			rbtdb->common.mctx, delete_callback, rbtdb,
7705 			rbt_datafixer, rbtdb, NULL, &nsec3);
7706 		if (result != ISC_R_SUCCESS) {
7707 			goto cleanup;
7708 		}
7709 	}
7710 
7711 	/*
7712 	 * We have a successfully loaded all the rbt trees now update
7713 	 * rbtdb to use them.
7714 	 */
7715 
7716 	rbtdb->mmap_location = base;
7717 	rbtdb->mmap_size = (size_t)filesize;
7718 
7719 	if (tree != NULL) {
7720 		dns_rbt_destroy(&rbtdb->tree);
7721 		rbtdb->tree = tree;
7722 		rbtdb->origin_node = origin_node;
7723 	}
7724 
7725 	if (nsec != NULL) {
7726 		dns_rbt_destroy(&rbtdb->nsec);
7727 		rbtdb->nsec = nsec;
7728 	}
7729 
7730 	if (nsec3 != NULL) {
7731 		dns_rbt_destroy(&rbtdb->nsec3);
7732 		rbtdb->nsec3 = nsec3;
7733 	}
7734 
7735 	return (ISC_R_SUCCESS);
7736 
7737 cleanup:
7738 	if (tree != NULL) {
7739 		dns_rbt_destroy(&tree);
7740 	}
7741 	if (nsec != NULL) {
7742 		dns_rbt_destroy(&nsec);
7743 	}
7744 	if (nsec3 != NULL) {
7745 		dns_rbt_destroy(&nsec3);
7746 	}
7747 	isc_file_munmap(base, (size_t)filesize);
7748 	return (result);
7749 }
7750 
7751 static isc_result_t
7752 beginload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7753 	rbtdb_load_t *loadctx;
7754 	dns_rbtdb_t *rbtdb;
7755 	rbtdb = (dns_rbtdb_t *)db;
7756 
7757 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7758 	REQUIRE(VALID_RBTDB(rbtdb));
7759 
7760 	loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
7761 
7762 	loadctx->rbtdb = rbtdb;
7763 	if (IS_CACHE(rbtdb)) {
7764 		isc_stdtime_get(&loadctx->now);
7765 	} else {
7766 		loadctx->now = 0;
7767 	}
7768 
7769 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7770 
7771 	REQUIRE((rbtdb->attributes &
7772 		 (RBTDB_ATTR_LOADED | RBTDB_ATTR_LOADING)) == 0);
7773 	rbtdb->attributes |= RBTDB_ATTR_LOADING;
7774 
7775 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7776 
7777 	callbacks->add = loading_addrdataset;
7778 	callbacks->add_private = loadctx;
7779 	callbacks->deserialize = deserialize;
7780 	callbacks->deserialize_private = loadctx;
7781 
7782 	return (ISC_R_SUCCESS);
7783 }
7784 
7785 static isc_result_t
7786 endload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7787 	rbtdb_load_t *loadctx;
7788 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7789 
7790 	REQUIRE(VALID_RBTDB(rbtdb));
7791 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7792 	loadctx = callbacks->add_private;
7793 	REQUIRE(loadctx != NULL);
7794 	REQUIRE(loadctx->rbtdb == rbtdb);
7795 
7796 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7797 
7798 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
7799 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
7800 
7801 	rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
7802 	rbtdb->attributes |= RBTDB_ATTR_LOADED;
7803 
7804 	/*
7805 	 * If there's a KEY rdataset at the zone origin containing a
7806 	 * zone key, we consider the zone secure.
7807 	 */
7808 	if (!IS_CACHE(rbtdb) && rbtdb->origin_node != NULL) {
7809 		dns_dbversion_t *version = rbtdb->current_version;
7810 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7811 		iszonesecure(db, version, rbtdb->origin_node);
7812 	} else {
7813 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7814 	}
7815 
7816 	callbacks->add = NULL;
7817 	callbacks->add_private = NULL;
7818 	callbacks->deserialize = NULL;
7819 	callbacks->deserialize_private = NULL;
7820 
7821 	isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
7822 
7823 	return (ISC_R_SUCCESS);
7824 }
7825 
7826 /*
7827  * helper function to handle writing out the rdataset data pointed to
7828  * by the void *data pointer in the dns_rbtnode
7829  */
7830 static isc_result_t
7831 rbt_datawriter(FILE *rbtfile, unsigned char *data, void *arg, uint64_t *crc) {
7832 	rbtdb_version_t *version = (rbtdb_version_t *)arg;
7833 	rbtdb_serial_t serial;
7834 	rdatasetheader_t newheader;
7835 	rdatasetheader_t *header = (rdatasetheader_t *)data, *next;
7836 	off_t where;
7837 	size_t cooked, size;
7838 	unsigned char *p;
7839 	isc_result_t result = ISC_R_SUCCESS;
7840 	char pad[sizeof(char *)];
7841 	uintptr_t off;
7842 
7843 	REQUIRE(rbtfile != NULL);
7844 	REQUIRE(data != NULL);
7845 	REQUIRE(version != NULL);
7846 
7847 	serial = version->serial;
7848 
7849 	for (; header != NULL; header = next) {
7850 		next = header->next;
7851 		do {
7852 			if (header->serial <= serial && !IGNORE(header)) {
7853 				if (NONEXISTENT(header)) {
7854 					header = NULL;
7855 				}
7856 				break;
7857 			} else {
7858 				header = header->down;
7859 			}
7860 		} while (header != NULL);
7861 
7862 		if (header == NULL) {
7863 			continue;
7864 		}
7865 
7866 		CHECK(isc_stdio_tell(rbtfile, &where));
7867 		size = dns_rdataslab_size((unsigned char *)header,
7868 					  sizeof(rdatasetheader_t));
7869 
7870 		p = (unsigned char *)header;
7871 		memmove(&newheader, p, sizeof(rdatasetheader_t));
7872 		newheader.down = NULL;
7873 		newheader.next = NULL;
7874 		off = where;
7875 		if ((off_t)off != where) {
7876 			return (ISC_R_RANGE);
7877 		}
7878 		newheader.node = (dns_rbtnode_t *)off;
7879 		newheader.node_is_relative = 1;
7880 		newheader.serial = 1;
7881 
7882 		/*
7883 		 * Round size up to the next pointer sized offset so it
7884 		 * will be properly aligned when read back in.
7885 		 */
7886 		cooked = dns_rbt_serialize_align(size);
7887 		if (next != NULL) {
7888 			newheader.next = (rdatasetheader_t *)(off + cooked);
7889 			newheader.next_is_relative = 1;
7890 		}
7891 
7892 #ifdef DEBUG
7893 		hexdump("writing header", (unsigned char *)&newheader,
7894 			sizeof(rdatasetheader_t));
7895 		hexdump("writing slab", p + sizeof(rdatasetheader_t),
7896 			size - sizeof(rdatasetheader_t));
7897 #endif /* ifdef DEBUG */
7898 		isc_crc64_update(crc, (unsigned char *)&newheader,
7899 				 sizeof(rdatasetheader_t));
7900 		CHECK(isc_stdio_write(&newheader, sizeof(rdatasetheader_t), 1,
7901 				      rbtfile, NULL));
7902 
7903 		isc_crc64_update(crc, p + sizeof(rdatasetheader_t),
7904 				 size - sizeof(rdatasetheader_t));
7905 		CHECK(isc_stdio_write(p + sizeof(rdatasetheader_t),
7906 				      size - sizeof(rdatasetheader_t), 1,
7907 				      rbtfile, NULL));
7908 		/*
7909 		 * Pad to force alignment.
7910 		 */
7911 		if (size != (size_t)cooked) {
7912 			memset(pad, 0, sizeof(pad));
7913 			CHECK(isc_stdio_write(pad, cooked - size, 1, rbtfile,
7914 					      NULL));
7915 		}
7916 	}
7917 
7918 failure:
7919 	return (result);
7920 }
7921 
7922 /*
7923  * Write out a zeroed header as a placeholder.  Doing this ensures
7924  * that the file will not read while it is partially written, should
7925  * writing fail or be interrupted.
7926  */
7927 static isc_result_t
7928 rbtdb_zero_header(FILE *rbtfile) {
7929 	char buffer[RBTDB_HEADER_LENGTH];
7930 	isc_result_t result;
7931 
7932 	memset(buffer, 0, RBTDB_HEADER_LENGTH);
7933 	result = isc_stdio_write(buffer, 1, RBTDB_HEADER_LENGTH, rbtfile, NULL);
7934 	fflush(rbtfile);
7935 
7936 	return (result);
7937 }
7938 
7939 static isc_once_t once = ISC_ONCE_INIT;
7940 
7941 static void
7942 init_file_version(void) {
7943 	int n;
7944 
7945 	memset(FILE_VERSION, 0, sizeof(FILE_VERSION));
7946 	n = snprintf(FILE_VERSION, sizeof(FILE_VERSION), "RBTDB Image %s %s",
7947 		     dns_major, dns_mapapi);
7948 	INSIST(n > 0 && (unsigned int)n < sizeof(FILE_VERSION));
7949 }
7950 
7951 /*
7952  * Write the file header out, recording the locations of the three
7953  * RBT's used in the rbtdb: tree, nsec, and nsec3, and including NodeDump
7954  * version information and any information stored in the rbtdb object
7955  * itself that should be stored here.
7956  */
7957 static isc_result_t
7958 rbtdb_write_header(FILE *rbtfile, off_t tree_location, off_t nsec_location,
7959 		   off_t nsec3_location) {
7960 	rbtdb_file_header_t header;
7961 	isc_result_t result;
7962 
7963 	RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
7964 
7965 	memset(&header, 0, sizeof(rbtdb_file_header_t));
7966 	memmove(header.version1, FILE_VERSION, sizeof(header.version1));
7967 	memmove(header.version2, FILE_VERSION, sizeof(header.version2));
7968 	header.ptrsize = (uint32_t)sizeof(void *);
7969 	header.bigendian = (1 == htonl(1)) ? 1 : 0;
7970 	header.tree = (uint64_t)tree_location;
7971 	header.nsec = (uint64_t)nsec_location;
7972 	header.nsec3 = (uint64_t)nsec3_location;
7973 	result = isc_stdio_write(&header, 1, sizeof(rbtdb_file_header_t),
7974 				 rbtfile, NULL);
7975 	fflush(rbtfile);
7976 
7977 	return (result);
7978 }
7979 
7980 static bool
7981 match_header_version(rbtdb_file_header_t *header) {
7982 	RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
7983 
7984 	if (memcmp(header->version1, FILE_VERSION, sizeof(header->version1)) !=
7985 		    0 ||
7986 	    memcmp(header->version2, FILE_VERSION, sizeof(header->version1)) !=
7987 		    0)
7988 	{
7989 		return (false);
7990 	}
7991 
7992 	return (true);
7993 }
7994 
7995 static isc_result_t
7996 serialize(dns_db_t *db, dns_dbversion_t *ver, FILE *rbtfile) {
7997 	rbtdb_version_t *version = (rbtdb_version_t *)ver;
7998 	dns_rbtdb_t *rbtdb;
7999 	isc_result_t result;
8000 	off_t tree_location, nsec_location, nsec3_location, header_location;
8001 
8002 	rbtdb = (dns_rbtdb_t *)db;
8003 
8004 	REQUIRE(VALID_RBTDB(rbtdb));
8005 	REQUIRE(rbtfile != NULL);
8006 
8007 	/* Ensure we're writing to a plain file */
8008 	CHECK(isc_file_isplainfilefd(fileno(rbtfile)));
8009 
8010 	/*
8011 	 * first, write out a zeroed header to store rbtdb information
8012 	 *
8013 	 * then for each of the three trees, store the current position
8014 	 * in the file and call dns_rbt_serialize_tree
8015 	 *
8016 	 * finally, write out the rbtdb header, storing the locations of the
8017 	 * rbtheaders
8018 	 *
8019 	 * NOTE: need to do something better with the return codes, &= will
8020 	 * not work.
8021 	 */
8022 	CHECK(isc_stdio_tell(rbtfile, &header_location));
8023 	CHECK(rbtdb_zero_header(rbtfile));
8024 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->tree, rbt_datawriter,
8025 				     version, &tree_location));
8026 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->nsec, rbt_datawriter,
8027 				     version, &nsec_location));
8028 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->nsec3, rbt_datawriter,
8029 				     version, &nsec3_location));
8030 
8031 	CHECK(isc_stdio_seek(rbtfile, header_location, SEEK_SET));
8032 	CHECK(rbtdb_write_header(rbtfile, tree_location, nsec_location,
8033 				 nsec3_location));
8034 failure:
8035 	return (result);
8036 }
8037 
8038 static isc_result_t
8039 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
8040      dns_masterformat_t masterformat) {
8041 	dns_rbtdb_t *rbtdb;
8042 	rbtdb_version_t *rbtversion = version;
8043 
8044 	rbtdb = (dns_rbtdb_t *)db;
8045 
8046 	REQUIRE(VALID_RBTDB(rbtdb));
8047 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8048 
8049 	return (dns_master_dump(rbtdb->common.mctx, db, version,
8050 				&dns_master_style_default, filename,
8051 				masterformat, NULL));
8052 }
8053 
8054 static void
8055 delete_callback(void *data, void *arg) {
8056 	dns_rbtdb_t *rbtdb = arg;
8057 	rdatasetheader_t *current, *next;
8058 	unsigned int locknum;
8059 
8060 	current = data;
8061 	locknum = current->node->locknum;
8062 	NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
8063 	while (current != NULL) {
8064 		next = current->next;
8065 		free_rdataset(rbtdb, rbtdb->common.mctx, current);
8066 		current = next;
8067 	}
8068 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
8069 }
8070 
8071 static bool
8072 issecure(dns_db_t *db) {
8073 	dns_rbtdb_t *rbtdb;
8074 	bool secure;
8075 
8076 	rbtdb = (dns_rbtdb_t *)db;
8077 
8078 	REQUIRE(VALID_RBTDB(rbtdb));
8079 
8080 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8081 	secure = (rbtdb->current_version->secure == dns_db_secure);
8082 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8083 
8084 	return (secure);
8085 }
8086 
8087 static bool
8088 isdnssec(dns_db_t *db) {
8089 	dns_rbtdb_t *rbtdb;
8090 	bool dnssec;
8091 
8092 	rbtdb = (dns_rbtdb_t *)db;
8093 
8094 	REQUIRE(VALID_RBTDB(rbtdb));
8095 
8096 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8097 	dnssec = (rbtdb->current_version->secure != dns_db_insecure);
8098 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8099 
8100 	return (dnssec);
8101 }
8102 
8103 static unsigned int
8104 nodecount(dns_db_t *db) {
8105 	dns_rbtdb_t *rbtdb;
8106 	unsigned int count;
8107 
8108 	rbtdb = (dns_rbtdb_t *)db;
8109 
8110 	REQUIRE(VALID_RBTDB(rbtdb));
8111 
8112 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8113 	count = dns_rbt_nodecount(rbtdb->tree);
8114 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8115 
8116 	return (count);
8117 }
8118 
8119 static size_t
8120 hashsize(dns_db_t *db) {
8121 	dns_rbtdb_t *rbtdb;
8122 	size_t size;
8123 
8124 	rbtdb = (dns_rbtdb_t *)db;
8125 
8126 	REQUIRE(VALID_RBTDB(rbtdb));
8127 
8128 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8129 	size = dns_rbt_hashsize(rbtdb->tree);
8130 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8131 
8132 	return (size);
8133 }
8134 
8135 static isc_result_t
8136 adjusthashsize(dns_db_t *db, size_t size) {
8137 	isc_result_t result;
8138 	dns_rbtdb_t *rbtdb;
8139 
8140 	rbtdb = (dns_rbtdb_t *)db;
8141 
8142 	REQUIRE(VALID_RBTDB(rbtdb));
8143 
8144 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8145 	result = dns_rbt_adjusthashsize(rbtdb->tree, size);
8146 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8147 
8148 	return (result);
8149 }
8150 
8151 static void
8152 settask(dns_db_t *db, isc_task_t *task) {
8153 	dns_rbtdb_t *rbtdb;
8154 
8155 	rbtdb = (dns_rbtdb_t *)db;
8156 
8157 	REQUIRE(VALID_RBTDB(rbtdb));
8158 
8159 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
8160 	if (rbtdb->task != NULL) {
8161 		isc_task_detach(&rbtdb->task);
8162 	}
8163 	if (task != NULL) {
8164 		isc_task_attach(task, &rbtdb->task);
8165 	}
8166 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
8167 }
8168 
8169 static bool
8170 ispersistent(dns_db_t *db) {
8171 	UNUSED(db);
8172 	return (false);
8173 }
8174 
8175 static isc_result_t
8176 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
8177 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8178 	dns_rbtnode_t *onode;
8179 	isc_result_t result = ISC_R_SUCCESS;
8180 
8181 	REQUIRE(VALID_RBTDB(rbtdb));
8182 	REQUIRE(nodep != NULL && *nodep == NULL);
8183 
8184 	/* Note that the access to origin_node doesn't require a DB lock */
8185 	onode = (dns_rbtnode_t *)rbtdb->origin_node;
8186 	if (onode != NULL) {
8187 		new_reference(rbtdb, onode, isc_rwlocktype_none);
8188 		*nodep = rbtdb->origin_node;
8189 	} else {
8190 		INSIST(IS_CACHE(rbtdb));
8191 		result = ISC_R_NOTFOUND;
8192 	}
8193 
8194 	return (result);
8195 }
8196 
8197 static isc_result_t
8198 getnsec3parameters(dns_db_t *db, dns_dbversion_t *version, dns_hash_t *hash,
8199 		   uint8_t *flags, uint16_t *iterations, unsigned char *salt,
8200 		   size_t *salt_length) {
8201 	dns_rbtdb_t *rbtdb;
8202 	isc_result_t result = ISC_R_NOTFOUND;
8203 	rbtdb_version_t *rbtversion = version;
8204 
8205 	rbtdb = (dns_rbtdb_t *)db;
8206 
8207 	REQUIRE(VALID_RBTDB(rbtdb));
8208 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8209 
8210 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8211 	if (rbtversion == NULL) {
8212 		rbtversion = rbtdb->current_version;
8213 	}
8214 
8215 	if (rbtversion->havensec3) {
8216 		if (hash != NULL) {
8217 			*hash = rbtversion->hash;
8218 		}
8219 		if (salt != NULL && salt_length != NULL) {
8220 			REQUIRE(*salt_length >= rbtversion->salt_length);
8221 			memmove(salt, rbtversion->salt,
8222 				rbtversion->salt_length);
8223 		}
8224 		if (salt_length != NULL) {
8225 			*salt_length = rbtversion->salt_length;
8226 		}
8227 		if (iterations != NULL) {
8228 			*iterations = rbtversion->iterations;
8229 		}
8230 		if (flags != NULL) {
8231 			*flags = rbtversion->flags;
8232 		}
8233 		result = ISC_R_SUCCESS;
8234 	}
8235 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8236 
8237 	return (result);
8238 }
8239 
8240 static isc_result_t
8241 getsize(dns_db_t *db, dns_dbversion_t *version, uint64_t *records,
8242 	uint64_t *xfrsize) {
8243 	dns_rbtdb_t *rbtdb;
8244 	isc_result_t result = ISC_R_SUCCESS;
8245 	rbtdb_version_t *rbtversion = version;
8246 
8247 	rbtdb = (dns_rbtdb_t *)db;
8248 
8249 	REQUIRE(VALID_RBTDB(rbtdb));
8250 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8251 
8252 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8253 	if (rbtversion == NULL) {
8254 		rbtversion = rbtdb->current_version;
8255 	}
8256 
8257 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8258 	if (records != NULL) {
8259 		*records = rbtversion->records;
8260 	}
8261 
8262 	if (xfrsize != NULL) {
8263 		*xfrsize = rbtversion->xfrsize;
8264 	}
8265 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8266 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8267 
8268 	return (result);
8269 }
8270 
8271 static isc_result_t
8272 setsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, isc_stdtime_t resign) {
8273 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8274 	rdatasetheader_t *header, oldheader;
8275 
8276 	REQUIRE(VALID_RBTDB(rbtdb));
8277 	REQUIRE(!IS_CACHE(rbtdb));
8278 	REQUIRE(rdataset != NULL);
8279 
8280 	header = rdataset->private3;
8281 	header--;
8282 
8283 	NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock,
8284 		  isc_rwlocktype_write);
8285 
8286 	oldheader = *header;
8287 	/*
8288 	 * Only break the heap invariant (by adjusting resign and resign_lsb)
8289 	 * if we are going to be restoring it by calling isc_heap_increased
8290 	 * or isc_heap_decreased.
8291 	 */
8292 	if (resign != 0) {
8293 		header->resign = (isc_stdtime_t)(dns_time64_from32(resign) >>
8294 						 1);
8295 		header->resign_lsb = resign & 0x1;
8296 	}
8297 	if (header->heap_index != 0) {
8298 		INSIST(RESIGN(header));
8299 		if (resign == 0) {
8300 			isc_heap_delete(rbtdb->heaps[header->node->locknum],
8301 					header->heap_index);
8302 			header->heap_index = 0;
8303 		} else if (resign_sooner(header, &oldheader)) {
8304 			isc_heap_increased(rbtdb->heaps[header->node->locknum],
8305 					   header->heap_index);
8306 		} else if (resign_sooner(&oldheader, header)) {
8307 			isc_heap_decreased(rbtdb->heaps[header->node->locknum],
8308 					   header->heap_index);
8309 		}
8310 	} else if (resign != 0) {
8311 		RDATASET_ATTR_SET(header, RDATASET_ATTR_RESIGN);
8312 		resign_insert(rbtdb, header->node->locknum, header);
8313 	}
8314 	NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock,
8315 		    isc_rwlocktype_write);
8316 	return (ISC_R_SUCCESS);
8317 }
8318 
8319 static isc_result_t
8320 getsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, dns_name_t *foundname) {
8321 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8322 	rdatasetheader_t *header = NULL, *this;
8323 	unsigned int i;
8324 	isc_result_t result = ISC_R_NOTFOUND;
8325 	unsigned int locknum = 0;
8326 
8327 	REQUIRE(VALID_RBTDB(rbtdb));
8328 
8329 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8330 
8331 	for (i = 0; i < rbtdb->node_lock_count; i++) {
8332 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read);
8333 
8334 		/*
8335 		 * Find for the earliest signing time among all of the
8336 		 * heaps, each of which is covered by a different bucket
8337 		 * lock.
8338 		 */
8339 		this = isc_heap_element(rbtdb->heaps[i], 1);
8340 		if (this == NULL) {
8341 			/* Nothing found; unlock and try the next heap. */
8342 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8343 				    isc_rwlocktype_read);
8344 			continue;
8345 		}
8346 
8347 		if (header == NULL) {
8348 			/*
8349 			 * Found a signing time: retain the bucket lock and
8350 			 * preserve the lock number so we can unlock it
8351 			 * later.
8352 			 */
8353 			header = this;
8354 			locknum = i;
8355 		} else if (resign_sooner(this, header)) {
8356 			/*
8357 			 * Found an earlier signing time; release the
8358 			 * previous bucket lock and retain this one instead.
8359 			 */
8360 			NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8361 				    isc_rwlocktype_read);
8362 			header = this;
8363 			locknum = i;
8364 		} else {
8365 			/*
8366 			 * Earliest signing time in this heap isn't
8367 			 * an improvement; unlock and try the next heap.
8368 			 */
8369 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8370 				    isc_rwlocktype_read);
8371 		}
8372 	}
8373 
8374 	if (header != NULL) {
8375 		/*
8376 		 * Found something; pass back the answer and unlock
8377 		 * the bucket.
8378 		 */
8379 		bind_rdataset(rbtdb, header->node, header, 0,
8380 			      isc_rwlocktype_read, rdataset);
8381 
8382 		if (foundname != NULL) {
8383 			dns_rbt_fullnamefromnode(header->node, foundname);
8384 		}
8385 
8386 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8387 			    isc_rwlocktype_read);
8388 
8389 		result = ISC_R_SUCCESS;
8390 	}
8391 
8392 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8393 
8394 	return (result);
8395 }
8396 
8397 static void
8398 resigned(dns_db_t *db, dns_rdataset_t *rdataset, dns_dbversion_t *version) {
8399 	rbtdb_version_t *rbtversion = (rbtdb_version_t *)version;
8400 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8401 	dns_rbtnode_t *node;
8402 	rdatasetheader_t *header;
8403 
8404 	REQUIRE(VALID_RBTDB(rbtdb));
8405 	REQUIRE(rdataset != NULL);
8406 	REQUIRE(rdataset->methods == &rdataset_methods);
8407 	REQUIRE(rbtdb->future_version == rbtversion);
8408 	REQUIRE(rbtversion != NULL);
8409 	REQUIRE(rbtversion->writer);
8410 	REQUIRE(rbtversion->rbtdb == rbtdb);
8411 
8412 	node = rdataset->private2;
8413 	INSIST(node != NULL);
8414 	header = rdataset->private3;
8415 	INSIST(header != NULL);
8416 	header--;
8417 
8418 	if (header->heap_index == 0) {
8419 		return;
8420 	}
8421 
8422 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8423 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
8424 	/*
8425 	 * Delete from heap and save to re-signed list so that it can
8426 	 * be restored if we backout of this change.
8427 	 */
8428 	resign_delete(rbtdb, rbtversion, header);
8429 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
8430 		    isc_rwlocktype_write);
8431 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8432 }
8433 
8434 static isc_result_t
8435 setcachestats(dns_db_t *db, isc_stats_t *stats) {
8436 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8437 
8438 	REQUIRE(VALID_RBTDB(rbtdb));
8439 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8440 	REQUIRE(stats != NULL);
8441 
8442 	isc_stats_attach(stats, &rbtdb->cachestats);
8443 	return (ISC_R_SUCCESS);
8444 }
8445 
8446 static isc_result_t
8447 setgluecachestats(dns_db_t *db, isc_stats_t *stats) {
8448 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8449 
8450 	REQUIRE(VALID_RBTDB(rbtdb));
8451 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
8452 	REQUIRE(stats != NULL);
8453 
8454 	isc_stats_attach(stats, &rbtdb->gluecachestats);
8455 	return (ISC_R_SUCCESS);
8456 }
8457 
8458 static dns_stats_t *
8459 getrrsetstats(dns_db_t *db) {
8460 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8461 
8462 	REQUIRE(VALID_RBTDB(rbtdb));
8463 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8464 
8465 	return (rbtdb->rrsetstats);
8466 }
8467 
8468 static isc_result_t
8469 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name) {
8470 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8471 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
8472 	isc_result_t result;
8473 
8474 	REQUIRE(VALID_RBTDB(rbtdb));
8475 	REQUIRE(node != NULL);
8476 	REQUIRE(name != NULL);
8477 
8478 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8479 	result = dns_rbt_fullnamefromnode(rbtnode, name);
8480 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8481 
8482 	return (result);
8483 }
8484 
8485 static isc_result_t
8486 setservestalettl(dns_db_t *db, dns_ttl_t ttl) {
8487 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8488 
8489 	REQUIRE(VALID_RBTDB(rbtdb));
8490 	REQUIRE(IS_CACHE(rbtdb));
8491 
8492 	/* currently no bounds checking.  0 means disable. */
8493 	rbtdb->serve_stale_ttl = ttl;
8494 	return (ISC_R_SUCCESS);
8495 }
8496 
8497 static isc_result_t
8498 getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
8499 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8500 
8501 	REQUIRE(VALID_RBTDB(rbtdb));
8502 	REQUIRE(IS_CACHE(rbtdb));
8503 
8504 	*ttl = rbtdb->serve_stale_ttl;
8505 	return (ISC_R_SUCCESS);
8506 }
8507 
8508 static isc_result_t
8509 setservestalerefresh(dns_db_t *db, uint32_t interval) {
8510 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8511 
8512 	REQUIRE(VALID_RBTDB(rbtdb));
8513 	REQUIRE(IS_CACHE(rbtdb));
8514 
8515 	/* currently no bounds checking.  0 means disable. */
8516 	rbtdb->serve_stale_refresh = interval;
8517 	return (ISC_R_SUCCESS);
8518 }
8519 
8520 static isc_result_t
8521 getservestalerefresh(dns_db_t *db, uint32_t *interval) {
8522 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8523 
8524 	REQUIRE(VALID_RBTDB(rbtdb));
8525 	REQUIRE(IS_CACHE(rbtdb));
8526 
8527 	*interval = rbtdb->serve_stale_refresh;
8528 	return (ISC_R_SUCCESS);
8529 }
8530 
8531 static dns_dbmethods_t zone_methods = { attach,
8532 					detach,
8533 					beginload,
8534 					endload,
8535 					serialize,
8536 					dump,
8537 					currentversion,
8538 					newversion,
8539 					attachversion,
8540 					closeversion,
8541 					findnode,
8542 					zone_find,
8543 					zone_findzonecut,
8544 					attachnode,
8545 					detachnode,
8546 					expirenode,
8547 					printnode,
8548 					createiterator,
8549 					zone_findrdataset,
8550 					allrdatasets,
8551 					addrdataset,
8552 					subtractrdataset,
8553 					deleterdataset,
8554 					issecure,
8555 					nodecount,
8556 					ispersistent,
8557 					overmem,
8558 					settask,
8559 					getoriginnode,
8560 					NULL, /* transfernode */
8561 					getnsec3parameters,
8562 					findnsec3node,
8563 					setsigningtime,
8564 					getsigningtime,
8565 					resigned,
8566 					isdnssec,
8567 					NULL, /* getrrsetstats */
8568 					NULL, /* rpz_attach */
8569 					NULL, /* rpz_ready */
8570 					NULL, /* findnodeext */
8571 					NULL, /* findext */
8572 					NULL, /* setcachestats */
8573 					hashsize,
8574 					nodefullname,
8575 					getsize,
8576 					NULL, /* setservestalettl */
8577 					NULL, /* getservestalettl */
8578 					NULL, /* setservestalerefresh */
8579 					NULL, /* getservestalerefresh */
8580 					setgluecachestats,
8581 					adjusthashsize };
8582 
8583 static dns_dbmethods_t cache_methods = { attach,
8584 					 detach,
8585 					 beginload,
8586 					 endload,
8587 					 NULL, /* serialize */
8588 					 dump,
8589 					 currentversion,
8590 					 newversion,
8591 					 attachversion,
8592 					 closeversion,
8593 					 findnode,
8594 					 cache_find,
8595 					 cache_findzonecut,
8596 					 attachnode,
8597 					 detachnode,
8598 					 expirenode,
8599 					 printnode,
8600 					 createiterator,
8601 					 cache_findrdataset,
8602 					 allrdatasets,
8603 					 addrdataset,
8604 					 subtractrdataset,
8605 					 deleterdataset,
8606 					 issecure,
8607 					 nodecount,
8608 					 ispersistent,
8609 					 overmem,
8610 					 settask,
8611 					 getoriginnode,
8612 					 NULL, /* transfernode */
8613 					 NULL, /* getnsec3parameters */
8614 					 NULL, /* findnsec3node */
8615 					 NULL, /* setsigningtime */
8616 					 NULL, /* getsigningtime */
8617 					 NULL, /* resigned */
8618 					 isdnssec,
8619 					 getrrsetstats,
8620 					 NULL, /* rpz_attach */
8621 					 NULL, /* rpz_ready */
8622 					 NULL, /* findnodeext */
8623 					 NULL, /* findext */
8624 					 setcachestats,
8625 					 hashsize,
8626 					 nodefullname,
8627 					 NULL, /* getsize */
8628 					 setservestalettl,
8629 					 getservestalettl,
8630 					 setservestalerefresh,
8631 					 getservestalerefresh,
8632 					 NULL,
8633 					 adjusthashsize };
8634 
8635 isc_result_t
8636 dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
8637 		 dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
8638 		 void *driverarg, dns_db_t **dbp) {
8639 	dns_rbtdb_t *rbtdb;
8640 	isc_result_t result;
8641 	int i;
8642 	dns_name_t name;
8643 	bool (*sooner)(void *, void *);
8644 	isc_mem_t *hmctx = mctx;
8645 
8646 	/* Keep the compiler happy. */
8647 	UNUSED(driverarg);
8648 
8649 	rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
8650 
8651 	/*
8652 	 * If argv[0] exists, it points to a memory context to use for heap
8653 	 */
8654 	if (argc != 0) {
8655 		hmctx = (isc_mem_t *)argv[0];
8656 	}
8657 
8658 	memset(rbtdb, '\0', sizeof(*rbtdb));
8659 	dns_name_init(&rbtdb->common.origin, NULL);
8660 	rbtdb->common.attributes = 0;
8661 	if (type == dns_dbtype_cache) {
8662 		rbtdb->common.methods = &cache_methods;
8663 		rbtdb->common.attributes |= DNS_DBATTR_CACHE;
8664 	} else if (type == dns_dbtype_stub) {
8665 		rbtdb->common.methods = &zone_methods;
8666 		rbtdb->common.attributes |= DNS_DBATTR_STUB;
8667 	} else {
8668 		rbtdb->common.methods = &zone_methods;
8669 	}
8670 	rbtdb->common.rdclass = rdclass;
8671 	rbtdb->common.mctx = NULL;
8672 
8673 	ISC_LIST_INIT(rbtdb->common.update_listeners);
8674 
8675 	RBTDB_INITLOCK(&rbtdb->lock);
8676 
8677 	isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
8678 
8679 	/*
8680 	 * Initialize node_lock_count in a generic way to support future
8681 	 * extension which allows the user to specify this value on creation.
8682 	 * Note that when specified for a cache DB it must be larger than 1
8683 	 * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
8684 	 */
8685 	if (rbtdb->node_lock_count == 0) {
8686 		if (IS_CACHE(rbtdb)) {
8687 			rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
8688 		} else {
8689 			rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
8690 		}
8691 	} else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
8692 		result = ISC_R_RANGE;
8693 		goto cleanup_tree_lock;
8694 	}
8695 	INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
8696 	rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
8697 						      sizeof(rbtdb_nodelock_t));
8698 
8699 	rbtdb->cachestats = NULL;
8700 	rbtdb->gluecachestats = NULL;
8701 
8702 	rbtdb->rrsetstats = NULL;
8703 	if (IS_CACHE(rbtdb)) {
8704 		result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
8705 		if (result != ISC_R_SUCCESS) {
8706 			goto cleanup_node_locks;
8707 		}
8708 		rbtdb->rdatasets = isc_mem_get(
8709 			mctx,
8710 			rbtdb->node_lock_count * sizeof(rdatasetheaderlist_t));
8711 		for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8712 			ISC_LIST_INIT(rbtdb->rdatasets[i]);
8713 		}
8714 	} else {
8715 		rbtdb->rdatasets = NULL;
8716 	}
8717 
8718 	/*
8719 	 * Create the heaps.
8720 	 */
8721 	rbtdb->heaps = isc_mem_get(hmctx, rbtdb->node_lock_count *
8722 						  sizeof(isc_heap_t *));
8723 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8724 		rbtdb->heaps[i] = NULL;
8725 	}
8726 	sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner;
8727 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8728 		isc_heap_create(hmctx, sooner, set_index, 0, &rbtdb->heaps[i]);
8729 	}
8730 
8731 	/*
8732 	 * Create deadnode lists.
8733 	 */
8734 	rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
8735 						     sizeof(rbtnodelist_t));
8736 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8737 		ISC_LIST_INIT(rbtdb->deadnodes[i]);
8738 	}
8739 
8740 	rbtdb->active = rbtdb->node_lock_count;
8741 
8742 	for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
8743 		NODE_INITLOCK(&rbtdb->node_locks[i].lock);
8744 		isc_refcount_init(&rbtdb->node_locks[i].references, 0);
8745 		rbtdb->node_locks[i].exiting = false;
8746 	}
8747 
8748 	/*
8749 	 * Attach to the mctx.  The database will persist so long as there
8750 	 * are references to it, and attaching to the mctx ensures that our
8751 	 * mctx won't disappear out from under us.
8752 	 */
8753 	isc_mem_attach(mctx, &rbtdb->common.mctx);
8754 	isc_mem_attach(hmctx, &rbtdb->hmctx);
8755 
8756 	/*
8757 	 * Make a copy of the origin name.
8758 	 */
8759 	result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
8760 	if (result != ISC_R_SUCCESS) {
8761 		free_rbtdb(rbtdb, false, NULL);
8762 		return (result);
8763 	}
8764 
8765 	/*
8766 	 * Make the Red-Black Trees.
8767 	 */
8768 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
8769 	if (result != ISC_R_SUCCESS) {
8770 		free_rbtdb(rbtdb, false, NULL);
8771 		return (result);
8772 	}
8773 
8774 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec);
8775 	if (result != ISC_R_SUCCESS) {
8776 		free_rbtdb(rbtdb, false, NULL);
8777 		return (result);
8778 	}
8779 
8780 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3);
8781 	if (result != ISC_R_SUCCESS) {
8782 		free_rbtdb(rbtdb, false, NULL);
8783 		return (result);
8784 	}
8785 
8786 	/*
8787 	 * In order to set the node callback bit correctly in zone databases,
8788 	 * we need to know if the node has the origin name of the zone.
8789 	 * In loading_addrdataset() we could simply compare the new name
8790 	 * to the origin name, but this is expensive.  Also, we don't know the
8791 	 * node name in addrdataset(), so we need another way of knowing the
8792 	 * zone's top.
8793 	 *
8794 	 * We now explicitly create a node for the zone's origin, and then
8795 	 * we simply remember the node's address.  This is safe, because
8796 	 * the top-of-zone node can never be deleted, nor can its address
8797 	 * change.
8798 	 */
8799 	if (!IS_CACHE(rbtdb)) {
8800 		rbtdb->origin_node = NULL;
8801 		result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
8802 					 &rbtdb->origin_node);
8803 		if (result != ISC_R_SUCCESS) {
8804 			INSIST(result != ISC_R_EXISTS);
8805 			free_rbtdb(rbtdb, false, NULL);
8806 			return (result);
8807 		}
8808 		INSIST(rbtdb->origin_node != NULL);
8809 		rbtdb->origin_node->nsec = DNS_RBT_NSEC_NORMAL;
8810 		/*
8811 		 * We need to give the origin node the right locknum.
8812 		 */
8813 		dns_name_init(&name, NULL);
8814 		dns_rbt_namefromnode(rbtdb->origin_node, &name);
8815 		rbtdb->origin_node->locknum = rbtdb->origin_node->hashval %
8816 					      rbtdb->node_lock_count;
8817 		/*
8818 		 * Add an apex node to the NSEC3 tree so that NSEC3 searches
8819 		 * return partial matches when there is only a single NSEC3
8820 		 * record in the tree.
8821 		 */
8822 		rbtdb->nsec3_origin_node = NULL;
8823 		result = dns_rbt_addnode(rbtdb->nsec3, &rbtdb->common.origin,
8824 					 &rbtdb->nsec3_origin_node);
8825 		if (result != ISC_R_SUCCESS) {
8826 			INSIST(result != ISC_R_EXISTS);
8827 			free_rbtdb(rbtdb, false, NULL);
8828 			return (result);
8829 		}
8830 		rbtdb->nsec3_origin_node->nsec = DNS_RBT_NSEC_NSEC3;
8831 		/*
8832 		 * We need to give the nsec3 origin node the right locknum.
8833 		 */
8834 		dns_name_init(&name, NULL);
8835 		dns_rbt_namefromnode(rbtdb->nsec3_origin_node, &name);
8836 		rbtdb->nsec3_origin_node->locknum =
8837 			rbtdb->nsec3_origin_node->hashval %
8838 			rbtdb->node_lock_count;
8839 	}
8840 
8841 	/*
8842 	 * Misc. Initialization.
8843 	 */
8844 	isc_refcount_init(&rbtdb->references, 1);
8845 	rbtdb->attributes = 0;
8846 	rbtdb->task = NULL;
8847 	rbtdb->serve_stale_ttl = 0;
8848 
8849 	/*
8850 	 * Version Initialization.
8851 	 */
8852 	rbtdb->current_serial = 1;
8853 	rbtdb->least_serial = 1;
8854 	rbtdb->next_serial = 2;
8855 	rbtdb->current_version = allocate_version(mctx, 1, 1, false);
8856 	rbtdb->current_version->rbtdb = rbtdb;
8857 	rbtdb->current_version->secure = dns_db_insecure;
8858 	rbtdb->current_version->havensec3 = false;
8859 	rbtdb->current_version->flags = 0;
8860 	rbtdb->current_version->iterations = 0;
8861 	rbtdb->current_version->hash = 0;
8862 	rbtdb->current_version->salt_length = 0;
8863 	memset(rbtdb->current_version->salt, 0,
8864 	       sizeof(rbtdb->current_version->salt));
8865 	isc_rwlock_init(&rbtdb->current_version->rwlock, 0, 0);
8866 	rbtdb->current_version->records = 0;
8867 	rbtdb->current_version->xfrsize = 0;
8868 	rbtdb->future_version = NULL;
8869 	ISC_LIST_INIT(rbtdb->open_versions);
8870 	/*
8871 	 * Keep the current version in the open list so that list operation
8872 	 * won't happen in normal lookup operations.
8873 	 */
8874 	PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
8875 
8876 	rbtdb->common.magic = DNS_DB_MAGIC;
8877 	rbtdb->common.impmagic = RBTDB_MAGIC;
8878 
8879 	*dbp = (dns_db_t *)rbtdb;
8880 
8881 	return (ISC_R_SUCCESS);
8882 
8883 cleanup_node_locks:
8884 	isc_mem_put(mctx, rbtdb->node_locks,
8885 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
8886 
8887 cleanup_tree_lock:
8888 	isc_rwlock_destroy(&rbtdb->tree_lock);
8889 	RBTDB_DESTROYLOCK(&rbtdb->lock);
8890 	isc_mem_put(mctx, rbtdb, sizeof(*rbtdb));
8891 	return (result);
8892 }
8893 
8894 /*
8895  * Slabbed Rdataset Methods
8896  */
8897 
8898 static void
8899 rdataset_disassociate(dns_rdataset_t *rdataset) {
8900 	dns_db_t *db = rdataset->private1;
8901 	dns_dbnode_t *node = rdataset->private2;
8902 
8903 	detachnode(db, &node);
8904 }
8905 
8906 static isc_result_t
8907 rdataset_first(dns_rdataset_t *rdataset) {
8908 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
8909 	unsigned int count;
8910 
8911 	count = raw[0] * 256 + raw[1];
8912 	if (count == 0) {
8913 		rdataset->private5 = NULL;
8914 		return (ISC_R_NOMORE);
8915 	}
8916 
8917 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
8918 		raw += DNS_RDATASET_COUNT;
8919 	}
8920 
8921 	raw += DNS_RDATASET_LENGTH;
8922 
8923 	/*
8924 	 * The privateuint4 field is the number of rdata beyond the
8925 	 * cursor position, so we decrement the total count by one
8926 	 * before storing it.
8927 	 *
8928 	 * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
8929 	 * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
8930 	 * to the first entry in the offset table.
8931 	 */
8932 	count--;
8933 	rdataset->privateuint4 = count;
8934 	rdataset->private5 = raw;
8935 
8936 	return (ISC_R_SUCCESS);
8937 }
8938 
8939 static isc_result_t
8940 rdataset_next(dns_rdataset_t *rdataset) {
8941 	unsigned int count;
8942 	unsigned int length;
8943 	unsigned char *raw; /* RDATASLAB */
8944 
8945 	count = rdataset->privateuint4;
8946 	if (count == 0) {
8947 		return (ISC_R_NOMORE);
8948 	}
8949 	count--;
8950 	rdataset->privateuint4 = count;
8951 
8952 	/*
8953 	 * Skip forward one record (length + 4) or one offset (4).
8954 	 */
8955 	raw = rdataset->private5;
8956 #if DNS_RDATASET_FIXED
8957 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
8958 #endif /* DNS_RDATASET_FIXED */
8959 	{
8960 		length = raw[0] * 256 + raw[1];
8961 		raw += length;
8962 	}
8963 
8964 	rdataset->private5 = raw + DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
8965 
8966 	return (ISC_R_SUCCESS);
8967 }
8968 
8969 static void
8970 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
8971 	unsigned char *raw = rdataset->private5; /* RDATASLAB */
8972 	unsigned int length;
8973 	isc_region_t r;
8974 	unsigned int flags = 0;
8975 
8976 	REQUIRE(raw != NULL);
8977 
8978 	/*
8979 	 * Find the start of the record if not already in private5
8980 	 * then skip the length and order fields.
8981 	 */
8982 #if DNS_RDATASET_FIXED
8983 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
8984 		unsigned int offset;
8985 		offset = ((unsigned int)raw[0] << 24) +
8986 			 ((unsigned int)raw[1] << 16) +
8987 			 ((unsigned int)raw[2] << 8) + (unsigned int)raw[3];
8988 		raw = rdataset->private3;
8989 		raw += offset;
8990 	}
8991 #endif /* if DNS_RDATASET_FIXED */
8992 
8993 	length = raw[0] * 256 + raw[1];
8994 
8995 	raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
8996 
8997 	if (rdataset->type == dns_rdatatype_rrsig) {
8998 		if (*raw & DNS_RDATASLAB_OFFLINE) {
8999 			flags |= DNS_RDATA_OFFLINE;
9000 		}
9001 		length--;
9002 		raw++;
9003 	}
9004 	r.length = length;
9005 	r.base = raw;
9006 	dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
9007 	rdata->flags |= flags;
9008 }
9009 
9010 static void
9011 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
9012 	dns_db_t *db = source->private1;
9013 	dns_dbnode_t *node = source->private2;
9014 	dns_dbnode_t *cloned_node = NULL;
9015 
9016 	attachnode(db, node, &cloned_node);
9017 	INSIST(!ISC_LINK_LINKED(target, link));
9018 	*target = *source;
9019 	ISC_LINK_INIT(target, link);
9020 
9021 	/*
9022 	 * Reset iterator state.
9023 	 */
9024 	target->privateuint4 = 0;
9025 	target->private5 = NULL;
9026 }
9027 
9028 static unsigned int
9029 rdataset_count(dns_rdataset_t *rdataset) {
9030 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9031 	unsigned int count;
9032 
9033 	count = raw[0] * 256 + raw[1];
9034 
9035 	return (count);
9036 }
9037 
9038 static isc_result_t
9039 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
9040 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
9041 	dns_db_t *db = rdataset->private1;
9042 	dns_dbnode_t *node = rdataset->private2;
9043 	dns_dbnode_t *cloned_node;
9044 	const struct noqname *noqname = rdataset->private6;
9045 
9046 	cloned_node = NULL;
9047 	attachnode(db, node, &cloned_node);
9048 	nsec->methods = &slab_methods;
9049 	nsec->rdclass = db->rdclass;
9050 	nsec->type = noqname->type;
9051 	nsec->covers = 0;
9052 	nsec->ttl = rdataset->ttl;
9053 	nsec->trust = rdataset->trust;
9054 	nsec->private1 = rdataset->private1;
9055 	nsec->private2 = rdataset->private2;
9056 	nsec->private3 = noqname->neg;
9057 	nsec->privateuint4 = 0;
9058 	nsec->private5 = NULL;
9059 	nsec->private6 = NULL;
9060 	nsec->private7 = NULL;
9061 
9062 	cloned_node = NULL;
9063 	attachnode(db, node, &cloned_node);
9064 	nsecsig->methods = &slab_methods;
9065 	nsecsig->rdclass = db->rdclass;
9066 	nsecsig->type = dns_rdatatype_rrsig;
9067 	nsecsig->covers = noqname->type;
9068 	nsecsig->ttl = rdataset->ttl;
9069 	nsecsig->trust = rdataset->trust;
9070 	nsecsig->private1 = rdataset->private1;
9071 	nsecsig->private2 = rdataset->private2;
9072 	nsecsig->private3 = noqname->negsig;
9073 	nsecsig->privateuint4 = 0;
9074 	nsecsig->private5 = NULL;
9075 	nsec->private6 = NULL;
9076 	nsec->private7 = NULL;
9077 
9078 	dns_name_clone(&noqname->name, name);
9079 
9080 	return (ISC_R_SUCCESS);
9081 }
9082 
9083 static isc_result_t
9084 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
9085 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
9086 	dns_db_t *db = rdataset->private1;
9087 	dns_dbnode_t *node = rdataset->private2;
9088 	dns_dbnode_t *cloned_node;
9089 	const struct noqname *closest = rdataset->private7;
9090 
9091 	cloned_node = NULL;
9092 	attachnode(db, node, &cloned_node);
9093 	nsec->methods = &slab_methods;
9094 	nsec->rdclass = db->rdclass;
9095 	nsec->type = closest->type;
9096 	nsec->covers = 0;
9097 	nsec->ttl = rdataset->ttl;
9098 	nsec->trust = rdataset->trust;
9099 	nsec->private1 = rdataset->private1;
9100 	nsec->private2 = rdataset->private2;
9101 	nsec->private3 = closest->neg;
9102 	nsec->privateuint4 = 0;
9103 	nsec->private5 = NULL;
9104 	nsec->private6 = NULL;
9105 	nsec->private7 = NULL;
9106 
9107 	cloned_node = NULL;
9108 	attachnode(db, node, &cloned_node);
9109 	nsecsig->methods = &slab_methods;
9110 	nsecsig->rdclass = db->rdclass;
9111 	nsecsig->type = dns_rdatatype_rrsig;
9112 	nsecsig->covers = closest->type;
9113 	nsecsig->ttl = rdataset->ttl;
9114 	nsecsig->trust = rdataset->trust;
9115 	nsecsig->private1 = rdataset->private1;
9116 	nsecsig->private2 = rdataset->private2;
9117 	nsecsig->private3 = closest->negsig;
9118 	nsecsig->privateuint4 = 0;
9119 	nsecsig->private5 = NULL;
9120 	nsec->private6 = NULL;
9121 	nsec->private7 = NULL;
9122 
9123 	dns_name_clone(&closest->name, name);
9124 
9125 	return (ISC_R_SUCCESS);
9126 }
9127 
9128 static void
9129 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust) {
9130 	dns_rbtdb_t *rbtdb = rdataset->private1;
9131 	dns_rbtnode_t *rbtnode = rdataset->private2;
9132 	rdatasetheader_t *header = rdataset->private3;
9133 
9134 	header--;
9135 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9136 		  isc_rwlocktype_write);
9137 	header->trust = rdataset->trust = trust;
9138 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9139 		    isc_rwlocktype_write);
9140 }
9141 
9142 static void
9143 rdataset_expire(dns_rdataset_t *rdataset) {
9144 	dns_rbtdb_t *rbtdb = rdataset->private1;
9145 	dns_rbtnode_t *rbtnode = rdataset->private2;
9146 	rdatasetheader_t *header = rdataset->private3;
9147 
9148 	header--;
9149 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9150 		  isc_rwlocktype_write);
9151 	expire_header(rbtdb, header, false, expire_flush);
9152 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9153 		    isc_rwlocktype_write);
9154 }
9155 
9156 static void
9157 rdataset_clearprefetch(dns_rdataset_t *rdataset) {
9158 	dns_rbtdb_t *rbtdb = rdataset->private1;
9159 	dns_rbtnode_t *rbtnode = rdataset->private2;
9160 	rdatasetheader_t *header = rdataset->private3;
9161 
9162 	header--;
9163 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9164 		  isc_rwlocktype_write);
9165 	RDATASET_ATTR_CLR(header, RDATASET_ATTR_PREFETCH);
9166 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9167 		    isc_rwlocktype_write);
9168 }
9169 
9170 /*
9171  * Rdataset Iterator Methods
9172  */
9173 
9174 static void
9175 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
9176 	rbtdb_rdatasetiter_t *rbtiterator;
9177 
9178 	rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
9179 
9180 	if (rbtiterator->common.version != NULL) {
9181 		closeversion(rbtiterator->common.db,
9182 			     &rbtiterator->common.version, false);
9183 	}
9184 	detachnode(rbtiterator->common.db, &rbtiterator->common.node);
9185 	isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
9186 		    sizeof(*rbtiterator));
9187 
9188 	*iteratorp = NULL;
9189 }
9190 
9191 static bool
9192 iterator_active(dns_rbtdb_t *rbtdb, rbtdb_rdatasetiter_t *rbtiterator,
9193 		rdatasetheader_t *header) {
9194 	dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
9195 
9196 	/*
9197 	 * Is this a "this rdataset doesn't exist" record?
9198 	 */
9199 	if (NONEXISTENT(header)) {
9200 		return (false);
9201 	}
9202 
9203 	/*
9204 	 * If this is a zone or this header still active then return it.
9205 	 */
9206 	if (!IS_CACHE(rbtdb) || ACTIVE(header, rbtiterator->common.now)) {
9207 		return (true);
9208 	}
9209 
9210 	/*
9211 	 * If we are not returning stale records or the rdataset is
9212 	 * too old don't return it.
9213 	 */
9214 	if (!STALEOK(rbtiterator) || (rbtiterator->common.now > stale_ttl)) {
9215 		return (false);
9216 	}
9217 	return (true);
9218 }
9219 
9220 static isc_result_t
9221 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
9222 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9223 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9224 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9225 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
9226 	rdatasetheader_t *header, *top_next;
9227 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
9228 
9229 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9230 		  isc_rwlocktype_read);
9231 
9232 	for (header = rbtnode->data; header != NULL; header = top_next) {
9233 		top_next = header->next;
9234 		do {
9235 			if (EXPIREDOK(rbtiterator)) {
9236 				if (!NONEXISTENT(header)) {
9237 					break;
9238 				}
9239 				header = header->down;
9240 			} else if (header->serial <= serial && !IGNORE(header))
9241 			{
9242 				if (!iterator_active(rbtdb, rbtiterator,
9243 						     header))
9244 				{
9245 					header = NULL;
9246 				}
9247 				break;
9248 			} else {
9249 				header = header->down;
9250 			}
9251 		} while (header != NULL);
9252 		if (header != NULL) {
9253 			break;
9254 		}
9255 	}
9256 
9257 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9258 		    isc_rwlocktype_read);
9259 
9260 	rbtiterator->current = header;
9261 
9262 	if (header == NULL) {
9263 		return (ISC_R_NOMORE);
9264 	}
9265 
9266 	return (ISC_R_SUCCESS);
9267 }
9268 
9269 static isc_result_t
9270 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
9271 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9272 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9273 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9274 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
9275 	rdatasetheader_t *header, *top_next;
9276 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
9277 	rbtdb_rdatatype_t type, negtype;
9278 	dns_rdatatype_t rdtype, covers;
9279 	bool expiredok = EXPIREDOK(rbtiterator);
9280 
9281 	header = rbtiterator->current;
9282 	if (header == NULL) {
9283 		return (ISC_R_NOMORE);
9284 	}
9285 
9286 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9287 		  isc_rwlocktype_read);
9288 
9289 	type = header->type;
9290 	rdtype = RBTDB_RDATATYPE_BASE(header->type);
9291 	if (NEGATIVE(header)) {
9292 		covers = RBTDB_RDATATYPE_EXT(header->type);
9293 		negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
9294 	} else {
9295 		negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
9296 	}
9297 
9298 	/*
9299 	 * Find the start of the header chain for the next type
9300 	 * by walking back up the list.
9301 	 */
9302 	top_next = header->next;
9303 	while (top_next != NULL &&
9304 	       (top_next->type == type || top_next->type == negtype))
9305 	{
9306 		top_next = top_next->next;
9307 	}
9308 	if (expiredok) {
9309 		/*
9310 		 * Keep walking down the list if possible or
9311 		 * start the next type.
9312 		 */
9313 		header = header->down != NULL ? header->down : top_next;
9314 	} else {
9315 		header = top_next;
9316 	}
9317 	for (; header != NULL; header = top_next) {
9318 		top_next = header->next;
9319 		do {
9320 			if (expiredok) {
9321 				if (!NONEXISTENT(header)) {
9322 					break;
9323 				}
9324 				header = header->down;
9325 			} else if (header->serial <= serial && !IGNORE(header))
9326 			{
9327 				if (!iterator_active(rbtdb, rbtiterator,
9328 						     header))
9329 				{
9330 					header = NULL;
9331 				}
9332 				break;
9333 			} else {
9334 				header = header->down;
9335 			}
9336 		} while (header != NULL);
9337 		if (header != NULL) {
9338 			break;
9339 		}
9340 		/*
9341 		 * Find the start of the header chain for the next type
9342 		 * by walking back up the list.
9343 		 */
9344 		while (top_next != NULL &&
9345 		       (top_next->type == type || top_next->type == negtype))
9346 		{
9347 			top_next = top_next->next;
9348 		}
9349 	}
9350 
9351 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9352 		    isc_rwlocktype_read);
9353 
9354 	rbtiterator->current = header;
9355 
9356 	if (header == NULL) {
9357 		return (ISC_R_NOMORE);
9358 	}
9359 
9360 	return (ISC_R_SUCCESS);
9361 }
9362 
9363 static void
9364 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
9365 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9366 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9367 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9368 	rdatasetheader_t *header;
9369 
9370 	header = rbtiterator->current;
9371 	REQUIRE(header != NULL);
9372 
9373 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9374 		  isc_rwlocktype_read);
9375 
9376 	bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
9377 		      isc_rwlocktype_read, rdataset);
9378 
9379 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9380 		    isc_rwlocktype_read);
9381 }
9382 
9383 /*
9384  * Database Iterator Methods
9385  */
9386 
9387 static void
9388 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9389 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9390 	dns_rbtnode_t *node = rbtdbiter->node;
9391 
9392 	if (node == NULL) {
9393 		return;
9394 	}
9395 
9396 	INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
9397 	reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
9398 }
9399 
9400 static void
9401 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9402 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9403 	dns_rbtnode_t *node = rbtdbiter->node;
9404 	nodelock_t *lock;
9405 
9406 	if (node == NULL) {
9407 		return;
9408 	}
9409 
9410 	lock = &rbtdb->node_locks[node->locknum].lock;
9411 	NODE_LOCK(lock, isc_rwlocktype_read);
9412 	decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9413 			    rbtdbiter->tree_locked, false);
9414 	NODE_UNLOCK(lock, isc_rwlocktype_read);
9415 
9416 	rbtdbiter->node = NULL;
9417 }
9418 
9419 static void
9420 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
9421 	dns_rbtnode_t *node;
9422 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9423 	bool was_read_locked = false;
9424 	nodelock_t *lock;
9425 	int i;
9426 
9427 	if (rbtdbiter->delcnt != 0) {
9428 		/*
9429 		 * Note that "%d node of %d in tree" can report things like
9430 		 * "flush_deletions: 59 nodes of 41 in tree".  This means
9431 		 * That some nodes appear on the deletions list more than
9432 		 * once.  Only the last occurrence will actually be deleted.
9433 		 */
9434 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
9435 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
9436 			      "flush_deletions: %d nodes of %d in tree",
9437 			      rbtdbiter->delcnt,
9438 			      dns_rbt_nodecount(rbtdb->tree));
9439 
9440 		if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9441 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9442 			was_read_locked = true;
9443 		}
9444 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9445 		rbtdbiter->tree_locked = isc_rwlocktype_write;
9446 
9447 		for (i = 0; i < rbtdbiter->delcnt; i++) {
9448 			node = rbtdbiter->deletions[i];
9449 			lock = &rbtdb->node_locks[node->locknum].lock;
9450 
9451 			NODE_LOCK(lock, isc_rwlocktype_read);
9452 			decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9453 					    rbtdbiter->tree_locked, false);
9454 			NODE_UNLOCK(lock, isc_rwlocktype_read);
9455 		}
9456 
9457 		rbtdbiter->delcnt = 0;
9458 
9459 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9460 		if (was_read_locked) {
9461 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9462 			rbtdbiter->tree_locked = isc_rwlocktype_read;
9463 		} else {
9464 			rbtdbiter->tree_locked = isc_rwlocktype_none;
9465 		}
9466 	}
9467 }
9468 
9469 static void
9470 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
9471 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9472 
9473 	REQUIRE(rbtdbiter->paused);
9474 	REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
9475 
9476 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9477 	rbtdbiter->tree_locked = isc_rwlocktype_read;
9478 
9479 	rbtdbiter->paused = false;
9480 }
9481 
9482 static void
9483 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
9484 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
9485 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9486 	dns_db_t *db = NULL;
9487 
9488 	if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9489 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9490 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9491 	} else {
9492 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
9493 	}
9494 
9495 	dereference_iter_node(rbtdbiter);
9496 
9497 	flush_deletions(rbtdbiter);
9498 
9499 	dns_db_attach(rbtdbiter->common.db, &db);
9500 	dns_db_detach(&rbtdbiter->common.db);
9501 
9502 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9503 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9504 	isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
9505 	dns_db_detach(&db);
9506 
9507 	*iteratorp = NULL;
9508 }
9509 
9510 static isc_result_t
9511 dbiterator_first(dns_dbiterator_t *iterator) {
9512 	isc_result_t result;
9513 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9514 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9515 	dns_name_t *name, *origin;
9516 
9517 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9518 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9519 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9520 	    rbtdbiter->result != ISC_R_NOMORE)
9521 	{
9522 		return (rbtdbiter->result);
9523 	}
9524 
9525 	if (rbtdbiter->paused) {
9526 		resume_iteration(rbtdbiter);
9527 	}
9528 
9529 	dereference_iter_node(rbtdbiter);
9530 
9531 	name = dns_fixedname_name(&rbtdbiter->name);
9532 	origin = dns_fixedname_name(&rbtdbiter->origin);
9533 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9534 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9535 
9536 	if (rbtdbiter->nsec3only) {
9537 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9538 		result = dns_rbtnodechain_first(rbtdbiter->current,
9539 						rbtdb->nsec3, name, origin);
9540 	} else {
9541 		rbtdbiter->current = &rbtdbiter->chain;
9542 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
9543 						name, origin);
9544 		if (!rbtdbiter->nonsec3 && result == ISC_R_NOTFOUND) {
9545 			rbtdbiter->current = &rbtdbiter->nsec3chain;
9546 			result = dns_rbtnodechain_first(
9547 				rbtdbiter->current, rbtdb->nsec3, name, origin);
9548 		}
9549 	}
9550 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9551 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9552 						  NULL, &rbtdbiter->node);
9553 		if (result == ISC_R_SUCCESS) {
9554 			rbtdbiter->new_origin = true;
9555 			reference_iter_node(rbtdbiter);
9556 		}
9557 	} else {
9558 		INSIST(result == ISC_R_NOTFOUND);
9559 		result = ISC_R_NOMORE; /* The tree is empty. */
9560 	}
9561 
9562 	rbtdbiter->result = result;
9563 
9564 	if (result != ISC_R_SUCCESS) {
9565 		ENSURE(!rbtdbiter->paused);
9566 	}
9567 
9568 	return (result);
9569 }
9570 
9571 static isc_result_t
9572 dbiterator_last(dns_dbiterator_t *iterator) {
9573 	isc_result_t result;
9574 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9575 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9576 	dns_name_t *name, *origin;
9577 
9578 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9579 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9580 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9581 	    rbtdbiter->result != ISC_R_NOMORE)
9582 	{
9583 		return (rbtdbiter->result);
9584 	}
9585 
9586 	if (rbtdbiter->paused) {
9587 		resume_iteration(rbtdbiter);
9588 	}
9589 
9590 	dereference_iter_node(rbtdbiter);
9591 
9592 	name = dns_fixedname_name(&rbtdbiter->name);
9593 	origin = dns_fixedname_name(&rbtdbiter->origin);
9594 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9595 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9596 
9597 	result = ISC_R_NOTFOUND;
9598 	if (rbtdbiter->nsec3only && !rbtdbiter->nonsec3) {
9599 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9600 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
9601 					       name, origin);
9602 	}
9603 	if (!rbtdbiter->nsec3only && result == ISC_R_NOTFOUND) {
9604 		rbtdbiter->current = &rbtdbiter->chain;
9605 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9606 					       name, origin);
9607 	}
9608 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9609 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9610 						  NULL, &rbtdbiter->node);
9611 		if (result == ISC_R_SUCCESS) {
9612 			rbtdbiter->new_origin = true;
9613 			reference_iter_node(rbtdbiter);
9614 		}
9615 	} else {
9616 		INSIST(result == ISC_R_NOTFOUND);
9617 		result = ISC_R_NOMORE; /* The tree is empty. */
9618 	}
9619 
9620 	rbtdbiter->result = result;
9621 
9622 	return (result);
9623 }
9624 
9625 static isc_result_t
9626 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name) {
9627 	isc_result_t result, tresult;
9628 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9629 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9630 	dns_name_t *iname, *origin;
9631 
9632 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9633 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9634 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9635 	    rbtdbiter->result != ISC_R_NOMORE)
9636 	{
9637 		return (rbtdbiter->result);
9638 	}
9639 
9640 	if (rbtdbiter->paused) {
9641 		resume_iteration(rbtdbiter);
9642 	}
9643 
9644 	dereference_iter_node(rbtdbiter);
9645 
9646 	iname = dns_fixedname_name(&rbtdbiter->name);
9647 	origin = dns_fixedname_name(&rbtdbiter->origin);
9648 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9649 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9650 
9651 	if (rbtdbiter->nsec3only) {
9652 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9653 		result = dns_rbt_findnode(rbtdb->nsec3, name, NULL,
9654 					  &rbtdbiter->node, rbtdbiter->current,
9655 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9656 	} else if (rbtdbiter->nonsec3) {
9657 		rbtdbiter->current = &rbtdbiter->chain;
9658 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9659 					  &rbtdbiter->node, rbtdbiter->current,
9660 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9661 	} else {
9662 		/*
9663 		 * Stay on main chain if not found on either chain.
9664 		 */
9665 		rbtdbiter->current = &rbtdbiter->chain;
9666 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9667 					  &rbtdbiter->node, rbtdbiter->current,
9668 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9669 		if (result == DNS_R_PARTIALMATCH) {
9670 			dns_rbtnode_t *node = NULL;
9671 			tresult = dns_rbt_findnode(
9672 				rbtdb->nsec3, name, NULL, &node,
9673 				&rbtdbiter->nsec3chain, DNS_RBTFIND_EMPTYDATA,
9674 				NULL, NULL);
9675 			if (tresult == ISC_R_SUCCESS) {
9676 				rbtdbiter->node = node;
9677 				rbtdbiter->current = &rbtdbiter->nsec3chain;
9678 				result = tresult;
9679 			}
9680 		}
9681 	}
9682 
9683 	if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) {
9684 		tresult = dns_rbtnodechain_current(rbtdbiter->current, iname,
9685 						   origin, NULL);
9686 		if (tresult == ISC_R_SUCCESS) {
9687 			rbtdbiter->new_origin = true;
9688 			reference_iter_node(rbtdbiter);
9689 		} else {
9690 			result = tresult;
9691 			rbtdbiter->node = NULL;
9692 		}
9693 	} else {
9694 		rbtdbiter->node = NULL;
9695 	}
9696 
9697 	rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? ISC_R_SUCCESS
9698 							   : result;
9699 
9700 	return (result);
9701 }
9702 
9703 static isc_result_t
9704 dbiterator_prev(dns_dbiterator_t *iterator) {
9705 	isc_result_t result;
9706 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9707 	dns_name_t *name, *origin;
9708 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9709 
9710 	REQUIRE(rbtdbiter->node != NULL);
9711 
9712 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9713 		return (rbtdbiter->result);
9714 	}
9715 
9716 	if (rbtdbiter->paused) {
9717 		resume_iteration(rbtdbiter);
9718 	}
9719 
9720 	name = dns_fixedname_name(&rbtdbiter->name);
9721 	origin = dns_fixedname_name(&rbtdbiter->origin);
9722 	result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin);
9723 	if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only &&
9724 	    !rbtdbiter->nonsec3 && &rbtdbiter->nsec3chain == rbtdbiter->current)
9725 	{
9726 		rbtdbiter->current = &rbtdbiter->chain;
9727 		dns_rbtnodechain_reset(rbtdbiter->current);
9728 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9729 					       name, origin);
9730 		if (result == ISC_R_NOTFOUND) {
9731 			result = ISC_R_NOMORE;
9732 		}
9733 	}
9734 
9735 	dereference_iter_node(rbtdbiter);
9736 
9737 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9738 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9739 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9740 						  NULL, &rbtdbiter->node);
9741 	}
9742 
9743 	if (result == ISC_R_SUCCESS) {
9744 		reference_iter_node(rbtdbiter);
9745 	}
9746 
9747 	rbtdbiter->result = result;
9748 
9749 	return (result);
9750 }
9751 
9752 static isc_result_t
9753 dbiterator_next(dns_dbiterator_t *iterator) {
9754 	isc_result_t result;
9755 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9756 	dns_name_t *name, *origin;
9757 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9758 
9759 	REQUIRE(rbtdbiter->node != NULL);
9760 
9761 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9762 		return (rbtdbiter->result);
9763 	}
9764 
9765 	if (rbtdbiter->paused) {
9766 		resume_iteration(rbtdbiter);
9767 	}
9768 
9769 	name = dns_fixedname_name(&rbtdbiter->name);
9770 	origin = dns_fixedname_name(&rbtdbiter->origin);
9771 	result = dns_rbtnodechain_next(rbtdbiter->current, name, origin);
9772 	if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only &&
9773 	    !rbtdbiter->nonsec3 && &rbtdbiter->chain == rbtdbiter->current)
9774 	{
9775 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9776 		dns_rbtnodechain_reset(rbtdbiter->current);
9777 		result = dns_rbtnodechain_first(rbtdbiter->current,
9778 						rbtdb->nsec3, name, origin);
9779 		if (result == ISC_R_NOTFOUND) {
9780 			result = ISC_R_NOMORE;
9781 		}
9782 	}
9783 
9784 	dereference_iter_node(rbtdbiter);
9785 
9786 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9787 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9788 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9789 						  NULL, &rbtdbiter->node);
9790 	}
9791 	if (result == ISC_R_SUCCESS) {
9792 		reference_iter_node(rbtdbiter);
9793 	}
9794 
9795 	rbtdbiter->result = result;
9796 
9797 	return (result);
9798 }
9799 
9800 static isc_result_t
9801 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
9802 		   dns_name_t *name) {
9803 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9804 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9805 	dns_rbtnode_t *node = rbtdbiter->node;
9806 	isc_result_t result;
9807 	dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
9808 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9809 
9810 	REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
9811 	REQUIRE(rbtdbiter->node != NULL);
9812 
9813 	if (rbtdbiter->paused) {
9814 		resume_iteration(rbtdbiter);
9815 	}
9816 
9817 	if (name != NULL) {
9818 		if (rbtdbiter->common.relative_names) {
9819 			origin = NULL;
9820 		}
9821 		result = dns_name_concatenate(nodename, origin, name, NULL);
9822 		if (result != ISC_R_SUCCESS) {
9823 			return (result);
9824 		}
9825 		if (rbtdbiter->common.relative_names && rbtdbiter->new_origin) {
9826 			result = DNS_R_NEWORIGIN;
9827 		}
9828 	} else {
9829 		result = ISC_R_SUCCESS;
9830 	}
9831 
9832 	new_reference(rbtdb, node, isc_rwlocktype_none);
9833 
9834 	*nodep = rbtdbiter->node;
9835 
9836 	if (iterator->cleaning && result == ISC_R_SUCCESS) {
9837 		isc_result_t expire_result;
9838 
9839 		/*
9840 		 * If the deletion array is full, flush it before trying
9841 		 * to expire the current node.  The current node can't
9842 		 * fully deleted while the iteration cursor is still on it.
9843 		 */
9844 		if (rbtdbiter->delcnt == DELETION_BATCH_MAX) {
9845 			flush_deletions(rbtdbiter);
9846 		}
9847 
9848 		expire_result = expirenode(iterator->db, *nodep, 0);
9849 
9850 		/*
9851 		 * expirenode() currently always returns success.
9852 		 */
9853 		if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
9854 			rbtdbiter->deletions[rbtdbiter->delcnt++] = node;
9855 			isc_refcount_increment(&node->references);
9856 		}
9857 	}
9858 
9859 	return (result);
9860 }
9861 
9862 static isc_result_t
9863 dbiterator_pause(dns_dbiterator_t *iterator) {
9864 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9865 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9866 
9867 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9868 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9869 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9870 	    rbtdbiter->result != ISC_R_NOMORE)
9871 	{
9872 		return (rbtdbiter->result);
9873 	}
9874 
9875 	if (rbtdbiter->paused) {
9876 		return (ISC_R_SUCCESS);
9877 	}
9878 
9879 	rbtdbiter->paused = true;
9880 
9881 	if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
9882 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
9883 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9884 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9885 	}
9886 
9887 	flush_deletions(rbtdbiter);
9888 
9889 	return (ISC_R_SUCCESS);
9890 }
9891 
9892 static isc_result_t
9893 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
9894 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9895 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9896 
9897 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9898 		return (rbtdbiter->result);
9899 	}
9900 
9901 	dns_name_copynf(origin, name);
9902 	return (ISC_R_SUCCESS);
9903 }
9904 
9905 static void
9906 setownercase(rdatasetheader_t *header, const dns_name_t *name) {
9907 	unsigned int i;
9908 	bool fully_lower;
9909 
9910 	/*
9911 	 * We do not need to worry about label lengths as they are all
9912 	 * less than or equal to 63.
9913 	 */
9914 	memset(header->upper, 0, sizeof(header->upper));
9915 	fully_lower = true;
9916 	for (i = 0; i < name->length; i++) {
9917 		if (isupper(name->ndata[i])) {
9918 			header->upper[i / 8] |= 1 << (i % 8);
9919 			fully_lower = false;
9920 		}
9921 	}
9922 	RDATASET_ATTR_SET(header, RDATASET_ATTR_CASESET);
9923 	if (ISC_LIKELY(fully_lower)) {
9924 		RDATASET_ATTR_SET(header, RDATASET_ATTR_CASEFULLYLOWER);
9925 	}
9926 }
9927 
9928 static void
9929 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name) {
9930 	dns_rbtdb_t *rbtdb = rdataset->private1;
9931 	dns_rbtnode_t *rbtnode = rdataset->private2;
9932 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9933 	rdatasetheader_t *header;
9934 
9935 	header = (struct rdatasetheader *)(raw - sizeof(*header));
9936 
9937 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9938 		  isc_rwlocktype_write);
9939 	setownercase(header, name);
9940 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9941 		    isc_rwlocktype_write);
9942 }
9943 
9944 static void
9945 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name) {
9946 	dns_rbtdb_t *rbtdb = rdataset->private1;
9947 	dns_rbtnode_t *rbtnode = rdataset->private2;
9948 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9949 	rdatasetheader_t *header = NULL;
9950 	uint8_t mask = (1 << 7);
9951 	uint8_t bits = 0;
9952 
9953 	header = (struct rdatasetheader *)(raw - sizeof(*header));
9954 
9955 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9956 		  isc_rwlocktype_read);
9957 
9958 	if (!CASESET(header)) {
9959 		goto unlock;
9960 	}
9961 
9962 	if (ISC_LIKELY(CASEFULLYLOWER(header))) {
9963 		for (size_t i = 0; i < name->length; i++) {
9964 			name->ndata[i] = tolower(name->ndata[i]);
9965 		}
9966 	} else {
9967 		for (size_t i = 0; i < name->length; i++) {
9968 			if (mask == (1 << 7)) {
9969 				bits = header->upper[i / 8];
9970 				mask = 1;
9971 			} else {
9972 				mask <<= 1;
9973 			}
9974 
9975 			name->ndata[i] = ((bits & mask) != 0)
9976 						 ? toupper(name->ndata[i])
9977 						 : tolower(name->ndata[i]);
9978 		}
9979 	}
9980 
9981 unlock:
9982 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9983 		    isc_rwlocktype_read);
9984 }
9985 
9986 struct rbtdb_glue {
9987 	struct rbtdb_glue *next;
9988 	dns_fixedname_t fixedname;
9989 	dns_rdataset_t rdataset_a;
9990 	dns_rdataset_t sigrdataset_a;
9991 	dns_rdataset_t rdataset_aaaa;
9992 	dns_rdataset_t sigrdataset_aaaa;
9993 };
9994 
9995 typedef struct {
9996 	rbtdb_glue_t *glue_list;
9997 	dns_rbtdb_t *rbtdb;
9998 	rbtdb_version_t *rbtversion;
9999 } rbtdb_glue_additionaldata_ctx_t;
10000 
10001 static void
10002 free_gluelist(rbtdb_glue_t *glue_list, dns_rbtdb_t *rbtdb) {
10003 	rbtdb_glue_t *cur, *cur_next;
10004 
10005 	if (glue_list == (void *)-1) {
10006 		return;
10007 	}
10008 
10009 	cur = glue_list;
10010 	while (cur != NULL) {
10011 		cur_next = cur->next;
10012 
10013 		if (dns_rdataset_isassociated(&cur->rdataset_a)) {
10014 			dns_rdataset_disassociate(&cur->rdataset_a);
10015 		}
10016 		if (dns_rdataset_isassociated(&cur->sigrdataset_a)) {
10017 			dns_rdataset_disassociate(&cur->sigrdataset_a);
10018 		}
10019 
10020 		if (dns_rdataset_isassociated(&cur->rdataset_aaaa)) {
10021 			dns_rdataset_disassociate(&cur->rdataset_aaaa);
10022 		}
10023 		if (dns_rdataset_isassociated(&cur->sigrdataset_aaaa)) {
10024 			dns_rdataset_disassociate(&cur->sigrdataset_aaaa);
10025 		}
10026 
10027 		dns_rdataset_invalidate(&cur->rdataset_a);
10028 		dns_rdataset_invalidate(&cur->sigrdataset_a);
10029 		dns_rdataset_invalidate(&cur->rdataset_aaaa);
10030 		dns_rdataset_invalidate(&cur->sigrdataset_aaaa);
10031 
10032 		isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
10033 		cur = cur_next;
10034 	}
10035 }
10036 
10037 static void
10038 free_gluetable(rbtdb_version_t *version) {
10039 	dns_rbtdb_t *rbtdb;
10040 	size_t size, i;
10041 
10042 	RWLOCK(&version->glue_rwlock, isc_rwlocktype_write);
10043 
10044 	rbtdb = version->rbtdb;
10045 
10046 	for (i = 0; i < HASHSIZE(version->glue_table_bits); i++) {
10047 		rbtdb_glue_table_node_t *cur, *cur_next;
10048 
10049 		cur = version->glue_table[i];
10050 		while (cur != NULL) {
10051 			cur_next = cur->next;
10052 			/* isc_refcount_decrement(&cur->node->references); */
10053 			cur->node = NULL;
10054 			free_gluelist(cur->glue_list, rbtdb);
10055 			cur->glue_list = NULL;
10056 			isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
10057 			cur = cur_next;
10058 		}
10059 		version->glue_table[i] = NULL;
10060 	}
10061 
10062 	size = HASHSIZE(version->glue_table_bits) *
10063 	       sizeof(*version->glue_table);
10064 	isc_mem_put(rbtdb->common.mctx, version->glue_table, size);
10065 
10066 	RWUNLOCK(&version->glue_rwlock, isc_rwlocktype_write);
10067 }
10068 
10069 static uint32_t
10070 rehash_bits(rbtdb_version_t *version, size_t newcount) {
10071 	uint32_t oldbits = version->glue_table_bits;
10072 	uint32_t newbits = oldbits;
10073 
10074 	while (newcount >= HASHSIZE(newbits) &&
10075 	       newbits <= RBTDB_GLUE_TABLE_MAX_BITS)
10076 	{
10077 		newbits += 1;
10078 	}
10079 
10080 	return (newbits);
10081 }
10082 
10083 /*%
10084  * Write lock (version->glue_rwlock) must be held.
10085  */
10086 static void
10087 rehash_gluetable(rbtdb_version_t *version) {
10088 	uint32_t oldbits, newbits;
10089 	size_t newsize, oldcount, i;
10090 	rbtdb_glue_table_node_t **oldtable;
10091 
10092 	oldbits = version->glue_table_bits;
10093 	oldcount = HASHSIZE(oldbits);
10094 	oldtable = version->glue_table;
10095 
10096 	newbits = rehash_bits(version, version->glue_table_nodecount);
10097 	newsize = HASHSIZE(newbits) * sizeof(version->glue_table[0]);
10098 
10099 	version->glue_table = isc_mem_get(version->rbtdb->common.mctx, newsize);
10100 	version->glue_table_bits = newbits;
10101 	memset(version->glue_table, 0, newsize);
10102 
10103 	for (i = 0; i < oldcount; i++) {
10104 		rbtdb_glue_table_node_t *gluenode;
10105 		rbtdb_glue_table_node_t *nextgluenode;
10106 		for (gluenode = oldtable[i]; gluenode != NULL;
10107 		     gluenode = nextgluenode)
10108 		{
10109 			uint32_t hash = isc_hash32(
10110 				&gluenode->node, sizeof(gluenode->node), true);
10111 			uint32_t idx = hash_32(hash, newbits);
10112 			nextgluenode = gluenode->next;
10113 			gluenode->next = version->glue_table[idx];
10114 			version->glue_table[idx] = gluenode;
10115 		}
10116 	}
10117 
10118 	isc_mem_put(version->rbtdb->common.mctx, oldtable,
10119 		    oldcount * sizeof(*version->glue_table));
10120 
10121 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_ZONE,
10122 		      ISC_LOG_DEBUG(3),
10123 		      "rehash_gluetable(): "
10124 		      "resized glue table from %zu to "
10125 		      "%zu",
10126 		      oldcount, newsize / sizeof(version->glue_table[0]));
10127 }
10128 
10129 static void
10130 maybe_rehash_gluetable(rbtdb_version_t *version) {
10131 	size_t overcommit = HASHSIZE(version->glue_table_bits) *
10132 			    RBTDB_GLUE_TABLE_OVERCOMMIT;
10133 	if (ISC_LIKELY(version->glue_table_nodecount < overcommit)) {
10134 		return;
10135 	}
10136 
10137 	rehash_gluetable(version);
10138 }
10139 
10140 static isc_result_t
10141 glue_nsdname_cb(void *arg, const dns_name_t *name, dns_rdatatype_t qtype) {
10142 	rbtdb_glue_additionaldata_ctx_t *ctx;
10143 	isc_result_t result;
10144 	dns_fixedname_t fixedname_a;
10145 	dns_name_t *name_a = NULL;
10146 	dns_rdataset_t rdataset_a, sigrdataset_a;
10147 	dns_rbtnode_t *node_a = NULL;
10148 	dns_fixedname_t fixedname_aaaa;
10149 	dns_name_t *name_aaaa = NULL;
10150 	dns_rdataset_t rdataset_aaaa, sigrdataset_aaaa;
10151 	dns_rbtnode_t *node_aaaa = NULL;
10152 	rbtdb_glue_t *glue = NULL;
10153 	dns_name_t *gluename = NULL;
10154 
10155 	/*
10156 	 * NS records want addresses in additional records.
10157 	 */
10158 	INSIST(qtype == dns_rdatatype_a);
10159 
10160 	ctx = (rbtdb_glue_additionaldata_ctx_t *)arg;
10161 
10162 	name_a = dns_fixedname_initname(&fixedname_a);
10163 	dns_rdataset_init(&rdataset_a);
10164 	dns_rdataset_init(&sigrdataset_a);
10165 
10166 	name_aaaa = dns_fixedname_initname(&fixedname_aaaa);
10167 	dns_rdataset_init(&rdataset_aaaa);
10168 	dns_rdataset_init(&sigrdataset_aaaa);
10169 
10170 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10171 			   dns_rdatatype_a, DNS_DBFIND_GLUEOK, 0,
10172 			   (dns_dbnode_t **)&node_a, name_a, &rdataset_a,
10173 			   &sigrdataset_a);
10174 	if (result == DNS_R_GLUE) {
10175 		glue = isc_mem_get(ctx->rbtdb->common.mctx, sizeof(*glue));
10176 
10177 		gluename = dns_fixedname_initname(&glue->fixedname);
10178 		dns_name_copynf(name_a, gluename);
10179 
10180 		dns_rdataset_init(&glue->rdataset_a);
10181 		dns_rdataset_init(&glue->sigrdataset_a);
10182 		dns_rdataset_init(&glue->rdataset_aaaa);
10183 		dns_rdataset_init(&glue->sigrdataset_aaaa);
10184 
10185 		dns_rdataset_clone(&rdataset_a, &glue->rdataset_a);
10186 		if (dns_rdataset_isassociated(&sigrdataset_a)) {
10187 			dns_rdataset_clone(&sigrdataset_a,
10188 					   &glue->sigrdataset_a);
10189 		}
10190 	}
10191 
10192 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10193 			   dns_rdatatype_aaaa, DNS_DBFIND_GLUEOK, 0,
10194 			   (dns_dbnode_t **)&node_aaaa, name_aaaa,
10195 			   &rdataset_aaaa, &sigrdataset_aaaa);
10196 	if (result == DNS_R_GLUE) {
10197 		if (glue == NULL) {
10198 			glue = isc_mem_get(ctx->rbtdb->common.mctx,
10199 					   sizeof(*glue));
10200 
10201 			gluename = dns_fixedname_initname(&glue->fixedname);
10202 			dns_name_copynf(name_aaaa, gluename);
10203 
10204 			dns_rdataset_init(&glue->rdataset_a);
10205 			dns_rdataset_init(&glue->sigrdataset_a);
10206 			dns_rdataset_init(&glue->rdataset_aaaa);
10207 			dns_rdataset_init(&glue->sigrdataset_aaaa);
10208 		} else {
10209 			INSIST(node_a == node_aaaa);
10210 			INSIST(dns_name_equal(name_a, name_aaaa));
10211 		}
10212 
10213 		dns_rdataset_clone(&rdataset_aaaa, &glue->rdataset_aaaa);
10214 		if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10215 			dns_rdataset_clone(&sigrdataset_aaaa,
10216 					   &glue->sigrdataset_aaaa);
10217 		}
10218 	}
10219 
10220 	if (glue != NULL) {
10221 		glue->next = ctx->glue_list;
10222 		ctx->glue_list = glue;
10223 	}
10224 
10225 	result = ISC_R_SUCCESS;
10226 
10227 	if (dns_rdataset_isassociated(&rdataset_a)) {
10228 		rdataset_disassociate(&rdataset_a);
10229 	}
10230 	if (dns_rdataset_isassociated(&sigrdataset_a)) {
10231 		rdataset_disassociate(&sigrdataset_a);
10232 	}
10233 
10234 	if (dns_rdataset_isassociated(&rdataset_aaaa)) {
10235 		rdataset_disassociate(&rdataset_aaaa);
10236 	}
10237 	if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10238 		rdataset_disassociate(&sigrdataset_aaaa);
10239 	}
10240 
10241 	if (node_a != NULL) {
10242 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_a);
10243 	}
10244 	if (node_aaaa != NULL) {
10245 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_aaaa);
10246 	}
10247 
10248 	return (result);
10249 }
10250 
10251 static isc_result_t
10252 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
10253 		 dns_message_t *msg) {
10254 	dns_rbtdb_t *rbtdb = rdataset->private1;
10255 	dns_rbtnode_t *node = rdataset->private2;
10256 	rbtdb_version_t *rbtversion = version;
10257 	uint32_t idx;
10258 	rbtdb_glue_table_node_t *cur;
10259 	bool found = false;
10260 	bool restarted = false;
10261 	rbtdb_glue_t *ge;
10262 	rbtdb_glue_additionaldata_ctx_t ctx;
10263 	isc_result_t result;
10264 	uint64_t hash;
10265 
10266 	REQUIRE(rdataset->type == dns_rdatatype_ns);
10267 	REQUIRE(rbtdb == rbtversion->rbtdb);
10268 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
10269 
10270 	/*
10271 	 * The glue table cache that forms a part of the DB version
10272 	 * structure is not explicitly bounded and there's no cache
10273 	 * cleaning. The zone data size itself is an implicit bound.
10274 	 *
10275 	 * The key into the glue hashtable is the node pointer. This is
10276 	 * because the glue hashtable is a property of the DB version,
10277 	 * and the glue is keyed for the ownername/NS tuple. We don't
10278 	 * bother with using an expensive dns_name_t comparison here as
10279 	 * the node pointer is a fixed value that won't change for a DB
10280 	 * version and can be compared directly.
10281 	 */
10282 	hash = isc_hash_function(&node, sizeof(node), true);
10283 
10284 restart:
10285 	/*
10286 	 * First, check if we have the additional entries already cached
10287 	 * in the glue table.
10288 	 */
10289 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10290 
10291 	idx = hash_32(hash, rbtversion->glue_table_bits);
10292 
10293 	for (cur = rbtversion->glue_table[idx]; cur != NULL; cur = cur->next) {
10294 		if (cur->node == node) {
10295 			break;
10296 		}
10297 	}
10298 
10299 	if (cur == NULL) {
10300 		goto no_glue;
10301 	}
10302 	/*
10303 	 * We found a cached result. Add it to the message and
10304 	 * return.
10305 	 */
10306 	found = true;
10307 	ge = cur->glue_list;
10308 
10309 	/*
10310 	 * (void *) -1 is a special value that means no glue is
10311 	 * present in the zone.
10312 	 */
10313 	if (ge == (void *)-1) {
10314 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10315 			isc_stats_increment(
10316 				rbtdb->gluecachestats,
10317 				dns_gluecachestatscounter_hits_absent);
10318 		}
10319 		goto no_glue;
10320 	} else {
10321 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10322 			isc_stats_increment(
10323 				rbtdb->gluecachestats,
10324 				dns_gluecachestatscounter_hits_present);
10325 		}
10326 	}
10327 
10328 	for (; ge != NULL; ge = ge->next) {
10329 		dns_name_t *name = NULL;
10330 		dns_rdataset_t *rdataset_a = NULL;
10331 		dns_rdataset_t *sigrdataset_a = NULL;
10332 		dns_rdataset_t *rdataset_aaaa = NULL;
10333 		dns_rdataset_t *sigrdataset_aaaa = NULL;
10334 		dns_name_t *gluename = dns_fixedname_name(&ge->fixedname);
10335 
10336 		result = dns_message_gettempname(msg, &name);
10337 		if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10338 			goto no_glue;
10339 		}
10340 
10341 		dns_name_copynf(gluename, name);
10342 
10343 		if (dns_rdataset_isassociated(&ge->rdataset_a)) {
10344 			result = dns_message_gettemprdataset(msg, &rdataset_a);
10345 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10346 				dns_message_puttempname(msg, &name);
10347 				goto no_glue;
10348 			}
10349 		}
10350 
10351 		if (dns_rdataset_isassociated(&ge->sigrdataset_a)) {
10352 			result = dns_message_gettemprdataset(msg,
10353 							     &sigrdataset_a);
10354 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10355 				if (rdataset_a != NULL) {
10356 					dns_message_puttemprdataset(
10357 						msg, &rdataset_a);
10358 				}
10359 				dns_message_puttempname(msg, &name);
10360 				goto no_glue;
10361 			}
10362 		}
10363 
10364 		if (dns_rdataset_isassociated(&ge->rdataset_aaaa)) {
10365 			result = dns_message_gettemprdataset(msg,
10366 							     &rdataset_aaaa);
10367 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10368 				dns_message_puttempname(msg, &name);
10369 				if (rdataset_a != NULL) {
10370 					dns_message_puttemprdataset(
10371 						msg, &rdataset_a);
10372 				}
10373 				if (sigrdataset_a != NULL) {
10374 					dns_message_puttemprdataset(
10375 						msg, &sigrdataset_a);
10376 				}
10377 				goto no_glue;
10378 			}
10379 		}
10380 
10381 		if (dns_rdataset_isassociated(&ge->sigrdataset_aaaa)) {
10382 			result = dns_message_gettemprdataset(msg,
10383 							     &sigrdataset_aaaa);
10384 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10385 				dns_message_puttempname(msg, &name);
10386 				if (rdataset_a != NULL) {
10387 					dns_message_puttemprdataset(
10388 						msg, &rdataset_a);
10389 				}
10390 				if (sigrdataset_a != NULL) {
10391 					dns_message_puttemprdataset(
10392 						msg, &sigrdataset_a);
10393 				}
10394 				if (rdataset_aaaa != NULL) {
10395 					dns_message_puttemprdataset(
10396 						msg, &rdataset_aaaa);
10397 				}
10398 				goto no_glue;
10399 			}
10400 		}
10401 
10402 		if (ISC_LIKELY(rdataset_a != NULL)) {
10403 			dns_rdataset_clone(&ge->rdataset_a, rdataset_a);
10404 			ISC_LIST_APPEND(name->list, rdataset_a, link);
10405 		}
10406 
10407 		if (sigrdataset_a != NULL) {
10408 			dns_rdataset_clone(&ge->sigrdataset_a, sigrdataset_a);
10409 			ISC_LIST_APPEND(name->list, sigrdataset_a, link);
10410 		}
10411 
10412 		if (rdataset_aaaa != NULL) {
10413 			dns_rdataset_clone(&ge->rdataset_aaaa, rdataset_aaaa);
10414 			ISC_LIST_APPEND(name->list, rdataset_aaaa, link);
10415 		}
10416 		if (sigrdataset_aaaa != NULL) {
10417 			dns_rdataset_clone(&ge->sigrdataset_aaaa,
10418 					   sigrdataset_aaaa);
10419 			ISC_LIST_APPEND(name->list, sigrdataset_aaaa, link);
10420 		}
10421 
10422 		dns_message_addname(msg, name, DNS_SECTION_ADDITIONAL);
10423 	}
10424 
10425 no_glue:
10426 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10427 
10428 	if (found) {
10429 		return (ISC_R_SUCCESS);
10430 	}
10431 
10432 	if (restarted) {
10433 		return (ISC_R_FAILURE);
10434 	}
10435 
10436 	/*
10437 	 * No cached glue was found in the table. Cache it and restart
10438 	 * this function.
10439 	 *
10440 	 * Due to the gap between the read lock and the write lock, it's
10441 	 * possible that we may cache a duplicate glue table entry, but
10442 	 * we don't care.
10443 	 */
10444 
10445 	ctx.glue_list = NULL;
10446 	ctx.rbtdb = rbtdb;
10447 	ctx.rbtversion = rbtversion;
10448 
10449 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10450 
10451 	maybe_rehash_gluetable(rbtversion);
10452 	idx = hash_32(hash, rbtversion->glue_table_bits);
10453 
10454 	(void)dns_rdataset_additionaldata(rdataset, glue_nsdname_cb, &ctx);
10455 
10456 	cur = isc_mem_get(rbtdb->common.mctx, sizeof(*cur));
10457 
10458 	/*
10459 	 * XXXMUKS: it looks like the dns_dbversion is not destroyed
10460 	 * when named is terminated by a keyboard break. This doesn't
10461 	 * cleanup the node reference and keeps the process dangling.
10462 	 */
10463 	/* isc_refcount_increment0(&node->references); */
10464 	cur->node = node;
10465 
10466 	if (ctx.glue_list == NULL) {
10467 		/*
10468 		 * No glue was found. Cache it so.
10469 		 */
10470 		cur->glue_list = (void *)-1;
10471 		if (rbtdb->gluecachestats != NULL) {
10472 			isc_stats_increment(
10473 				rbtdb->gluecachestats,
10474 				dns_gluecachestatscounter_inserts_absent);
10475 		}
10476 	} else {
10477 		cur->glue_list = ctx.glue_list;
10478 		if (rbtdb->gluecachestats != NULL) {
10479 			isc_stats_increment(
10480 				rbtdb->gluecachestats,
10481 				dns_gluecachestatscounter_inserts_present);
10482 		}
10483 	}
10484 
10485 	cur->next = rbtversion->glue_table[idx];
10486 	rbtversion->glue_table[idx] = cur;
10487 	rbtversion->glue_table_nodecount++;
10488 
10489 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10490 
10491 	restarted = true;
10492 	goto restart;
10493 
10494 	/* UNREACHABLE */
10495 }
10496 
10497 /*%
10498  * Routines for LRU-based cache management.
10499  */
10500 
10501 /*%
10502  * See if a given cache entry that is being reused needs to be updated
10503  * in the LRU-list.  From the LRU management point of view, this function is
10504  * expected to return true for almost all cases.  When used with threads,
10505  * however, this may cause a non-negligible performance penalty because a
10506  * writer lock will have to be acquired before updating the list.
10507  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
10508  * function returns true if the entry has not been updated for some period of
10509  * time.  We differentiate the NS or glue address case and the others since
10510  * experiments have shown that the former tends to be accessed relatively
10511  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
10512  * may cause external queries at a higher level zone, involving more
10513  * transactions).
10514  *
10515  * Caller must hold the node (read or write) lock.
10516  */
10517 static bool
10518 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
10519 	if (RDATASET_ATTR_GET(header, (RDATASET_ATTR_NONEXISTENT |
10520 				       RDATASET_ATTR_ANCIENT |
10521 				       RDATASET_ATTR_ZEROTTL)) != 0)
10522 	{
10523 		return (false);
10524 	}
10525 
10526 #if DNS_RBTDB_LIMITLRUUPDATE
10527 	if (header->type == dns_rdatatype_ns ||
10528 	    (header->trust == dns_trust_glue &&
10529 	     (header->type == dns_rdatatype_a ||
10530 	      header->type == dns_rdatatype_aaaa)))
10531 	{
10532 		/*
10533 		 * Glue records are updated if at least DNS_RBTDB_LRUUPDATE_GLUE
10534 		 * seconds have passed since the previous update time.
10535 		 */
10536 		return (header->last_used + DNS_RBTDB_LRUUPDATE_GLUE <= now);
10537 	}
10538 
10539 	/*
10540 	 * Other records are updated if DNS_RBTDB_LRUUPDATE_REGULAR seconds
10541 	 * have passed.
10542 	 */
10543 	return (header->last_used + DNS_RBTDB_LRUUPDATE_REGULAR <= now);
10544 #else
10545 	UNUSED(now);
10546 
10547 	return (true);
10548 #endif /* if DNS_RBTDB_LIMITLRUUPDATE */
10549 }
10550 
10551 /*%
10552  * Update the timestamp of a given cache entry and move it to the head
10553  * of the corresponding LRU list.
10554  *
10555  * Caller must hold the node (write) lock.
10556  *
10557  * Note that the we do NOT touch the heap here, as the TTL has not changed.
10558  */
10559 static void
10560 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now) {
10561 	INSIST(IS_CACHE(rbtdb));
10562 
10563 	/* To be checked: can we really assume this? XXXMLG */
10564 	INSIST(ISC_LINK_LINKED(header, link));
10565 
10566 	ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum], header, link);
10567 	header->last_used = now;
10568 	ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], header, link);
10569 }
10570 
10571 static size_t
10572 expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize,
10573 		   bool tree_locked) {
10574 	rdatasetheader_t *header, *header_prev;
10575 	size_t purged = 0;
10576 
10577 	for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
10578 	     header != NULL && purged <= purgesize; header = header_prev)
10579 	{
10580 		header_prev = ISC_LIST_PREV(header, link);
10581 		/*
10582 		 * Unlink the entry at this point to avoid checking it
10583 		 * again even if it's currently used someone else and
10584 		 * cannot be purged at this moment.  This entry won't be
10585 		 * referenced any more (so unlinking is safe) since the
10586 		 * TTL was reset to 0.
10587 		 */
10588 		ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, link);
10589 		size_t header_size = rdataset_size(header);
10590 		expire_header(rbtdb, header, tree_locked, expire_lru);
10591 		purged += header_size;
10592 	}
10593 
10594 	return (purged);
10595 }
10596 
10597 /*%
10598  * Purge some stale (i.e. unused for some period - LRU based cleaning) cache
10599  * entries under the overmem condition.  To recover from this condition quickly,
10600  * we cleanup entries up to the size of newly added rdata (passed as purgesize).
10601  *
10602  * This process is triggered while adding a new entry, and we specifically avoid
10603  * purging entries in the same LRU bucket as the one to which the new entry will
10604  * belong.  Otherwise, we might purge entries of the same name of different RR
10605  * types while adding RRsets from a single response (consider the case where
10606  * we're adding A and AAAA glue records of the same NS name).
10607  */
10608 static void
10609 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
10610 	      bool tree_locked) {
10611 	unsigned int locknum;
10612 	size_t purged = 0;
10613 
10614 	for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
10615 	     locknum != locknum_start && purged <= purgesize;
10616 	     locknum = (locknum + 1) % rbtdb->node_lock_count)
10617 	{
10618 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
10619 			  isc_rwlocktype_write);
10620 
10621 		purged += expire_lru_headers(rbtdb, locknum, purgesize - purged,
10622 					     tree_locked);
10623 
10624 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
10625 			    isc_rwlocktype_write);
10626 	}
10627 }
10628 
10629 static void
10630 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
10631 	      expire_t reason) {
10632 	set_ttl(rbtdb, header, 0);
10633 	mark_header_ancient(rbtdb, header);
10634 
10635 	/*
10636 	 * Caller must hold the node (write) lock.
10637 	 */
10638 
10639 	if (isc_refcount_current(&header->node->references) == 0) {
10640 		/*
10641 		 * If no one else is using the node, we can clean it up now.
10642 		 * We first need to gain a new reference to the node to meet a
10643 		 * requirement of decrement_reference().
10644 		 */
10645 		new_reference(rbtdb, header->node, isc_rwlocktype_write);
10646 		decrement_reference(rbtdb, header->node, 0,
10647 				    isc_rwlocktype_write,
10648 				    tree_locked ? isc_rwlocktype_write
10649 						: isc_rwlocktype_none,
10650 				    false);
10651 
10652 		if (rbtdb->cachestats == NULL) {
10653 			return;
10654 		}
10655 
10656 		switch (reason) {
10657 		case expire_ttl:
10658 			isc_stats_increment(rbtdb->cachestats,
10659 					    dns_cachestatscounter_deletettl);
10660 			break;
10661 		case expire_lru:
10662 			isc_stats_increment(rbtdb->cachestats,
10663 					    dns_cachestatscounter_deletelru);
10664 			break;
10665 		default:
10666 			break;
10667 		}
10668 	}
10669 }
10670