xref: /netbsd-src/external/mpl/bind/dist/lib/dns/rbtdb.c (revision f8cf1a9151c7af1cb0bd8b09c13c66bca599c027)
1 /*	$NetBSD: rbtdb.c,v 1.20 2024/09/22 00:14:06 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <ctype.h>
19 #include <inttypes.h>
20 #include <stdbool.h>
21 #include <sys/mman.h>
22 
23 #include <isc/atomic.h>
24 #include <isc/crc64.h>
25 #include <isc/event.h>
26 #include <isc/file.h>
27 #include <isc/hash.h>
28 #include <isc/heap.h>
29 #include <isc/hex.h>
30 #include <isc/mem.h>
31 #include <isc/mutex.h>
32 #include <isc/once.h>
33 #include <isc/print.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/result.h>
37 #include <isc/rwlock.h>
38 #include <isc/serial.h>
39 #include <isc/stdio.h>
40 #include <isc/string.h>
41 #include <isc/task.h>
42 #include <isc/time.h>
43 #include <isc/util.h>
44 
45 #include <dns/callbacks.h>
46 #include <dns/db.h>
47 #include <dns/dbiterator.h>
48 #include <dns/events.h>
49 #include <dns/fixedname.h>
50 #include <dns/log.h>
51 #include <dns/masterdump.h>
52 #include <dns/nsec.h>
53 #include <dns/nsec3.h>
54 #include <dns/rbt.h>
55 #include <dns/rdata.h>
56 #include <dns/rdataset.h>
57 #include <dns/rdatasetiter.h>
58 #include <dns/rdataslab.h>
59 #include <dns/rdatastruct.h>
60 #include <dns/stats.h>
61 #include <dns/time.h>
62 #include <dns/view.h>
63 #include <dns/zone.h>
64 #include <dns/zonekey.h>
65 
66 #include "rbtdb.h"
67 
68 #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
69 
70 #define CHECK(op)                            \
71 	do {                                 \
72 		result = (op);               \
73 		if (result != ISC_R_SUCCESS) \
74 			goto failure;        \
75 	} while (0)
76 
77 /*%
78  * Note that "impmagic" is not the first four bytes of the struct, so
79  * ISC_MAGIC_VALID cannot be used.
80  */
81 #define VALID_RBTDB(rbtdb) \
82 	((rbtdb) != NULL && (rbtdb)->common.impmagic == RBTDB_MAGIC)
83 
84 typedef uint32_t rbtdb_serial_t;
85 typedef uint32_t rbtdb_rdatatype_t;
86 
87 #define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF))
88 #define RBTDB_RDATATYPE_EXT(type)  ((dns_rdatatype_t)((type) >> 16))
89 #define RBTDB_RDATATYPE_VALUE(base, ext)              \
90 	((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \
91 	 (((uint32_t)base) & 0xffff))
92 
93 #define RBTDB_RDATATYPE_SIGNSEC \
94 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
95 #define RBTDB_RDATATYPE_SIGNSEC3 \
96 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
97 #define RBTDB_RDATATYPE_SIGNS \
98 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
99 #define RBTDB_RDATATYPE_SIGCNAME \
100 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
101 #define RBTDB_RDATATYPE_SIGDNAME \
102 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
103 #define RBTDB_RDATATYPE_SIGDS \
104 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
105 #define RBTDB_RDATATYPE_SIGSOA \
106 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa)
107 #define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
108 
109 #define RBTDB_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
110 #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
111 #define RBTDB_LOCK(l, t)     RWLOCK((l), (t))
112 #define RBTDB_UNLOCK(l, t)   RWUNLOCK((l), (t))
113 
114 /*
115  * Since node locking is sensitive to both performance and memory footprint,
116  * we need some trick here.  If we have both high-performance rwlock and
117  * high performance and small-memory reference counters, we use rwlock for
118  * node lock and isc_refcount for node references.  In this case, we don't have
119  * to protect the access to the counters by locks.
120  * Otherwise, we simply use ordinary mutex lock for node locking, and use
121  * simple integers as reference counters which is protected by the lock.
122  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
123  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
124  * counters first and then protect other parts of a node as read-only data.
125  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
126  * provided for these special cases.  When we can use the efficient backend
127  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
128  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
129  * section including the access to the reference counter.
130  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
131  * section is also protected by NODE_STRONGLOCK().
132  */
133 typedef isc_rwlock_t nodelock_t;
134 
135 #define NODE_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
136 #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
137 #define NODE_LOCK(l, t)	    RWLOCK((l), (t))
138 #define NODE_UNLOCK(l, t)   RWUNLOCK((l), (t))
139 #define NODE_TRYUPGRADE(l)  isc_rwlock_tryupgrade(l)
140 #define NODE_DOWNGRADE(l)   isc_rwlock_downgrade(l)
141 
142 /*%
143  * Whether to rate-limit updating the LRU to avoid possible thread contention.
144  * Updating LRU requires write locking, so we don't do it every time the
145  * record is touched - only after some time passes.
146  */
147 #ifndef DNS_RBTDB_LIMITLRUUPDATE
148 #define DNS_RBTDB_LIMITLRUUPDATE 1
149 #endif
150 
151 /*% Time after which we update LRU for glue records, 5 minutes */
152 #define DNS_RBTDB_LRUUPDATE_GLUE 300
153 /*% Time after which we update LRU for all other records, 10 minutes */
154 #define DNS_RBTDB_LRUUPDATE_REGULAR 600
155 
156 /*
157  * Allow clients with a virtual time of up to 5 minutes in the past to see
158  * records that would have otherwise have expired.
159  */
160 #define RBTDB_VIRTUAL 300
161 
162 struct noqname {
163 	dns_name_t name;
164 	void *neg;
165 	void *negsig;
166 	dns_rdatatype_t type;
167 };
168 
169 typedef struct rdatasetheader {
170 	/*%
171 	 * Locked by the owning node's lock.
172 	 */
173 	rbtdb_serial_t serial;
174 	dns_ttl_t rdh_ttl;
175 	rbtdb_rdatatype_t type;
176 	atomic_uint_least16_t attributes;
177 	dns_trust_t trust;
178 	atomic_uint_fast32_t last_refresh_fail_ts;
179 	struct noqname *noqname;
180 	struct noqname *closest;
181 	unsigned int resign_lsb : 1;
182 	/*%<
183 	 * We don't use the LIST macros, because the LIST structure has
184 	 * both head and tail pointers, and is doubly linked.
185 	 */
186 
187 	struct rdatasetheader *next;
188 	/*%<
189 	 * If this is the top header for an rdataset, 'next' points
190 	 * to the top header for the next rdataset (i.e., the next type).
191 	 * Otherwise, it points up to the header whose down pointer points
192 	 * at this header.
193 	 */
194 
195 	struct rdatasetheader *down;
196 	/*%<
197 	 * Points to the header for the next older version of
198 	 * this rdataset.
199 	 */
200 
201 	atomic_uint_fast32_t count;
202 	/*%<
203 	 * Monotonously increased every time this rdataset is bound so that
204 	 * it is used as the base of the starting point in DNS responses
205 	 * when the "cyclic" rrset-order is required.
206 	 */
207 
208 	dns_rbtnode_t *node;
209 	isc_stdtime_t last_used;
210 	ISC_LINK(struct rdatasetheader) link;
211 
212 	unsigned int heap_index;
213 	/*%<
214 	 * Used for TTL-based cache cleaning.
215 	 */
216 	isc_stdtime_t resign;
217 	/*%<
218 	 * Case vector.  If the bit is set then the corresponding
219 	 * character in the owner name needs to be AND'd with 0x20,
220 	 * rendering that character upper case.
221 	 */
222 	unsigned char upper[32];
223 } rdatasetheader_t;
224 
225 typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
226 typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
227 
228 #define RDATASET_ATTR_NONEXISTENT 0x0001
229 /*%< May be potentially served as stale data. */
230 #define RDATASET_ATTR_STALE	     0x0002
231 #define RDATASET_ATTR_IGNORE	     0x0004
232 #define RDATASET_ATTR_RETAIN	     0x0008
233 #define RDATASET_ATTR_NXDOMAIN	     0x0010
234 #define RDATASET_ATTR_RESIGN	     0x0020
235 #define RDATASET_ATTR_STATCOUNT	     0x0040
236 #define RDATASET_ATTR_OPTOUT	     0x0080
237 #define RDATASET_ATTR_NEGATIVE	     0x0100
238 #define RDATASET_ATTR_PREFETCH	     0x0200
239 #define RDATASET_ATTR_CASESET	     0x0400
240 #define RDATASET_ATTR_ZEROTTL	     0x0800
241 #define RDATASET_ATTR_CASEFULLYLOWER 0x1000
242 /*%< Ancient - awaiting cleanup. */
243 #define RDATASET_ATTR_ANCIENT	   0x2000
244 #define RDATASET_ATTR_STALE_WINDOW 0x4000
245 
246 /*
247  * XXX
248  * When the cache will pre-expire data (due to memory low or other
249  * situations) before the rdataset's TTL has expired, it MUST
250  * respect the RETAIN bit and not expire the data until its TTL is
251  * expired.
252  */
253 
254 #define EXISTS(header)                                 \
255 	((atomic_load_acquire(&(header)->attributes) & \
256 	  RDATASET_ATTR_NONEXISTENT) == 0)
257 #define NONEXISTENT(header)                            \
258 	((atomic_load_acquire(&(header)->attributes) & \
259 	  RDATASET_ATTR_NONEXISTENT) != 0)
260 #define IGNORE(header)                                 \
261 	((atomic_load_acquire(&(header)->attributes) & \
262 	  RDATASET_ATTR_IGNORE) != 0)
263 #define RETAIN(header)                                 \
264 	((atomic_load_acquire(&(header)->attributes) & \
265 	  RDATASET_ATTR_RETAIN) != 0)
266 #define NXDOMAIN(header)                               \
267 	((atomic_load_acquire(&(header)->attributes) & \
268 	  RDATASET_ATTR_NXDOMAIN) != 0)
269 #define STALE(header)                                                          \
270 	((atomic_load_acquire(&(header)->attributes) & RDATASET_ATTR_STALE) != \
271 	 0)
272 #define STALE_WINDOW(header)                           \
273 	((atomic_load_acquire(&(header)->attributes) & \
274 	  RDATASET_ATTR_STALE_WINDOW) != 0)
275 #define RESIGN(header)                                 \
276 	((atomic_load_acquire(&(header)->attributes) & \
277 	  RDATASET_ATTR_RESIGN) != 0)
278 #define OPTOUT(header)                                 \
279 	((atomic_load_acquire(&(header)->attributes) & \
280 	  RDATASET_ATTR_OPTOUT) != 0)
281 #define NEGATIVE(header)                               \
282 	((atomic_load_acquire(&(header)->attributes) & \
283 	  RDATASET_ATTR_NEGATIVE) != 0)
284 #define PREFETCH(header)                               \
285 	((atomic_load_acquire(&(header)->attributes) & \
286 	  RDATASET_ATTR_PREFETCH) != 0)
287 #define CASESET(header)                                \
288 	((atomic_load_acquire(&(header)->attributes) & \
289 	  RDATASET_ATTR_CASESET) != 0)
290 #define ZEROTTL(header)                                \
291 	((atomic_load_acquire(&(header)->attributes) & \
292 	  RDATASET_ATTR_ZEROTTL) != 0)
293 #define CASEFULLYLOWER(header)                         \
294 	((atomic_load_acquire(&(header)->attributes) & \
295 	  RDATASET_ATTR_CASEFULLYLOWER) != 0)
296 #define ANCIENT(header)                                \
297 	((atomic_load_acquire(&(header)->attributes) & \
298 	  RDATASET_ATTR_ANCIENT) != 0)
299 #define STATCOUNT(header)                              \
300 	((atomic_load_acquire(&(header)->attributes) & \
301 	  RDATASET_ATTR_STATCOUNT) != 0)
302 #define STALE_TTL(header, rbtdb) (NXDOMAIN(header) ? 0 : rbtdb->serve_stale_ttl)
303 
304 #define RDATASET_ATTR_GET(header, attribute) \
305 	(atomic_load_acquire(&(header)->attributes) & attribute)
306 #define RDATASET_ATTR_SET(header, attribute) \
307 	atomic_fetch_or_release(&(header)->attributes, attribute)
308 #define RDATASET_ATTR_CLR(header, attribute) \
309 	atomic_fetch_and_release(&(header)->attributes, ~(attribute))
310 
311 #define ACTIVE(header, now)             \
312 	(((header)->rdh_ttl > (now)) || \
313 	 ((header)->rdh_ttl == (now) && ZEROTTL(header)))
314 
315 #define DEFAULT_NODE_LOCK_COUNT	    7 /*%< Should be prime. */
316 #define RBTDB_GLUE_TABLE_INIT_BITS  2U
317 #define RBTDB_GLUE_TABLE_MAX_BITS   32U
318 #define RBTDB_GLUE_TABLE_OVERCOMMIT 3
319 
320 #define GOLDEN_RATIO_32 0x61C88647
321 #define HASHSIZE(bits)	(UINT64_C(1) << (bits))
322 
323 static uint32_t
324 hash_32(uint32_t val, unsigned int bits) {
325 	REQUIRE(bits <= RBTDB_GLUE_TABLE_MAX_BITS);
326 	/* High bits are more random. */
327 	return (val * GOLDEN_RATIO_32 >> (32 - bits));
328 }
329 
330 #define EXPIREDOK(rbtiterator) \
331 	(((rbtiterator)->common.options & DNS_DB_EXPIREDOK) != 0)
332 
333 #define STALEOK(rbtiterator) \
334 	(((rbtiterator)->common.options & DNS_DB_STALEOK) != 0)
335 
336 #define RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, iterator)       \
337 	((iterator)->current == &(iterator)->nsec3chain && \
338 	 (iterator)->node == (rbtdb)->nsec3_origin_node)
339 
340 /*%
341  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
342  * There is a tradeoff issue about configuring this value: if this is too
343  * small, it may cause heavier contention between threads; if this is too large,
344  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
345  * The default value should work well for most environments, but this can
346  * also be configurable at compilation time via the
347  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
348  * 1 due to the assumption of overmem_purge().
349  */
350 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
351 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
352 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
353 #else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
354 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
355 #endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
356 #else  /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
357 #define DEFAULT_CACHE_NODE_LOCK_COUNT 17
358 #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
359 
360 /*
361  * This defines the number of headers that we try to expire each time the
362  * expire_ttl_headers() is run.  The number should be small enough, so the
363  * TTL-based header expiration doesn't take too long, but it should be large
364  * enough, so we expire enough headers if their TTL is clustered.
365  */
366 #define DNS_RBTDB_EXPIRE_TTL_COUNT 10
367 
368 typedef struct {
369 	nodelock_t lock;
370 	/* Protected in the refcount routines. */
371 	isc_refcount_t references;
372 	/* Locked by lock. */
373 	bool exiting;
374 } rbtdb_nodelock_t;
375 
376 typedef struct rbtdb_changed {
377 	dns_rbtnode_t *node;
378 	bool dirty;
379 	ISC_LINK(struct rbtdb_changed) link;
380 } rbtdb_changed_t;
381 
382 typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
383 
384 typedef enum { dns_db_insecure, dns_db_partial, dns_db_secure } dns_db_secure_t;
385 
386 typedef struct dns_rbtdb dns_rbtdb_t;
387 
388 /* Reason for expiring a record from cache */
389 typedef enum { expire_lru, expire_ttl, expire_flush } expire_t;
390 
391 typedef struct rbtdb_glue rbtdb_glue_t;
392 
393 typedef struct rbtdb_glue_table_node {
394 	struct rbtdb_glue_table_node *next;
395 	dns_rbtnode_t *node;
396 	rbtdb_glue_t *glue_list;
397 } rbtdb_glue_table_node_t;
398 
399 typedef enum {
400 	rdataset_ttl_fresh,
401 	rdataset_ttl_stale,
402 	rdataset_ttl_ancient
403 } rdataset_ttl_t;
404 
405 typedef struct rbtdb_version {
406 	/* Not locked */
407 	rbtdb_serial_t serial;
408 	dns_rbtdb_t *rbtdb;
409 	/*
410 	 * Protected in the refcount routines.
411 	 * XXXJT: should we change the lock policy based on the refcount
412 	 * performance?
413 	 */
414 	isc_refcount_t references;
415 	/* Locked by database lock. */
416 	bool writer;
417 	bool commit_ok;
418 	rbtdb_changedlist_t changed_list;
419 	rdatasetheaderlist_t resigned_list;
420 	ISC_LINK(struct rbtdb_version) link;
421 	dns_db_secure_t secure;
422 	bool havensec3;
423 	/* NSEC3 parameters */
424 	dns_hash_t hash;
425 	uint8_t flags;
426 	uint16_t iterations;
427 	uint8_t salt_length;
428 	unsigned char salt[DNS_NSEC3_SALTSIZE];
429 
430 	/*
431 	 * records and xfrsize are covered by rwlock.
432 	 */
433 	isc_rwlock_t rwlock;
434 	uint64_t records;
435 	uint64_t xfrsize;
436 
437 	isc_rwlock_t glue_rwlock;
438 	size_t glue_table_bits;
439 	size_t glue_table_nodecount;
440 	rbtdb_glue_table_node_t **glue_table;
441 } rbtdb_version_t;
442 
443 typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
444 
445 struct dns_rbtdb {
446 	/* Unlocked. */
447 	dns_db_t common;
448 	/* Locks the data in this struct */
449 	isc_rwlock_t lock;
450 	/* Locks the tree structure (prevents nodes appearing/disappearing) */
451 	isc_rwlock_t tree_lock;
452 	/* Locks for individual tree nodes */
453 	unsigned int node_lock_count;
454 	rbtdb_nodelock_t *node_locks;
455 	dns_rbtnode_t *origin_node;
456 	dns_rbtnode_t *nsec3_origin_node;
457 	dns_stats_t *rrsetstats;     /* cache DB only */
458 	isc_stats_t *cachestats;     /* cache DB only */
459 	isc_stats_t *gluecachestats; /* zone DB only */
460 	/* Locked by lock. */
461 	unsigned int active;
462 	isc_refcount_t references;
463 	unsigned int attributes;
464 	rbtdb_serial_t current_serial;
465 	rbtdb_serial_t least_serial;
466 	rbtdb_serial_t next_serial;
467 	uint32_t maxrrperset;
468 	uint32_t maxtypepername;
469 	rbtdb_version_t *current_version;
470 	rbtdb_version_t *future_version;
471 	rbtdb_versionlist_t open_versions;
472 	isc_task_t *task;
473 	isc_task_t *prunetask;
474 	dns_dbnode_t *soanode;
475 	dns_dbnode_t *nsnode;
476 
477 	/*
478 	 * Maximum length of time to keep using a stale answer past its
479 	 * normal TTL expiry.
480 	 */
481 	dns_ttl_t serve_stale_ttl;
482 
483 	/*
484 	 * The time after a failed lookup, where stale answers from cache
485 	 * may be used directly in a DNS response without attempting a
486 	 * new iterative lookup.
487 	 */
488 	uint32_t serve_stale_refresh;
489 
490 	/*
491 	 * This is a linked list used to implement the LRU cache.  There will
492 	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
493 	 * placed on the linked list rdatasets[1].
494 	 */
495 	rdatasetheaderlist_t *rdatasets;
496 
497 	/*
498 	 * Start point % node_lock_count for next LRU cleanup.
499 	 */
500 	atomic_uint lru_sweep;
501 
502 	/*
503 	 * When performing LRU cleaning limit cleaning to headers that were
504 	 * last used at or before this.
505 	 */
506 	atomic_uint last_used;
507 
508 	/*%
509 	 * Temporary storage for stale cache nodes and dynamically deleted
510 	 * nodes that await being cleaned up.
511 	 */
512 	rbtnodelist_t *deadnodes;
513 
514 	/*
515 	 * Heaps.  These are used for TTL based expiry in a cache,
516 	 * or for zone resigning in a zone DB.  hmctx is the memory
517 	 * context to use for the heap (which differs from the main
518 	 * database memory context in the case of a cache).
519 	 */
520 	isc_mem_t *hmctx;
521 	isc_heap_t **heaps;
522 
523 	/* Locked by tree_lock. */
524 	dns_rbt_t *tree;
525 	dns_rbt_t *nsec;
526 	dns_rbt_t *nsec3;
527 
528 	/* Unlocked */
529 	unsigned int quantum;
530 };
531 
532 #define RBTDB_ATTR_LOADED  0x01
533 #define RBTDB_ATTR_LOADING 0x02
534 
535 #define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0)
536 
537 /*%
538  * Search Context
539  */
540 typedef struct {
541 	dns_rbtdb_t *rbtdb;
542 	rbtdb_version_t *rbtversion;
543 	rbtdb_serial_t serial;
544 	unsigned int options;
545 	dns_rbtnodechain_t chain;
546 	bool copy_name;
547 	bool need_cleanup;
548 	bool wild;
549 	dns_rbtnode_t *zonecut;
550 	rdatasetheader_t *zonecut_rdataset;
551 	rdatasetheader_t *zonecut_sigrdataset;
552 	dns_fixedname_t zonecut_name;
553 	isc_stdtime_t now;
554 } rbtdb_search_t;
555 
556 /*%
557  * Load Context
558  */
559 typedef struct {
560 	dns_rbtdb_t *rbtdb;
561 	isc_stdtime_t now;
562 } rbtdb_load_t;
563 
564 static void
565 delete_callback(void *data, void *arg);
566 static void
567 rdataset_disassociate(dns_rdataset_t *rdataset);
568 static isc_result_t
569 rdataset_first(dns_rdataset_t *rdataset);
570 static isc_result_t
571 rdataset_next(dns_rdataset_t *rdataset);
572 static void
573 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
574 static void
575 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
576 static unsigned int
577 rdataset_count(dns_rdataset_t *rdataset);
578 static isc_result_t
579 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
580 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
581 static isc_result_t
582 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
583 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
584 static bool
585 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now);
586 static void
587 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now);
588 static void
589 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
590 	      expire_t reason);
591 static void
592 overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked);
593 static void
594 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
595 static void
596 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
597 	      rdatasetheader_t *header);
598 static void
599 prune_tree(isc_task_t *task, isc_event_t *event);
600 static void
601 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
602 static void
603 rdataset_expire(dns_rdataset_t *rdataset);
604 static void
605 rdataset_clearprefetch(dns_rdataset_t *rdataset);
606 static void
607 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name);
608 static void
609 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name);
610 static isc_result_t
611 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
612 		 dns_message_t *msg);
613 static void
614 free_gluetable(rbtdb_version_t *version);
615 static isc_result_t
616 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name);
617 
618 static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate,
619 						  rdataset_first,
620 						  rdataset_next,
621 						  rdataset_current,
622 						  rdataset_clone,
623 						  rdataset_count,
624 						  NULL, /* addnoqname */
625 						  rdataset_getnoqname,
626 						  NULL, /* addclosest */
627 						  rdataset_getclosest,
628 						  rdataset_settrust,
629 						  rdataset_expire,
630 						  rdataset_clearprefetch,
631 						  rdataset_setownercase,
632 						  rdataset_getownercase,
633 						  rdataset_addglue };
634 
635 static dns_rdatasetmethods_t slab_methods = {
636 	rdataset_disassociate,
637 	rdataset_first,
638 	rdataset_next,
639 	rdataset_current,
640 	rdataset_clone,
641 	rdataset_count,
642 	NULL, /* addnoqname */
643 	NULL, /* getnoqname */
644 	NULL, /* addclosest */
645 	NULL, /* getclosest */
646 	NULL, /* settrust */
647 	NULL, /* expire */
648 	NULL, /* clearprefetch */
649 	NULL, /* setownercase */
650 	NULL, /* getownercase */
651 	NULL  /* addglue */
652 };
653 
654 static void
655 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
656 static isc_result_t
657 rdatasetiter_first(dns_rdatasetiter_t *iterator);
658 static isc_result_t
659 rdatasetiter_next(dns_rdatasetiter_t *iterator);
660 static void
661 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset);
662 
663 static dns_rdatasetitermethods_t rdatasetiter_methods = {
664 	rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next,
665 	rdatasetiter_current
666 };
667 
668 typedef struct rbtdb_rdatasetiter {
669 	dns_rdatasetiter_t common;
670 	rdatasetheader_t *current;
671 } rbtdb_rdatasetiter_t;
672 
673 /*
674  * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
675  * DNS_DB_NONSEC3, will transparently move between the last node of the
676  * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
677  * ("nsec3chain" field) of the database in question, as if the latter was a
678  * successor to the former in lexical order.  The "current" field always holds
679  * the address of either "chain" or "nsec3chain", depending on which RBT is
680  * being traversed at given time.
681  */
682 static void
683 dbiterator_destroy(dns_dbiterator_t **iteratorp);
684 static isc_result_t
685 dbiterator_first(dns_dbiterator_t *iterator);
686 static isc_result_t
687 dbiterator_last(dns_dbiterator_t *iterator);
688 static isc_result_t
689 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name);
690 static isc_result_t
691 dbiterator_prev(dns_dbiterator_t *iterator);
692 static isc_result_t
693 dbiterator_next(dns_dbiterator_t *iterator);
694 static isc_result_t
695 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
696 		   dns_name_t *name);
697 static isc_result_t
698 dbiterator_pause(dns_dbiterator_t *iterator);
699 static isc_result_t
700 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name);
701 
702 static dns_dbiteratormethods_t dbiterator_methods = {
703 	dbiterator_destroy, dbiterator_first, dbiterator_last,
704 	dbiterator_seek,    dbiterator_prev,  dbiterator_next,
705 	dbiterator_current, dbiterator_pause, dbiterator_origin
706 };
707 
708 #define DELETION_BATCH_MAX 64
709 
710 /*
711  * If 'paused' is true, then the tree lock is not being held.
712  */
713 typedef struct rbtdb_dbiterator {
714 	dns_dbiterator_t common;
715 	bool paused;
716 	bool new_origin;
717 	isc_rwlocktype_t tree_locked;
718 	isc_result_t result;
719 	dns_fixedname_t name;
720 	dns_fixedname_t origin;
721 	dns_rbtnodechain_t chain;
722 	dns_rbtnodechain_t nsec3chain;
723 	dns_rbtnodechain_t *current;
724 	dns_rbtnode_t *node;
725 	dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
726 	int delcnt;
727 	enum { full, nonsec3, nsec3only } nsec3mode;
728 } rbtdb_dbiterator_t;
729 
730 #define IS_STUB(rbtdb)	(((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
731 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
732 
733 static void
734 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event);
735 static void
736 overmem(dns_db_t *db, bool over);
737 static void
738 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
739 static void
740 setownercase(rdatasetheader_t *header, const dns_name_t *name);
741 
742 /*%
743  * 'init_count' is used to initialize 'newheader->count' which inturn
744  * is used to determine where in the cycle rrset-order cyclic starts.
745  * We don't lock this as we don't care about simultaneous updates.
746  *
747  * Note:
748  *      Both init_count and header->count can be UINT32_MAX.
749  *      The count on the returned rdataset however can't be as
750  *      that indicates that the database does not implement cyclic
751  *      processing.
752  */
753 static atomic_uint_fast32_t init_count = 0;
754 
755 /*
756  * Locking
757  *
758  * If a routine is going to lock more than one lock in this module, then
759  * the locking must be done in the following order:
760  *
761  *      Tree Lock
762  *
763  *      Node Lock       (Only one from the set may be locked at one time by
764  *                       any caller)
765  *
766  *      Database Lock
767  *
768  * Failure to follow this hierarchy can result in deadlock.
769  */
770 
771 /*
772  * Deleting Nodes
773  *
774  * For zone databases the node for the origin of the zone MUST NOT be deleted.
775  */
776 
777 /* Fixed RRSet helper macros */
778 
779 #define DNS_RDATASET_LENGTH 2;
780 
781 #if DNS_RDATASET_FIXED
782 #define DNS_RDATASET_ORDER 2
783 #define DNS_RDATASET_COUNT (count * 4)
784 #else /* !DNS_RDATASET_FIXED */
785 #define DNS_RDATASET_ORDER 0
786 #define DNS_RDATASET_COUNT 0
787 #endif /* DNS_RDATASET_FIXED */
788 
789 /*
790  * DB Routines
791  */
792 
793 static void
794 attach(dns_db_t *source, dns_db_t **targetp) {
795 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
796 
797 	REQUIRE(VALID_RBTDB(rbtdb));
798 
799 	isc_refcount_increment(&rbtdb->references);
800 
801 	*targetp = source;
802 }
803 
804 static void
805 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
806 	dns_rbtdb_t *rbtdb = event->ev_arg;
807 
808 	UNUSED(task);
809 
810 	free_rbtdb(rbtdb, true, event);
811 }
812 
813 static void
814 update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
815 	INSIST(IS_CACHE(rbtdb));
816 
817 	if (rbtdb->cachestats == NULL) {
818 		return;
819 	}
820 
821 	switch (result) {
822 	case DNS_R_COVERINGNSEC:
823 		isc_stats_increment(rbtdb->cachestats,
824 				    dns_cachestatscounter_coveringnsec);
825 		FALLTHROUGH;
826 	case ISC_R_SUCCESS:
827 	case DNS_R_CNAME:
828 	case DNS_R_DNAME:
829 	case DNS_R_DELEGATION:
830 	case DNS_R_NCACHENXDOMAIN:
831 	case DNS_R_NCACHENXRRSET:
832 		isc_stats_increment(rbtdb->cachestats,
833 				    dns_cachestatscounter_hits);
834 		break;
835 	default:
836 		isc_stats_increment(rbtdb->cachestats,
837 				    dns_cachestatscounter_misses);
838 	}
839 }
840 
841 static bool
842 do_stats(rdatasetheader_t *header) {
843 	return (EXISTS(header) && STATCOUNT(header));
844 }
845 
846 static void
847 update_rrsetstats(dns_rbtdb_t *rbtdb, const rbtdb_rdatatype_t htype,
848 		  const uint_least16_t hattributes, const bool increment) {
849 	dns_rdatastatstype_t statattributes = 0;
850 	dns_rdatastatstype_t base = 0;
851 	dns_rdatastatstype_t type;
852 	rdatasetheader_t *header = &(rdatasetheader_t){
853 		.type = htype,
854 		.attributes = hattributes,
855 	};
856 
857 	if (!do_stats(header)) {
858 		return;
859 	}
860 
861 	/* At the moment we count statistics only for cache DB */
862 	INSIST(IS_CACHE(rbtdb));
863 
864 	if (NEGATIVE(header)) {
865 		if (NXDOMAIN(header)) {
866 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
867 		} else {
868 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
869 			base = RBTDB_RDATATYPE_EXT(header->type);
870 		}
871 	} else {
872 		base = RBTDB_RDATATYPE_BASE(header->type);
873 	}
874 
875 	if (STALE(header)) {
876 		statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE;
877 	}
878 	if (ANCIENT(header)) {
879 		statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT;
880 	}
881 
882 	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
883 	if (increment) {
884 		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
885 	} else {
886 		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
887 	}
888 }
889 
890 static void
891 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
892 	int idx;
893 	isc_heap_t *heap;
894 	dns_ttl_t oldttl;
895 
896 	if (!IS_CACHE(rbtdb)) {
897 		header->rdh_ttl = newttl;
898 		return;
899 	}
900 
901 	oldttl = header->rdh_ttl;
902 	header->rdh_ttl = newttl;
903 
904 	/*
905 	 * It's possible the rbtdb is not a cache.  If this is the case,
906 	 * we will not have a heap, and we move on.  If we do, though,
907 	 * we might need to adjust things.
908 	 */
909 	if (header->heap_index == 0 || newttl == oldttl) {
910 		return;
911 	}
912 	idx = header->node->locknum;
913 	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) {
914 		return;
915 	}
916 	heap = rbtdb->heaps[idx];
917 
918 	if (newttl < oldttl) {
919 		isc_heap_increased(heap, header->heap_index);
920 	} else {
921 		isc_heap_decreased(heap, header->heap_index);
922 	}
923 
924 	if (newttl == 0) {
925 		isc_heap_delete(heap, header->heap_index);
926 	}
927 }
928 
929 static bool
930 prio_type(rbtdb_rdatatype_t type) {
931 	switch (type) {
932 	case dns_rdatatype_soa:
933 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa):
934 	case dns_rdatatype_a:
935 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_a):
936 	case dns_rdatatype_mx:
937 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_mx):
938 	case dns_rdatatype_aaaa:
939 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_aaaa):
940 	case dns_rdatatype_nsec:
941 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec):
942 	case dns_rdatatype_nsec3:
943 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3):
944 	case dns_rdatatype_ns:
945 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns):
946 	case dns_rdatatype_ds:
947 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds):
948 	case dns_rdatatype_cname:
949 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname):
950 	case dns_rdatatype_dname:
951 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname):
952 	case dns_rdatatype_svcb:
953 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_svcb):
954 	case dns_rdatatype_https:
955 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_https):
956 	case dns_rdatatype_dnskey:
957 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dnskey):
958 	case dns_rdatatype_srv:
959 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_srv):
960 	case dns_rdatatype_txt:
961 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_txt):
962 	case dns_rdatatype_ptr:
963 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ptr):
964 	case dns_rdatatype_naptr:
965 	case RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_naptr):
966 		return (true);
967 	}
968 	return (false);
969 }
970 
971 /*%
972  * These functions allow the heap code to rank the priority of each
973  * element.  It returns true if v1 happens "sooner" than v2.
974  */
975 static bool
976 ttl_sooner(void *v1, void *v2) {
977 	rdatasetheader_t *h1 = v1;
978 	rdatasetheader_t *h2 = v2;
979 
980 	return (h1->rdh_ttl < h2->rdh_ttl);
981 }
982 
983 /*%
984  * Return which RRset should be resigned sooner.  If the RRsets have the
985  * same signing time, prefer the other RRset over the SOA RRset.
986  */
987 static bool
988 resign_sooner(void *v1, void *v2) {
989 	rdatasetheader_t *h1 = v1;
990 	rdatasetheader_t *h2 = v2;
991 
992 	return (h1->resign < h2->resign ||
993 		(h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) ||
994 		(h1->resign == h2->resign && h1->resign_lsb == h2->resign_lsb &&
995 		 h2->type == RBTDB_RDATATYPE_SIGSOA));
996 }
997 
998 /*%
999  * This function sets the heap index into the header.
1000  */
1001 static void
1002 set_index(void *what, unsigned int idx) {
1003 	rdatasetheader_t *h = what;
1004 
1005 	h->heap_index = idx;
1006 }
1007 
1008 /*%
1009  * Work out how many nodes can be deleted in the time between two
1010  * requests to the nameserver.  Smooth the resulting number and use it
1011  * as a estimate for the number of nodes to be deleted in the next
1012  * iteration.
1013  */
1014 static unsigned int
1015 adjust_quantum(unsigned int old, isc_time_t *start) {
1016 	unsigned int pps = dns_pps; /* packets per second */
1017 	unsigned int interval;
1018 	uint64_t usecs;
1019 	isc_time_t end;
1020 	unsigned int nodes;
1021 
1022 	if (pps < 100) {
1023 		pps = 100;
1024 	}
1025 	isc_time_now(&end);
1026 
1027 	interval = 1000000 / pps; /* interval in usec */
1028 	if (interval == 0) {
1029 		interval = 1;
1030 	}
1031 	usecs = isc_time_microdiff(&end, start);
1032 	if (usecs == 0) {
1033 		/*
1034 		 * We were unable to measure the amount of time taken.
1035 		 * Double the nodes deleted next time.
1036 		 */
1037 		old *= 2;
1038 		if (old > 1000) {
1039 			old = 1000;
1040 		}
1041 		return (old);
1042 	}
1043 	nodes = old * interval;
1044 	nodes /= (unsigned int)usecs;
1045 	if (nodes == 0) {
1046 		nodes = 1;
1047 	} else if (nodes > 1000) {
1048 		nodes = 1000;
1049 	}
1050 
1051 	/* Smooth */
1052 	nodes = (nodes + old * 3) / 4;
1053 
1054 	if (nodes != old) {
1055 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1056 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1057 			      "adjust_quantum: old=%d, new=%d", old, nodes);
1058 	}
1059 
1060 	return (nodes);
1061 }
1062 
1063 static void
1064 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
1065 	unsigned int i;
1066 	isc_result_t result;
1067 	char buf[DNS_NAME_FORMATSIZE];
1068 	dns_rbt_t **treep;
1069 	isc_time_t start;
1070 
1071 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1072 		overmem((dns_db_t *)rbtdb, (bool)-1);
1073 	}
1074 
1075 	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
1076 	REQUIRE(rbtdb->future_version == NULL);
1077 
1078 	if (rbtdb->current_version != NULL) {
1079 		isc_refcount_decrementz(&rbtdb->current_version->references);
1080 		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
1081 		isc_rwlock_destroy(&rbtdb->current_version->glue_rwlock);
1082 		isc_refcount_destroy(&rbtdb->current_version->references);
1083 		isc_rwlock_destroy(&rbtdb->current_version->rwlock);
1084 		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
1085 			    sizeof(rbtdb_version_t));
1086 	}
1087 
1088 	/*
1089 	 * We assume the number of remaining dead nodes is reasonably small;
1090 	 * the overhead of unlinking all nodes here should be negligible.
1091 	 */
1092 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1093 		dns_rbtnode_t *node;
1094 
1095 		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1096 		while (node != NULL) {
1097 			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
1098 			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1099 		}
1100 	}
1101 
1102 	if (event == NULL) {
1103 		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
1104 	}
1105 
1106 	for (;;) {
1107 		/*
1108 		 * pick the next tree to (start to) destroy
1109 		 */
1110 		treep = &rbtdb->tree;
1111 		if (*treep == NULL) {
1112 			treep = &rbtdb->nsec;
1113 			if (*treep == NULL) {
1114 				treep = &rbtdb->nsec3;
1115 				/*
1116 				 * we're finished after clear cutting
1117 				 */
1118 				if (*treep == NULL) {
1119 					break;
1120 				}
1121 			}
1122 		}
1123 
1124 		isc_time_now(&start);
1125 		result = dns_rbt_destroy2(treep, rbtdb->quantum);
1126 		if (result == ISC_R_QUOTA) {
1127 			INSIST(rbtdb->task != NULL);
1128 			if (rbtdb->quantum != 0) {
1129 				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
1130 								&start);
1131 			}
1132 			if (event == NULL) {
1133 				event = isc_event_allocate(
1134 					rbtdb->common.mctx, NULL,
1135 					DNS_EVENT_FREESTORAGE,
1136 					free_rbtdb_callback, rbtdb,
1137 					sizeof(isc_event_t));
1138 			}
1139 			isc_task_send(rbtdb->task, &event);
1140 			return;
1141 		}
1142 		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
1143 	}
1144 
1145 	if (event != NULL) {
1146 		isc_event_free(&event);
1147 	}
1148 	if (log) {
1149 		if (dns_name_dynamic(&rbtdb->common.origin)) {
1150 			dns_name_format(&rbtdb->common.origin, buf,
1151 					sizeof(buf));
1152 		} else {
1153 			strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1154 		}
1155 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1156 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1157 			      "done free_rbtdb(%s)", buf);
1158 	}
1159 	if (dns_name_dynamic(&rbtdb->common.origin)) {
1160 		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
1161 	}
1162 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1163 		isc_refcount_destroy(&rbtdb->node_locks[i].references);
1164 		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
1165 	}
1166 
1167 	/*
1168 	 * Clean up LRU / re-signing order lists.
1169 	 */
1170 	if (rbtdb->rdatasets != NULL) {
1171 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1172 			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
1173 		}
1174 		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
1175 			    rbtdb->node_lock_count *
1176 				    sizeof(rdatasetheaderlist_t));
1177 	}
1178 	/*
1179 	 * Clean up dead node buckets.
1180 	 */
1181 	if (rbtdb->deadnodes != NULL) {
1182 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1183 			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
1184 		}
1185 		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
1186 			    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
1187 	}
1188 	/*
1189 	 * Clean up heap objects.
1190 	 */
1191 	if (rbtdb->heaps != NULL) {
1192 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1193 			isc_heap_destroy(&rbtdb->heaps[i]);
1194 		}
1195 		isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
1196 			    rbtdb->node_lock_count * sizeof(isc_heap_t *));
1197 	}
1198 
1199 	if (rbtdb->rrsetstats != NULL) {
1200 		dns_stats_detach(&rbtdb->rrsetstats);
1201 	}
1202 	if (rbtdb->cachestats != NULL) {
1203 		isc_stats_detach(&rbtdb->cachestats);
1204 	}
1205 	if (rbtdb->gluecachestats != NULL) {
1206 		isc_stats_detach(&rbtdb->gluecachestats);
1207 	}
1208 
1209 	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
1210 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
1211 	isc_rwlock_destroy(&rbtdb->tree_lock);
1212 	isc_refcount_destroy(&rbtdb->references);
1213 	if (rbtdb->task != NULL) {
1214 		isc_task_detach(&rbtdb->task);
1215 	}
1216 	if (rbtdb->prunetask != NULL) {
1217 		isc_task_detach(&rbtdb->prunetask);
1218 	}
1219 
1220 	RBTDB_DESTROYLOCK(&rbtdb->lock);
1221 	rbtdb->common.magic = 0;
1222 	rbtdb->common.impmagic = 0;
1223 	isc_mem_detach(&rbtdb->hmctx);
1224 
1225 	INSIST(ISC_LIST_EMPTY(rbtdb->common.update_listeners));
1226 
1227 	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
1228 }
1229 
1230 static void
1231 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
1232 	bool want_free = false;
1233 	unsigned int i;
1234 	unsigned int inactive = 0;
1235 
1236 	/* XXX check for open versions here */
1237 
1238 	if (rbtdb->soanode != NULL) {
1239 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
1240 	}
1241 	if (rbtdb->nsnode != NULL) {
1242 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
1243 	}
1244 
1245 	/*
1246 	 * The current version's glue table needs to be freed early
1247 	 * so the nodes are dereferenced before we check the active
1248 	 * node count below.
1249 	 */
1250 	if (rbtdb->current_version != NULL) {
1251 		free_gluetable(rbtdb->current_version);
1252 	}
1253 
1254 	/*
1255 	 * Even though there are no external direct references, there still
1256 	 * may be nodes in use.
1257 	 */
1258 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1259 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1260 		rbtdb->node_locks[i].exiting = true;
1261 		if (isc_refcount_current(&rbtdb->node_locks[i].references) == 0)
1262 		{
1263 			inactive++;
1264 		}
1265 		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1266 	}
1267 
1268 	if (inactive != 0) {
1269 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1270 		rbtdb->active -= inactive;
1271 		if (rbtdb->active == 0) {
1272 			want_free = true;
1273 		}
1274 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1275 		if (want_free) {
1276 			char buf[DNS_NAME_FORMATSIZE];
1277 			if (dns_name_dynamic(&rbtdb->common.origin)) {
1278 				dns_name_format(&rbtdb->common.origin, buf,
1279 						sizeof(buf));
1280 			} else {
1281 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1282 			}
1283 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1284 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1285 				      "calling free_rbtdb(%s)", buf);
1286 			free_rbtdb(rbtdb, true, NULL);
1287 		}
1288 	}
1289 }
1290 
1291 static void
1292 detach(dns_db_t **dbp) {
1293 	REQUIRE(dbp != NULL && VALID_RBTDB((dns_rbtdb_t *)(*dbp)));
1294 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
1295 	*dbp = NULL;
1296 
1297 	if (isc_refcount_decrement(&rbtdb->references) == 1) {
1298 		maybe_free_rbtdb(rbtdb);
1299 	}
1300 }
1301 
1302 static void
1303 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
1304 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1305 	rbtdb_version_t *version;
1306 
1307 	REQUIRE(VALID_RBTDB(rbtdb));
1308 
1309 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1310 	version = rbtdb->current_version;
1311 	isc_refcount_increment(&version->references);
1312 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1313 
1314 	*versionp = (dns_dbversion_t *)version;
1315 }
1316 
1317 static rbtdb_version_t *
1318 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
1319 		 unsigned int references, bool writer) {
1320 	rbtdb_version_t *version;
1321 	size_t size;
1322 
1323 	version = isc_mem_get(mctx, sizeof(*version));
1324 	version->serial = serial;
1325 
1326 	isc_refcount_init(&version->references, references);
1327 	isc_rwlock_init(&version->glue_rwlock, 0, 0);
1328 
1329 	version->glue_table_bits = RBTDB_GLUE_TABLE_INIT_BITS;
1330 	version->glue_table_nodecount = 0U;
1331 
1332 	size = HASHSIZE(version->glue_table_bits) *
1333 	       sizeof(version->glue_table[0]);
1334 	version->glue_table = isc_mem_get(mctx, size);
1335 	memset(version->glue_table, 0, size);
1336 
1337 	version->writer = writer;
1338 	version->commit_ok = false;
1339 	ISC_LIST_INIT(version->changed_list);
1340 	ISC_LIST_INIT(version->resigned_list);
1341 	ISC_LINK_INIT(version, link);
1342 
1343 	return (version);
1344 }
1345 
1346 static isc_result_t
1347 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1348 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1349 	rbtdb_version_t *version;
1350 
1351 	REQUIRE(VALID_RBTDB(rbtdb));
1352 	REQUIRE(versionp != NULL && *versionp == NULL);
1353 	REQUIRE(rbtdb->future_version == NULL);
1354 
1355 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1356 	RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
1357 	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1358 				   true);
1359 	version->rbtdb = rbtdb;
1360 	version->commit_ok = true;
1361 	version->secure = rbtdb->current_version->secure;
1362 	version->havensec3 = rbtdb->current_version->havensec3;
1363 	if (version->havensec3) {
1364 		version->flags = rbtdb->current_version->flags;
1365 		version->iterations = rbtdb->current_version->iterations;
1366 		version->hash = rbtdb->current_version->hash;
1367 		version->salt_length = rbtdb->current_version->salt_length;
1368 		memmove(version->salt, rbtdb->current_version->salt,
1369 			version->salt_length);
1370 	} else {
1371 		version->flags = 0;
1372 		version->iterations = 0;
1373 		version->hash = 0;
1374 		version->salt_length = 0;
1375 		memset(version->salt, 0, sizeof(version->salt));
1376 	}
1377 	isc_rwlock_init(&version->rwlock, 0, 0);
1378 	RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1379 	version->records = rbtdb->current_version->records;
1380 	version->xfrsize = rbtdb->current_version->xfrsize;
1381 	RWUNLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1382 	rbtdb->next_serial++;
1383 	rbtdb->future_version = version;
1384 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1385 
1386 	*versionp = version;
1387 
1388 	return (ISC_R_SUCCESS);
1389 }
1390 
1391 static void
1392 attachversion(dns_db_t *db, dns_dbversion_t *source,
1393 	      dns_dbversion_t **targetp) {
1394 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1395 	rbtdb_version_t *rbtversion = source;
1396 
1397 	REQUIRE(VALID_RBTDB(rbtdb));
1398 	INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
1399 
1400 	isc_refcount_increment(&rbtversion->references);
1401 
1402 	*targetp = rbtversion;
1403 }
1404 
1405 static rbtdb_changed_t *
1406 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) {
1407 	rbtdb_changed_t *changed;
1408 
1409 	/*
1410 	 * Caller must be holding the node lock if its reference must be
1411 	 * protected by the lock.
1412 	 */
1413 
1414 	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1415 
1416 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1417 
1418 	REQUIRE(version->writer);
1419 
1420 	if (changed != NULL) {
1421 		isc_refcount_increment(&node->references);
1422 		changed->node = node;
1423 		changed->dirty = false;
1424 		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1425 	} else {
1426 		version->commit_ok = false;
1427 	}
1428 
1429 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1430 
1431 	return (changed);
1432 }
1433 
1434 static void
1435 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1436 	if (dns_name_dynamic(&(*noqname)->name)) {
1437 		dns_name_free(&(*noqname)->name, mctx);
1438 	}
1439 	if ((*noqname)->neg != NULL) {
1440 		isc_mem_put(mctx, (*noqname)->neg,
1441 			    dns_rdataslab_size((*noqname)->neg, 0));
1442 	}
1443 	if ((*noqname)->negsig != NULL) {
1444 		isc_mem_put(mctx, (*noqname)->negsig,
1445 			    dns_rdataslab_size((*noqname)->negsig, 0));
1446 	}
1447 	isc_mem_put(mctx, *noqname, sizeof(**noqname));
1448 	*noqname = NULL;
1449 }
1450 
1451 static void
1452 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) {
1453 	ISC_LINK_INIT(h, link);
1454 	h->heap_index = 0;
1455 	atomic_init(&h->attributes, 0);
1456 	atomic_init(&h->last_refresh_fail_ts, 0);
1457 
1458 	STATIC_ASSERT((sizeof(h->attributes) == 2),
1459 		      "The .attributes field of rdatasetheader_t needs to be "
1460 		      "16-bit int type exactly.");
1461 
1462 #if TRACE_HEADER
1463 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1464 		fprintf(stderr, "initialized header: %p\n", h);
1465 	}
1466 #else  /* if TRACE_HEADER */
1467 	UNUSED(rbtdb);
1468 #endif /* if TRACE_HEADER */
1469 }
1470 
1471 static void
1472 update_newheader(rdatasetheader_t *newh, rdatasetheader_t *old) {
1473 	if (CASESET(old)) {
1474 		uint_least16_t attr = RDATASET_ATTR_GET(
1475 			old,
1476 			(RDATASET_ATTR_CASESET | RDATASET_ATTR_CASEFULLYLOWER));
1477 		RDATASET_ATTR_SET(newh, attr);
1478 		memmove(newh->upper, old->upper, sizeof(old->upper));
1479 	}
1480 }
1481 
1482 static rdatasetheader_t *
1483 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) {
1484 	rdatasetheader_t *h;
1485 
1486 	h = isc_mem_get(mctx, sizeof(*h));
1487 
1488 #if TRACE_HEADER
1489 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1490 		fprintf(stderr, "allocated header: %p\n", h);
1491 	}
1492 #endif /* if TRACE_HEADER */
1493 	memset(h->upper, 0xeb, sizeof(h->upper));
1494 	init_rdataset(rbtdb, h);
1495 	h->rdh_ttl = 0;
1496 	return (h);
1497 }
1498 
1499 static void
1500 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) {
1501 	unsigned int size;
1502 	int idx;
1503 
1504 	update_rrsetstats(rbtdb, rdataset->type,
1505 			  atomic_load_acquire(&rdataset->attributes), false);
1506 
1507 	idx = rdataset->node->locknum;
1508 	if (ISC_LINK_LINKED(rdataset, link)) {
1509 		INSIST(IS_CACHE(rbtdb));
1510 		ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link);
1511 	}
1512 
1513 	if (rdataset->heap_index != 0) {
1514 		isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
1515 	}
1516 	rdataset->heap_index = 0;
1517 
1518 	if (rdataset->noqname != NULL) {
1519 		free_noqname(mctx, &rdataset->noqname);
1520 	}
1521 	if (rdataset->closest != NULL) {
1522 		free_noqname(mctx, &rdataset->closest);
1523 	}
1524 
1525 	if (NONEXISTENT(rdataset)) {
1526 		size = sizeof(*rdataset);
1527 	} else {
1528 		size = dns_rdataslab_size((unsigned char *)rdataset,
1529 					  sizeof(*rdataset));
1530 	}
1531 
1532 	isc_mem_put(mctx, rdataset, size);
1533 }
1534 
1535 static void
1536 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1537 	rdatasetheader_t *header, *dcurrent;
1538 	bool make_dirty = false;
1539 
1540 	/*
1541 	 * Caller must hold the node lock.
1542 	 */
1543 
1544 	/*
1545 	 * We set the IGNORE attribute on rdatasets with serial number
1546 	 * 'serial'.  When the reference count goes to zero, these rdatasets
1547 	 * will be cleaned up; until that time, they will be ignored.
1548 	 */
1549 	for (header = node->data; header != NULL; header = header->next) {
1550 		if (header->serial == serial) {
1551 			RDATASET_ATTR_SET(header, RDATASET_ATTR_IGNORE);
1552 			make_dirty = true;
1553 		}
1554 		for (dcurrent = header->down; dcurrent != NULL;
1555 		     dcurrent = dcurrent->down)
1556 		{
1557 			if (dcurrent->serial == serial) {
1558 				RDATASET_ATTR_SET(dcurrent,
1559 						  RDATASET_ATTR_IGNORE);
1560 				make_dirty = true;
1561 			}
1562 		}
1563 	}
1564 	if (make_dirty) {
1565 		node->dirty = 1;
1566 	}
1567 }
1568 
1569 static void
1570 mark_header_ancient(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1571 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1572 	uint_least16_t newattributes = 0;
1573 
1574 	/*
1575 	 * If we are already ancient there is nothing to do.
1576 	 */
1577 	do {
1578 		if ((attributes & RDATASET_ATTR_ANCIENT) != 0) {
1579 			return;
1580 		}
1581 		newattributes = attributes | RDATASET_ATTR_ANCIENT;
1582 	} while (!atomic_compare_exchange_weak_acq_rel(
1583 		&header->attributes, &attributes, newattributes));
1584 
1585 	/*
1586 	 * Decrement the stats counter for the appropriate RRtype.
1587 	 * If the STALE attribute is set, this will decrement the
1588 	 * stale type counter, otherwise it decrements the active
1589 	 * stats type counter.
1590 	 */
1591 	update_rrsetstats(rbtdb, header->type, attributes, false);
1592 	header->node->dirty = 1;
1593 
1594 	/* Increment the stats counter for the ancient RRtype. */
1595 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1596 }
1597 
1598 static void
1599 mark_header_stale(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1600 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1601 	uint_least16_t newattributes = 0;
1602 
1603 	INSIST((attributes & RDATASET_ATTR_ZEROTTL) == 0);
1604 
1605 	/*
1606 	 * If we are already stale there is nothing to do.
1607 	 */
1608 	do {
1609 		if ((attributes & RDATASET_ATTR_STALE) != 0) {
1610 			return;
1611 		}
1612 		newattributes = attributes | RDATASET_ATTR_STALE;
1613 	} while (!atomic_compare_exchange_weak_acq_rel(
1614 		&header->attributes, &attributes, newattributes));
1615 
1616 	/* Decrement the stats counter for the appropriate RRtype.
1617 	 * If the ANCIENT attribute is set (although it is very
1618 	 * unlikely that an RRset goes from ANCIENT to STALE), this
1619 	 * will decrement the ancient stale type counter, otherwise it
1620 	 * decrements the active stats type counter.
1621 	 */
1622 
1623 	update_rrsetstats(rbtdb, header->type, attributes, false);
1624 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1625 }
1626 
1627 static void
1628 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx,
1629 		    rdatasetheader_t *top) {
1630 	rdatasetheader_t *d, *down_next;
1631 
1632 	for (d = top->down; d != NULL; d = down_next) {
1633 		down_next = d->down;
1634 		free_rdataset(rbtdb, mctx, d);
1635 	}
1636 	top->down = NULL;
1637 }
1638 
1639 static void
1640 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1641 	rdatasetheader_t *current, *top_prev, *top_next;
1642 	isc_mem_t *mctx = rbtdb->common.mctx;
1643 
1644 	/*
1645 	 * Caller must be holding the node lock.
1646 	 */
1647 
1648 	top_prev = NULL;
1649 	for (current = node->data; current != NULL; current = top_next) {
1650 		top_next = current->next;
1651 		clean_stale_headers(rbtdb, mctx, current);
1652 		/*
1653 		 * If current is nonexistent, ancient, or stale and
1654 		 * we are not keeping stale, we can clean it up.
1655 		 */
1656 		if (NONEXISTENT(current) || ANCIENT(current) ||
1657 		    (STALE(current) && !KEEPSTALE(rbtdb)))
1658 		{
1659 			if (top_prev != NULL) {
1660 				top_prev->next = current->next;
1661 			} else {
1662 				node->data = current->next;
1663 			}
1664 			free_rdataset(rbtdb, mctx, current);
1665 		} else {
1666 			top_prev = current;
1667 		}
1668 	}
1669 	node->dirty = 0;
1670 }
1671 
1672 static void
1673 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1674 		rbtdb_serial_t least_serial) {
1675 	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1676 	rdatasetheader_t *top_prev, *top_next;
1677 	isc_mem_t *mctx = rbtdb->common.mctx;
1678 	bool still_dirty = false;
1679 
1680 	/*
1681 	 * Caller must be holding the node lock.
1682 	 */
1683 	REQUIRE(least_serial != 0);
1684 
1685 	top_prev = NULL;
1686 	for (current = node->data; current != NULL; current = top_next) {
1687 		top_next = current->next;
1688 
1689 		/*
1690 		 * First, we clean up any instances of multiple rdatasets
1691 		 * with the same serial number, or that have the IGNORE
1692 		 * attribute.
1693 		 */
1694 		dparent = current;
1695 		for (dcurrent = current->down; dcurrent != NULL;
1696 		     dcurrent = down_next)
1697 		{
1698 			down_next = dcurrent->down;
1699 			INSIST(dcurrent->serial <= dparent->serial);
1700 			if (dcurrent->serial == dparent->serial ||
1701 			    IGNORE(dcurrent))
1702 			{
1703 				if (down_next != NULL) {
1704 					down_next->next = dparent;
1705 				}
1706 				dparent->down = down_next;
1707 				free_rdataset(rbtdb, mctx, dcurrent);
1708 			} else {
1709 				dparent = dcurrent;
1710 			}
1711 		}
1712 
1713 		/*
1714 		 * We've now eliminated all IGNORE datasets with the possible
1715 		 * exception of current, which we now check.
1716 		 */
1717 		if (IGNORE(current)) {
1718 			down_next = current->down;
1719 			if (down_next == NULL) {
1720 				if (top_prev != NULL) {
1721 					top_prev->next = current->next;
1722 				} else {
1723 					node->data = current->next;
1724 				}
1725 				free_rdataset(rbtdb, mctx, current);
1726 				/*
1727 				 * current no longer exists, so we can
1728 				 * just continue with the loop.
1729 				 */
1730 				continue;
1731 			} else {
1732 				/*
1733 				 * Pull up current->down, making it the new
1734 				 * current.
1735 				 */
1736 				if (top_prev != NULL) {
1737 					top_prev->next = down_next;
1738 				} else {
1739 					node->data = down_next;
1740 				}
1741 				down_next->next = top_next;
1742 				free_rdataset(rbtdb, mctx, current);
1743 				current = down_next;
1744 			}
1745 		}
1746 
1747 		/*
1748 		 * We now try to find the first down node less than the
1749 		 * least serial.
1750 		 */
1751 		dparent = current;
1752 		for (dcurrent = current->down; dcurrent != NULL;
1753 		     dcurrent = down_next)
1754 		{
1755 			down_next = dcurrent->down;
1756 			if (dcurrent->serial < least_serial) {
1757 				break;
1758 			}
1759 			dparent = dcurrent;
1760 		}
1761 
1762 		/*
1763 		 * If there is a such an rdataset, delete it and any older
1764 		 * versions.
1765 		 */
1766 		if (dcurrent != NULL) {
1767 			do {
1768 				down_next = dcurrent->down;
1769 				INSIST(dcurrent->serial <= least_serial);
1770 				free_rdataset(rbtdb, mctx, dcurrent);
1771 				dcurrent = down_next;
1772 			} while (dcurrent != NULL);
1773 			dparent->down = NULL;
1774 		}
1775 
1776 		/*
1777 		 * Note.  The serial number of 'current' might be less than
1778 		 * least_serial too, but we cannot delete it because it is
1779 		 * the most recent version, unless it is a NONEXISTENT
1780 		 * rdataset.
1781 		 */
1782 		if (current->down != NULL) {
1783 			still_dirty = true;
1784 			top_prev = current;
1785 		} else {
1786 			/*
1787 			 * If this is a NONEXISTENT rdataset, we can delete it.
1788 			 */
1789 			if (NONEXISTENT(current)) {
1790 				if (top_prev != NULL) {
1791 					top_prev->next = current->next;
1792 				} else {
1793 					node->data = current->next;
1794 				}
1795 				free_rdataset(rbtdb, mctx, current);
1796 			} else {
1797 				top_prev = current;
1798 			}
1799 		}
1800 	}
1801 	if (!still_dirty) {
1802 		node->dirty = 0;
1803 	}
1804 }
1805 
1806 /*
1807  * tree_lock(write) must be held.
1808  */
1809 static void
1810 delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1811 	dns_rbtnode_t *nsecnode;
1812 	dns_fixedname_t fname;
1813 	dns_name_t *name;
1814 	isc_result_t result = ISC_R_UNEXPECTED;
1815 
1816 	INSIST(!ISC_LINK_LINKED(node, deadlink));
1817 
1818 	if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1819 		char printname[DNS_NAME_FORMATSIZE];
1820 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1821 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1822 			      "delete_node(): %p %s (bucket %d)", node,
1823 			      dns_rbt_formatnodename(node, printname,
1824 						     sizeof(printname)),
1825 			      node->locknum);
1826 	}
1827 
1828 	switch (node->nsec) {
1829 	case DNS_RBT_NSEC_NORMAL:
1830 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1831 		break;
1832 	case DNS_RBT_NSEC_HAS_NSEC:
1833 		/*
1834 		 * Though this may be wasteful, it has to be done before
1835 		 * node is deleted.
1836 		 */
1837 		name = dns_fixedname_initname(&fname);
1838 		dns_rbt_fullnamefromnode(node, name);
1839 		/*
1840 		 * Delete the corresponding node from the auxiliary NSEC
1841 		 * tree before deleting from the main tree.
1842 		 */
1843 		nsecnode = NULL;
1844 		result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
1845 					  NULL, DNS_RBTFIND_EMPTYDATA, NULL,
1846 					  NULL);
1847 		if (result != ISC_R_SUCCESS) {
1848 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1849 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1850 				      "delete_node: "
1851 				      "dns_rbt_findnode(nsec): %s",
1852 				      isc_result_totext(result));
1853 		} else {
1854 			result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
1855 						    false);
1856 			if (result != ISC_R_SUCCESS) {
1857 				isc_log_write(
1858 					dns_lctx, DNS_LOGCATEGORY_DATABASE,
1859 					DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1860 					"delete_node(): "
1861 					"dns_rbt_deletenode(nsecnode): %s",
1862 					isc_result_totext(result));
1863 			}
1864 		}
1865 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1866 		break;
1867 	case DNS_RBT_NSEC_NSEC:
1868 		result = dns_rbt_deletenode(rbtdb->nsec, node, false);
1869 		break;
1870 	case DNS_RBT_NSEC_NSEC3:
1871 		result = dns_rbt_deletenode(rbtdb->nsec3, node, false);
1872 		break;
1873 	}
1874 	if (result != ISC_R_SUCCESS) {
1875 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1876 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1877 			      "delete_node(): "
1878 			      "dns_rbt_deletenode: %s",
1879 			      isc_result_totext(result));
1880 	}
1881 }
1882 
1883 /*
1884  * Caller must be holding the node lock.
1885  */
1886 static void
1887 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1888 	      isc_rwlocktype_t nlocktype) {
1889 	if (nlocktype == isc_rwlocktype_write &&
1890 	    ISC_LINK_LINKED(node, deadlink))
1891 	{
1892 		ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
1893 				deadlink);
1894 	}
1895 	if (isc_refcount_increment0(&node->references) == 0) {
1896 		/* this is the first reference to the node */
1897 		isc_refcount_increment0(
1898 			&rbtdb->node_locks[node->locknum].references);
1899 	}
1900 }
1901 
1902 /*%
1903  * The tree lock must be held for the result to be valid.
1904  */
1905 static bool
1906 is_last_node_on_its_level(dns_rbtnode_t *node) {
1907 	return (node->parent != NULL && node->parent->down == node &&
1908 		node->left == NULL && node->right == NULL);
1909 }
1910 
1911 static void
1912 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1913 		   isc_rwlocktype_t nlocktype) {
1914 	isc_event_t *ev;
1915 	dns_db_t *db;
1916 
1917 	ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE,
1918 				prune_tree, node, sizeof(isc_event_t));
1919 	new_reference(rbtdb, node, nlocktype);
1920 	db = NULL;
1921 	attach((dns_db_t *)rbtdb, &db);
1922 	ev->ev_sender = db;
1923 	isc_task_send(rbtdb->prunetask, &ev);
1924 }
1925 
1926 /*%
1927  * Clean up dead nodes.  These are nodes which have no references, and
1928  * have no data.  They are dead but we could not or chose not to delete
1929  * them when we deleted all the data at that node because we did not want
1930  * to wait for the tree write lock.
1931  *
1932  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1933  */
1934 static void
1935 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1936 	dns_rbtnode_t *node;
1937 	int count = 10; /* XXXJT: should be adjustable */
1938 
1939 	node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1940 	while (node != NULL && count > 0) {
1941 		ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1942 
1943 		/*
1944 		 * We might have reactivated this node without a tree write
1945 		 * lock, so we couldn't remove this node from deadnodes then
1946 		 * and we have to do it now.
1947 		 */
1948 		if (isc_refcount_current(&node->references) != 0 ||
1949 		    node->data != NULL)
1950 		{
1951 			node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1952 			count--;
1953 			continue;
1954 		}
1955 
1956 		if (is_last_node_on_its_level(node) && rbtdb->task != NULL) {
1957 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
1958 		} else if (node->down == NULL && node->data == NULL) {
1959 			/*
1960 			 * Not a interior node and not needing to be
1961 			 * reactivated.
1962 			 */
1963 			delete_node(rbtdb, node);
1964 		} else if (node->data == NULL) {
1965 			/*
1966 			 * A interior node without data. Leave linked to
1967 			 * to be cleaned up when node->down becomes NULL.
1968 			 */
1969 			ISC_LIST_APPEND(rbtdb->deadnodes[bucketnum], node,
1970 					deadlink);
1971 		}
1972 		node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1973 		count--;
1974 	}
1975 }
1976 
1977 /*
1978  * This function is assumed to be called when a node is newly referenced
1979  * and can be in the deadnode list.  In that case the node must be retrieved
1980  * from the list because it is going to be used.  In addition, if the caller
1981  * happens to hold a write lock on the tree, it's a good chance to purge dead
1982  * nodes.
1983  * Note: while a new reference is gained in multiple places, there are only very
1984  * few cases where the node can be in the deadnode list (only empty nodes can
1985  * have been added to the list).
1986  */
1987 static void
1988 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1989 		isc_rwlocktype_t treelocktype) {
1990 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
1991 	nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
1992 	bool maybe_cleanup = false;
1993 
1994 	POST(locktype);
1995 
1996 	NODE_LOCK(nodelock, locktype);
1997 
1998 	/*
1999 	 * Check if we can possibly cleanup the dead node.  If so, upgrade
2000 	 * the node lock below to perform the cleanup.
2001 	 */
2002 	if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
2003 	    treelocktype == isc_rwlocktype_write)
2004 	{
2005 		maybe_cleanup = true;
2006 	}
2007 
2008 	if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
2009 		/*
2010 		 * Upgrade the lock and test if we still need to unlink.
2011 		 */
2012 		NODE_UNLOCK(nodelock, locktype);
2013 		locktype = isc_rwlocktype_write;
2014 		POST(locktype);
2015 		NODE_LOCK(nodelock, locktype);
2016 		if (ISC_LINK_LINKED(node, deadlink)) {
2017 			ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
2018 					deadlink);
2019 		}
2020 		if (maybe_cleanup) {
2021 			cleanup_dead_nodes(rbtdb, node->locknum);
2022 		}
2023 	}
2024 
2025 	new_reference(rbtdb, node, locktype);
2026 
2027 	NODE_UNLOCK(nodelock, locktype);
2028 }
2029 
2030 /*
2031  * Caller must be holding the node lock; either the "strong", read or write
2032  * lock.  Note that the lock must be held even when node references are
2033  * atomically modified; in that case the decrement operation itself does not
2034  * have to be protected, but we must avoid a race condition where multiple
2035  * threads are decreasing the reference to zero simultaneously and at least
2036  * one of them is going to free the node.
2037  *
2038  * This function returns true if and only if the node reference decreases
2039  * to zero.
2040  *
2041  * NOTE: Decrementing the reference count of a node to zero does not mean it
2042  * will be immediately freed.
2043  */
2044 static bool
2045 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2046 		    rbtdb_serial_t least_serial, isc_rwlocktype_t nlock,
2047 		    isc_rwlocktype_t tlock, bool pruning) {
2048 	isc_result_t result;
2049 	bool write_locked;
2050 	bool locked = tlock != isc_rwlocktype_none;
2051 	rbtdb_nodelock_t *nodelock;
2052 	int bucket = node->locknum;
2053 	bool no_reference = true;
2054 	uint_fast32_t refs;
2055 
2056 	nodelock = &rbtdb->node_locks[bucket];
2057 
2058 #define KEEP_NODE(n, r, l)                                  \
2059 	((n)->data != NULL || ((l) && (n)->down != NULL) || \
2060 	 (n) == (r)->origin_node || (n) == (r)->nsec3_origin_node)
2061 
2062 	/* Handle easy and typical case first. */
2063 	if (!node->dirty && KEEP_NODE(node, rbtdb, locked)) {
2064 		if (isc_refcount_decrement(&node->references) == 1) {
2065 			refs = isc_refcount_decrement(&nodelock->references);
2066 			INSIST(refs > 0);
2067 			return (true);
2068 		} else {
2069 			return (false);
2070 		}
2071 	}
2072 
2073 	/* Upgrade the lock? */
2074 	if (nlock == isc_rwlocktype_read) {
2075 		NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
2076 		NODE_LOCK(&nodelock->lock, isc_rwlocktype_write);
2077 	}
2078 
2079 	if (isc_refcount_decrement(&node->references) > 1) {
2080 		/* Restore the lock? */
2081 		if (nlock == isc_rwlocktype_read) {
2082 			NODE_DOWNGRADE(&nodelock->lock);
2083 		}
2084 		return (false);
2085 	}
2086 
2087 	if (node->dirty) {
2088 		if (IS_CACHE(rbtdb)) {
2089 			clean_cache_node(rbtdb, node);
2090 		} else {
2091 			if (least_serial == 0) {
2092 				/*
2093 				 * Caller doesn't know the least serial.
2094 				 * Get it.
2095 				 */
2096 				RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2097 				least_serial = rbtdb->least_serial;
2098 				RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2099 			}
2100 			clean_zone_node(rbtdb, node, least_serial);
2101 		}
2102 	}
2103 
2104 	/*
2105 	 * Attempt to switch to a write lock on the tree.  If this fails,
2106 	 * we will add this node to a linked list of nodes in this locking
2107 	 * bucket which we will free later.
2108 	 */
2109 	if (tlock != isc_rwlocktype_write) {
2110 		/*
2111 		 * Locking hierarchy notwithstanding, we don't need to free
2112 		 * the node lock before acquiring the tree write lock because
2113 		 * we only do a trylock.
2114 		 */
2115 		if (tlock == isc_rwlocktype_read) {
2116 			result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
2117 		} else {
2118 			result = isc_rwlock_trylock(&rbtdb->tree_lock,
2119 						    isc_rwlocktype_write);
2120 		}
2121 		RUNTIME_CHECK(result == ISC_R_SUCCESS ||
2122 			      result == ISC_R_LOCKBUSY);
2123 
2124 		write_locked = (result == ISC_R_SUCCESS);
2125 	} else {
2126 		write_locked = true;
2127 	}
2128 
2129 	refs = isc_refcount_decrement(&nodelock->references);
2130 	INSIST(refs > 0);
2131 
2132 	if (KEEP_NODE(node, rbtdb, locked || write_locked)) {
2133 		goto restore_locks;
2134 	}
2135 
2136 #undef KEEP_NODE
2137 
2138 	if (write_locked) {
2139 		/*
2140 		 * If this node is the only one left on its RBTDB level,
2141 		 * attempt pruning the RBTDB (i.e. deleting empty nodes that
2142 		 * are ancestors of 'node' and are not interior nodes) starting
2143 		 * from this node (see prune_tree()).  The main reason this is
2144 		 * not done immediately, but asynchronously, is that the
2145 		 * ancestors of 'node' are almost guaranteed to belong to
2146 		 * different node buckets and we don't want to do juggle locks
2147 		 * right now.
2148 		 *
2149 		 * Since prune_tree() also calls decrement_reference(), check
2150 		 * the value of the 'pruning' parameter (which is only set to
2151 		 * 'true' in the decrement_reference() call present in
2152 		 * prune_tree()) to prevent an infinite loop and to allow a
2153 		 * node sent to prune_tree() to be deleted by the delete_node()
2154 		 * call in the code branch below.
2155 		 */
2156 		if (!pruning && is_last_node_on_its_level(node) &&
2157 		    rbtdb->task != NULL)
2158 		{
2159 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
2160 			no_reference = false;
2161 		} else {
2162 			/*
2163 			 * The node can now be deleted.
2164 			 */
2165 			delete_node(rbtdb, node);
2166 		}
2167 	} else {
2168 		INSIST(node->data == NULL);
2169 		if (!ISC_LINK_LINKED(node, deadlink)) {
2170 			ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
2171 					deadlink);
2172 		}
2173 	}
2174 
2175 restore_locks:
2176 	/* Restore the lock? */
2177 	if (nlock == isc_rwlocktype_read) {
2178 		NODE_DOWNGRADE(&nodelock->lock);
2179 	}
2180 
2181 	/*
2182 	 * Relock a read lock, or unlock the write lock if no lock was held.
2183 	 */
2184 	if (tlock == isc_rwlocktype_none) {
2185 		if (write_locked) {
2186 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2187 		}
2188 	}
2189 
2190 	if (tlock == isc_rwlocktype_read) {
2191 		if (write_locked) {
2192 			isc_rwlock_downgrade(&rbtdb->tree_lock);
2193 		}
2194 	}
2195 
2196 	return (no_reference);
2197 }
2198 
2199 /*
2200  * Prune the RBTDB tree of trees.  Start by attempting to delete a node that is
2201  * the only one left on its RBTDB level (see the send_to_prune_tree() call in
2202  * decrement_reference()).  Then, if the node has a parent (which can either
2203  * exist on the same RBTDB level or on an upper RBTDB level), check whether the
2204  * latter is an interior node (i.e. a node with a non-NULL 'down' pointer).  If
2205  * the parent node is not an interior node, attempt deleting the parent node as
2206  * well and then move on to examining the parent node's parent, etc.  Continue
2207  * traversing the RBTDB tree until a node is encountered that is still an
2208  * interior node after the previously-processed node gets deleted.
2209  *
2210  * It is acceptable for a node sent to this function to NOT be deleted in the
2211  * process (e.g. if it gets reactivated in the meantime).  Furthermore, node
2212  * deletion is not a prerequisite for continuing RBTDB traversal.
2213  *
2214  * This function gets called once for every "starting node" and it continues
2215  * traversing the RBTDB until the stop condition is met.  In the worst case,
2216  * the number of nodes processed by a single execution of this function is the
2217  * number of tree levels, which is at most the maximum number of domain name
2218  * labels (127); however, it should be much smaller in practice and deleting
2219  * empty RBTDB nodes is critical to keeping the amount of memory used by the
2220  * cache memory context within the configured limit anyway.
2221  */
2222 static void
2223 prune_tree(isc_task_t *task, isc_event_t *event) {
2224 	dns_rbtdb_t *rbtdb = event->ev_sender;
2225 	dns_rbtnode_t *node = event->ev_arg;
2226 	dns_rbtnode_t *parent = NULL;
2227 	unsigned int locknum = node->locknum;
2228 
2229 	UNUSED(task);
2230 
2231 	isc_event_free(&event);
2232 
2233 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2234 	NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
2235 	do {
2236 		parent = node->parent;
2237 		decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
2238 				    isc_rwlocktype_write, true);
2239 
2240 		/*
2241 		 * Check whether the parent is an interior node.  Note that it
2242 		 * might have been one before the decrement_reference() call on
2243 		 * the previous line, but decrementing the reference count for
2244 		 * 'node' could have caused 'node->parent->down' to become
2245 		 * NULL.
2246 		 */
2247 		if (parent != NULL && parent->down == NULL) {
2248 			/*
2249 			 * Keep the node lock if possible; otherwise, release
2250 			 * the old lock and acquire one for the parent.
2251 			 */
2252 			if (parent->locknum != locknum) {
2253 				NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2254 					    isc_rwlocktype_write);
2255 				locknum = parent->locknum;
2256 				NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2257 					  isc_rwlocktype_write);
2258 			}
2259 
2260 			/*
2261 			 * We need to gain a reference to the parent node
2262 			 * before decrementing it in the next iteration.
2263 			 */
2264 			new_reference(rbtdb, parent, isc_rwlocktype_write);
2265 		} else {
2266 			parent = NULL;
2267 		}
2268 
2269 		node = parent;
2270 	} while (node != NULL);
2271 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
2272 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2273 
2274 	detach((dns_db_t **)(void *)&rbtdb);
2275 }
2276 
2277 static void
2278 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
2279 		   rbtdb_changedlist_t *cleanup_list) {
2280 	/*
2281 	 * Caller must be holding the database lock.
2282 	 */
2283 
2284 	rbtdb->least_serial = version->serial;
2285 	*cleanup_list = version->changed_list;
2286 	ISC_LIST_INIT(version->changed_list);
2287 }
2288 
2289 static void
2290 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
2291 	rbtdb_changed_t *changed, *next_changed;
2292 
2293 	/*
2294 	 * If the changed record is dirty, then
2295 	 * an update created multiple versions of
2296 	 * a given rdataset.  We keep this list
2297 	 * until we're the least open version, at
2298 	 * which point it's safe to get rid of any
2299 	 * older versions.
2300 	 *
2301 	 * If the changed record isn't dirty, then
2302 	 * we don't need it anymore since we're
2303 	 * committing and not rolling back.
2304 	 *
2305 	 * The caller must be holding the database lock.
2306 	 */
2307 	for (changed = HEAD(version->changed_list); changed != NULL;
2308 	     changed = next_changed)
2309 	{
2310 		next_changed = NEXT(changed, link);
2311 		if (!changed->dirty) {
2312 			UNLINK(version->changed_list, changed, link);
2313 			APPEND(*cleanup_list, changed, link);
2314 		}
2315 	}
2316 }
2317 
2318 static void
2319 iszonesecure(dns_db_t *db, rbtdb_version_t *version, dns_dbnode_t *origin) {
2320 	dns_rdataset_t keyset;
2321 	dns_rdataset_t nsecset, signsecset;
2322 	bool haszonekey = false;
2323 	bool hasnsec = false;
2324 	isc_result_t result;
2325 
2326 	REQUIRE(version != NULL);
2327 
2328 	dns_rdataset_init(&keyset);
2329 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey,
2330 				     0, 0, &keyset, NULL);
2331 	if (result == ISC_R_SUCCESS) {
2332 		result = dns_rdataset_first(&keyset);
2333 		while (result == ISC_R_SUCCESS) {
2334 			dns_rdata_t keyrdata = DNS_RDATA_INIT;
2335 			dns_rdataset_current(&keyset, &keyrdata);
2336 			if (dns_zonekey_iszonekey(&keyrdata)) {
2337 				haszonekey = true;
2338 				break;
2339 			}
2340 			result = dns_rdataset_next(&keyset);
2341 		}
2342 		dns_rdataset_disassociate(&keyset);
2343 	}
2344 	if (!haszonekey) {
2345 		version->secure = dns_db_insecure;
2346 		version->havensec3 = false;
2347 		return;
2348 	}
2349 
2350 	dns_rdataset_init(&nsecset);
2351 	dns_rdataset_init(&signsecset);
2352 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, 0,
2353 				     0, &nsecset, &signsecset);
2354 	if (result == ISC_R_SUCCESS) {
2355 		if (dns_rdataset_isassociated(&signsecset)) {
2356 			hasnsec = true;
2357 			dns_rdataset_disassociate(&signsecset);
2358 		}
2359 		dns_rdataset_disassociate(&nsecset);
2360 	}
2361 
2362 	setnsec3parameters(db, version);
2363 
2364 	/*
2365 	 * Do we have a valid NSEC/NSEC3 chain?
2366 	 */
2367 	if (version->havensec3 || hasnsec) {
2368 		version->secure = dns_db_secure;
2369 	} else {
2370 		version->secure = dns_db_insecure;
2371 	}
2372 }
2373 
2374 /*%<
2375  * Walk the origin node looking for NSEC3PARAM records.
2376  * Cache the nsec3 parameters.
2377  */
2378 static void
2379 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version) {
2380 	dns_rbtnode_t *node;
2381 	dns_rdata_nsec3param_t nsec3param;
2382 	dns_rdata_t rdata = DNS_RDATA_INIT;
2383 	isc_region_t region;
2384 	isc_result_t result;
2385 	rdatasetheader_t *header, *header_next;
2386 	unsigned char *raw; /* RDATASLAB */
2387 	unsigned int count, length;
2388 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2389 
2390 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2391 	version->havensec3 = false;
2392 	node = rbtdb->origin_node;
2393 	NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2394 		  isc_rwlocktype_read);
2395 	for (header = node->data; header != NULL; header = header_next) {
2396 		header_next = header->next;
2397 		do {
2398 			if (header->serial <= version->serial &&
2399 			    !IGNORE(header))
2400 			{
2401 				if (NONEXISTENT(header)) {
2402 					header = NULL;
2403 				}
2404 				break;
2405 			} else {
2406 				header = header->down;
2407 			}
2408 		} while (header != NULL);
2409 
2410 		if (header != NULL &&
2411 		    (header->type == dns_rdatatype_nsec3param))
2412 		{
2413 			/*
2414 			 * Find A NSEC3PARAM with a supported algorithm.
2415 			 */
2416 			raw = (unsigned char *)header + sizeof(*header);
2417 			count = raw[0] * 256 + raw[1]; /* count */
2418 			raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
2419 			while (count-- > 0U) {
2420 				length = raw[0] * 256 + raw[1];
2421 				raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
2422 				region.base = raw;
2423 				region.length = length;
2424 				raw += length;
2425 				dns_rdata_fromregion(
2426 					&rdata, rbtdb->common.rdclass,
2427 					dns_rdatatype_nsec3param, &region);
2428 				result = dns_rdata_tostruct(&rdata, &nsec3param,
2429 							    NULL);
2430 				INSIST(result == ISC_R_SUCCESS);
2431 				dns_rdata_reset(&rdata);
2432 
2433 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG &&
2434 				    !dns_nsec3_supportedhash(nsec3param.hash))
2435 				{
2436 					continue;
2437 				}
2438 
2439 				if (nsec3param.flags != 0) {
2440 					continue;
2441 				}
2442 
2443 				memmove(version->salt, nsec3param.salt,
2444 					nsec3param.salt_length);
2445 				version->hash = nsec3param.hash;
2446 				version->salt_length = nsec3param.salt_length;
2447 				version->iterations = nsec3param.iterations;
2448 				version->flags = nsec3param.flags;
2449 				version->havensec3 = true;
2450 				/*
2451 				 * Look for a better algorithm than the
2452 				 * unknown test algorithm.
2453 				 */
2454 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) {
2455 					goto unlock;
2456 				}
2457 			}
2458 		}
2459 	}
2460 unlock:
2461 	NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2462 		    isc_rwlocktype_read);
2463 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2464 }
2465 
2466 static void
2467 cleanup_dead_nodes_callback(isc_task_t *task, isc_event_t *event) {
2468 	dns_rbtdb_t *rbtdb = event->ev_arg;
2469 	bool again = false;
2470 	unsigned int locknum;
2471 
2472 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2473 	for (locknum = 0; locknum < rbtdb->node_lock_count; locknum++) {
2474 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2475 			  isc_rwlocktype_write);
2476 		cleanup_dead_nodes(rbtdb, locknum);
2477 		if (ISC_LIST_HEAD(rbtdb->deadnodes[locknum]) != NULL) {
2478 			again = true;
2479 		}
2480 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2481 			    isc_rwlocktype_write);
2482 	}
2483 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2484 	if (again) {
2485 		isc_task_send(task, &event);
2486 	} else {
2487 		isc_event_free(&event);
2488 		if (isc_refcount_decrement(&rbtdb->references) == 1) {
2489 			(void)isc_refcount_current(&rbtdb->references);
2490 			maybe_free_rbtdb(rbtdb);
2491 		}
2492 	}
2493 }
2494 
2495 static void
2496 closeversion(dns_db_t *db, dns_dbversion_t **versionp, bool commit) {
2497 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2498 	rbtdb_version_t *version, *cleanup_version, *least_greater;
2499 	bool rollback = false;
2500 	rbtdb_changedlist_t cleanup_list;
2501 	rdatasetheaderlist_t resigned_list;
2502 	rbtdb_changed_t *changed, *next_changed;
2503 	rbtdb_serial_t serial, least_serial;
2504 	dns_rbtnode_t *rbtnode;
2505 	rdatasetheader_t *header;
2506 
2507 	REQUIRE(VALID_RBTDB(rbtdb));
2508 	version = (rbtdb_version_t *)*versionp;
2509 	INSIST(version->rbtdb == rbtdb);
2510 
2511 	cleanup_version = NULL;
2512 	ISC_LIST_INIT(cleanup_list);
2513 	ISC_LIST_INIT(resigned_list);
2514 
2515 	if (isc_refcount_decrement(&version->references) > 1) {
2516 		/* typical and easy case first */
2517 		if (commit) {
2518 			RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2519 			INSIST(!version->writer);
2520 			RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2521 		}
2522 		goto end;
2523 	}
2524 
2525 	/*
2526 	 * Update the zone's secure status in version before making
2527 	 * it the current version.
2528 	 */
2529 	if (version->writer && commit && !IS_CACHE(rbtdb)) {
2530 		iszonesecure(db, version, rbtdb->origin_node);
2531 	}
2532 
2533 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
2534 	serial = version->serial;
2535 	if (version->writer) {
2536 		if (commit) {
2537 			unsigned cur_ref;
2538 			rbtdb_version_t *cur_version;
2539 
2540 			INSIST(version->commit_ok);
2541 			INSIST(version == rbtdb->future_version);
2542 			/*
2543 			 * The current version is going to be replaced.
2544 			 * Release the (likely last) reference to it from the
2545 			 * DB itself and unlink it from the open list.
2546 			 */
2547 			cur_version = rbtdb->current_version;
2548 			cur_ref = isc_refcount_decrement(
2549 				&cur_version->references);
2550 			if (cur_ref == 1) {
2551 				(void)isc_refcount_current(
2552 					&cur_version->references);
2553 				if (cur_version->serial == rbtdb->least_serial)
2554 				{
2555 					INSIST(EMPTY(
2556 						cur_version->changed_list));
2557 				}
2558 				UNLINK(rbtdb->open_versions, cur_version, link);
2559 			}
2560 			if (EMPTY(rbtdb->open_versions)) {
2561 				/*
2562 				 * We're going to become the least open
2563 				 * version.
2564 				 */
2565 				make_least_version(rbtdb, version,
2566 						   &cleanup_list);
2567 			} else {
2568 				/*
2569 				 * Some other open version is the
2570 				 * least version.  We can't cleanup
2571 				 * records that were changed in this
2572 				 * version because the older versions
2573 				 * may still be in use by an open
2574 				 * version.
2575 				 *
2576 				 * We can, however, discard the
2577 				 * changed records for things that
2578 				 * we've added that didn't exist in
2579 				 * prior versions.
2580 				 */
2581 				cleanup_nondirty(version, &cleanup_list);
2582 			}
2583 			/*
2584 			 * If the (soon to be former) current version
2585 			 * isn't being used by anyone, we can clean
2586 			 * it up.
2587 			 */
2588 			if (cur_ref == 1) {
2589 				cleanup_version = cur_version;
2590 				APPENDLIST(version->changed_list,
2591 					   cleanup_version->changed_list, link);
2592 			}
2593 			/*
2594 			 * Become the current version.
2595 			 */
2596 			version->writer = false;
2597 			rbtdb->current_version = version;
2598 			rbtdb->current_serial = version->serial;
2599 			rbtdb->future_version = NULL;
2600 
2601 			/*
2602 			 * Keep the current version in the open list, and
2603 			 * gain a reference for the DB itself (see the DB
2604 			 * creation function below).  This must be the only
2605 			 * case where we need to increment the counter from
2606 			 * zero and need to use isc_refcount_increment0().
2607 			 */
2608 			INSIST(isc_refcount_increment0(&version->references) ==
2609 			       0);
2610 			PREPEND(rbtdb->open_versions, rbtdb->current_version,
2611 				link);
2612 			resigned_list = version->resigned_list;
2613 			ISC_LIST_INIT(version->resigned_list);
2614 		} else {
2615 			/*
2616 			 * We're rolling back this transaction.
2617 			 */
2618 			cleanup_list = version->changed_list;
2619 			ISC_LIST_INIT(version->changed_list);
2620 			resigned_list = version->resigned_list;
2621 			ISC_LIST_INIT(version->resigned_list);
2622 			rollback = true;
2623 			cleanup_version = version;
2624 			rbtdb->future_version = NULL;
2625 		}
2626 	} else {
2627 		if (version != rbtdb->current_version) {
2628 			/*
2629 			 * There are no external or internal references
2630 			 * to this version and it can be cleaned up.
2631 			 */
2632 			cleanup_version = version;
2633 
2634 			/*
2635 			 * Find the version with the least serial
2636 			 * number greater than ours.
2637 			 */
2638 			least_greater = PREV(version, link);
2639 			if (least_greater == NULL) {
2640 				least_greater = rbtdb->current_version;
2641 			}
2642 
2643 			INSIST(version->serial < least_greater->serial);
2644 			/*
2645 			 * Is this the least open version?
2646 			 */
2647 			if (version->serial == rbtdb->least_serial) {
2648 				/*
2649 				 * Yes.  Install the new least open
2650 				 * version.
2651 				 */
2652 				make_least_version(rbtdb, least_greater,
2653 						   &cleanup_list);
2654 			} else {
2655 				/*
2656 				 * Add any unexecuted cleanups to
2657 				 * those of the least greater version.
2658 				 */
2659 				APPENDLIST(least_greater->changed_list,
2660 					   version->changed_list, link);
2661 			}
2662 		} else if (version->serial == rbtdb->least_serial) {
2663 			INSIST(EMPTY(version->changed_list));
2664 		}
2665 		UNLINK(rbtdb->open_versions, version, link);
2666 	}
2667 	least_serial = rbtdb->least_serial;
2668 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2669 
2670 	if (cleanup_version != NULL) {
2671 		INSIST(EMPTY(cleanup_version->changed_list));
2672 		free_gluetable(cleanup_version);
2673 		isc_rwlock_destroy(&cleanup_version->glue_rwlock);
2674 		isc_rwlock_destroy(&cleanup_version->rwlock);
2675 		isc_mem_put(rbtdb->common.mctx, cleanup_version,
2676 			    sizeof(*cleanup_version));
2677 	}
2678 
2679 	/*
2680 	 * Commit/rollback re-signed headers.
2681 	 */
2682 	for (header = HEAD(resigned_list); header != NULL;
2683 	     header = HEAD(resigned_list))
2684 	{
2685 		nodelock_t *lock;
2686 
2687 		ISC_LIST_UNLINK(resigned_list, header, link);
2688 
2689 		lock = &rbtdb->node_locks[header->node->locknum].lock;
2690 		NODE_LOCK(lock, isc_rwlocktype_write);
2691 		if (rollback && !IGNORE(header)) {
2692 			resign_insert(rbtdb, header->node->locknum, header);
2693 		}
2694 		decrement_reference(rbtdb, header->node, least_serial,
2695 				    isc_rwlocktype_write, isc_rwlocktype_none,
2696 				    false);
2697 		NODE_UNLOCK(lock, isc_rwlocktype_write);
2698 	}
2699 
2700 	if (!EMPTY(cleanup_list)) {
2701 		isc_event_t *event = NULL;
2702 		isc_rwlocktype_t tlock = isc_rwlocktype_none;
2703 
2704 		if (rbtdb->task != NULL) {
2705 			event = isc_event_allocate(rbtdb->common.mctx, NULL,
2706 						   DNS_EVENT_RBTDEADNODES,
2707 						   cleanup_dead_nodes_callback,
2708 						   rbtdb, sizeof(isc_event_t));
2709 		}
2710 		if (event == NULL) {
2711 			/*
2712 			 * We acquire a tree write lock here in order to make
2713 			 * sure that stale nodes will be removed in
2714 			 * decrement_reference().  If we didn't have the lock,
2715 			 * those nodes could miss the chance to be removed
2716 			 * until the server stops.  The write lock is
2717 			 * expensive, but this event should be rare enough
2718 			 * to justify the cost.
2719 			 */
2720 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2721 			tlock = isc_rwlocktype_write;
2722 		}
2723 
2724 		for (changed = HEAD(cleanup_list); changed != NULL;
2725 		     changed = next_changed)
2726 		{
2727 			nodelock_t *lock;
2728 
2729 			next_changed = NEXT(changed, link);
2730 			rbtnode = changed->node;
2731 			lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2732 
2733 			NODE_LOCK(lock, isc_rwlocktype_write);
2734 			/*
2735 			 * This is a good opportunity to purge any dead nodes,
2736 			 * so use it.
2737 			 */
2738 			if (event == NULL) {
2739 				cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2740 			}
2741 
2742 			if (rollback) {
2743 				rollback_node(rbtnode, serial);
2744 			}
2745 			decrement_reference(rbtdb, rbtnode, least_serial,
2746 					    isc_rwlocktype_write, tlock, false);
2747 
2748 			NODE_UNLOCK(lock, isc_rwlocktype_write);
2749 
2750 			isc_mem_put(rbtdb->common.mctx, changed,
2751 				    sizeof(*changed));
2752 		}
2753 		if (event != NULL) {
2754 			isc_refcount_increment(&rbtdb->references);
2755 			isc_task_send(rbtdb->task, &event);
2756 		} else {
2757 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2758 		}
2759 	}
2760 
2761 end:
2762 	*versionp = NULL;
2763 }
2764 
2765 /*
2766  * Add the necessary magic for the wildcard name 'name'
2767  * to be found in 'rbtdb'.
2768  *
2769  * In order for wildcard matching to work correctly in
2770  * zone_find(), we must ensure that a node for the wildcarding
2771  * level exists in the database, and has its 'find_callback'
2772  * and 'wild' bits set.
2773  *
2774  * E.g. if the wildcard name is "*.sub.example." then we
2775  * must ensure that "sub.example." exists and is marked as
2776  * a wildcard level.
2777  *
2778  * tree_lock(write) must be held.
2779  */
2780 static isc_result_t
2781 add_wildcard_magic(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2782 	isc_result_t result;
2783 	dns_name_t foundname;
2784 	dns_offsets_t offsets;
2785 	unsigned int n;
2786 	dns_rbtnode_t *node = NULL;
2787 
2788 	dns_name_init(&foundname, offsets);
2789 	n = dns_name_countlabels(name);
2790 	INSIST(n >= 2);
2791 	n--;
2792 	dns_name_getlabelsequence(name, 1, n, &foundname);
2793 	result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2794 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2795 		return (result);
2796 	}
2797 	if (result == ISC_R_SUCCESS) {
2798 		node->nsec = DNS_RBT_NSEC_NORMAL;
2799 	}
2800 	node->find_callback = 1;
2801 	if (lock) {
2802 		NODE_LOCK(&rbtdb->node_locks[node->locknum].lock,
2803 			  isc_rwlocktype_write);
2804 	}
2805 	node->wild = 1;
2806 	if (lock) {
2807 		NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
2808 			    isc_rwlocktype_write);
2809 	}
2810 	return (ISC_R_SUCCESS);
2811 }
2812 
2813 /*
2814  * tree_lock(write) must be held.
2815  */
2816 static isc_result_t
2817 add_empty_wildcards(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2818 	isc_result_t result;
2819 	dns_name_t foundname;
2820 	dns_offsets_t offsets;
2821 	unsigned int n, l, i;
2822 
2823 	dns_name_init(&foundname, offsets);
2824 	n = dns_name_countlabels(name);
2825 	l = dns_name_countlabels(&rbtdb->common.origin);
2826 	i = l + 1;
2827 	while (i < n) {
2828 		dns_rbtnode_t *node = NULL; /* dummy */
2829 		dns_name_getlabelsequence(name, n - i, i, &foundname);
2830 		if (dns_name_iswildcard(&foundname)) {
2831 			result = add_wildcard_magic(rbtdb, &foundname, lock);
2832 			if (result != ISC_R_SUCCESS) {
2833 				return (result);
2834 			}
2835 			result = dns_rbt_addnode(rbtdb->tree, &foundname,
2836 						 &node);
2837 			if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2838 				return (result);
2839 			}
2840 			if (result == ISC_R_SUCCESS) {
2841 				node->nsec = DNS_RBT_NSEC_NORMAL;
2842 			}
2843 		}
2844 		i++;
2845 	}
2846 	return (ISC_R_SUCCESS);
2847 }
2848 
2849 static isc_result_t
2850 findnodeintree(dns_rbtdb_t *rbtdb, dns_rbt_t *tree, const dns_name_t *name,
2851 	       bool create, dns_dbnode_t **nodep) {
2852 	dns_rbtnode_t *node = NULL;
2853 	dns_name_t nodename;
2854 	isc_result_t result;
2855 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
2856 
2857 	INSIST(tree == rbtdb->tree || tree == rbtdb->nsec3);
2858 
2859 	dns_name_init(&nodename, NULL);
2860 	RWLOCK(&rbtdb->tree_lock, locktype);
2861 	result = dns_rbt_findnode(tree, name, NULL, &node, NULL,
2862 				  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2863 	if (result != ISC_R_SUCCESS) {
2864 		RWUNLOCK(&rbtdb->tree_lock, locktype);
2865 		if (!create) {
2866 			if (result == DNS_R_PARTIALMATCH) {
2867 				result = ISC_R_NOTFOUND;
2868 			}
2869 			return (result);
2870 		}
2871 		/*
2872 		 * It would be nice to try to upgrade the lock instead of
2873 		 * unlocking then relocking.
2874 		 */
2875 		locktype = isc_rwlocktype_write;
2876 		RWLOCK(&rbtdb->tree_lock, locktype);
2877 		node = NULL;
2878 		result = dns_rbt_addnode(tree, name, &node);
2879 		if (result == ISC_R_SUCCESS) {
2880 			dns_rbt_namefromnode(node, &nodename);
2881 			node->locknum = node->hashval % rbtdb->node_lock_count;
2882 			if (tree == rbtdb->tree) {
2883 				add_empty_wildcards(rbtdb, name, true);
2884 
2885 				if (dns_name_iswildcard(name)) {
2886 					result = add_wildcard_magic(rbtdb, name,
2887 								    true);
2888 					if (result != ISC_R_SUCCESS) {
2889 						RWUNLOCK(&rbtdb->tree_lock,
2890 							 locktype);
2891 						return (result);
2892 					}
2893 				}
2894 			}
2895 			if (tree == rbtdb->nsec3) {
2896 				node->nsec = DNS_RBT_NSEC_NSEC3;
2897 			}
2898 		} else if (result != ISC_R_EXISTS) {
2899 			RWUNLOCK(&rbtdb->tree_lock, locktype);
2900 			return (result);
2901 		}
2902 	}
2903 
2904 	if (tree == rbtdb->nsec3) {
2905 		INSIST(node->nsec == DNS_RBT_NSEC_NSEC3);
2906 	}
2907 
2908 	reactivate_node(rbtdb, node, locktype);
2909 
2910 	RWUNLOCK(&rbtdb->tree_lock, locktype);
2911 
2912 	*nodep = (dns_dbnode_t *)node;
2913 
2914 	return (ISC_R_SUCCESS);
2915 }
2916 
2917 static isc_result_t
2918 findnode(dns_db_t *db, const dns_name_t *name, bool create,
2919 	 dns_dbnode_t **nodep) {
2920 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2921 
2922 	REQUIRE(VALID_RBTDB(rbtdb));
2923 
2924 	return (findnodeintree(rbtdb, rbtdb->tree, name, create, nodep));
2925 }
2926 
2927 static isc_result_t
2928 findnsec3node(dns_db_t *db, const dns_name_t *name, bool create,
2929 	      dns_dbnode_t **nodep) {
2930 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2931 
2932 	REQUIRE(VALID_RBTDB(rbtdb));
2933 
2934 	return (findnodeintree(rbtdb, rbtdb->nsec3, name, create, nodep));
2935 }
2936 
2937 static isc_result_t
2938 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2939 	rbtdb_search_t *search = arg;
2940 	rdatasetheader_t *header, *header_next;
2941 	rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2942 	rdatasetheader_t *found;
2943 	isc_result_t result;
2944 	dns_rbtnode_t *onode;
2945 
2946 	/*
2947 	 * We only want to remember the topmost zone cut, since it's the one
2948 	 * that counts, so we'll just continue if we've already found a
2949 	 * zonecut.
2950 	 */
2951 	if (search->zonecut != NULL) {
2952 		return (DNS_R_CONTINUE);
2953 	}
2954 
2955 	found = NULL;
2956 	result = DNS_R_CONTINUE;
2957 	onode = search->rbtdb->origin_node;
2958 
2959 	NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2960 		  isc_rwlocktype_read);
2961 
2962 	/*
2963 	 * Look for an NS or DNAME rdataset active in our version.
2964 	 */
2965 	ns_header = NULL;
2966 	dname_header = NULL;
2967 	sigdname_header = NULL;
2968 	for (header = node->data; header != NULL; header = header_next) {
2969 		header_next = header->next;
2970 		if (header->type == dns_rdatatype_ns ||
2971 		    header->type == dns_rdatatype_dname ||
2972 		    header->type == RBTDB_RDATATYPE_SIGDNAME)
2973 		{
2974 			do {
2975 				if (header->serial <= search->serial &&
2976 				    !IGNORE(header))
2977 				{
2978 					/*
2979 					 * Is this a "this rdataset doesn't
2980 					 * exist" record?
2981 					 */
2982 					if (NONEXISTENT(header)) {
2983 						header = NULL;
2984 					}
2985 					break;
2986 				} else {
2987 					header = header->down;
2988 				}
2989 			} while (header != NULL);
2990 			if (header != NULL) {
2991 				if (header->type == dns_rdatatype_dname) {
2992 					dname_header = header;
2993 				} else if (header->type ==
2994 					   RBTDB_RDATATYPE_SIGDNAME)
2995 				{
2996 					sigdname_header = header;
2997 				} else if (node != onode ||
2998 					   IS_STUB(search->rbtdb))
2999 				{
3000 					/*
3001 					 * We've found an NS rdataset that
3002 					 * isn't at the origin node.  We check
3003 					 * that they're not at the origin node,
3004 					 * because otherwise we'd erroneously
3005 					 * treat the zone top as if it were
3006 					 * a delegation.
3007 					 */
3008 					ns_header = header;
3009 				}
3010 			}
3011 		}
3012 	}
3013 
3014 	/*
3015 	 * Did we find anything?
3016 	 */
3017 	if (!IS_CACHE(search->rbtdb) && !IS_STUB(search->rbtdb) &&
3018 	    ns_header != NULL)
3019 	{
3020 		/*
3021 		 * Note that NS has precedence over DNAME if both exist
3022 		 * in a zone.  Otherwise DNAME take precedence over NS.
3023 		 */
3024 		found = ns_header;
3025 		search->zonecut_sigrdataset = NULL;
3026 	} else if (dname_header != NULL) {
3027 		found = dname_header;
3028 		search->zonecut_sigrdataset = sigdname_header;
3029 	} else if (ns_header != NULL) {
3030 		found = ns_header;
3031 		search->zonecut_sigrdataset = NULL;
3032 	}
3033 
3034 	if (found != NULL) {
3035 		/*
3036 		 * We increment the reference count on node to ensure that
3037 		 * search->zonecut_rdataset will still be valid later.
3038 		 */
3039 		new_reference(search->rbtdb, node, isc_rwlocktype_read);
3040 		search->zonecut = node;
3041 		search->zonecut_rdataset = found;
3042 		search->need_cleanup = true;
3043 		/*
3044 		 * Since we've found a zonecut, anything beneath it is
3045 		 * glue and is not subject to wildcard matching, so we
3046 		 * may clear search->wild.
3047 		 */
3048 		search->wild = false;
3049 		if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
3050 			/*
3051 			 * If the caller does not want to find glue, then
3052 			 * this is the best answer and the search should
3053 			 * stop now.
3054 			 */
3055 			result = DNS_R_PARTIALMATCH;
3056 		} else {
3057 			dns_name_t *zcname;
3058 
3059 			/*
3060 			 * The search will continue beneath the zone cut.
3061 			 * This may or may not be the best match.  In case it
3062 			 * is, we need to remember the node name.
3063 			 */
3064 			zcname = dns_fixedname_name(&search->zonecut_name);
3065 			dns_name_copy(name, zcname);
3066 			search->copy_name = true;
3067 		}
3068 	} else {
3069 		/*
3070 		 * There is no zonecut at this node which is active in this
3071 		 * version.
3072 		 *
3073 		 * If this is a "wild" node and the caller hasn't disabled
3074 		 * wildcard matching, remember that we've seen a wild node
3075 		 * in case we need to go searching for wildcard matches
3076 		 * later on.
3077 		 */
3078 		if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0) {
3079 			search->wild = true;
3080 		}
3081 	}
3082 
3083 	NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3084 		    isc_rwlocktype_read);
3085 
3086 	return (result);
3087 }
3088 
3089 static void
3090 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header,
3091 	      isc_stdtime_t now, isc_rwlocktype_t locktype,
3092 	      dns_rdataset_t *rdataset) {
3093 	unsigned char *raw; /* RDATASLAB */
3094 	bool stale = STALE(header);
3095 	bool ancient = ANCIENT(header);
3096 
3097 	/*
3098 	 * Caller must be holding the node reader lock.
3099 	 * XXXJT: technically, we need a writer lock, since we'll increment
3100 	 * the header count below.  However, since the actual counter value
3101 	 * doesn't matter, we prioritize performance here.  (We may want to
3102 	 * use atomic increment when available).
3103 	 */
3104 
3105 	if (rdataset == NULL) {
3106 		return;
3107 	}
3108 
3109 	new_reference(rbtdb, node, locktype);
3110 
3111 	INSIST(rdataset->methods == NULL); /* We must be disassociated. */
3112 
3113 	/*
3114 	 * Mark header stale or ancient if the RRset is no longer active.
3115 	 */
3116 	if (!ACTIVE(header, now)) {
3117 		dns_ttl_t stale_ttl = header->rdh_ttl +
3118 				      STALE_TTL(header, rbtdb);
3119 		/*
3120 		 * If this data is in the stale window keep it and if
3121 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
3122 		 * skip this record.  We skip the records with ZEROTTL
3123 		 * (these records should not be cached anyway).
3124 		 */
3125 
3126 		if (KEEPSTALE(rbtdb) && stale_ttl > now) {
3127 			stale = true;
3128 		} else {
3129 			/*
3130 			 * We are not keeping stale, or it is outside the
3131 			 * stale window. Mark ancient, i.e. ready for cleanup.
3132 			 */
3133 			ancient = true;
3134 		}
3135 	}
3136 
3137 	rdataset->methods = &rdataset_methods;
3138 	rdataset->rdclass = rbtdb->common.rdclass;
3139 	rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
3140 	rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
3141 	rdataset->ttl = header->rdh_ttl - now;
3142 	rdataset->trust = header->trust;
3143 
3144 	if (NEGATIVE(header)) {
3145 		rdataset->attributes |= DNS_RDATASETATTR_NEGATIVE;
3146 	}
3147 	if (NXDOMAIN(header)) {
3148 		rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
3149 	}
3150 	if (OPTOUT(header)) {
3151 		rdataset->attributes |= DNS_RDATASETATTR_OPTOUT;
3152 	}
3153 	if (PREFETCH(header)) {
3154 		rdataset->attributes |= DNS_RDATASETATTR_PREFETCH;
3155 	}
3156 
3157 	if (stale && !ancient) {
3158 		dns_ttl_t stale_ttl = header->rdh_ttl +
3159 				      STALE_TTL(header, rbtdb);
3160 		if (stale_ttl > now) {
3161 			rdataset->ttl = stale_ttl - now;
3162 		} else {
3163 			rdataset->ttl = 0;
3164 		}
3165 		if (STALE_WINDOW(header)) {
3166 			rdataset->attributes |= DNS_RDATASETATTR_STALE_WINDOW;
3167 		}
3168 		rdataset->attributes |= DNS_RDATASETATTR_STALE;
3169 	} else if (IS_CACHE(rbtdb) && !ACTIVE(header, now)) {
3170 		rdataset->attributes |= DNS_RDATASETATTR_ANCIENT;
3171 		rdataset->ttl = header->rdh_ttl;
3172 	}
3173 
3174 	rdataset->private1 = rbtdb;
3175 	rdataset->private2 = node;
3176 	raw = (unsigned char *)header + sizeof(*header);
3177 	rdataset->private3 = raw;
3178 	rdataset->count = atomic_fetch_add_relaxed(&header->count, 1);
3179 	if (rdataset->count == UINT32_MAX) {
3180 		rdataset->count = 0;
3181 	}
3182 
3183 	/*
3184 	 * Reset iterator state.
3185 	 */
3186 	rdataset->privateuint4 = 0;
3187 	rdataset->private5 = NULL;
3188 
3189 	/*
3190 	 * Add noqname proof.
3191 	 */
3192 	rdataset->private6 = header->noqname;
3193 	if (rdataset->private6 != NULL) {
3194 		rdataset->attributes |= DNS_RDATASETATTR_NOQNAME;
3195 	}
3196 	rdataset->private7 = header->closest;
3197 	if (rdataset->private7 != NULL) {
3198 		rdataset->attributes |= DNS_RDATASETATTR_CLOSEST;
3199 	}
3200 
3201 	/*
3202 	 * Copy out re-signing information.
3203 	 */
3204 	if (RESIGN(header)) {
3205 		rdataset->attributes |= DNS_RDATASETATTR_RESIGN;
3206 		rdataset->resign = (header->resign << 1) | header->resign_lsb;
3207 	} else {
3208 		rdataset->resign = 0;
3209 	}
3210 }
3211 
3212 static isc_result_t
3213 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
3214 		 dns_name_t *foundname, dns_rdataset_t *rdataset,
3215 		 dns_rdataset_t *sigrdataset) {
3216 	dns_name_t *zcname;
3217 	rbtdb_rdatatype_t type;
3218 	dns_rbtnode_t *node;
3219 
3220 	REQUIRE(search != NULL);
3221 	REQUIRE(search->zonecut != NULL);
3222 	REQUIRE(search->zonecut_rdataset != NULL);
3223 
3224 	/*
3225 	 * The caller MUST NOT be holding any node locks.
3226 	 */
3227 
3228 	node = search->zonecut;
3229 	type = search->zonecut_rdataset->type;
3230 
3231 	/*
3232 	 * If we have to set foundname, we do it before anything else.
3233 	 * If we were to set foundname after we had set nodep or bound the
3234 	 * rdataset, then we'd have to undo that work if dns_name_copy()
3235 	 * failed.  By setting foundname first, there's nothing to undo if
3236 	 * we have trouble.
3237 	 */
3238 	if (foundname != NULL && search->copy_name) {
3239 		zcname = dns_fixedname_name(&search->zonecut_name);
3240 		dns_name_copy(zcname, foundname);
3241 	}
3242 	if (nodep != NULL) {
3243 		/*
3244 		 * Note that we don't have to increment the node's reference
3245 		 * count here because we're going to use the reference we
3246 		 * already have in the search block.
3247 		 */
3248 		*nodep = node;
3249 		search->need_cleanup = false;
3250 	}
3251 	if (rdataset != NULL) {
3252 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3253 			  isc_rwlocktype_read);
3254 		bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
3255 			      search->now, isc_rwlocktype_read, rdataset);
3256 		if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
3257 		{
3258 			bind_rdataset(search->rbtdb, node,
3259 				      search->zonecut_sigrdataset, search->now,
3260 				      isc_rwlocktype_read, sigrdataset);
3261 		}
3262 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3263 			    isc_rwlocktype_read);
3264 	}
3265 
3266 	if (type == dns_rdatatype_dname) {
3267 		return (DNS_R_DNAME);
3268 	}
3269 	return (DNS_R_DELEGATION);
3270 }
3271 
3272 static bool
3273 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
3274 	   dns_rbtnode_t *node) {
3275 	unsigned char *raw; /* RDATASLAB */
3276 	unsigned int count, size;
3277 	dns_name_t ns_name;
3278 	bool valid = false;
3279 	dns_offsets_t offsets;
3280 	isc_region_t region;
3281 	rdatasetheader_t *header;
3282 
3283 	/*
3284 	 * No additional locking is required.
3285 	 */
3286 
3287 	/*
3288 	 * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
3289 	 * if it occurs at a zone cut, but is not valid below it.
3290 	 */
3291 	if (type == dns_rdatatype_ns) {
3292 		if (node != search->zonecut) {
3293 			return (false);
3294 		}
3295 	} else if (type != dns_rdatatype_a && type != dns_rdatatype_aaaa &&
3296 		   type != dns_rdatatype_a6)
3297 	{
3298 		return (false);
3299 	}
3300 
3301 	header = search->zonecut_rdataset;
3302 	raw = (unsigned char *)header + sizeof(*header);
3303 	count = raw[0] * 256 + raw[1];
3304 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3305 
3306 	while (count > 0) {
3307 		count--;
3308 		size = raw[0] * 256 + raw[1];
3309 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3310 		region.base = raw;
3311 		region.length = size;
3312 		raw += size;
3313 		/*
3314 		 * XXX Until we have rdata structures, we have no choice but
3315 		 * to directly access the rdata format.
3316 		 */
3317 		dns_name_init(&ns_name, offsets);
3318 		dns_name_fromregion(&ns_name, &region);
3319 		if (dns_name_compare(&ns_name, name) == 0) {
3320 			valid = true;
3321 			break;
3322 		}
3323 	}
3324 
3325 	return (valid);
3326 }
3327 
3328 static bool
3329 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
3330 	    const dns_name_t *name) {
3331 	dns_fixedname_t fnext;
3332 	dns_fixedname_t forigin;
3333 	dns_name_t *next;
3334 	dns_name_t *origin;
3335 	dns_name_t prefix;
3336 	dns_rbtdb_t *rbtdb;
3337 	dns_rbtnode_t *node;
3338 	isc_result_t result;
3339 	bool answer = false;
3340 	rdatasetheader_t *header;
3341 
3342 	rbtdb = search->rbtdb;
3343 
3344 	dns_name_init(&prefix, NULL);
3345 	next = dns_fixedname_initname(&fnext);
3346 	origin = dns_fixedname_initname(&forigin);
3347 
3348 	result = dns_rbtnodechain_next(chain, NULL, NULL);
3349 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3350 		node = NULL;
3351 		result = dns_rbtnodechain_current(chain, &prefix, origin,
3352 						  &node);
3353 		if (result != ISC_R_SUCCESS) {
3354 			break;
3355 		}
3356 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3357 			  isc_rwlocktype_read);
3358 		for (header = node->data; header != NULL; header = header->next)
3359 		{
3360 			if (header->serial <= search->serial &&
3361 			    !IGNORE(header) && EXISTS(header))
3362 			{
3363 				break;
3364 			}
3365 		}
3366 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3367 			    isc_rwlocktype_read);
3368 		if (header != NULL) {
3369 			break;
3370 		}
3371 		result = dns_rbtnodechain_next(chain, NULL, NULL);
3372 	}
3373 	if (result == ISC_R_SUCCESS) {
3374 		result = dns_name_concatenate(&prefix, origin, next, NULL);
3375 	}
3376 	if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name)) {
3377 		answer = true;
3378 	}
3379 	return (answer);
3380 }
3381 
3382 static bool
3383 activeemptynode(rbtdb_search_t *search, const dns_name_t *qname,
3384 		dns_name_t *wname) {
3385 	dns_fixedname_t fnext;
3386 	dns_fixedname_t forigin;
3387 	dns_fixedname_t fprev;
3388 	dns_name_t *next;
3389 	dns_name_t *origin;
3390 	dns_name_t *prev;
3391 	dns_name_t name;
3392 	dns_name_t rname;
3393 	dns_name_t tname;
3394 	dns_rbtdb_t *rbtdb;
3395 	dns_rbtnode_t *node;
3396 	dns_rbtnodechain_t chain;
3397 	bool check_next = true;
3398 	bool check_prev = true;
3399 	bool answer = false;
3400 	isc_result_t result;
3401 	rdatasetheader_t *header;
3402 	unsigned int n;
3403 
3404 	rbtdb = search->rbtdb;
3405 
3406 	dns_name_init(&name, NULL);
3407 	dns_name_init(&tname, NULL);
3408 	dns_name_init(&rname, NULL);
3409 	next = dns_fixedname_initname(&fnext);
3410 	prev = dns_fixedname_initname(&fprev);
3411 	origin = dns_fixedname_initname(&forigin);
3412 
3413 	/*
3414 	 * Find if qname is at or below a empty node.
3415 	 * Use our own copy of the chain.
3416 	 */
3417 
3418 	chain = search->chain;
3419 	do {
3420 		node = NULL;
3421 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3422 		if (result != ISC_R_SUCCESS) {
3423 			break;
3424 		}
3425 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3426 			  isc_rwlocktype_read);
3427 		for (header = node->data; header != NULL; header = header->next)
3428 		{
3429 			if (header->serial <= search->serial &&
3430 			    !IGNORE(header) && EXISTS(header))
3431 			{
3432 				break;
3433 			}
3434 		}
3435 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3436 			    isc_rwlocktype_read);
3437 		if (header != NULL) {
3438 			break;
3439 		}
3440 		result = dns_rbtnodechain_prev(&chain, NULL, NULL);
3441 	} while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
3442 	if (result == ISC_R_SUCCESS) {
3443 		result = dns_name_concatenate(&name, origin, prev, NULL);
3444 	}
3445 	if (result != ISC_R_SUCCESS) {
3446 		check_prev = false;
3447 	}
3448 
3449 	result = dns_rbtnodechain_next(&chain, NULL, NULL);
3450 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3451 		node = NULL;
3452 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3453 		if (result != ISC_R_SUCCESS) {
3454 			break;
3455 		}
3456 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3457 			  isc_rwlocktype_read);
3458 		for (header = node->data; header != NULL; header = header->next)
3459 		{
3460 			if (header->serial <= search->serial &&
3461 			    !IGNORE(header) && EXISTS(header))
3462 			{
3463 				break;
3464 			}
3465 		}
3466 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3467 			    isc_rwlocktype_read);
3468 		if (header != NULL) {
3469 			break;
3470 		}
3471 		result = dns_rbtnodechain_next(&chain, NULL, NULL);
3472 	}
3473 	if (result == ISC_R_SUCCESS) {
3474 		result = dns_name_concatenate(&name, origin, next, NULL);
3475 	}
3476 	if (result != ISC_R_SUCCESS) {
3477 		check_next = false;
3478 	}
3479 
3480 	dns_name_clone(qname, &rname);
3481 
3482 	/*
3483 	 * Remove the wildcard label to find the terminal name.
3484 	 */
3485 	n = dns_name_countlabels(wname);
3486 	dns_name_getlabelsequence(wname, 1, n - 1, &tname);
3487 
3488 	do {
3489 		if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
3490 		    (check_next && dns_name_issubdomain(next, &rname)))
3491 		{
3492 			answer = true;
3493 			break;
3494 		}
3495 		/*
3496 		 * Remove the left hand label.
3497 		 */
3498 		n = dns_name_countlabels(&rname);
3499 		dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
3500 	} while (!dns_name_equal(&rname, &tname));
3501 	return (answer);
3502 }
3503 
3504 static isc_result_t
3505 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
3506 	      const dns_name_t *qname) {
3507 	unsigned int i, j;
3508 	dns_rbtnode_t *node, *level_node, *wnode;
3509 	rdatasetheader_t *header;
3510 	isc_result_t result = ISC_R_NOTFOUND;
3511 	dns_name_t name;
3512 	dns_name_t *wname;
3513 	dns_fixedname_t fwname;
3514 	dns_rbtdb_t *rbtdb;
3515 	bool done, wild, active;
3516 	dns_rbtnodechain_t wchain;
3517 
3518 	/*
3519 	 * Caller must be holding the tree lock and MUST NOT be holding
3520 	 * any node locks.
3521 	 */
3522 
3523 	/*
3524 	 * Examine each ancestor level.  If the level's wild bit
3525 	 * is set, then construct the corresponding wildcard name and
3526 	 * search for it.  If the wildcard node exists, and is active in
3527 	 * this version, we're done.  If not, then we next check to see
3528 	 * if the ancestor is active in this version.  If so, then there
3529 	 * can be no possible wildcard match and again we're done.  If not,
3530 	 * continue the search.
3531 	 */
3532 
3533 	rbtdb = search->rbtdb;
3534 	i = search->chain.level_matches;
3535 	done = false;
3536 	node = *nodep;
3537 	do {
3538 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3539 			  isc_rwlocktype_read);
3540 
3541 		/*
3542 		 * First we try to figure out if this node is active in
3543 		 * the search's version.  We do this now, even though we
3544 		 * may not need the information, because it simplifies the
3545 		 * locking and code flow.
3546 		 */
3547 		for (header = node->data; header != NULL; header = header->next)
3548 		{
3549 			if (header->serial <= search->serial &&
3550 			    !IGNORE(header) && EXISTS(header) &&
3551 			    !ANCIENT(header))
3552 			{
3553 				break;
3554 			}
3555 		}
3556 		if (header != NULL) {
3557 			active = true;
3558 		} else {
3559 			active = false;
3560 		}
3561 
3562 		if (node->wild) {
3563 			wild = true;
3564 		} else {
3565 			wild = false;
3566 		}
3567 
3568 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3569 			    isc_rwlocktype_read);
3570 
3571 		if (wild) {
3572 			/*
3573 			 * Construct the wildcard name for this level.
3574 			 */
3575 			dns_name_init(&name, NULL);
3576 			dns_rbt_namefromnode(node, &name);
3577 			wname = dns_fixedname_initname(&fwname);
3578 			result = dns_name_concatenate(dns_wildcardname, &name,
3579 						      wname, NULL);
3580 			j = i;
3581 			while (result == ISC_R_SUCCESS && j != 0) {
3582 				j--;
3583 				level_node = search->chain.levels[j];
3584 				dns_name_init(&name, NULL);
3585 				dns_rbt_namefromnode(level_node, &name);
3586 				result = dns_name_concatenate(wname, &name,
3587 							      wname, NULL);
3588 			}
3589 			if (result != ISC_R_SUCCESS) {
3590 				break;
3591 			}
3592 
3593 			wnode = NULL;
3594 			dns_rbtnodechain_init(&wchain);
3595 			result = dns_rbt_findnode(
3596 				rbtdb->tree, wname, NULL, &wnode, &wchain,
3597 				DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3598 			if (result == ISC_R_SUCCESS) {
3599 				nodelock_t *lock;
3600 
3601 				/*
3602 				 * We have found the wildcard node.  If it
3603 				 * is active in the search's version, we're
3604 				 * done.
3605 				 */
3606 				lock = &rbtdb->node_locks[wnode->locknum].lock;
3607 				NODE_LOCK(lock, isc_rwlocktype_read);
3608 				for (header = wnode->data; header != NULL;
3609 				     header = header->next)
3610 				{
3611 					if (header->serial <= search->serial &&
3612 					    !IGNORE(header) && EXISTS(header) &&
3613 					    !ANCIENT(header))
3614 					{
3615 						break;
3616 					}
3617 				}
3618 				NODE_UNLOCK(lock, isc_rwlocktype_read);
3619 				if (header != NULL ||
3620 				    activeempty(search, &wchain, wname))
3621 				{
3622 					if (activeemptynode(search, qname,
3623 							    wname))
3624 					{
3625 						return (ISC_R_NOTFOUND);
3626 					}
3627 					/*
3628 					 * The wildcard node is active!
3629 					 *
3630 					 * Note: result is still ISC_R_SUCCESS
3631 					 * so we don't have to set it.
3632 					 */
3633 					*nodep = wnode;
3634 					break;
3635 				}
3636 			} else if (result != ISC_R_NOTFOUND &&
3637 				   result != DNS_R_PARTIALMATCH)
3638 			{
3639 				/*
3640 				 * An error has occurred.  Bail out.
3641 				 */
3642 				break;
3643 			}
3644 		}
3645 
3646 		if (active) {
3647 			/*
3648 			 * The level node is active.  Any wildcarding
3649 			 * present at higher levels has no
3650 			 * effect and we're done.
3651 			 */
3652 			result = ISC_R_NOTFOUND;
3653 			break;
3654 		}
3655 
3656 		if (i > 0) {
3657 			i--;
3658 			node = search->chain.levels[i];
3659 		} else {
3660 			done = true;
3661 		}
3662 	} while (!done);
3663 
3664 	return (result);
3665 }
3666 
3667 static bool
3668 matchparams(rdatasetheader_t *header, rbtdb_search_t *search) {
3669 	dns_rdata_t rdata = DNS_RDATA_INIT;
3670 	dns_rdata_nsec3_t nsec3;
3671 	unsigned char *raw; /* RDATASLAB */
3672 	unsigned int rdlen, count;
3673 	isc_region_t region;
3674 	isc_result_t result;
3675 
3676 	REQUIRE(header->type == dns_rdatatype_nsec3);
3677 
3678 	raw = (unsigned char *)header + sizeof(*header);
3679 	count = raw[0] * 256 + raw[1]; /* count */
3680 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3681 
3682 	while (count-- > 0) {
3683 		rdlen = raw[0] * 256 + raw[1];
3684 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3685 		region.base = raw;
3686 		region.length = rdlen;
3687 		dns_rdata_fromregion(&rdata, search->rbtdb->common.rdclass,
3688 				     dns_rdatatype_nsec3, &region);
3689 		raw += rdlen;
3690 		result = dns_rdata_tostruct(&rdata, &nsec3, NULL);
3691 		INSIST(result == ISC_R_SUCCESS);
3692 		if (nsec3.hash == search->rbtversion->hash &&
3693 		    nsec3.iterations == search->rbtversion->iterations &&
3694 		    nsec3.salt_length == search->rbtversion->salt_length &&
3695 		    memcmp(nsec3.salt, search->rbtversion->salt,
3696 			   nsec3.salt_length) == 0)
3697 		{
3698 			return (true);
3699 		}
3700 		dns_rdata_reset(&rdata);
3701 	}
3702 	return (false);
3703 }
3704 
3705 /*
3706  * Find node of the NSEC/NSEC3 record that is 'name'.
3707  */
3708 static isc_result_t
3709 previous_closest_nsec(dns_rdatatype_t type, rbtdb_search_t *search,
3710 		      dns_name_t *name, dns_name_t *origin,
3711 		      dns_rbtnode_t **nodep, dns_rbtnodechain_t *nsecchain,
3712 		      bool *firstp) {
3713 	dns_fixedname_t ftarget;
3714 	dns_name_t *target;
3715 	dns_rbtnode_t *nsecnode;
3716 	isc_result_t result;
3717 
3718 	REQUIRE(nodep != NULL && *nodep == NULL);
3719 	REQUIRE(type == dns_rdatatype_nsec3 || firstp != NULL);
3720 
3721 	if (type == dns_rdatatype_nsec3) {
3722 		result = dns_rbtnodechain_prev(&search->chain, NULL, NULL);
3723 		if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) {
3724 			return (result);
3725 		}
3726 		result = dns_rbtnodechain_current(&search->chain, name, origin,
3727 						  nodep);
3728 		return (result);
3729 	}
3730 
3731 	target = dns_fixedname_initname(&ftarget);
3732 
3733 	for (;;) {
3734 		if (*firstp) {
3735 			/*
3736 			 * Construct the name of the second node to check.
3737 			 * It is the first node sought in the NSEC tree.
3738 			 */
3739 			*firstp = false;
3740 			dns_rbtnodechain_init(nsecchain);
3741 			result = dns_name_concatenate(name, origin, target,
3742 						      NULL);
3743 			if (result != ISC_R_SUCCESS) {
3744 				return (result);
3745 			}
3746 			nsecnode = NULL;
3747 			result = dns_rbt_findnode(
3748 				search->rbtdb->nsec, target, NULL, &nsecnode,
3749 				nsecchain, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3750 			if (result == ISC_R_SUCCESS) {
3751 				/*
3752 				 * Since this was the first loop, finding the
3753 				 * name in the NSEC tree implies that the first
3754 				 * node checked in the main tree had an
3755 				 * unacceptable NSEC record.
3756 				 * Try the previous node in the NSEC tree.
3757 				 */
3758 				result = dns_rbtnodechain_prev(nsecchain, name,
3759 							       origin);
3760 				if (result == DNS_R_NEWORIGIN) {
3761 					result = ISC_R_SUCCESS;
3762 				}
3763 			} else if (result == ISC_R_NOTFOUND ||
3764 				   result == DNS_R_PARTIALMATCH)
3765 			{
3766 				result = dns_rbtnodechain_current(
3767 					nsecchain, name, origin, NULL);
3768 				if (result == ISC_R_NOTFOUND) {
3769 					result = ISC_R_NOMORE;
3770 				}
3771 			}
3772 		} else {
3773 			/*
3774 			 * This is a second or later trip through the auxiliary
3775 			 * tree for the name of a third or earlier NSEC node in
3776 			 * the main tree.  Previous trips through the NSEC tree
3777 			 * must have found nodes in the main tree with NSEC
3778 			 * records.  Perhaps they lacked signature records.
3779 			 */
3780 			result = dns_rbtnodechain_prev(nsecchain, name, origin);
3781 			if (result == DNS_R_NEWORIGIN) {
3782 				result = ISC_R_SUCCESS;
3783 			}
3784 		}
3785 		if (result != ISC_R_SUCCESS) {
3786 			return (result);
3787 		}
3788 
3789 		/*
3790 		 * Construct the name to seek in the main tree.
3791 		 */
3792 		result = dns_name_concatenate(name, origin, target, NULL);
3793 		if (result != ISC_R_SUCCESS) {
3794 			return (result);
3795 		}
3796 
3797 		*nodep = NULL;
3798 		result = dns_rbt_findnode(search->rbtdb->tree, target, NULL,
3799 					  nodep, &search->chain,
3800 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3801 		if (result == ISC_R_SUCCESS) {
3802 			return (result);
3803 		}
3804 
3805 		/*
3806 		 * There should always be a node in the main tree with the
3807 		 * same name as the node in the auxiliary NSEC tree, except for
3808 		 * nodes in the auxiliary tree that are awaiting deletion.
3809 		 */
3810 		if (result != DNS_R_PARTIALMATCH && result != ISC_R_NOTFOUND) {
3811 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
3812 				      DNS_LOGMODULE_CACHE, ISC_LOG_ERROR,
3813 				      "previous_closest_nsec(): %s",
3814 				      isc_result_totext(result));
3815 			return (DNS_R_BADDB);
3816 		}
3817 	}
3818 }
3819 
3820 /*
3821  * Find the NSEC/NSEC3 which is or before the current point on the
3822  * search chain.  For NSEC3 records only NSEC3 records that match the
3823  * current NSEC3PARAM record are considered.
3824  */
3825 static isc_result_t
3826 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3827 		  dns_name_t *foundname, dns_rdataset_t *rdataset,
3828 		  dns_rdataset_t *sigrdataset, dns_rbt_t *tree,
3829 		  dns_db_secure_t secure) {
3830 	dns_rbtnode_t *node, *prevnode;
3831 	rdatasetheader_t *header, *header_next, *found, *foundsig;
3832 	dns_rbtnodechain_t nsecchain;
3833 	bool empty_node;
3834 	isc_result_t result;
3835 	dns_fixedname_t fname, forigin;
3836 	dns_name_t *name, *origin;
3837 	dns_rdatatype_t type;
3838 	rbtdb_rdatatype_t sigtype;
3839 	bool wraps;
3840 	bool first = true;
3841 	bool need_sig = (secure == dns_db_secure);
3842 
3843 	if (tree == search->rbtdb->nsec3) {
3844 		type = dns_rdatatype_nsec3;
3845 		sigtype = RBTDB_RDATATYPE_SIGNSEC3;
3846 		wraps = true;
3847 	} else {
3848 		type = dns_rdatatype_nsec;
3849 		sigtype = RBTDB_RDATATYPE_SIGNSEC;
3850 		wraps = false;
3851 	}
3852 
3853 	/*
3854 	 * Use the auxiliary tree only starting with the second node in the
3855 	 * hope that the original node will be right much of the time.
3856 	 */
3857 	name = dns_fixedname_initname(&fname);
3858 	origin = dns_fixedname_initname(&forigin);
3859 again:
3860 	node = NULL;
3861 	prevnode = NULL;
3862 	result = dns_rbtnodechain_current(&search->chain, name, origin, &node);
3863 	if (result != ISC_R_SUCCESS) {
3864 		return (result);
3865 	}
3866 	do {
3867 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3868 			  isc_rwlocktype_read);
3869 		found = NULL;
3870 		foundsig = NULL;
3871 		empty_node = true;
3872 		for (header = node->data; header != NULL; header = header_next)
3873 		{
3874 			header_next = header->next;
3875 			/*
3876 			 * Look for an active, extant NSEC or RRSIG NSEC.
3877 			 */
3878 			do {
3879 				if (header->serial <= search->serial &&
3880 				    !IGNORE(header))
3881 				{
3882 					/*
3883 					 * Is this a "this rdataset doesn't
3884 					 * exist" record?
3885 					 */
3886 					if (NONEXISTENT(header)) {
3887 						header = NULL;
3888 					}
3889 					break;
3890 				} else {
3891 					header = header->down;
3892 				}
3893 			} while (header != NULL);
3894 			if (header != NULL) {
3895 				/*
3896 				 * We now know that there is at least one
3897 				 * active rdataset at this node.
3898 				 */
3899 				empty_node = false;
3900 				if (header->type == type) {
3901 					found = header;
3902 					if (foundsig != NULL) {
3903 						break;
3904 					}
3905 				} else if (header->type == sigtype) {
3906 					foundsig = header;
3907 					if (found != NULL) {
3908 						break;
3909 					}
3910 				}
3911 			}
3912 		}
3913 		if (!empty_node) {
3914 			if (found != NULL && search->rbtversion->havensec3 &&
3915 			    found->type == dns_rdatatype_nsec3 &&
3916 			    !matchparams(found, search))
3917 			{
3918 				empty_node = true;
3919 				found = NULL;
3920 				foundsig = NULL;
3921 				result = previous_closest_nsec(
3922 					type, search, name, origin, &prevnode,
3923 					NULL, NULL);
3924 			} else if (found != NULL &&
3925 				   (foundsig != NULL || !need_sig))
3926 			{
3927 				/*
3928 				 * We've found the right NSEC/NSEC3 record.
3929 				 *
3930 				 * Note: for this to really be the right
3931 				 * NSEC record, it's essential that the NSEC
3932 				 * records of any nodes obscured by a zone
3933 				 * cut have been removed; we assume this is
3934 				 * the case.
3935 				 */
3936 				result = dns_name_concatenate(name, origin,
3937 							      foundname, NULL);
3938 				if (result == ISC_R_SUCCESS) {
3939 					if (nodep != NULL) {
3940 						new_reference(
3941 							search->rbtdb, node,
3942 							isc_rwlocktype_read);
3943 						*nodep = node;
3944 					}
3945 					bind_rdataset(search->rbtdb, node,
3946 						      found, search->now,
3947 						      isc_rwlocktype_read,
3948 						      rdataset);
3949 					if (foundsig != NULL) {
3950 						bind_rdataset(
3951 							search->rbtdb, node,
3952 							foundsig, search->now,
3953 							isc_rwlocktype_read,
3954 							sigrdataset);
3955 					}
3956 				}
3957 			} else if (found == NULL && foundsig == NULL) {
3958 				/*
3959 				 * This node is active, but has no NSEC or
3960 				 * RRSIG NSEC.  That means it's glue or
3961 				 * other obscured zone data that isn't
3962 				 * relevant for our search.  Treat the
3963 				 * node as if it were empty and keep looking.
3964 				 */
3965 				empty_node = true;
3966 				result = previous_closest_nsec(
3967 					type, search, name, origin, &prevnode,
3968 					&nsecchain, &first);
3969 			} else {
3970 				/*
3971 				 * We found an active node, but either the
3972 				 * NSEC or the RRSIG NSEC is missing.  This
3973 				 * shouldn't happen.
3974 				 */
3975 				result = DNS_R_BADDB;
3976 			}
3977 		} else {
3978 			/*
3979 			 * This node isn't active.  We've got to keep
3980 			 * looking.
3981 			 */
3982 			result = previous_closest_nsec(type, search, name,
3983 						       origin, &prevnode,
3984 						       &nsecchain, &first);
3985 		}
3986 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3987 			    isc_rwlocktype_read);
3988 		node = prevnode;
3989 		prevnode = NULL;
3990 	} while (empty_node && result == ISC_R_SUCCESS);
3991 
3992 	if (!first) {
3993 		dns_rbtnodechain_invalidate(&nsecchain);
3994 	}
3995 
3996 	if (result == ISC_R_NOMORE && wraps) {
3997 		result = dns_rbtnodechain_last(&search->chain, tree, NULL,
3998 					       NULL);
3999 		if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
4000 			wraps = false;
4001 			goto again;
4002 		}
4003 	}
4004 
4005 	/*
4006 	 * If the result is ISC_R_NOMORE, then we got to the beginning of
4007 	 * the database and didn't find a NSEC record.  This shouldn't
4008 	 * happen.
4009 	 */
4010 	if (result == ISC_R_NOMORE) {
4011 		result = DNS_R_BADDB;
4012 	}
4013 
4014 	return (result);
4015 }
4016 
4017 static isc_result_t
4018 zone_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
4019 	  dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
4020 	  dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset,
4021 	  dns_rdataset_t *sigrdataset) {
4022 	dns_rbtnode_t *node = NULL;
4023 	isc_result_t result;
4024 	rbtdb_search_t search;
4025 	bool cname_ok = true;
4026 	bool close_version = false;
4027 	bool maybe_zonecut = false;
4028 	bool at_zonecut = false;
4029 	bool wild;
4030 	bool empty_node;
4031 	rdatasetheader_t *header, *header_next, *found, *nsecheader;
4032 	rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
4033 	rbtdb_rdatatype_t sigtype;
4034 	bool active;
4035 	nodelock_t *lock;
4036 	dns_rbt_t *tree;
4037 
4038 	search.rbtdb = (dns_rbtdb_t *)db;
4039 
4040 	REQUIRE(VALID_RBTDB(search.rbtdb));
4041 	INSIST(version == NULL ||
4042 	       ((rbtdb_version_t *)version)->rbtdb == (dns_rbtdb_t *)db);
4043 
4044 	/*
4045 	 * We don't care about 'now'.
4046 	 */
4047 	UNUSED(now);
4048 
4049 	/*
4050 	 * If the caller didn't supply a version, attach to the current
4051 	 * version.
4052 	 */
4053 	if (version == NULL) {
4054 		currentversion(db, &version);
4055 		close_version = true;
4056 	}
4057 
4058 	search.rbtversion = version;
4059 	search.serial = search.rbtversion->serial;
4060 	search.options = options;
4061 	search.copy_name = false;
4062 	search.need_cleanup = false;
4063 	search.wild = false;
4064 	search.zonecut = NULL;
4065 	dns_fixedname_init(&search.zonecut_name);
4066 	dns_rbtnodechain_init(&search.chain);
4067 	search.now = 0;
4068 
4069 	/*
4070 	 * 'wild' will be true iff. we've matched a wildcard.
4071 	 */
4072 	wild = false;
4073 
4074 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4075 
4076 	/*
4077 	 * Search down from the root of the tree.  If, while going down, we
4078 	 * encounter a callback node, zone_zonecut_callback() will search the
4079 	 * rdatasets at the zone cut for active DNAME or NS rdatasets.
4080 	 */
4081 	tree = (options & DNS_DBFIND_FORCENSEC3) != 0 ? search.rbtdb->nsec3
4082 						      : search.rbtdb->tree;
4083 	result = dns_rbt_findnode(tree, name, foundname, &node, &search.chain,
4084 				  DNS_RBTFIND_EMPTYDATA, zone_zonecut_callback,
4085 				  &search);
4086 
4087 	if (result == DNS_R_PARTIALMATCH) {
4088 	partial_match:
4089 		if (search.zonecut != NULL) {
4090 			result = setup_delegation(&search, nodep, foundname,
4091 						  rdataset, sigrdataset);
4092 			goto tree_exit;
4093 		}
4094 
4095 		if (search.wild) {
4096 			/*
4097 			 * At least one of the levels in the search chain
4098 			 * potentially has a wildcard.  For each such level,
4099 			 * we must see if there's a matching wildcard active
4100 			 * in the current version.
4101 			 */
4102 			result = find_wildcard(&search, &node, name);
4103 			if (result == ISC_R_SUCCESS) {
4104 				dns_name_copy(name, foundname);
4105 				wild = true;
4106 				goto found;
4107 			} else if (result != ISC_R_NOTFOUND) {
4108 				goto tree_exit;
4109 			}
4110 		}
4111 
4112 		active = false;
4113 		if ((options & DNS_DBFIND_FORCENSEC3) == 0) {
4114 			/*
4115 			 * The NSEC3 tree won't have empty nodes,
4116 			 * so it isn't necessary to check for them.
4117 			 */
4118 			dns_rbtnodechain_t chain = search.chain;
4119 			active = activeempty(&search, &chain, name);
4120 		}
4121 
4122 		/*
4123 		 * If we're here, then the name does not exist, is not
4124 		 * beneath a zonecut, and there's no matching wildcard.
4125 		 */
4126 		if ((search.rbtversion->secure == dns_db_secure &&
4127 		     !search.rbtversion->havensec3) ||
4128 		    (search.options & DNS_DBFIND_FORCENSEC) != 0 ||
4129 		    (search.options & DNS_DBFIND_FORCENSEC3) != 0)
4130 		{
4131 			result = find_closest_nsec(&search, nodep, foundname,
4132 						   rdataset, sigrdataset, tree,
4133 						   search.rbtversion->secure);
4134 			if (result == ISC_R_SUCCESS) {
4135 				result = active ? DNS_R_EMPTYNAME
4136 						: DNS_R_NXDOMAIN;
4137 			}
4138 		} else {
4139 			bool wantpartial = (options & DNS_DBFIND_WANTPARTIAL) !=
4140 					   0;
4141 			result = active	       ? DNS_R_EMPTYNAME
4142 				 : wantpartial ? DNS_R_PARTIALMATCH
4143 					       : DNS_R_NXDOMAIN;
4144 		}
4145 		goto tree_exit;
4146 	} else if (result != ISC_R_SUCCESS) {
4147 		goto tree_exit;
4148 	}
4149 
4150 found:
4151 	/*
4152 	 * We have found a node whose name is the desired name, or we
4153 	 * have matched a wildcard.
4154 	 */
4155 
4156 	if (search.zonecut != NULL) {
4157 		/*
4158 		 * If we're beneath a zone cut, we don't want to look for
4159 		 * CNAMEs because they're not legitimate zone glue.
4160 		 */
4161 		cname_ok = false;
4162 	} else {
4163 		/*
4164 		 * The node may be a zone cut itself.  If it might be one,
4165 		 * make sure we check for it later.
4166 		 *
4167 		 * DS records live above the zone cut in ordinary zone so
4168 		 * we want to ignore any referral.
4169 		 *
4170 		 * Stub zones don't have anything "above" the delegation so
4171 		 * we always return a referral.
4172 		 */
4173 		if (node->find_callback &&
4174 		    ((node != search.rbtdb->origin_node &&
4175 		      !dns_rdatatype_atparent(type)) ||
4176 		     IS_STUB(search.rbtdb)))
4177 		{
4178 			maybe_zonecut = true;
4179 		}
4180 	}
4181 
4182 	/*
4183 	 * Certain DNSSEC types are not subject to CNAME matching
4184 	 * (RFC4035, section 2.5 and RFC3007).
4185 	 *
4186 	 * We don't check for RRSIG, because we don't store RRSIG records
4187 	 * directly.
4188 	 */
4189 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
4190 		cname_ok = false;
4191 	}
4192 
4193 	/*
4194 	 * We now go looking for rdata...
4195 	 */
4196 
4197 	lock = &search.rbtdb->node_locks[node->locknum].lock;
4198 	NODE_LOCK(lock, isc_rwlocktype_read);
4199 
4200 	found = NULL;
4201 	foundsig = NULL;
4202 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4203 	nsecheader = NULL;
4204 	nsecsig = NULL;
4205 	cnamesig = NULL;
4206 	empty_node = true;
4207 	for (header = node->data; header != NULL; header = header_next) {
4208 		header_next = header->next;
4209 		/*
4210 		 * Look for an active, extant rdataset.
4211 		 */
4212 		do {
4213 			if (header->serial <= search.serial && !IGNORE(header))
4214 			{
4215 				/*
4216 				 * Is this a "this rdataset doesn't
4217 				 * exist" record?
4218 				 */
4219 				if (NONEXISTENT(header)) {
4220 					header = NULL;
4221 				}
4222 				break;
4223 			} else {
4224 				header = header->down;
4225 			}
4226 		} while (header != NULL);
4227 		if (header != NULL) {
4228 			/*
4229 			 * We now know that there is at least one active
4230 			 * rdataset at this node.
4231 			 */
4232 			empty_node = false;
4233 
4234 			/*
4235 			 * Do special zone cut handling, if requested.
4236 			 */
4237 			if (maybe_zonecut && header->type == dns_rdatatype_ns) {
4238 				/*
4239 				 * We increment the reference count on node to
4240 				 * ensure that search->zonecut_rdataset will
4241 				 * still be valid later.
4242 				 */
4243 				new_reference(search.rbtdb, node,
4244 					      isc_rwlocktype_read);
4245 				search.zonecut = node;
4246 				search.zonecut_rdataset = header;
4247 				search.zonecut_sigrdataset = NULL;
4248 				search.need_cleanup = true;
4249 				maybe_zonecut = false;
4250 				at_zonecut = true;
4251 				/*
4252 				 * It is not clear if KEY should still be
4253 				 * allowed at the parent side of the zone
4254 				 * cut or not.  It is needed for RFC3007
4255 				 * validated updates.
4256 				 */
4257 				if ((search.options & DNS_DBFIND_GLUEOK) == 0 &&
4258 				    type != dns_rdatatype_nsec &&
4259 				    type != dns_rdatatype_key)
4260 				{
4261 					/*
4262 					 * Glue is not OK, but any answer we
4263 					 * could return would be glue.  Return
4264 					 * the delegation.
4265 					 */
4266 					found = NULL;
4267 					break;
4268 				}
4269 				if (found != NULL && foundsig != NULL) {
4270 					break;
4271 				}
4272 			}
4273 
4274 			/*
4275 			 * If the NSEC3 record doesn't match the chain
4276 			 * we are using behave as if it isn't here.
4277 			 */
4278 			if (header->type == dns_rdatatype_nsec3 &&
4279 			    !matchparams(header, &search))
4280 			{
4281 				NODE_UNLOCK(lock, isc_rwlocktype_read);
4282 				goto partial_match;
4283 			}
4284 			/*
4285 			 * If we found a type we were looking for,
4286 			 * remember it.
4287 			 */
4288 			if (header->type == type || type == dns_rdatatype_any ||
4289 			    (header->type == dns_rdatatype_cname && cname_ok))
4290 			{
4291 				/*
4292 				 * We've found the answer!
4293 				 */
4294 				found = header;
4295 				if (header->type == dns_rdatatype_cname &&
4296 				    cname_ok)
4297 				{
4298 					/*
4299 					 * We may be finding a CNAME instead
4300 					 * of the desired type.
4301 					 *
4302 					 * If we've already got the CNAME RRSIG,
4303 					 * use it, otherwise change sigtype
4304 					 * so that we find it.
4305 					 */
4306 					if (cnamesig != NULL) {
4307 						foundsig = cnamesig;
4308 					} else {
4309 						sigtype =
4310 							RBTDB_RDATATYPE_SIGCNAME;
4311 					}
4312 				}
4313 				/*
4314 				 * If we've got all we need, end the search.
4315 				 */
4316 				if (!maybe_zonecut && foundsig != NULL) {
4317 					break;
4318 				}
4319 			} else if (header->type == sigtype) {
4320 				/*
4321 				 * We've found the RRSIG rdataset for our
4322 				 * target type.  Remember it.
4323 				 */
4324 				foundsig = header;
4325 				/*
4326 				 * If we've got all we need, end the search.
4327 				 */
4328 				if (!maybe_zonecut && found != NULL) {
4329 					break;
4330 				}
4331 			} else if (header->type == dns_rdatatype_nsec &&
4332 				   !search.rbtversion->havensec3)
4333 			{
4334 				/*
4335 				 * Remember a NSEC rdataset even if we're
4336 				 * not specifically looking for it, because
4337 				 * we might need it later.
4338 				 */
4339 				nsecheader = header;
4340 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC &&
4341 				   !search.rbtversion->havensec3)
4342 			{
4343 				/*
4344 				 * If we need the NSEC rdataset, we'll also
4345 				 * need its signature.
4346 				 */
4347 				nsecsig = header;
4348 			} else if (cname_ok &&
4349 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
4350 			{
4351 				/*
4352 				 * If we get a CNAME match, we'll also need
4353 				 * its signature.
4354 				 */
4355 				cnamesig = header;
4356 			}
4357 		}
4358 	}
4359 
4360 	if (empty_node) {
4361 		/*
4362 		 * We have an exact match for the name, but there are no
4363 		 * active rdatasets in the desired version.  That means that
4364 		 * this node doesn't exist in the desired version, and that
4365 		 * we really have a partial match.
4366 		 */
4367 		if (!wild) {
4368 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4369 			goto partial_match;
4370 		}
4371 	}
4372 
4373 	/*
4374 	 * If we didn't find what we were looking for...
4375 	 */
4376 	if (found == NULL) {
4377 		if (search.zonecut != NULL) {
4378 			/*
4379 			 * We were trying to find glue at a node beneath a
4380 			 * zone cut, but didn't.
4381 			 *
4382 			 * Return the delegation.
4383 			 */
4384 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4385 			result = setup_delegation(&search, nodep, foundname,
4386 						  rdataset, sigrdataset);
4387 			goto tree_exit;
4388 		}
4389 		/*
4390 		 * The desired type doesn't exist.
4391 		 */
4392 		result = DNS_R_NXRRSET;
4393 		if (search.rbtversion->secure == dns_db_secure &&
4394 		    !search.rbtversion->havensec3 &&
4395 		    (nsecheader == NULL || nsecsig == NULL))
4396 		{
4397 			/*
4398 			 * The zone is secure but there's no NSEC,
4399 			 * or the NSEC has no signature!
4400 			 */
4401 			if (!wild) {
4402 				result = DNS_R_BADDB;
4403 				goto node_exit;
4404 			}
4405 
4406 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4407 			result = find_closest_nsec(&search, nodep, foundname,
4408 						   rdataset, sigrdataset,
4409 						   search.rbtdb->tree,
4410 						   search.rbtversion->secure);
4411 			if (result == ISC_R_SUCCESS) {
4412 				result = DNS_R_EMPTYWILD;
4413 			}
4414 			goto tree_exit;
4415 		}
4416 		if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
4417 		    nsecheader == NULL)
4418 		{
4419 			/*
4420 			 * There's no NSEC record, and we were told
4421 			 * to find one.
4422 			 */
4423 			result = DNS_R_BADDB;
4424 			goto node_exit;
4425 		}
4426 		if (nodep != NULL) {
4427 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4428 			*nodep = node;
4429 		}
4430 		if ((search.rbtversion->secure == dns_db_secure &&
4431 		     !search.rbtversion->havensec3) ||
4432 		    (search.options & DNS_DBFIND_FORCENSEC) != 0)
4433 		{
4434 			bind_rdataset(search.rbtdb, node, nsecheader, 0,
4435 				      isc_rwlocktype_read, rdataset);
4436 			if (nsecsig != NULL) {
4437 				bind_rdataset(search.rbtdb, node, nsecsig, 0,
4438 					      isc_rwlocktype_read, sigrdataset);
4439 			}
4440 		}
4441 		if (wild) {
4442 			foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4443 		}
4444 		goto node_exit;
4445 	}
4446 
4447 	/*
4448 	 * We found what we were looking for, or we found a CNAME.
4449 	 */
4450 
4451 	if (type != found->type && type != dns_rdatatype_any &&
4452 	    found->type == dns_rdatatype_cname)
4453 	{
4454 		/*
4455 		 * We weren't doing an ANY query and we found a CNAME instead
4456 		 * of the type we were looking for, so we need to indicate
4457 		 * that result to the caller.
4458 		 */
4459 		result = DNS_R_CNAME;
4460 	} else if (search.zonecut != NULL) {
4461 		/*
4462 		 * If we're beneath a zone cut, we must indicate that the
4463 		 * result is glue, unless we're actually at the zone cut
4464 		 * and the type is NSEC or KEY.
4465 		 */
4466 		if (search.zonecut == node) {
4467 			/*
4468 			 * It is not clear if KEY should still be
4469 			 * allowed at the parent side of the zone
4470 			 * cut or not.  It is needed for RFC3007
4471 			 * validated updates.
4472 			 */
4473 			if (type == dns_rdatatype_nsec ||
4474 			    type == dns_rdatatype_nsec3 ||
4475 			    type == dns_rdatatype_key)
4476 			{
4477 				result = ISC_R_SUCCESS;
4478 			} else if (type == dns_rdatatype_any) {
4479 				result = DNS_R_ZONECUT;
4480 			} else {
4481 				result = DNS_R_GLUE;
4482 			}
4483 		} else {
4484 			result = DNS_R_GLUE;
4485 		}
4486 		/*
4487 		 * We might have found data that isn't glue, but was occluded
4488 		 * by a dynamic update.  If the caller cares about this, they
4489 		 * will have told us to validate glue.
4490 		 *
4491 		 * XXX We should cache the glue validity state!
4492 		 */
4493 		if (result == DNS_R_GLUE &&
4494 		    (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
4495 		    !valid_glue(&search, foundname, type, node))
4496 		{
4497 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4498 			result = setup_delegation(&search, nodep, foundname,
4499 						  rdataset, sigrdataset);
4500 			goto tree_exit;
4501 		}
4502 	} else {
4503 		/*
4504 		 * An ordinary successful query!
4505 		 */
4506 		result = ISC_R_SUCCESS;
4507 	}
4508 
4509 	if (nodep != NULL) {
4510 		if (!at_zonecut) {
4511 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4512 		} else {
4513 			search.need_cleanup = false;
4514 		}
4515 		*nodep = node;
4516 	}
4517 
4518 	if (type != dns_rdatatype_any) {
4519 		bind_rdataset(search.rbtdb, node, found, 0, isc_rwlocktype_read,
4520 			      rdataset);
4521 		if (foundsig != NULL) {
4522 			bind_rdataset(search.rbtdb, node, foundsig, 0,
4523 				      isc_rwlocktype_read, sigrdataset);
4524 		}
4525 	}
4526 
4527 	if (wild) {
4528 		foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4529 	}
4530 
4531 node_exit:
4532 	NODE_UNLOCK(lock, isc_rwlocktype_read);
4533 
4534 tree_exit:
4535 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4536 
4537 	/*
4538 	 * If we found a zonecut but aren't going to use it, we have to
4539 	 * let go of it.
4540 	 */
4541 	if (search.need_cleanup) {
4542 		node = search.zonecut;
4543 		INSIST(node != NULL);
4544 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
4545 
4546 		NODE_LOCK(lock, isc_rwlocktype_read);
4547 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
4548 				    isc_rwlocktype_none, false);
4549 		NODE_UNLOCK(lock, isc_rwlocktype_read);
4550 	}
4551 
4552 	if (close_version) {
4553 		closeversion(db, &version, false);
4554 	}
4555 
4556 	dns_rbtnodechain_reset(&search.chain);
4557 
4558 	return (result);
4559 }
4560 
4561 static isc_result_t
4562 zone_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
4563 		 isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname,
4564 		 dns_name_t *dcname, dns_rdataset_t *rdataset,
4565 		 dns_rdataset_t *sigrdataset) {
4566 	UNUSED(db);
4567 	UNUSED(name);
4568 	UNUSED(options);
4569 	UNUSED(now);
4570 	UNUSED(nodep);
4571 	UNUSED(foundname);
4572 	UNUSED(dcname);
4573 	UNUSED(rdataset);
4574 	UNUSED(sigrdataset);
4575 
4576 	FATAL_ERROR("zone_findzonecut() called!");
4577 
4578 	UNREACHABLE();
4579 	return (ISC_R_NOTIMPLEMENTED);
4580 }
4581 
4582 static bool
4583 check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header,
4584 		   isc_rwlocktype_t *locktype, nodelock_t *lock,
4585 		   rbtdb_search_t *search, rdatasetheader_t **header_prev) {
4586 	if (!ACTIVE(header, search->now)) {
4587 		dns_ttl_t stale = header->rdh_ttl +
4588 				  STALE_TTL(header, search->rbtdb);
4589 		/*
4590 		 * If this data is in the stale window keep it and if
4591 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
4592 		 * skip this record.  We skip the records with ZEROTTL
4593 		 * (these records should not be cached anyway).
4594 		 */
4595 
4596 		RDATASET_ATTR_CLR(header, RDATASET_ATTR_STALE_WINDOW);
4597 		if (!ZEROTTL(header) && KEEPSTALE(search->rbtdb) &&
4598 		    stale > search->now)
4599 		{
4600 			mark_header_stale(search->rbtdb, header);
4601 			*header_prev = header;
4602 			/*
4603 			 * If DNS_DBFIND_STALESTART is set then it means we
4604 			 * failed to resolve the name during recursion, in
4605 			 * this case we mark the time in which the refresh
4606 			 * failed.
4607 			 */
4608 			if ((search->options & DNS_DBFIND_STALESTART) != 0) {
4609 				atomic_store_release(
4610 					&header->last_refresh_fail_ts,
4611 					search->now);
4612 			} else if ((search->options &
4613 				    DNS_DBFIND_STALEENABLED) != 0 &&
4614 				   search->now <
4615 					   (atomic_load_acquire(
4616 						    &header->last_refresh_fail_ts) +
4617 					    search->rbtdb->serve_stale_refresh))
4618 			{
4619 				/*
4620 				 * If we are within interval between last
4621 				 * refresh failure time + 'stale-refresh-time',
4622 				 * then don't skip this stale entry but use it
4623 				 * instead.
4624 				 */
4625 				RDATASET_ATTR_SET(header,
4626 						  RDATASET_ATTR_STALE_WINDOW);
4627 				return (false);
4628 			} else if ((search->options &
4629 				    DNS_DBFIND_STALETIMEOUT) != 0)
4630 			{
4631 				/*
4632 				 * We want stale RRset due to timeout, so we
4633 				 * don't skip it.
4634 				 */
4635 				return (false);
4636 			}
4637 			return ((search->options & DNS_DBFIND_STALEOK) == 0);
4638 		}
4639 
4640 		/*
4641 		 * This rdataset is stale.  If no one else is using the
4642 		 * node, we can clean it up right now, otherwise we mark
4643 		 * it as ancient, and the node as dirty, so it will get
4644 		 * cleaned up later.
4645 		 */
4646 		if ((header->rdh_ttl < search->now - RBTDB_VIRTUAL) &&
4647 		    (*locktype == isc_rwlocktype_write ||
4648 		     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
4649 		{
4650 			/*
4651 			 * We update the node's status only when we can
4652 			 * get write access; otherwise, we leave others
4653 			 * to this work.  Periodical cleaning will
4654 			 * eventually take the job as the last resort.
4655 			 * We won't downgrade the lock, since other
4656 			 * rdatasets are probably stale, too.
4657 			 */
4658 			*locktype = isc_rwlocktype_write;
4659 
4660 			if (isc_refcount_current(&node->references) == 0) {
4661 				isc_mem_t *mctx;
4662 
4663 				/*
4664 				 * header->down can be non-NULL if the
4665 				 * refcount has just decremented to 0
4666 				 * but decrement_reference() has not
4667 				 * performed clean_cache_node(), in
4668 				 * which case we need to purge the stale
4669 				 * headers first.
4670 				 */
4671 				mctx = search->rbtdb->common.mctx;
4672 				clean_stale_headers(search->rbtdb, mctx,
4673 						    header);
4674 				if (*header_prev != NULL) {
4675 					(*header_prev)->next = header->next;
4676 				} else {
4677 					node->data = header->next;
4678 				}
4679 				free_rdataset(search->rbtdb, mctx, header);
4680 			} else {
4681 				mark_header_ancient(search->rbtdb, header);
4682 				*header_prev = header;
4683 			}
4684 		} else {
4685 			*header_prev = header;
4686 		}
4687 		return (true);
4688 	}
4689 	return (false);
4690 }
4691 
4692 static isc_result_t
4693 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
4694 	rbtdb_search_t *search = arg;
4695 	rdatasetheader_t *header, *header_prev, *header_next;
4696 	rdatasetheader_t *dname_header, *sigdname_header;
4697 	isc_result_t result;
4698 	nodelock_t *lock;
4699 	isc_rwlocktype_t locktype;
4700 
4701 	/* XXX comment */
4702 
4703 	REQUIRE(search->zonecut == NULL);
4704 
4705 	/*
4706 	 * Keep compiler silent.
4707 	 */
4708 	UNUSED(name);
4709 
4710 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
4711 	locktype = isc_rwlocktype_read;
4712 	NODE_LOCK(lock, locktype);
4713 
4714 	/*
4715 	 * Look for a DNAME or RRSIG DNAME rdataset.
4716 	 */
4717 	dname_header = NULL;
4718 	sigdname_header = NULL;
4719 	header_prev = NULL;
4720 	for (header = node->data; header != NULL; header = header_next) {
4721 		header_next = header->next;
4722 		if (check_stale_header(node, header, &locktype, lock, search,
4723 				       &header_prev))
4724 		{
4725 			/* Do nothing. */
4726 		} else if (header->type == dns_rdatatype_dname &&
4727 			   EXISTS(header) && !ANCIENT(header))
4728 		{
4729 			dname_header = header;
4730 			header_prev = header;
4731 		} else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
4732 			   EXISTS(header) && !ANCIENT(header))
4733 		{
4734 			sigdname_header = header;
4735 			header_prev = header;
4736 		} else {
4737 			header_prev = header;
4738 		}
4739 	}
4740 
4741 	if (dname_header != NULL &&
4742 	    (!DNS_TRUST_PENDING(dname_header->trust) ||
4743 	     (search->options & DNS_DBFIND_PENDINGOK) != 0))
4744 	{
4745 		/*
4746 		 * We increment the reference count on node to ensure that
4747 		 * search->zonecut_rdataset will still be valid later.
4748 		 */
4749 		new_reference(search->rbtdb, node, locktype);
4750 		search->zonecut = node;
4751 		search->zonecut_rdataset = dname_header;
4752 		search->zonecut_sigrdataset = sigdname_header;
4753 		search->need_cleanup = true;
4754 		result = DNS_R_PARTIALMATCH;
4755 	} else {
4756 		result = DNS_R_CONTINUE;
4757 	}
4758 
4759 	NODE_UNLOCK(lock, locktype);
4760 
4761 	return (result);
4762 }
4763 
4764 static isc_result_t
4765 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
4766 		     dns_dbnode_t **nodep, dns_name_t *foundname,
4767 		     dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
4768 	unsigned int i;
4769 	dns_rbtnode_t *level_node;
4770 	rdatasetheader_t *header, *header_prev, *header_next;
4771 	rdatasetheader_t *found, *foundsig;
4772 	isc_result_t result = ISC_R_NOTFOUND;
4773 	dns_name_t name;
4774 	dns_rbtdb_t *rbtdb;
4775 	bool done;
4776 	nodelock_t *lock;
4777 	isc_rwlocktype_t locktype;
4778 
4779 	/*
4780 	 * Caller must be holding the tree lock.
4781 	 */
4782 
4783 	rbtdb = search->rbtdb;
4784 	i = search->chain.level_matches;
4785 	done = false;
4786 	do {
4787 		locktype = isc_rwlocktype_read;
4788 		lock = &rbtdb->node_locks[node->locknum].lock;
4789 		NODE_LOCK(lock, locktype);
4790 
4791 		/*
4792 		 * Look for NS and RRSIG NS rdatasets.
4793 		 */
4794 		found = NULL;
4795 		foundsig = NULL;
4796 		header_prev = NULL;
4797 		for (header = node->data; header != NULL; header = header_next)
4798 		{
4799 			header_next = header->next;
4800 			if (check_stale_header(node, header, &locktype, lock,
4801 					       search, &header_prev))
4802 			{
4803 				/* Do nothing. */
4804 			} else if (EXISTS(header) && !ANCIENT(header)) {
4805 				/*
4806 				 * We've found an extant rdataset.  See if
4807 				 * we're interested in it.
4808 				 */
4809 				if (header->type == dns_rdatatype_ns) {
4810 					found = header;
4811 					if (foundsig != NULL) {
4812 						break;
4813 					}
4814 				} else if (header->type ==
4815 					   RBTDB_RDATATYPE_SIGNS)
4816 				{
4817 					foundsig = header;
4818 					if (found != NULL) {
4819 						break;
4820 					}
4821 				}
4822 				header_prev = header;
4823 			} else {
4824 				header_prev = header;
4825 			}
4826 		}
4827 
4828 		if (found != NULL) {
4829 			/*
4830 			 * If we have to set foundname, we do it before
4831 			 * anything else.  If we were to set foundname after
4832 			 * we had set nodep or bound the rdataset, then we'd
4833 			 * have to undo that work if dns_name_concatenate()
4834 			 * failed.  By setting foundname first, there's
4835 			 * nothing to undo if we have trouble.
4836 			 */
4837 			if (foundname != NULL) {
4838 				dns_name_init(&name, NULL);
4839 				dns_rbt_namefromnode(node, &name);
4840 				dns_name_copy(&name, foundname);
4841 				while (i > 0) {
4842 					i--;
4843 					level_node = search->chain.levels[i];
4844 					dns_name_init(&name, NULL);
4845 					dns_rbt_namefromnode(level_node, &name);
4846 					result = dns_name_concatenate(
4847 						foundname, &name, foundname,
4848 						NULL);
4849 					if (result != ISC_R_SUCCESS) {
4850 						if (nodep != NULL) {
4851 							*nodep = NULL;
4852 						}
4853 						goto node_exit;
4854 					}
4855 				}
4856 			}
4857 			result = DNS_R_DELEGATION;
4858 			if (nodep != NULL) {
4859 				new_reference(search->rbtdb, node, locktype);
4860 				*nodep = node;
4861 			}
4862 			bind_rdataset(search->rbtdb, node, found, search->now,
4863 				      locktype, rdataset);
4864 			if (foundsig != NULL) {
4865 				bind_rdataset(search->rbtdb, node, foundsig,
4866 					      search->now, locktype,
4867 					      sigrdataset);
4868 			}
4869 			if (need_headerupdate(found, search->now) ||
4870 			    (foundsig != NULL &&
4871 			     need_headerupdate(foundsig, search->now)))
4872 			{
4873 				if (locktype != isc_rwlocktype_write) {
4874 					NODE_UNLOCK(lock, locktype);
4875 					NODE_LOCK(lock, isc_rwlocktype_write);
4876 					locktype = isc_rwlocktype_write;
4877 					POST(locktype);
4878 				}
4879 				if (need_headerupdate(found, search->now)) {
4880 					update_header(search->rbtdb, found,
4881 						      search->now);
4882 				}
4883 				if (foundsig != NULL &&
4884 				    need_headerupdate(foundsig, search->now))
4885 				{
4886 					update_header(search->rbtdb, foundsig,
4887 						      search->now);
4888 				}
4889 			}
4890 		}
4891 
4892 	node_exit:
4893 		NODE_UNLOCK(lock, locktype);
4894 
4895 		if (found == NULL && i > 0) {
4896 			i--;
4897 			node = search->chain.levels[i];
4898 		} else {
4899 			done = true;
4900 		}
4901 	} while (!done);
4902 
4903 	return (result);
4904 }
4905 
4906 /*
4907  * Look for a potentially covering NSEC in the cache where `name`
4908  * is known not to exist.  This uses the auxiliary NSEC tree to find
4909  * the potential NSEC owner. If found, we update 'foundname', 'nodep',
4910  * 'rdataset' and 'sigrdataset', and return DNS_R_COVERINGNSEC.
4911  * Otherwise, return ISC_R_NOTFOUND.
4912  */
4913 static isc_result_t
4914 find_coveringnsec(rbtdb_search_t *search, const dns_name_t *name,
4915 		  dns_dbnode_t **nodep, isc_stdtime_t now,
4916 		  dns_name_t *foundname, dns_rdataset_t *rdataset,
4917 		  dns_rdataset_t *sigrdataset) {
4918 	dns_fixedname_t fprefix, forigin, ftarget, fixed;
4919 	dns_name_t *prefix = NULL, *origin = NULL;
4920 	dns_name_t *target = NULL, *fname = NULL;
4921 	dns_rbtnode_t *node = NULL;
4922 	dns_rbtnodechain_t chain;
4923 	isc_result_t result;
4924 	isc_rwlocktype_t locktype;
4925 	nodelock_t *lock = NULL;
4926 	rbtdb_rdatatype_t matchtype, sigmatchtype;
4927 	rdatasetheader_t *found = NULL, *foundsig = NULL;
4928 	rdatasetheader_t *header = NULL;
4929 	rdatasetheader_t *header_next = NULL, *header_prev = NULL;
4930 
4931 	/*
4932 	 * Look for the node in the auxilary tree.
4933 	 */
4934 	dns_rbtnodechain_init(&chain);
4935 	target = dns_fixedname_initname(&ftarget);
4936 	result = dns_rbt_findnode(search->rbtdb->nsec, name, target, &node,
4937 				  &chain, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
4938 	if (result != DNS_R_PARTIALMATCH) {
4939 		dns_rbtnodechain_reset(&chain);
4940 		return (ISC_R_NOTFOUND);
4941 	}
4942 
4943 	prefix = dns_fixedname_initname(&fprefix);
4944 	origin = dns_fixedname_initname(&forigin);
4945 	target = dns_fixedname_initname(&ftarget);
4946 	fname = dns_fixedname_initname(&fixed);
4947 
4948 	locktype = isc_rwlocktype_read;
4949 	matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
4950 	sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
4951 					     dns_rdatatype_nsec);
4952 
4953 	/*
4954 	 * Extract predecessor from chain.
4955 	 */
4956 	result = dns_rbtnodechain_current(&chain, prefix, origin, NULL);
4957 	dns_rbtnodechain_reset(&chain);
4958 	if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) {
4959 		return (ISC_R_NOTFOUND);
4960 	}
4961 
4962 	result = dns_name_concatenate(prefix, origin, target, NULL);
4963 	if (result != ISC_R_SUCCESS) {
4964 		return (ISC_R_NOTFOUND);
4965 	}
4966 
4967 	/*
4968 	 * Lookup the predecessor in the main tree.
4969 	 */
4970 	node = NULL;
4971 	result = dns_rbt_findnode(search->rbtdb->tree, target, fname, &node,
4972 				  NULL, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
4973 	if (result != ISC_R_SUCCESS) {
4974 		return (ISC_R_NOTFOUND);
4975 	}
4976 
4977 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
4978 	NODE_LOCK(lock, locktype);
4979 	for (header = node->data; header != NULL; header = header_next) {
4980 		header_next = header->next;
4981 		if (check_stale_header(node, header, &locktype, lock, search,
4982 				       &header_prev))
4983 		{
4984 			continue;
4985 		}
4986 		if (NONEXISTENT(header) ||
4987 		    RBTDB_RDATATYPE_BASE(header->type) == 0)
4988 		{
4989 			header_prev = header;
4990 			continue;
4991 		}
4992 		if (header->type == matchtype) {
4993 			found = header;
4994 			if (foundsig != NULL) {
4995 				break;
4996 			}
4997 		} else if (header->type == sigmatchtype) {
4998 			foundsig = header;
4999 			if (found != NULL) {
5000 				break;
5001 			}
5002 		}
5003 		header_prev = header;
5004 	}
5005 	if (found != NULL) {
5006 		bind_rdataset(search->rbtdb, node, found, now, locktype,
5007 			      rdataset);
5008 		if (foundsig != NULL) {
5009 			bind_rdataset(search->rbtdb, node, foundsig, now,
5010 				      locktype, sigrdataset);
5011 		}
5012 		new_reference(search->rbtdb, node, locktype);
5013 
5014 		dns_name_copy(fname, foundname);
5015 
5016 		*nodep = node;
5017 		result = DNS_R_COVERINGNSEC;
5018 	} else {
5019 		result = ISC_R_NOTFOUND;
5020 	}
5021 	NODE_UNLOCK(lock, locktype);
5022 	return (result);
5023 }
5024 
5025 static isc_result_t
5026 cache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
5027 	   dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
5028 	   dns_dbnode_t **nodep, dns_name_t *foundname,
5029 	   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5030 	dns_rbtnode_t *node = NULL;
5031 	isc_result_t result;
5032 	rbtdb_search_t search;
5033 	bool cname_ok = true;
5034 	bool found_noqname = false;
5035 	bool all_negative = true;
5036 	bool empty_node;
5037 	nodelock_t *lock;
5038 	isc_rwlocktype_t locktype;
5039 	rdatasetheader_t *header, *header_prev, *header_next;
5040 	rdatasetheader_t *found, *nsheader;
5041 	rdatasetheader_t *foundsig, *nssig, *cnamesig;
5042 	rdatasetheader_t *update, *updatesig;
5043 	rdatasetheader_t *nsecheader, *nsecsig;
5044 	rbtdb_rdatatype_t sigtype, negtype;
5045 
5046 	UNUSED(version);
5047 
5048 	search.rbtdb = (dns_rbtdb_t *)db;
5049 
5050 	REQUIRE(VALID_RBTDB(search.rbtdb));
5051 	REQUIRE(version == NULL);
5052 
5053 	if (now == 0) {
5054 		isc_stdtime_get(&now);
5055 	}
5056 
5057 	search.rbtversion = NULL;
5058 	search.serial = 1;
5059 	search.options = options;
5060 	search.copy_name = false;
5061 	search.need_cleanup = false;
5062 	search.wild = false;
5063 	search.zonecut = NULL;
5064 	search.zonecut_rdataset = NULL;
5065 	search.zonecut_sigrdataset = NULL;
5066 	dns_fixedname_init(&search.zonecut_name);
5067 	dns_rbtnodechain_init(&search.chain);
5068 	search.now = now;
5069 	update = NULL;
5070 	updatesig = NULL;
5071 
5072 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5073 
5074 	/*
5075 	 * Search down from the root of the tree.  If, while going down, we
5076 	 * encounter a callback node, cache_zonecut_callback() will search the
5077 	 * rdatasets at the zone cut for a DNAME rdataset.
5078 	 */
5079 	result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
5080 				  &search.chain, DNS_RBTFIND_EMPTYDATA,
5081 				  cache_zonecut_callback, &search);
5082 
5083 	if (result == DNS_R_PARTIALMATCH) {
5084 		/*
5085 		 * If dns_rbt_findnode discovered a covering DNAME skip
5086 		 * looking for a covering NSEC.
5087 		 */
5088 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
5089 		    (search.zonecut_rdataset == NULL ||
5090 		     search.zonecut_rdataset->type != dns_rdatatype_dname))
5091 		{
5092 			result = find_coveringnsec(&search, name, nodep, now,
5093 						   foundname, rdataset,
5094 						   sigrdataset);
5095 			if (result == DNS_R_COVERINGNSEC) {
5096 				goto tree_exit;
5097 			}
5098 		}
5099 		if (search.zonecut != NULL) {
5100 			result = setup_delegation(&search, nodep, foundname,
5101 						  rdataset, sigrdataset);
5102 			goto tree_exit;
5103 		} else {
5104 		find_ns:
5105 			result = find_deepest_zonecut(&search, node, nodep,
5106 						      foundname, rdataset,
5107 						      sigrdataset);
5108 			goto tree_exit;
5109 		}
5110 	} else if (result != ISC_R_SUCCESS) {
5111 		goto tree_exit;
5112 	}
5113 
5114 	/*
5115 	 * Certain DNSSEC types are not subject to CNAME matching
5116 	 * (RFC4035, section 2.5 and RFC3007).
5117 	 *
5118 	 * We don't check for RRSIG, because we don't store RRSIG records
5119 	 * directly.
5120 	 */
5121 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
5122 		cname_ok = false;
5123 	}
5124 
5125 	/*
5126 	 * We now go looking for rdata...
5127 	 */
5128 
5129 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5130 	locktype = isc_rwlocktype_read;
5131 	NODE_LOCK(lock, locktype);
5132 
5133 	found = NULL;
5134 	foundsig = NULL;
5135 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5136 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
5137 	nsheader = NULL;
5138 	nsecheader = NULL;
5139 	nssig = NULL;
5140 	nsecsig = NULL;
5141 	cnamesig = NULL;
5142 	empty_node = true;
5143 	header_prev = NULL;
5144 	for (header = node->data; header != NULL; header = header_next) {
5145 		header_next = header->next;
5146 		if (check_stale_header(node, header, &locktype, lock, &search,
5147 				       &header_prev))
5148 		{
5149 			/* Do nothing. */
5150 		} else if (EXISTS(header) && !ANCIENT(header)) {
5151 			/*
5152 			 * We now know that there is at least one active
5153 			 * non-stale rdataset at this node.
5154 			 */
5155 			empty_node = false;
5156 			if (header->noqname != NULL &&
5157 			    header->trust == dns_trust_secure)
5158 			{
5159 				found_noqname = true;
5160 			}
5161 			if (!NEGATIVE(header)) {
5162 				all_negative = false;
5163 			}
5164 
5165 			/*
5166 			 * If we found a type we were looking for, remember
5167 			 * it.
5168 			 */
5169 			if (header->type == type ||
5170 			    (type == dns_rdatatype_any &&
5171 			     RBTDB_RDATATYPE_BASE(header->type) != 0) ||
5172 			    (cname_ok && header->type == dns_rdatatype_cname))
5173 			{
5174 				/*
5175 				 * We've found the answer.
5176 				 */
5177 				found = header;
5178 				if (header->type == dns_rdatatype_cname &&
5179 				    cname_ok)
5180 				{
5181 					/*
5182 					 * If we've already got the
5183 					 * CNAME RRSIG, use it.
5184 					 */
5185 					if (cnamesig != NULL) {
5186 						foundsig = cnamesig;
5187 					} else {
5188 						sigtype =
5189 							RBTDB_RDATATYPE_SIGCNAME;
5190 					}
5191 				}
5192 			} else if (header->type == sigtype) {
5193 				/*
5194 				 * We've found the RRSIG rdataset for our
5195 				 * target type.  Remember it.
5196 				 */
5197 				foundsig = header;
5198 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
5199 				   header->type == negtype)
5200 			{
5201 				/*
5202 				 * We've found a negative cache entry.
5203 				 */
5204 				found = header;
5205 			} else if (header->type == dns_rdatatype_ns) {
5206 				/*
5207 				 * Remember a NS rdataset even if we're
5208 				 * not specifically looking for it, because
5209 				 * we might need it later.
5210 				 */
5211 				nsheader = header;
5212 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5213 				/*
5214 				 * If we need the NS rdataset, we'll also
5215 				 * need its signature.
5216 				 */
5217 				nssig = header;
5218 			} else if (header->type == dns_rdatatype_nsec) {
5219 				nsecheader = header;
5220 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
5221 				nsecsig = header;
5222 			} else if (cname_ok &&
5223 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
5224 			{
5225 				/*
5226 				 * If we get a CNAME match, we'll also need
5227 				 * its signature.
5228 				 */
5229 				cnamesig = header;
5230 			}
5231 			header_prev = header;
5232 		} else {
5233 			header_prev = header;
5234 		}
5235 	}
5236 
5237 	if (empty_node) {
5238 		/*
5239 		 * We have an exact match for the name, but there are no
5240 		 * extant rdatasets.  That means that this node doesn't
5241 		 * meaningfully exist, and that we really have a partial match.
5242 		 */
5243 		NODE_UNLOCK(lock, locktype);
5244 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
5245 			result = find_coveringnsec(&search, name, nodep, now,
5246 						   foundname, rdataset,
5247 						   sigrdataset);
5248 			if (result == DNS_R_COVERINGNSEC) {
5249 				goto tree_exit;
5250 			}
5251 		}
5252 		goto find_ns;
5253 	}
5254 
5255 	/*
5256 	 * If we didn't find what we were looking for...
5257 	 */
5258 	if (found == NULL ||
5259 	    (DNS_TRUST_ADDITIONAL(found->trust) &&
5260 	     ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
5261 	    (found->trust == dns_trust_glue &&
5262 	     ((options & DNS_DBFIND_GLUEOK) == 0)) ||
5263 	    (DNS_TRUST_PENDING(found->trust) &&
5264 	     ((options & DNS_DBFIND_PENDINGOK) == 0)))
5265 	{
5266 		/*
5267 		 * Return covering NODATA NSEC record.
5268 		 */
5269 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
5270 		    nsecheader != NULL)
5271 		{
5272 			if (nodep != NULL) {
5273 				new_reference(search.rbtdb, node, locktype);
5274 				*nodep = node;
5275 			}
5276 			bind_rdataset(search.rbtdb, node, nsecheader,
5277 				      search.now, locktype, rdataset);
5278 			if (need_headerupdate(nsecheader, search.now)) {
5279 				update = nsecheader;
5280 			}
5281 			if (nsecsig != NULL) {
5282 				bind_rdataset(search.rbtdb, node, nsecsig,
5283 					      search.now, locktype,
5284 					      sigrdataset);
5285 				if (need_headerupdate(nsecsig, search.now)) {
5286 					updatesig = nsecsig;
5287 				}
5288 			}
5289 			result = DNS_R_COVERINGNSEC;
5290 			goto node_exit;
5291 		}
5292 
5293 		/*
5294 		 * This name was from a wild card.  Look for a covering NSEC.
5295 		 */
5296 		if (found == NULL && (found_noqname || all_negative) &&
5297 		    (search.options & DNS_DBFIND_COVERINGNSEC) != 0)
5298 		{
5299 			NODE_UNLOCK(lock, locktype);
5300 			result = find_coveringnsec(&search, name, nodep, now,
5301 						   foundname, rdataset,
5302 						   sigrdataset);
5303 			if (result == DNS_R_COVERINGNSEC) {
5304 				goto tree_exit;
5305 			}
5306 			goto find_ns;
5307 		}
5308 
5309 		/*
5310 		 * If there is an NS rdataset at this node, then this is the
5311 		 * deepest zone cut.
5312 		 */
5313 		if (nsheader != NULL) {
5314 			if (nodep != NULL) {
5315 				new_reference(search.rbtdb, node, locktype);
5316 				*nodep = node;
5317 			}
5318 			bind_rdataset(search.rbtdb, node, nsheader, search.now,
5319 				      locktype, rdataset);
5320 			if (need_headerupdate(nsheader, search.now)) {
5321 				update = nsheader;
5322 			}
5323 			if (nssig != NULL) {
5324 				bind_rdataset(search.rbtdb, node, nssig,
5325 					      search.now, locktype,
5326 					      sigrdataset);
5327 				if (need_headerupdate(nssig, search.now)) {
5328 					updatesig = nssig;
5329 				}
5330 			}
5331 			result = DNS_R_DELEGATION;
5332 			goto node_exit;
5333 		}
5334 
5335 		/*
5336 		 * Go find the deepest zone cut.
5337 		 */
5338 		NODE_UNLOCK(lock, locktype);
5339 		goto find_ns;
5340 	}
5341 
5342 	/*
5343 	 * We found what we were looking for, or we found a CNAME.
5344 	 */
5345 
5346 	if (nodep != NULL) {
5347 		new_reference(search.rbtdb, node, locktype);
5348 		*nodep = node;
5349 	}
5350 
5351 	if (NEGATIVE(found)) {
5352 		/*
5353 		 * We found a negative cache entry.
5354 		 */
5355 		if (NXDOMAIN(found)) {
5356 			result = DNS_R_NCACHENXDOMAIN;
5357 		} else {
5358 			result = DNS_R_NCACHENXRRSET;
5359 		}
5360 	} else if (type != found->type && type != dns_rdatatype_any &&
5361 		   found->type == dns_rdatatype_cname)
5362 	{
5363 		/*
5364 		 * We weren't doing an ANY query and we found a CNAME instead
5365 		 * of the type we were looking for, so we need to indicate
5366 		 * that result to the caller.
5367 		 */
5368 		result = DNS_R_CNAME;
5369 	} else {
5370 		/*
5371 		 * An ordinary successful query!
5372 		 */
5373 		result = ISC_R_SUCCESS;
5374 	}
5375 
5376 	if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
5377 	    result == DNS_R_NCACHENXRRSET)
5378 	{
5379 		bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5380 			      rdataset);
5381 		if (need_headerupdate(found, search.now)) {
5382 			update = found;
5383 		}
5384 		if (!NEGATIVE(found) && foundsig != NULL) {
5385 			bind_rdataset(search.rbtdb, node, foundsig, search.now,
5386 				      locktype, sigrdataset);
5387 			if (need_headerupdate(foundsig, search.now)) {
5388 				updatesig = foundsig;
5389 			}
5390 		}
5391 	}
5392 
5393 node_exit:
5394 	if ((update != NULL || updatesig != NULL) &&
5395 	    locktype != isc_rwlocktype_write)
5396 	{
5397 		NODE_UNLOCK(lock, locktype);
5398 		NODE_LOCK(lock, isc_rwlocktype_write);
5399 		locktype = isc_rwlocktype_write;
5400 		POST(locktype);
5401 	}
5402 	if (update != NULL && need_headerupdate(update, search.now)) {
5403 		update_header(search.rbtdb, update, search.now);
5404 	}
5405 	if (updatesig != NULL && need_headerupdate(updatesig, search.now)) {
5406 		update_header(search.rbtdb, updatesig, search.now);
5407 	}
5408 
5409 	NODE_UNLOCK(lock, locktype);
5410 
5411 tree_exit:
5412 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5413 
5414 	/*
5415 	 * If we found a zonecut but aren't going to use it, we have to
5416 	 * let go of it.
5417 	 */
5418 	if (search.need_cleanup) {
5419 		node = search.zonecut;
5420 		INSIST(node != NULL);
5421 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
5422 
5423 		NODE_LOCK(lock, isc_rwlocktype_read);
5424 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
5425 				    isc_rwlocktype_none, false);
5426 		NODE_UNLOCK(lock, isc_rwlocktype_read);
5427 	}
5428 
5429 	dns_rbtnodechain_reset(&search.chain);
5430 
5431 	update_cachestats(search.rbtdb, result);
5432 	return (result);
5433 }
5434 
5435 static isc_result_t
5436 cache_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
5437 		  isc_stdtime_t now, dns_dbnode_t **nodep,
5438 		  dns_name_t *foundname, dns_name_t *dcname,
5439 		  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5440 	dns_rbtnode_t *node = NULL;
5441 	nodelock_t *lock;
5442 	isc_result_t result;
5443 	rbtdb_search_t search;
5444 	rdatasetheader_t *header, *header_prev, *header_next;
5445 	rdatasetheader_t *found, *foundsig;
5446 	unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
5447 	isc_rwlocktype_t locktype;
5448 	bool dcnull = (dcname == NULL);
5449 
5450 	search.rbtdb = (dns_rbtdb_t *)db;
5451 
5452 	REQUIRE(VALID_RBTDB(search.rbtdb));
5453 
5454 	if (now == 0) {
5455 		isc_stdtime_get(&now);
5456 	}
5457 
5458 	search.rbtversion = NULL;
5459 	search.serial = 1;
5460 	search.options = options;
5461 	search.copy_name = false;
5462 	search.need_cleanup = false;
5463 	search.wild = false;
5464 	search.zonecut = NULL;
5465 	dns_fixedname_init(&search.zonecut_name);
5466 	dns_rbtnodechain_init(&search.chain);
5467 	search.now = now;
5468 
5469 	if (dcnull) {
5470 		dcname = foundname;
5471 	}
5472 
5473 	if ((options & DNS_DBFIND_NOEXACT) != 0) {
5474 		rbtoptions |= DNS_RBTFIND_NOEXACT;
5475 	}
5476 
5477 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5478 
5479 	/*
5480 	 * Search down from the root of the tree.
5481 	 */
5482 	result = dns_rbt_findnode(search.rbtdb->tree, name, dcname, &node,
5483 				  &search.chain, rbtoptions, NULL, &search);
5484 
5485 	if (result == DNS_R_PARTIALMATCH) {
5486 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5487 					      rdataset, sigrdataset);
5488 		goto tree_exit;
5489 	} else if (result != ISC_R_SUCCESS) {
5490 		goto tree_exit;
5491 	} else if (!dcnull) {
5492 		dns_name_copy(dcname, foundname);
5493 	}
5494 
5495 	/*
5496 	 * We now go looking for an NS rdataset at the node.
5497 	 */
5498 
5499 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5500 	locktype = isc_rwlocktype_read;
5501 	NODE_LOCK(lock, locktype);
5502 
5503 	found = NULL;
5504 	foundsig = NULL;
5505 	header_prev = NULL;
5506 	for (header = node->data; header != NULL; header = header_next) {
5507 		header_next = header->next;
5508 		if (check_stale_header(node, header, &locktype, lock, &search,
5509 				       &header_prev))
5510 		{
5511 			/*
5512 			 * The function dns_rbt_findnode found us the a matching
5513 			 * node for 'name' and stored the result in 'dcname'.
5514 			 * This is the deepest known zonecut in our database.
5515 			 * However, this node may be stale and if serve-stale
5516 			 * is not enabled (in other words 'stale-answer-enable'
5517 			 * is set to no), this node may not be used as a
5518 			 * zonecut we know about. If so, find the deepest
5519 			 * zonecut from this node up and return that instead.
5520 			 */
5521 			NODE_UNLOCK(lock, locktype);
5522 			result = find_deepest_zonecut(&search, node, nodep,
5523 						      foundname, rdataset,
5524 						      sigrdataset);
5525 			dns_name_copy(foundname, dcname);
5526 			goto tree_exit;
5527 		} else if (EXISTS(header) && !ANCIENT(header)) {
5528 			/*
5529 			 * If we found a type we were looking for, remember
5530 			 * it.
5531 			 */
5532 			if (header->type == dns_rdatatype_ns) {
5533 				/*
5534 				 * Remember a NS rdataset even if we're
5535 				 * not specifically looking for it, because
5536 				 * we might need it later.
5537 				 */
5538 				found = header;
5539 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5540 				/*
5541 				 * If we need the NS rdataset, we'll also
5542 				 * need its signature.
5543 				 */
5544 				foundsig = header;
5545 			}
5546 			header_prev = header;
5547 		} else {
5548 			header_prev = header;
5549 		}
5550 	}
5551 
5552 	if (found == NULL) {
5553 		/*
5554 		 * No NS records here.
5555 		 */
5556 		NODE_UNLOCK(lock, locktype);
5557 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5558 					      rdataset, sigrdataset);
5559 		goto tree_exit;
5560 	}
5561 
5562 	if (nodep != NULL) {
5563 		new_reference(search.rbtdb, node, locktype);
5564 		*nodep = node;
5565 	}
5566 
5567 	bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5568 		      rdataset);
5569 	if (foundsig != NULL) {
5570 		bind_rdataset(search.rbtdb, node, foundsig, search.now,
5571 			      locktype, sigrdataset);
5572 	}
5573 
5574 	if (need_headerupdate(found, search.now) ||
5575 	    (foundsig != NULL && need_headerupdate(foundsig, search.now)))
5576 	{
5577 		if (locktype != isc_rwlocktype_write) {
5578 			NODE_UNLOCK(lock, locktype);
5579 			NODE_LOCK(lock, isc_rwlocktype_write);
5580 			locktype = isc_rwlocktype_write;
5581 			POST(locktype);
5582 		}
5583 		if (need_headerupdate(found, search.now)) {
5584 			update_header(search.rbtdb, found, search.now);
5585 		}
5586 		if (foundsig != NULL && need_headerupdate(foundsig, search.now))
5587 		{
5588 			update_header(search.rbtdb, foundsig, search.now);
5589 		}
5590 	}
5591 
5592 	NODE_UNLOCK(lock, locktype);
5593 
5594 tree_exit:
5595 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5596 
5597 	INSIST(!search.need_cleanup);
5598 
5599 	dns_rbtnodechain_reset(&search.chain);
5600 
5601 	if (result == DNS_R_DELEGATION) {
5602 		result = ISC_R_SUCCESS;
5603 	}
5604 
5605 	return (result);
5606 }
5607 
5608 static void
5609 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
5610 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5611 	dns_rbtnode_t *node = (dns_rbtnode_t *)source;
5612 
5613 	REQUIRE(VALID_RBTDB(rbtdb));
5614 	REQUIRE(targetp != NULL && *targetp == NULL);
5615 
5616 	isc_refcount_increment(&node->references);
5617 
5618 	*targetp = source;
5619 }
5620 
5621 static void
5622 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
5623 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5624 	dns_rbtnode_t *node;
5625 	bool want_free = false;
5626 	bool inactive = false;
5627 	rbtdb_nodelock_t *nodelock;
5628 
5629 	REQUIRE(VALID_RBTDB(rbtdb));
5630 	REQUIRE(targetp != NULL && *targetp != NULL);
5631 
5632 	node = (dns_rbtnode_t *)(*targetp);
5633 	nodelock = &rbtdb->node_locks[node->locknum];
5634 
5635 	NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
5636 
5637 	if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
5638 				isc_rwlocktype_none, false))
5639 	{
5640 		if (isc_refcount_current(&nodelock->references) == 0 &&
5641 		    nodelock->exiting)
5642 		{
5643 			inactive = true;
5644 		}
5645 	}
5646 
5647 	NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
5648 
5649 	*targetp = NULL;
5650 
5651 	if (inactive) {
5652 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5653 		rbtdb->active--;
5654 		if (rbtdb->active == 0) {
5655 			want_free = true;
5656 		}
5657 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5658 		if (want_free) {
5659 			char buf[DNS_NAME_FORMATSIZE];
5660 			if (dns_name_dynamic(&rbtdb->common.origin)) {
5661 				dns_name_format(&rbtdb->common.origin, buf,
5662 						sizeof(buf));
5663 			} else {
5664 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
5665 			}
5666 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
5667 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
5668 				      "calling free_rbtdb(%s)", buf);
5669 			free_rbtdb(rbtdb, true, NULL);
5670 		}
5671 	}
5672 }
5673 
5674 static isc_result_t
5675 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
5676 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5677 	dns_rbtnode_t *rbtnode = node;
5678 	rdatasetheader_t *header;
5679 	bool force_expire = false;
5680 	/*
5681 	 * These are the category and module used by the cache cleaner.
5682 	 */
5683 	bool log = false;
5684 	isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
5685 	isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
5686 	int level = ISC_LOG_DEBUG(2);
5687 	char printname[DNS_NAME_FORMATSIZE];
5688 
5689 	REQUIRE(VALID_RBTDB(rbtdb));
5690 
5691 	/*
5692 	 * Caller must hold a tree lock.
5693 	 */
5694 
5695 	if (now == 0) {
5696 		isc_stdtime_get(&now);
5697 	}
5698 
5699 	if (isc_mem_isovermem(rbtdb->common.mctx)) {
5700 		/*
5701 		 * Force expire with 25% probability.
5702 		 * XXXDCL Could stand to have a better policy, like LRU.
5703 		 */
5704 		force_expire = (rbtnode->down == NULL &&
5705 				(isc_random32() % 4) == 0);
5706 
5707 		/*
5708 		 * Note that 'log' can be true IFF overmem is also true.
5709 		 * overmem can currently only be true for cache
5710 		 * databases -- hence all of the "overmem cache" log strings.
5711 		 */
5712 		log = isc_log_wouldlog(dns_lctx, level);
5713 		if (log) {
5714 			isc_log_write(
5715 				dns_lctx, category, module, level,
5716 				"overmem cache: %s %s",
5717 				force_expire ? "FORCE" : "check",
5718 				dns_rbt_formatnodename(rbtnode, printname,
5719 						       sizeof(printname)));
5720 		}
5721 	}
5722 
5723 	/*
5724 	 * We may not need write access, but this code path is not performance
5725 	 * sensitive, so it should be okay to always lock as a writer.
5726 	 */
5727 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5728 		  isc_rwlocktype_write);
5729 
5730 	for (header = rbtnode->data; header != NULL; header = header->next) {
5731 		if (header->rdh_ttl + STALE_TTL(header, rbtdb) <=
5732 		    now - RBTDB_VIRTUAL)
5733 		{
5734 			/*
5735 			 * We don't check if refcurrent(rbtnode) == 0 and try
5736 			 * to free like we do in cache_find(), because
5737 			 * refcurrent(rbtnode) must be non-zero.  This is so
5738 			 * because 'node' is an argument to the function.
5739 			 */
5740 			mark_header_ancient(rbtdb, header);
5741 			if (log) {
5742 				isc_log_write(dns_lctx, category, module, level,
5743 					      "overmem cache: ancient %s",
5744 					      printname);
5745 			}
5746 		} else if (force_expire) {
5747 			if (!RETAIN(header)) {
5748 				set_ttl(rbtdb, header, 0);
5749 				mark_header_ancient(rbtdb, header);
5750 			} else if (log) {
5751 				isc_log_write(dns_lctx, category, module, level,
5752 					      "overmem cache: "
5753 					      "reprieve by RETAIN() %s",
5754 					      printname);
5755 			}
5756 		} else if (isc_mem_isovermem(rbtdb->common.mctx) && log) {
5757 			isc_log_write(dns_lctx, category, module, level,
5758 				      "overmem cache: saved %s", printname);
5759 		}
5760 	}
5761 
5762 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5763 		    isc_rwlocktype_write);
5764 
5765 	return (ISC_R_SUCCESS);
5766 }
5767 
5768 static void
5769 overmem(dns_db_t *db, bool over) {
5770 	/* This is an empty callback.  See adb.c:water() */
5771 
5772 	UNUSED(db);
5773 	UNUSED(over);
5774 
5775 	return;
5776 }
5777 
5778 static void
5779 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
5780 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5781 	dns_rbtnode_t *rbtnode = node;
5782 	bool first;
5783 	uint32_t refs;
5784 
5785 	REQUIRE(VALID_RBTDB(rbtdb));
5786 
5787 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5788 		  isc_rwlocktype_read);
5789 
5790 	refs = isc_refcount_current(&rbtnode->references);
5791 	fprintf(out, "node %p, %" PRIu32 " references, locknum = %u\n", rbtnode,
5792 		refs, rbtnode->locknum);
5793 	if (rbtnode->data != NULL) {
5794 		rdatasetheader_t *current, *top_next;
5795 
5796 		for (current = rbtnode->data; current != NULL;
5797 		     current = top_next)
5798 		{
5799 			top_next = current->next;
5800 			first = true;
5801 			fprintf(out, "\ttype %u", current->type);
5802 			do {
5803 				uint_least16_t attributes = atomic_load_acquire(
5804 					&current->attributes);
5805 				if (!first) {
5806 					fprintf(out, "\t");
5807 				}
5808 				first = false;
5809 				fprintf(out,
5810 					"\tserial = %lu, ttl = %u, "
5811 					"trust = %u, attributes = %" PRIuLEAST16
5812 					", "
5813 					"resign = %u\n",
5814 					(unsigned long)current->serial,
5815 					current->rdh_ttl, current->trust,
5816 					attributes,
5817 					(current->resign << 1) |
5818 						current->resign_lsb);
5819 				current = current->down;
5820 			} while (current != NULL);
5821 		}
5822 	} else {
5823 		fprintf(out, "(empty)\n");
5824 	}
5825 
5826 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5827 		    isc_rwlocktype_read);
5828 }
5829 
5830 static isc_result_t
5831 createiterator(dns_db_t *db, unsigned int options,
5832 	       dns_dbiterator_t **iteratorp) {
5833 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5834 	rbtdb_dbiterator_t *rbtdbiter;
5835 
5836 	REQUIRE(VALID_RBTDB(rbtdb));
5837 	REQUIRE((options & (DNS_DB_NSEC3ONLY | DNS_DB_NONSEC3)) !=
5838 		(DNS_DB_NSEC3ONLY | DNS_DB_NONSEC3));
5839 
5840 	rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
5841 
5842 	rbtdbiter->common.methods = &dbiterator_methods;
5843 	rbtdbiter->common.db = NULL;
5844 	dns_db_attach(db, &rbtdbiter->common.db);
5845 	rbtdbiter->common.relative_names = ((options & DNS_DB_RELATIVENAMES) !=
5846 					    0);
5847 	rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
5848 	rbtdbiter->common.cleaning = false;
5849 	rbtdbiter->paused = true;
5850 	rbtdbiter->tree_locked = isc_rwlocktype_none;
5851 	rbtdbiter->result = ISC_R_SUCCESS;
5852 	dns_fixedname_init(&rbtdbiter->name);
5853 	dns_fixedname_init(&rbtdbiter->origin);
5854 	rbtdbiter->node = NULL;
5855 	rbtdbiter->delcnt = 0;
5856 	if ((options & DNS_DB_NSEC3ONLY) != 0) {
5857 		rbtdbiter->nsec3mode = nsec3only;
5858 	} else if ((options & DNS_DB_NONSEC3) != 0) {
5859 		rbtdbiter->nsec3mode = nonsec3;
5860 	} else {
5861 		rbtdbiter->nsec3mode = full;
5862 	}
5863 
5864 	memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
5865 	dns_rbtnodechain_init(&rbtdbiter->chain);
5866 	dns_rbtnodechain_init(&rbtdbiter->nsec3chain);
5867 	if (rbtdbiter->nsec3mode == nsec3only) {
5868 		rbtdbiter->current = &rbtdbiter->nsec3chain;
5869 	} else {
5870 		rbtdbiter->current = &rbtdbiter->chain;
5871 	}
5872 
5873 	*iteratorp = (dns_dbiterator_t *)rbtdbiter;
5874 
5875 	return (ISC_R_SUCCESS);
5876 }
5877 
5878 static isc_result_t
5879 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5880 		  dns_rdatatype_t type, dns_rdatatype_t covers,
5881 		  isc_stdtime_t now, dns_rdataset_t *rdataset,
5882 		  dns_rdataset_t *sigrdataset) {
5883 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5884 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5885 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5886 	rbtdb_serial_t serial;
5887 	rbtdb_version_t *rbtversion = version;
5888 	bool close_version = false;
5889 	rbtdb_rdatatype_t matchtype, sigmatchtype;
5890 
5891 	REQUIRE(VALID_RBTDB(rbtdb));
5892 	REQUIRE(type != dns_rdatatype_any);
5893 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
5894 
5895 	if (rbtversion == NULL) {
5896 		currentversion(db, (dns_dbversion_t **)(void *)(&rbtversion));
5897 		close_version = true;
5898 	}
5899 	serial = rbtversion->serial;
5900 	now = 0;
5901 
5902 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5903 		  isc_rwlocktype_read);
5904 
5905 	found = NULL;
5906 	foundsig = NULL;
5907 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
5908 	if (covers == 0) {
5909 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5910 	} else {
5911 		sigmatchtype = 0;
5912 	}
5913 
5914 	for (header = rbtnode->data; header != NULL; header = header_next) {
5915 		header_next = header->next;
5916 		do {
5917 			if (header->serial <= serial && !IGNORE(header)) {
5918 				/*
5919 				 * Is this a "this rdataset doesn't
5920 				 * exist" record?
5921 				 */
5922 				if (NONEXISTENT(header)) {
5923 					header = NULL;
5924 				}
5925 				break;
5926 			} else {
5927 				header = header->down;
5928 			}
5929 		} while (header != NULL);
5930 		if (header != NULL) {
5931 			/*
5932 			 * We have an active, extant rdataset.  If it's a
5933 			 * type we're looking for, remember it.
5934 			 */
5935 			if (header->type == matchtype) {
5936 				found = header;
5937 				if (foundsig != NULL) {
5938 					break;
5939 				}
5940 			} else if (header->type == sigmatchtype) {
5941 				foundsig = header;
5942 				if (found != NULL) {
5943 					break;
5944 				}
5945 			}
5946 		}
5947 	}
5948 	if (found != NULL) {
5949 		bind_rdataset(rbtdb, rbtnode, found, now, isc_rwlocktype_read,
5950 			      rdataset);
5951 		if (foundsig != NULL) {
5952 			bind_rdataset(rbtdb, rbtnode, foundsig, now,
5953 				      isc_rwlocktype_read, sigrdataset);
5954 		}
5955 	}
5956 
5957 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5958 		    isc_rwlocktype_read);
5959 
5960 	if (close_version) {
5961 		closeversion(db, (dns_dbversion_t **)(void *)(&rbtversion),
5962 			     false);
5963 	}
5964 
5965 	if (found == NULL) {
5966 		return (ISC_R_NOTFOUND);
5967 	}
5968 
5969 	return (ISC_R_SUCCESS);
5970 }
5971 
5972 static isc_result_t
5973 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5974 		   dns_rdatatype_t type, dns_rdatatype_t covers,
5975 		   isc_stdtime_t now, dns_rdataset_t *rdataset,
5976 		   dns_rdataset_t *sigrdataset) {
5977 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5978 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5979 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5980 	rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
5981 	isc_result_t result;
5982 	nodelock_t *lock;
5983 	isc_rwlocktype_t locktype;
5984 
5985 	REQUIRE(VALID_RBTDB(rbtdb));
5986 	REQUIRE(type != dns_rdatatype_any);
5987 
5988 	UNUSED(version);
5989 
5990 	result = ISC_R_SUCCESS;
5991 
5992 	if (now == 0) {
5993 		isc_stdtime_get(&now);
5994 	}
5995 
5996 	lock = &rbtdb->node_locks[rbtnode->locknum].lock;
5997 	locktype = isc_rwlocktype_read;
5998 	NODE_LOCK(lock, locktype);
5999 
6000 	found = NULL;
6001 	foundsig = NULL;
6002 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
6003 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
6004 	if (covers == 0) {
6005 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
6006 	} else {
6007 		sigmatchtype = 0;
6008 	}
6009 
6010 	for (header = rbtnode->data; header != NULL; header = header_next) {
6011 		header_next = header->next;
6012 		if (!ACTIVE(header, now)) {
6013 			if ((header->rdh_ttl + STALE_TTL(header, rbtdb) <
6014 			     now - RBTDB_VIRTUAL) &&
6015 			    (locktype == isc_rwlocktype_write ||
6016 			     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
6017 			{
6018 				/*
6019 				 * We update the node's status only when we
6020 				 * can get write access.
6021 				 */
6022 				locktype = isc_rwlocktype_write;
6023 
6024 				/*
6025 				 * We don't check if refcurrent(rbtnode) == 0
6026 				 * and try to free like we do in cache_find(),
6027 				 * because refcurrent(rbtnode) must be
6028 				 * non-zero.  This is so because 'node' is an
6029 				 * argument to the function.
6030 				 */
6031 				mark_header_ancient(rbtdb, header);
6032 			}
6033 		} else if (EXISTS(header) && !ANCIENT(header)) {
6034 			if (header->type == matchtype) {
6035 				found = header;
6036 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
6037 				   header->type == negtype)
6038 			{
6039 				found = header;
6040 			} else if (header->type == sigmatchtype) {
6041 				foundsig = header;
6042 			}
6043 		}
6044 	}
6045 	if (found != NULL) {
6046 		bind_rdataset(rbtdb, rbtnode, found, now, locktype, rdataset);
6047 		if (!NEGATIVE(found) && foundsig != NULL) {
6048 			bind_rdataset(rbtdb, rbtnode, foundsig, now, locktype,
6049 				      sigrdataset);
6050 		}
6051 	}
6052 
6053 	NODE_UNLOCK(lock, locktype);
6054 
6055 	if (found == NULL) {
6056 		return (ISC_R_NOTFOUND);
6057 	}
6058 
6059 	if (NEGATIVE(found)) {
6060 		/*
6061 		 * We found a negative cache entry.
6062 		 */
6063 		if (NXDOMAIN(found)) {
6064 			result = DNS_R_NCACHENXDOMAIN;
6065 		} else {
6066 			result = DNS_R_NCACHENXRRSET;
6067 		}
6068 	}
6069 
6070 	update_cachestats(rbtdb, result);
6071 
6072 	return (result);
6073 }
6074 
6075 static isc_result_t
6076 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
6077 	     unsigned int options, isc_stdtime_t now,
6078 	     dns_rdatasetiter_t **iteratorp) {
6079 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6080 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
6081 	rbtdb_version_t *rbtversion = version;
6082 	rbtdb_rdatasetiter_t *iterator;
6083 
6084 	REQUIRE(VALID_RBTDB(rbtdb));
6085 
6086 	iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
6087 
6088 	if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
6089 		now = 0;
6090 		if (rbtversion == NULL) {
6091 			currentversion(
6092 				db, (dns_dbversion_t **)(void *)(&rbtversion));
6093 		} else {
6094 			INSIST(rbtversion->rbtdb == rbtdb);
6095 
6096 			(void)isc_refcount_increment(&rbtversion->references);
6097 		}
6098 	} else {
6099 		if (now == 0) {
6100 			isc_stdtime_get(&now);
6101 		}
6102 		rbtversion = NULL;
6103 	}
6104 
6105 	iterator->common.magic = DNS_RDATASETITER_MAGIC;
6106 	iterator->common.methods = &rdatasetiter_methods;
6107 	iterator->common.db = db;
6108 	iterator->common.node = node;
6109 	iterator->common.version = (dns_dbversion_t *)rbtversion;
6110 	iterator->common.options = options;
6111 	iterator->common.now = now;
6112 
6113 	isc_refcount_increment(&rbtnode->references);
6114 
6115 	iterator->current = NULL;
6116 
6117 	*iteratorp = (dns_rdatasetiter_t *)iterator;
6118 
6119 	return (ISC_R_SUCCESS);
6120 }
6121 
6122 static bool
6123 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
6124 	rdatasetheader_t *header, *header_next;
6125 	bool cname = false, other_data = false;
6126 	dns_rdatatype_t rdtype;
6127 
6128 	/*
6129 	 * The caller must hold the node lock.
6130 	 */
6131 
6132 	/*
6133 	 * Look for CNAME and "other data" rdatasets active in our version.
6134 	 */
6135 	for (header = node->data; header != NULL; header = header_next) {
6136 		header_next = header->next;
6137 		if (!prio_type(header->type)) {
6138 			/*
6139 			 * CNAME is in the priority list, so if we are done
6140 			 * with the priority list, we know there will not be
6141 			 * CNAME, so we are safe to skip the rest of the types.
6142 			 */
6143 			return (false);
6144 		}
6145 		if (header->type == dns_rdatatype_cname) {
6146 			/*
6147 			 * Look for an active extant CNAME.
6148 			 */
6149 			do {
6150 				if (header->serial <= serial && !IGNORE(header))
6151 				{
6152 					/*
6153 					 * Is this a "this rdataset doesn't
6154 					 * exist" record?
6155 					 */
6156 					if (NONEXISTENT(header)) {
6157 						header = NULL;
6158 					}
6159 					break;
6160 				} else {
6161 					header = header->down;
6162 				}
6163 			} while (header != NULL);
6164 			if (header != NULL) {
6165 				cname = true;
6166 			}
6167 		} else {
6168 			/*
6169 			 * Look for active extant "other data".
6170 			 *
6171 			 * "Other data" is any rdataset whose type is not
6172 			 * KEY, NSEC, SIG or RRSIG.
6173 			 */
6174 			rdtype = RBTDB_RDATATYPE_BASE(header->type);
6175 			if (rdtype != dns_rdatatype_key &&
6176 			    rdtype != dns_rdatatype_sig &&
6177 			    rdtype != dns_rdatatype_nsec &&
6178 			    rdtype != dns_rdatatype_rrsig)
6179 			{
6180 				/*
6181 				 * Is it active and extant?
6182 				 */
6183 				do {
6184 					if (header->serial <= serial &&
6185 					    !IGNORE(header))
6186 					{
6187 						/*
6188 						 * Is this a "this rdataset
6189 						 * doesn't exist" record?
6190 						 */
6191 						if (NONEXISTENT(header)) {
6192 							header = NULL;
6193 						}
6194 						break;
6195 					} else {
6196 						header = header->down;
6197 					}
6198 				} while (header != NULL);
6199 				if (header != NULL) {
6200 					other_data = true;
6201 				}
6202 			}
6203 		}
6204 		if (cname && other_data) {
6205 			return (true);
6206 		}
6207 	}
6208 
6209 	return (false);
6210 }
6211 
6212 static void
6213 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader) {
6214 	INSIST(!IS_CACHE(rbtdb));
6215 	INSIST(newheader->heap_index == 0);
6216 	INSIST(!ISC_LINK_LINKED(newheader, link));
6217 
6218 	isc_heap_insert(rbtdb->heaps[idx], newheader);
6219 }
6220 
6221 /*
6222  * node write lock must be held.
6223  */
6224 static void
6225 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
6226 	      rdatasetheader_t *header) {
6227 	/*
6228 	 * Remove the old header from the heap
6229 	 */
6230 	if (header != NULL && header->heap_index != 0) {
6231 		isc_heap_delete(rbtdb->heaps[header->node->locknum],
6232 				header->heap_index);
6233 		header->heap_index = 0;
6234 		if (version != NULL) {
6235 			new_reference(rbtdb, header->node,
6236 				      isc_rwlocktype_write);
6237 			ISC_LIST_APPEND(version->resigned_list, header, link);
6238 		}
6239 	}
6240 }
6241 
6242 static uint64_t
6243 recordsize(rdatasetheader_t *header, unsigned int namelen) {
6244 	return (dns_rdataslab_rdatasize((unsigned char *)header,
6245 					sizeof(*header)) +
6246 		sizeof(dns_ttl_t) + sizeof(dns_rdatatype_t) +
6247 		sizeof(dns_rdataclass_t) + namelen);
6248 }
6249 
6250 static void
6251 update_recordsandxfrsize(bool add, rbtdb_version_t *rbtversion,
6252 			 rdatasetheader_t *header, unsigned int namelen) {
6253 	unsigned char *hdr = (unsigned char *)header;
6254 	size_t hdrsize = sizeof(*header);
6255 
6256 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6257 	if (add) {
6258 		rbtversion->records += dns_rdataslab_count(hdr, hdrsize);
6259 		rbtversion->xfrsize += recordsize(header, namelen);
6260 	} else {
6261 		rbtversion->records -= dns_rdataslab_count(hdr, hdrsize);
6262 		rbtversion->xfrsize -= recordsize(header, namelen);
6263 	}
6264 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6265 }
6266 
6267 static bool
6268 overmaxtype(dns_rbtdb_t *rbtdb, uint32_t ntypes) {
6269 	if (rbtdb->maxtypepername == 0) {
6270 		return (false);
6271 	}
6272 
6273 	return (ntypes >= rbtdb->maxtypepername);
6274 }
6275 
6276 static bool
6277 prio_header(rdatasetheader_t *header) {
6278 	if (NEGATIVE(header) && prio_type(RBTDB_RDATATYPE_EXT(header->type))) {
6279 		return (true);
6280 	}
6281 
6282 	return (prio_type(header->type));
6283 }
6284 
6285 /*
6286  * write lock on rbtnode must be held.
6287  */
6288 static isc_result_t
6289 add32(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, const dns_name_t *nodename,
6290       rbtdb_version_t *rbtversion, rdatasetheader_t *newheader,
6291       unsigned int options, bool loading, dns_rdataset_t *addedrdataset,
6292       isc_stdtime_t now) {
6293 	rbtdb_changed_t *changed = NULL;
6294 	rdatasetheader_t *topheader = NULL, *topheader_prev = NULL;
6295 	rdatasetheader_t *header = NULL, *sigheader = NULL;
6296 	rdatasetheader_t *prioheader = NULL, *expireheader = NULL;
6297 	unsigned char *merged = NULL;
6298 	isc_result_t result;
6299 	bool header_nx;
6300 	bool newheader_nx;
6301 	bool merge;
6302 	dns_rdatatype_t rdtype, covers;
6303 	rbtdb_rdatatype_t negtype, sigtype;
6304 	dns_trust_t trust;
6305 	int idx;
6306 	uint32_t ntypes = 0;
6307 
6308 	/*
6309 	 * Add an rdatasetheader_t to a node.
6310 	 */
6311 
6312 	/*
6313 	 * Caller must be holding the node lock.
6314 	 */
6315 
6316 	if ((options & DNS_DBADD_MERGE) != 0) {
6317 		REQUIRE(rbtversion != NULL);
6318 		merge = true;
6319 	} else {
6320 		merge = false;
6321 	}
6322 
6323 	if ((options & DNS_DBADD_FORCE) != 0) {
6324 		trust = dns_trust_ultimate;
6325 	} else {
6326 		trust = newheader->trust;
6327 	}
6328 
6329 	if (rbtversion != NULL && !loading) {
6330 		/*
6331 		 * We always add a changed record, even if no changes end up
6332 		 * being made to this node, because it's harmless and
6333 		 * simplifies the code.
6334 		 */
6335 		changed = add_changed(rbtdb, rbtversion, rbtnode);
6336 		if (changed == NULL) {
6337 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6338 			return (ISC_R_NOMEMORY);
6339 		}
6340 	}
6341 
6342 	newheader_nx = NONEXISTENT(newheader) ? true : false;
6343 	topheader_prev = NULL;
6344 	sigheader = NULL;
6345 	negtype = 0;
6346 	if (rbtversion == NULL && !newheader_nx) {
6347 		rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
6348 		covers = RBTDB_RDATATYPE_EXT(newheader->type);
6349 		sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, covers);
6350 		if (NEGATIVE(newheader)) {
6351 			/*
6352 			 * We're adding a negative cache entry.
6353 			 */
6354 			if (covers == dns_rdatatype_any) {
6355 				/*
6356 				 * If we're adding an negative cache entry
6357 				 * which covers all types (NXDOMAIN,
6358 				 * NODATA(QTYPE=ANY)),
6359 				 *
6360 				 * We make all other data ancient so that the
6361 				 * only rdataset that can be found at this
6362 				 * node is the negative cache entry.
6363 				 */
6364 				for (topheader = rbtnode->data;
6365 				     topheader != NULL;
6366 				     topheader = topheader->next)
6367 				{
6368 					set_ttl(rbtdb, topheader, 0);
6369 					mark_header_ancient(rbtdb, topheader);
6370 				}
6371 				goto find_header;
6372 			}
6373 			/*
6374 			 * Otherwise look for any RRSIGs of the given
6375 			 * type so they can be marked ancient later.
6376 			 */
6377 			for (topheader = rbtnode->data; topheader != NULL;
6378 			     topheader = topheader->next)
6379 			{
6380 				if (topheader->type == sigtype) {
6381 					sigheader = topheader;
6382 					break;
6383 				}
6384 			}
6385 			negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6386 		} else {
6387 			/*
6388 			 * We're adding something that isn't a
6389 			 * negative cache entry.  Look for an extant
6390 			 * non-ancient NXDOMAIN/NODATA(QTYPE=ANY) negative
6391 			 * cache entry.  If we're adding an RRSIG, also
6392 			 * check for an extant non-ancient NODATA ncache
6393 			 * entry which covers the same type as the RRSIG.
6394 			 */
6395 			for (topheader = rbtnode->data; topheader != NULL;
6396 			     topheader = topheader->next)
6397 			{
6398 				if ((topheader->type ==
6399 				     RBTDB_RDATATYPE_NCACHEANY) ||
6400 				    (newheader->type == sigtype &&
6401 				     topheader->type ==
6402 					     RBTDB_RDATATYPE_VALUE(0, covers)))
6403 				{
6404 					break;
6405 				}
6406 			}
6407 			if (topheader != NULL && EXISTS(topheader) &&
6408 			    ACTIVE(topheader, now))
6409 			{
6410 				/*
6411 				 * Found one.
6412 				 */
6413 				if (trust < topheader->trust) {
6414 					/*
6415 					 * The NXDOMAIN/NODATA(QTYPE=ANY)
6416 					 * is more trusted.
6417 					 */
6418 					free_rdataset(rbtdb, rbtdb->common.mctx,
6419 						      newheader);
6420 					if (addedrdataset != NULL) {
6421 						bind_rdataset(
6422 							rbtdb, rbtnode,
6423 							topheader, now,
6424 							isc_rwlocktype_write,
6425 							addedrdataset);
6426 					}
6427 					return (DNS_R_UNCHANGED);
6428 				}
6429 				/*
6430 				 * The new rdataset is better.  Expire the
6431 				 * ncache entry.
6432 				 */
6433 				set_ttl(rbtdb, topheader, 0);
6434 				mark_header_ancient(rbtdb, topheader);
6435 				topheader = NULL;
6436 				goto find_header;
6437 			}
6438 			negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6439 		}
6440 	}
6441 
6442 	for (topheader = rbtnode->data; topheader != NULL;
6443 	     topheader = topheader->next)
6444 	{
6445 		if (IS_CACHE(rbtdb) && ACTIVE(topheader, now)) {
6446 			++ntypes;
6447 			expireheader = topheader;
6448 		} else if (!IS_CACHE(rbtdb)) {
6449 			++ntypes;
6450 		}
6451 		if (prio_header(topheader)) {
6452 			prioheader = topheader;
6453 		}
6454 		if (topheader->type == newheader->type ||
6455 		    topheader->type == negtype)
6456 		{
6457 			break;
6458 		}
6459 		topheader_prev = topheader;
6460 	}
6461 
6462 find_header:
6463 	/*
6464 	 * If header isn't NULL, we've found the right type.  There may be
6465 	 * IGNORE rdatasets between the top of the chain and the first real
6466 	 * data.  We skip over them.
6467 	 */
6468 	header = topheader;
6469 	while (header != NULL && IGNORE(header)) {
6470 		header = header->down;
6471 	}
6472 	if (header != NULL) {
6473 		header_nx = NONEXISTENT(header) ? true : false;
6474 
6475 		/*
6476 		 * Deleting an already non-existent rdataset has no effect.
6477 		 */
6478 		if (header_nx && newheader_nx) {
6479 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6480 			return (DNS_R_UNCHANGED);
6481 		}
6482 
6483 		/*
6484 		 * Trying to add an rdataset with lower trust to a cache
6485 		 * DB has no effect, provided that the cache data isn't
6486 		 * stale. If the cache data is stale, new lower trust
6487 		 * data will supersede it below. Unclear what the best
6488 		 * policy is here.
6489 		 */
6490 		if (rbtversion == NULL && trust < header->trust &&
6491 		    (ACTIVE(header, now) || header_nx))
6492 		{
6493 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6494 			if (addedrdataset != NULL) {
6495 				bind_rdataset(rbtdb, rbtnode, header, now,
6496 					      isc_rwlocktype_write,
6497 					      addedrdataset);
6498 			}
6499 			return (DNS_R_UNCHANGED);
6500 		}
6501 
6502 		/*
6503 		 * Don't merge if a nonexistent rdataset is involved.
6504 		 */
6505 		if (merge && (header_nx || newheader_nx)) {
6506 			merge = false;
6507 		}
6508 
6509 		/*
6510 		 * If 'merge' is true, we'll try to create a new rdataset
6511 		 * that is the union of 'newheader' and 'header'.
6512 		 */
6513 		if (merge) {
6514 			unsigned int flags = 0;
6515 			INSIST(rbtversion->serial >= header->serial);
6516 			merged = NULL;
6517 			result = ISC_R_SUCCESS;
6518 
6519 			if ((options & DNS_DBADD_EXACT) != 0) {
6520 				flags |= DNS_RDATASLAB_EXACT;
6521 			}
6522 			/*
6523 			 * TTL use here is irrelevant to the cache;
6524 			 * merge is only done with zonedbs.
6525 			 */
6526 			if ((options & DNS_DBADD_EXACTTTL) != 0 &&
6527 			    newheader->rdh_ttl != header->rdh_ttl)
6528 			{
6529 				result = DNS_R_NOTEXACT;
6530 			} else if (newheader->rdh_ttl != header->rdh_ttl) {
6531 				flags |= DNS_RDATASLAB_FORCE;
6532 			}
6533 			if (result == ISC_R_SUCCESS) {
6534 				result = dns_rdataslab_merge(
6535 					(unsigned char *)header,
6536 					(unsigned char *)newheader,
6537 					(unsigned int)(sizeof(*newheader)),
6538 					rbtdb->common.mctx,
6539 					rbtdb->common.rdclass,
6540 					(dns_rdatatype_t)header->type, flags,
6541 					rbtdb->maxrrperset, &merged);
6542 			}
6543 			if (result == ISC_R_SUCCESS) {
6544 				/*
6545 				 * If 'header' has the same serial number as
6546 				 * we do, we could clean it up now if we knew
6547 				 * that our caller had no references to it.
6548 				 * We don't know this, however, so we leave it
6549 				 * alone.  It will get cleaned up when
6550 				 * clean_zone_node() runs.
6551 				 */
6552 				free_rdataset(rbtdb, rbtdb->common.mctx,
6553 					      newheader);
6554 				newheader = (rdatasetheader_t *)merged;
6555 				init_rdataset(rbtdb, newheader);
6556 				update_newheader(newheader, header);
6557 				if (loading && RESIGN(newheader) &&
6558 				    RESIGN(header) &&
6559 				    resign_sooner(header, newheader))
6560 				{
6561 					newheader->resign = header->resign;
6562 					newheader->resign_lsb =
6563 						header->resign_lsb;
6564 				}
6565 			} else {
6566 				free_rdataset(rbtdb, rbtdb->common.mctx,
6567 					      newheader);
6568 				return (result);
6569 			}
6570 		}
6571 		/*
6572 		 * Don't replace existing NS, A and AAAA RRsets in the
6573 		 * cache if they are already exist. This prevents named
6574 		 * being locked to old servers. Don't lower trust of
6575 		 * existing record if the update is forced. Nothing
6576 		 * special to be done w.r.t stale data; it gets replaced
6577 		 * normally further down.
6578 		 */
6579 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6580 		    header->type == dns_rdatatype_ns && !header_nx &&
6581 		    !newheader_nx && header->trust >= newheader->trust &&
6582 		    dns_rdataslab_equalx((unsigned char *)header,
6583 					 (unsigned char *)newheader,
6584 					 (unsigned int)(sizeof(*newheader)),
6585 					 rbtdb->common.rdclass,
6586 					 (dns_rdatatype_t)header->type))
6587 		{
6588 			/*
6589 			 * Honour the new ttl if it is less than the
6590 			 * older one.
6591 			 */
6592 			if (header->rdh_ttl > newheader->rdh_ttl) {
6593 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6594 			}
6595 			if (header->last_used != now) {
6596 				update_header(rbtdb, header, now);
6597 			}
6598 			if (header->noqname == NULL &&
6599 			    newheader->noqname != NULL)
6600 			{
6601 				header->noqname = newheader->noqname;
6602 				newheader->noqname = NULL;
6603 			}
6604 			if (header->closest == NULL &&
6605 			    newheader->closest != NULL)
6606 			{
6607 				header->closest = newheader->closest;
6608 				newheader->closest = NULL;
6609 			}
6610 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6611 			if (addedrdataset != NULL) {
6612 				bind_rdataset(rbtdb, rbtnode, header, now,
6613 					      isc_rwlocktype_write,
6614 					      addedrdataset);
6615 			}
6616 			return (ISC_R_SUCCESS);
6617 		}
6618 		/*
6619 		 * If we have will be replacing a NS RRset force its TTL
6620 		 * to be no more than the current NS RRset's TTL.  This
6621 		 * ensures the delegations that are withdrawn are honoured.
6622 		 */
6623 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6624 		    header->type == dns_rdatatype_ns && !header_nx &&
6625 		    !newheader_nx && header->trust <= newheader->trust)
6626 		{
6627 			if (newheader->rdh_ttl > header->rdh_ttl) {
6628 				newheader->rdh_ttl = header->rdh_ttl;
6629 			}
6630 		}
6631 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6632 		    (options & DNS_DBADD_PREFETCH) == 0 &&
6633 		    (header->type == dns_rdatatype_a ||
6634 		     header->type == dns_rdatatype_aaaa ||
6635 		     header->type == dns_rdatatype_ds ||
6636 		     header->type == RBTDB_RDATATYPE_SIGDS) &&
6637 		    !header_nx && !newheader_nx &&
6638 		    header->trust >= newheader->trust &&
6639 		    dns_rdataslab_equal((unsigned char *)header,
6640 					(unsigned char *)newheader,
6641 					(unsigned int)(sizeof(*newheader))))
6642 		{
6643 			/*
6644 			 * Honour the new ttl if it is less than the
6645 			 * older one.
6646 			 */
6647 			if (header->rdh_ttl > newheader->rdh_ttl) {
6648 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6649 			}
6650 			if (header->last_used != now) {
6651 				update_header(rbtdb, header, now);
6652 			}
6653 			if (header->noqname == NULL &&
6654 			    newheader->noqname != NULL)
6655 			{
6656 				header->noqname = newheader->noqname;
6657 				newheader->noqname = NULL;
6658 			}
6659 			if (header->closest == NULL &&
6660 			    newheader->closest != NULL)
6661 			{
6662 				header->closest = newheader->closest;
6663 				newheader->closest = NULL;
6664 			}
6665 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6666 			if (addedrdataset != NULL) {
6667 				bind_rdataset(rbtdb, rbtnode, header, now,
6668 					      isc_rwlocktype_write,
6669 					      addedrdataset);
6670 			}
6671 			return (ISC_R_SUCCESS);
6672 		}
6673 		INSIST(rbtversion == NULL ||
6674 		       rbtversion->serial >= topheader->serial);
6675 		if (loading) {
6676 			newheader->down = NULL;
6677 			idx = newheader->node->locknum;
6678 			if (IS_CACHE(rbtdb)) {
6679 				if (ZEROTTL(newheader)) {
6680 					newheader->last_used =
6681 						atomic_load(&rbtdb->last_used) +
6682 						1;
6683 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6684 							newheader, link);
6685 				} else {
6686 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6687 							 newheader, link);
6688 				}
6689 				INSIST(rbtdb->heaps != NULL);
6690 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6691 			} else if (RESIGN(newheader)) {
6692 				resign_insert(rbtdb, idx, newheader);
6693 				/*
6694 				 * Don't call resign_delete as we don't need
6695 				 * to reverse the delete.  The free_rdataset
6696 				 * call below will clean up the heap entry.
6697 				 */
6698 			}
6699 
6700 			/*
6701 			 * There are no other references to 'header' when
6702 			 * loading, so we MAY clean up 'header' now.
6703 			 * Since we don't generate changed records when
6704 			 * loading, we MUST clean up 'header' now.
6705 			 */
6706 			if (topheader_prev != NULL) {
6707 				topheader_prev->next = newheader;
6708 			} else {
6709 				rbtnode->data = newheader;
6710 			}
6711 			newheader->next = topheader->next;
6712 			if (rbtversion != NULL && !header_nx) {
6713 				update_recordsandxfrsize(false, rbtversion,
6714 							 header,
6715 							 nodename->length);
6716 			}
6717 			free_rdataset(rbtdb, rbtdb->common.mctx, header);
6718 		} else {
6719 			idx = newheader->node->locknum;
6720 			if (IS_CACHE(rbtdb)) {
6721 				INSIST(rbtdb->heaps != NULL);
6722 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6723 				if (ZEROTTL(newheader)) {
6724 					newheader->last_used =
6725 						atomic_load(&rbtdb->last_used) +
6726 						1;
6727 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6728 							newheader, link);
6729 				} else {
6730 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6731 							 newheader, link);
6732 				}
6733 			} else if (RESIGN(newheader)) {
6734 				resign_insert(rbtdb, idx, newheader);
6735 				resign_delete(rbtdb, rbtversion, header);
6736 			}
6737 			if (topheader_prev != NULL) {
6738 				topheader_prev->next = newheader;
6739 			} else {
6740 				rbtnode->data = newheader;
6741 			}
6742 			newheader->next = topheader->next;
6743 			newheader->down = topheader;
6744 			topheader->next = newheader;
6745 			rbtnode->dirty = 1;
6746 			if (changed != NULL) {
6747 				changed->dirty = true;
6748 			}
6749 			if (rbtversion == NULL) {
6750 				set_ttl(rbtdb, header, 0);
6751 				mark_header_ancient(rbtdb, header);
6752 				if (sigheader != NULL) {
6753 					set_ttl(rbtdb, sigheader, 0);
6754 					mark_header_ancient(rbtdb, sigheader);
6755 				}
6756 			}
6757 			if (rbtversion != NULL && !header_nx) {
6758 				update_recordsandxfrsize(false, rbtversion,
6759 							 header,
6760 							 nodename->length);
6761 			}
6762 		}
6763 	} else {
6764 		/*
6765 		 * No non-IGNORED rdatasets of the given type exist at
6766 		 * this node.
6767 		 */
6768 
6769 		/*
6770 		 * If we're trying to delete the type, don't bother.
6771 		 */
6772 		if (newheader_nx) {
6773 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6774 			return (DNS_R_UNCHANGED);
6775 		}
6776 
6777 		idx = newheader->node->locknum;
6778 		if (IS_CACHE(rbtdb)) {
6779 			isc_heap_insert(rbtdb->heaps[idx], newheader);
6780 			if (ZEROTTL(newheader)) {
6781 				ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6782 						newheader, link);
6783 			} else {
6784 				ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6785 						 newheader, link);
6786 			}
6787 		} else if (RESIGN(newheader)) {
6788 			resign_insert(rbtdb, idx, newheader);
6789 			resign_delete(rbtdb, rbtversion, header);
6790 		}
6791 
6792 		if (topheader != NULL) {
6793 			/*
6794 			 * We have an list of rdatasets of the given type,
6795 			 * but they're all marked IGNORE.  We simply insert
6796 			 * the new rdataset at the head of the list.
6797 			 *
6798 			 * Ignored rdatasets cannot occur during loading, so
6799 			 * we INSIST on it.
6800 			 */
6801 			INSIST(!loading);
6802 			INSIST(rbtversion == NULL ||
6803 			       rbtversion->serial >= topheader->serial);
6804 			if (topheader_prev != NULL) {
6805 				topheader_prev->next = newheader;
6806 			} else {
6807 				rbtnode->data = newheader;
6808 			}
6809 			newheader->next = topheader->next;
6810 			newheader->down = topheader;
6811 			topheader->next = newheader;
6812 			rbtnode->dirty = 1;
6813 			if (changed != NULL) {
6814 				changed->dirty = true;
6815 			}
6816 		} else {
6817 			/*
6818 			 * No rdatasets of the given type exist at the node.
6819 			 */
6820 			if (!IS_CACHE(rbtdb) && overmaxtype(rbtdb, ntypes)) {
6821 				free_rdataset(rbtdb, rbtdb->common.mctx,
6822 					      newheader);
6823 				return (DNS_R_TOOMANYRECORDS);
6824 			}
6825 
6826 			newheader->down = NULL;
6827 
6828 			if (prio_header(newheader)) {
6829 				/* This is a priority type, prepend it */
6830 				newheader->next = rbtnode->data;
6831 				rbtnode->data = newheader;
6832 			} else if (prioheader != NULL) {
6833 				/* Append after the priority headers */
6834 				newheader->next = prioheader->next;
6835 				prioheader->next = newheader;
6836 			} else {
6837 				/* There were no priority headers */
6838 				newheader->next = rbtnode->data;
6839 				rbtnode->data = newheader;
6840 			}
6841 
6842 			if (IS_CACHE(rbtdb) && overmaxtype(rbtdb, ntypes)) {
6843 				if (expireheader == NULL) {
6844 					expireheader = newheader;
6845 				}
6846 				if (NEGATIVE(newheader) &&
6847 				    !prio_header(newheader))
6848 				{
6849 					/*
6850 					 * Add the new non-priority negative
6851 					 * header to the database only
6852 					 * temporarily.
6853 					 */
6854 					expireheader = newheader;
6855 				}
6856 
6857 				set_ttl(rbtdb, expireheader, 0);
6858 				mark_header_ancient(rbtdb, expireheader);
6859 				/*
6860 				 * FIXME: In theory, we should mark the RRSIG
6861 				 * and the header at the same time, but there is
6862 				 * no direct link between those two header, so
6863 				 * we would have to check the whole list again.
6864 				 */
6865 			}
6866 		}
6867 	}
6868 
6869 	if (rbtversion != NULL && !newheader_nx) {
6870 		update_recordsandxfrsize(true, rbtversion, newheader,
6871 					 nodename->length);
6872 	}
6873 
6874 	/*
6875 	 * Check if the node now contains CNAME and other data.
6876 	 */
6877 	if (rbtversion != NULL &&
6878 	    cname_and_other_data(rbtnode, rbtversion->serial))
6879 	{
6880 		return (DNS_R_CNAMEANDOTHER);
6881 	}
6882 
6883 	if (addedrdataset != NULL) {
6884 		bind_rdataset(rbtdb, rbtnode, newheader, now,
6885 			      isc_rwlocktype_write, addedrdataset);
6886 	}
6887 
6888 	return (ISC_R_SUCCESS);
6889 }
6890 
6891 static bool
6892 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
6893 		rbtdb_rdatatype_t type) {
6894 	if (IS_CACHE(rbtdb)) {
6895 		if (type == dns_rdatatype_dname) {
6896 			return (true);
6897 		} else {
6898 			return (false);
6899 		}
6900 	} else if (type == dns_rdatatype_dname ||
6901 		   (type == dns_rdatatype_ns &&
6902 		    (node != rbtdb->origin_node || IS_STUB(rbtdb))))
6903 	{
6904 		return (true);
6905 	}
6906 	return (false);
6907 }
6908 
6909 static isc_result_t
6910 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6911 	   uint32_t maxrrperset, dns_rdataset_t *rdataset) {
6912 	struct noqname *noqname;
6913 	isc_mem_t *mctx = rbtdb->common.mctx;
6914 	dns_name_t name;
6915 	dns_rdataset_t neg, negsig;
6916 	isc_result_t result;
6917 	isc_region_t r;
6918 
6919 	dns_name_init(&name, NULL);
6920 	dns_rdataset_init(&neg);
6921 	dns_rdataset_init(&negsig);
6922 
6923 	result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig);
6924 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6925 
6926 	noqname = isc_mem_get(mctx, sizeof(*noqname));
6927 	dns_name_init(&noqname->name, NULL);
6928 	noqname->neg = NULL;
6929 	noqname->negsig = NULL;
6930 	noqname->type = neg.type;
6931 	dns_name_dup(&name, mctx, &noqname->name);
6932 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0, maxrrperset);
6933 	if (result != ISC_R_SUCCESS) {
6934 		goto cleanup;
6935 	}
6936 	noqname->neg = r.base;
6937 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0, maxrrperset);
6938 	if (result != ISC_R_SUCCESS) {
6939 		goto cleanup;
6940 	}
6941 	noqname->negsig = r.base;
6942 	dns_rdataset_disassociate(&neg);
6943 	dns_rdataset_disassociate(&negsig);
6944 	newheader->noqname = noqname;
6945 	return (ISC_R_SUCCESS);
6946 
6947 cleanup:
6948 	dns_rdataset_disassociate(&neg);
6949 	dns_rdataset_disassociate(&negsig);
6950 	free_noqname(mctx, &noqname);
6951 	return (result);
6952 }
6953 
6954 static isc_result_t
6955 addclosest(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6956 	   uint32_t maxrrperset, dns_rdataset_t *rdataset) {
6957 	struct noqname *closest;
6958 	isc_mem_t *mctx = rbtdb->common.mctx;
6959 	dns_name_t name;
6960 	dns_rdataset_t neg, negsig;
6961 	isc_result_t result;
6962 	isc_region_t r;
6963 
6964 	dns_name_init(&name, NULL);
6965 	dns_rdataset_init(&neg);
6966 	dns_rdataset_init(&negsig);
6967 
6968 	result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig);
6969 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6970 
6971 	closest = isc_mem_get(mctx, sizeof(*closest));
6972 	dns_name_init(&closest->name, NULL);
6973 	closest->neg = NULL;
6974 	closest->negsig = NULL;
6975 	closest->type = neg.type;
6976 	dns_name_dup(&name, mctx, &closest->name);
6977 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0, maxrrperset);
6978 	if (result != ISC_R_SUCCESS) {
6979 		goto cleanup;
6980 	}
6981 	closest->neg = r.base;
6982 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0, maxrrperset);
6983 	if (result != ISC_R_SUCCESS) {
6984 		goto cleanup;
6985 	}
6986 	closest->negsig = r.base;
6987 	dns_rdataset_disassociate(&neg);
6988 	dns_rdataset_disassociate(&negsig);
6989 	newheader->closest = closest;
6990 	return (ISC_R_SUCCESS);
6991 
6992 cleanup:
6993 	dns_rdataset_disassociate(&neg);
6994 	dns_rdataset_disassociate(&negsig);
6995 	free_noqname(mctx, &closest);
6996 	return (result);
6997 }
6998 
6999 static dns_dbmethods_t zone_methods;
7000 
7001 static size_t
7002 rdataset_size(rdatasetheader_t *header) {
7003 	if (!NONEXISTENT(header)) {
7004 		return (dns_rdataslab_size((unsigned char *)header,
7005 					   sizeof(*header)));
7006 	}
7007 
7008 	return (sizeof(*header));
7009 }
7010 
7011 static void
7012 expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, bool tree_locked,
7013 		   isc_stdtime_t now);
7014 
7015 static isc_result_t
7016 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7017 	    isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
7018 	    dns_rdataset_t *addedrdataset) {
7019 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7020 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7021 	rbtdb_version_t *rbtversion = version;
7022 	isc_region_t region;
7023 	rdatasetheader_t *newheader;
7024 	isc_result_t result;
7025 	bool delegating;
7026 	bool newnsec;
7027 	bool tree_locked = false;
7028 	bool cache_is_overmem = false;
7029 	dns_fixedname_t fixed;
7030 	dns_name_t *name;
7031 
7032 	REQUIRE(VALID_RBTDB(rbtdb));
7033 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7034 
7035 	if (rbtdb->common.methods == &zone_methods) {
7036 		/*
7037 		 * SOA records are only allowed at top of zone.
7038 		 */
7039 		if (rdataset->type == dns_rdatatype_soa &&
7040 		    node != rbtdb->origin_node)
7041 		{
7042 			return (DNS_R_NOTZONETOP);
7043 		}
7044 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7045 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
7046 			  (rdataset->type == dns_rdatatype_nsec3 ||
7047 			   rdataset->covers == dns_rdatatype_nsec3)) ||
7048 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
7049 			  rdataset->type != dns_rdatatype_nsec3 &&
7050 			  rdataset->covers != dns_rdatatype_nsec3)));
7051 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7052 	}
7053 
7054 	if (rbtversion == NULL) {
7055 		if (now == 0) {
7056 			isc_stdtime_get(&now);
7057 		}
7058 	} else {
7059 		now = 0;
7060 	}
7061 
7062 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7063 					    &region, sizeof(rdatasetheader_t),
7064 					    rbtdb->maxrrperset);
7065 	if (result != ISC_R_SUCCESS) {
7066 		return (result);
7067 	}
7068 
7069 	name = dns_fixedname_initname(&fixed);
7070 	nodefullname(db, node, name);
7071 	dns_rdataset_getownercase(rdataset, name);
7072 
7073 	newheader = (rdatasetheader_t *)region.base;
7074 	init_rdataset(rbtdb, newheader);
7075 	setownercase(newheader, name);
7076 	set_ttl(rbtdb, newheader, rdataset->ttl + now);
7077 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7078 						rdataset->covers);
7079 	atomic_init(&newheader->attributes, 0);
7080 	if (rdataset->ttl == 0U) {
7081 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_ZEROTTL);
7082 	}
7083 	newheader->noqname = NULL;
7084 	newheader->closest = NULL;
7085 	atomic_init(&newheader->count,
7086 		    atomic_fetch_add_relaxed(&init_count, 1));
7087 	newheader->trust = rdataset->trust;
7088 	newheader->last_used = now;
7089 	newheader->node = rbtnode;
7090 	if (rbtversion != NULL) {
7091 		newheader->serial = rbtversion->serial;
7092 		now = 0;
7093 
7094 		if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7095 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7096 			newheader->resign =
7097 				(isc_stdtime_t)(dns_time64_from32(
7098 							rdataset->resign) >>
7099 						1);
7100 			newheader->resign_lsb = rdataset->resign & 0x1;
7101 		} else {
7102 			newheader->resign = 0;
7103 			newheader->resign_lsb = 0;
7104 		}
7105 	} else {
7106 		newheader->serial = 1;
7107 		newheader->resign = 0;
7108 		newheader->resign_lsb = 0;
7109 		if ((rdataset->attributes & DNS_RDATASETATTR_PREFETCH) != 0) {
7110 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_PREFETCH);
7111 		}
7112 		if ((rdataset->attributes & DNS_RDATASETATTR_NEGATIVE) != 0) {
7113 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NEGATIVE);
7114 		}
7115 		if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) {
7116 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NXDOMAIN);
7117 		}
7118 		if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) {
7119 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_OPTOUT);
7120 		}
7121 		if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
7122 			result = addnoqname(rbtdb, newheader,
7123 					    rbtdb->maxrrperset, rdataset);
7124 			if (result != ISC_R_SUCCESS) {
7125 				free_rdataset(rbtdb, rbtdb->common.mctx,
7126 					      newheader);
7127 				return (result);
7128 			}
7129 		}
7130 		if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) {
7131 			result = addclosest(rbtdb, newheader,
7132 					    rbtdb->maxrrperset, rdataset);
7133 			if (result != ISC_R_SUCCESS) {
7134 				free_rdataset(rbtdb, rbtdb->common.mctx,
7135 					      newheader);
7136 				return (result);
7137 			}
7138 		}
7139 	}
7140 
7141 	/*
7142 	 * If we're adding a delegation type (e.g. NS or DNAME for a zone,
7143 	 * just DNAME for the cache), then we need to set the callback bit
7144 	 * on the node.
7145 	 */
7146 	if (delegating_type(rbtdb, rbtnode, rdataset->type)) {
7147 		delegating = true;
7148 	} else {
7149 		delegating = false;
7150 	}
7151 
7152 	/*
7153 	 * Add to the auxiliary NSEC tree if we're adding an NSEC record.
7154 	 */
7155 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7156 	if (rbtnode->nsec != DNS_RBT_NSEC_HAS_NSEC &&
7157 	    rdataset->type == dns_rdatatype_nsec)
7158 	{
7159 		newnsec = true;
7160 	} else {
7161 		newnsec = false;
7162 	}
7163 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7164 
7165 	/*
7166 	 * If we're adding a delegation type, adding to the auxiliary NSEC
7167 	 * tree, or the DB is a cache in an overmem state, hold an
7168 	 * exclusive lock on the tree.  In the latter case the lock does
7169 	 * not necessarily have to be acquired but it will help purge
7170 	 * ancient entries more effectively.
7171 	 */
7172 	if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) {
7173 		cache_is_overmem = true;
7174 	}
7175 	if (delegating || newnsec || cache_is_overmem) {
7176 		tree_locked = true;
7177 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7178 	}
7179 
7180 	if (cache_is_overmem) {
7181 		overmem_purge(rbtdb, newheader, tree_locked);
7182 	}
7183 
7184 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7185 		  isc_rwlocktype_write);
7186 
7187 	if (rbtdb->rrsetstats != NULL) {
7188 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_STATCOUNT);
7189 		update_rrsetstats(rbtdb, newheader->type,
7190 				  atomic_load_acquire(&newheader->attributes),
7191 				  true);
7192 	}
7193 
7194 	if (IS_CACHE(rbtdb)) {
7195 		if (tree_locked) {
7196 			cleanup_dead_nodes(rbtdb, rbtnode->locknum);
7197 		}
7198 
7199 		expire_ttl_headers(rbtdb, rbtnode->locknum, tree_locked, now);
7200 
7201 		/*
7202 		 * If we've been holding a write lock on the tree just for
7203 		 * cleaning, we can release it now.  However, we still need the
7204 		 * node lock.
7205 		 */
7206 		if (tree_locked && !delegating && !newnsec) {
7207 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7208 			tree_locked = false;
7209 		}
7210 	}
7211 
7212 	result = ISC_R_SUCCESS;
7213 	if (newnsec) {
7214 		dns_rbtnode_t *nsecnode;
7215 
7216 		nsecnode = NULL;
7217 		result = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7218 		if (result == ISC_R_SUCCESS) {
7219 			nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7220 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7221 		} else if (result == ISC_R_EXISTS) {
7222 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7223 			result = ISC_R_SUCCESS;
7224 		}
7225 	}
7226 
7227 	if (result == ISC_R_SUCCESS) {
7228 		result = add32(rbtdb, rbtnode, name, rbtversion, newheader,
7229 			       options, false, addedrdataset, now);
7230 	}
7231 	if (result == ISC_R_SUCCESS && delegating) {
7232 		rbtnode->find_callback = 1;
7233 	}
7234 
7235 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7236 		    isc_rwlocktype_write);
7237 
7238 	if (tree_locked) {
7239 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7240 	}
7241 
7242 	return (result);
7243 }
7244 
7245 static isc_result_t
7246 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7247 		 dns_rdataset_t *rdataset, unsigned int options,
7248 		 dns_rdataset_t *newrdataset) {
7249 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7250 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7251 	rbtdb_version_t *rbtversion = version;
7252 	dns_fixedname_t fname;
7253 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7254 	rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
7255 	unsigned char *subresult;
7256 	isc_region_t region;
7257 	isc_result_t result;
7258 	rbtdb_changed_t *changed;
7259 
7260 	REQUIRE(VALID_RBTDB(rbtdb));
7261 	REQUIRE(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
7262 
7263 	if (rbtdb->common.methods == &zone_methods) {
7264 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7265 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
7266 			  (rdataset->type == dns_rdatatype_nsec3 ||
7267 			   rdataset->covers == dns_rdatatype_nsec3)) ||
7268 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
7269 			  rdataset->type != dns_rdatatype_nsec3 &&
7270 			  rdataset->covers != dns_rdatatype_nsec3)));
7271 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7272 	}
7273 
7274 	nodefullname(db, node, nodename);
7275 
7276 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7277 					    &region, sizeof(rdatasetheader_t),
7278 					    0);
7279 	if (result != ISC_R_SUCCESS) {
7280 		return (result);
7281 	}
7282 	newheader = (rdatasetheader_t *)region.base;
7283 	init_rdataset(rbtdb, newheader);
7284 	set_ttl(rbtdb, newheader, rdataset->ttl);
7285 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7286 						rdataset->covers);
7287 	atomic_init(&newheader->attributes, 0);
7288 	newheader->serial = rbtversion->serial;
7289 	newheader->trust = 0;
7290 	newheader->noqname = NULL;
7291 	newheader->closest = NULL;
7292 	atomic_init(&newheader->count,
7293 		    atomic_fetch_add_relaxed(&init_count, 1));
7294 	newheader->last_used = 0;
7295 	newheader->node = rbtnode;
7296 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7297 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7298 		newheader->resign =
7299 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7300 					1);
7301 		newheader->resign_lsb = rdataset->resign & 0x1;
7302 	} else {
7303 		newheader->resign = 0;
7304 		newheader->resign_lsb = 0;
7305 	}
7306 
7307 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7308 		  isc_rwlocktype_write);
7309 
7310 	changed = add_changed(rbtdb, rbtversion, rbtnode);
7311 	if (changed == NULL) {
7312 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7313 		NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7314 			    isc_rwlocktype_write);
7315 		return (ISC_R_NOMEMORY);
7316 	}
7317 
7318 	topheader_prev = NULL;
7319 	for (topheader = rbtnode->data; topheader != NULL;
7320 	     topheader = topheader->next)
7321 	{
7322 		if (topheader->type == newheader->type) {
7323 			break;
7324 		}
7325 		topheader_prev = topheader;
7326 	}
7327 	/*
7328 	 * If header isn't NULL, we've found the right type.  There may be
7329 	 * IGNORE rdatasets between the top of the chain and the first real
7330 	 * data.  We skip over them.
7331 	 */
7332 	header = topheader;
7333 	while (header != NULL && IGNORE(header)) {
7334 		header = header->down;
7335 	}
7336 	if (header != NULL && EXISTS(header)) {
7337 		unsigned int flags = 0;
7338 		subresult = NULL;
7339 		result = ISC_R_SUCCESS;
7340 		if ((options & DNS_DBSUB_EXACT) != 0) {
7341 			flags |= DNS_RDATASLAB_EXACT;
7342 			if (newheader->rdh_ttl != header->rdh_ttl) {
7343 				result = DNS_R_NOTEXACT;
7344 			}
7345 		}
7346 		if (result == ISC_R_SUCCESS) {
7347 			result = dns_rdataslab_subtract(
7348 				(unsigned char *)header,
7349 				(unsigned char *)newheader,
7350 				(unsigned int)(sizeof(*newheader)),
7351 				rbtdb->common.mctx, rbtdb->common.rdclass,
7352 				(dns_rdatatype_t)header->type, flags,
7353 				&subresult);
7354 		}
7355 		if (result == ISC_R_SUCCESS) {
7356 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7357 			newheader = (rdatasetheader_t *)subresult;
7358 			init_rdataset(rbtdb, newheader);
7359 			update_newheader(newheader, header);
7360 			if (RESIGN(header)) {
7361 				RDATASET_ATTR_SET(newheader,
7362 						  RDATASET_ATTR_RESIGN);
7363 				newheader->resign = header->resign;
7364 				newheader->resign_lsb = header->resign_lsb;
7365 				resign_insert(rbtdb, rbtnode->locknum,
7366 					      newheader);
7367 			}
7368 			/*
7369 			 * We have to set the serial since the rdataslab
7370 			 * subtraction routine copies the reserved portion of
7371 			 * header, not newheader.
7372 			 */
7373 			newheader->serial = rbtversion->serial;
7374 			/*
7375 			 * XXXJT: dns_rdataslab_subtract() copied the pointers
7376 			 * to additional info.  We need to clear these fields
7377 			 * to avoid having duplicated references.
7378 			 */
7379 			update_recordsandxfrsize(true, rbtversion, newheader,
7380 						 nodename->length);
7381 		} else if (result == DNS_R_NXRRSET) {
7382 			/*
7383 			 * This subtraction would remove all of the rdata;
7384 			 * add a nonexistent header instead.
7385 			 */
7386 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7387 			newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7388 			if (newheader == NULL) {
7389 				result = ISC_R_NOMEMORY;
7390 				goto unlock;
7391 			}
7392 			init_rdataset(rbtdb, newheader);
7393 			set_ttl(rbtdb, newheader, 0);
7394 			newheader->type = topheader->type;
7395 			atomic_init(&newheader->attributes,
7396 				    RDATASET_ATTR_NONEXISTENT);
7397 			newheader->trust = 0;
7398 			newheader->serial = rbtversion->serial;
7399 			newheader->noqname = NULL;
7400 			newheader->closest = NULL;
7401 			atomic_init(&newheader->count, 0);
7402 			newheader->node = rbtnode;
7403 			newheader->resign = 0;
7404 			newheader->resign_lsb = 0;
7405 			newheader->last_used = 0;
7406 		} else {
7407 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7408 			goto unlock;
7409 		}
7410 
7411 		/*
7412 		 * If we're here, we want to link newheader in front of
7413 		 * topheader.
7414 		 */
7415 		INSIST(rbtversion->serial >= topheader->serial);
7416 		update_recordsandxfrsize(false, rbtversion, header,
7417 					 nodename->length);
7418 		if (topheader_prev != NULL) {
7419 			topheader_prev->next = newheader;
7420 		} else {
7421 			rbtnode->data = newheader;
7422 		}
7423 		newheader->next = topheader->next;
7424 		newheader->down = topheader;
7425 		topheader->next = newheader;
7426 		rbtnode->dirty = 1;
7427 		changed->dirty = true;
7428 		resign_delete(rbtdb, rbtversion, header);
7429 	} else {
7430 		/*
7431 		 * The rdataset doesn't exist, so we don't need to do anything
7432 		 * to satisfy the deletion request.
7433 		 */
7434 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7435 		if ((options & DNS_DBSUB_EXACT) != 0) {
7436 			result = DNS_R_NOTEXACT;
7437 		} else {
7438 			result = DNS_R_UNCHANGED;
7439 		}
7440 	}
7441 
7442 	if (result == ISC_R_SUCCESS && newrdataset != NULL) {
7443 		bind_rdataset(rbtdb, rbtnode, newheader, 0,
7444 			      isc_rwlocktype_write, newrdataset);
7445 	}
7446 
7447 	if (result == DNS_R_NXRRSET && newrdataset != NULL &&
7448 	    (options & DNS_DBSUB_WANTOLD) != 0)
7449 	{
7450 		bind_rdataset(rbtdb, rbtnode, header, 0, isc_rwlocktype_write,
7451 			      newrdataset);
7452 	}
7453 
7454 unlock:
7455 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7456 		    isc_rwlocktype_write);
7457 
7458 	return (result);
7459 }
7460 
7461 static isc_result_t
7462 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7463 	       dns_rdatatype_t type, dns_rdatatype_t covers) {
7464 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7465 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7466 	rbtdb_version_t *rbtversion = version;
7467 	dns_fixedname_t fname;
7468 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7469 	isc_result_t result;
7470 	rdatasetheader_t *newheader;
7471 
7472 	REQUIRE(VALID_RBTDB(rbtdb));
7473 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7474 
7475 	if (type == dns_rdatatype_any) {
7476 		return (ISC_R_NOTIMPLEMENTED);
7477 	}
7478 	if (type == dns_rdatatype_rrsig && covers == 0) {
7479 		return (ISC_R_NOTIMPLEMENTED);
7480 	}
7481 
7482 	newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7483 	if (newheader == NULL) {
7484 		return (ISC_R_NOMEMORY);
7485 	}
7486 	init_rdataset(rbtdb, newheader);
7487 	set_ttl(rbtdb, newheader, 0);
7488 	newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
7489 	atomic_init(&newheader->attributes, RDATASET_ATTR_NONEXISTENT);
7490 	newheader->trust = 0;
7491 	newheader->noqname = NULL;
7492 	newheader->closest = NULL;
7493 	if (rbtversion != NULL) {
7494 		newheader->serial = rbtversion->serial;
7495 	} else {
7496 		newheader->serial = 0;
7497 	}
7498 	atomic_init(&newheader->count, 0);
7499 	newheader->last_used = 0;
7500 	newheader->node = rbtnode;
7501 
7502 	nodefullname(db, node, nodename);
7503 
7504 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7505 		  isc_rwlocktype_write);
7506 	result = add32(rbtdb, rbtnode, nodename, rbtversion, newheader,
7507 		       DNS_DBADD_FORCE, false, NULL, 0);
7508 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7509 		    isc_rwlocktype_write);
7510 
7511 	return (result);
7512 }
7513 
7514 /*
7515  * load a non-NSEC3 node in the main tree and optionally to the auxiliary NSEC
7516  */
7517 static isc_result_t
7518 loadnode(dns_rbtdb_t *rbtdb, const dns_name_t *name, dns_rbtnode_t **nodep,
7519 	 bool hasnsec) {
7520 	isc_result_t noderesult, nsecresult, tmpresult;
7521 	dns_rbtnode_t *nsecnode = NULL, *node = NULL;
7522 
7523 	noderesult = dns_rbt_addnode(rbtdb->tree, name, &node);
7524 	if (!hasnsec) {
7525 		goto done;
7526 	}
7527 	if (noderesult == ISC_R_EXISTS) {
7528 		/*
7529 		 * Add a node to the auxiliary NSEC tree for an old node
7530 		 * just now getting an NSEC record.
7531 		 */
7532 		if (node->nsec == DNS_RBT_NSEC_HAS_NSEC) {
7533 			goto done;
7534 		}
7535 	} else if (noderesult != ISC_R_SUCCESS) {
7536 		goto done;
7537 	}
7538 
7539 	/*
7540 	 * Build the auxiliary tree for NSECs as we go.
7541 	 * This tree speeds searches for closest NSECs that would otherwise
7542 	 * need to examine many irrelevant nodes in large TLDs.
7543 	 *
7544 	 * Add nodes to the auxiliary tree after corresponding nodes have
7545 	 * been added to the main tree.
7546 	 */
7547 	nsecresult = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7548 	if (nsecresult == ISC_R_SUCCESS) {
7549 		nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7550 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7551 		goto done;
7552 	}
7553 
7554 	if (nsecresult == ISC_R_EXISTS) {
7555 #if 1 /* 0 */
7556 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7557 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7558 			      "addnode: NSEC node already exists");
7559 #endif /* if 1 */
7560 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7561 		goto done;
7562 	}
7563 
7564 	if (noderesult == ISC_R_SUCCESS) {
7565 		/*
7566 		 * Remove the node we just added above.
7567 		 */
7568 		tmpresult = dns_rbt_deletenode(rbtdb->tree, node, false);
7569 		if (tmpresult != ISC_R_SUCCESS) {
7570 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7571 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7572 				      "loading_addrdataset: "
7573 				      "dns_rbt_deletenode: %s after "
7574 				      "dns_rbt_addnode(NSEC): %s",
7575 				      isc_result_totext(tmpresult),
7576 				      isc_result_totext(noderesult));
7577 		}
7578 	}
7579 
7580 	/*
7581 	 * Set the error condition to be returned.
7582 	 */
7583 	noderesult = nsecresult;
7584 
7585 done:
7586 	if (noderesult == ISC_R_SUCCESS || noderesult == ISC_R_EXISTS) {
7587 		*nodep = node;
7588 	}
7589 
7590 	return (noderesult);
7591 }
7592 
7593 static isc_result_t
7594 loading_addrdataset(void *arg, const dns_name_t *name,
7595 		    dns_rdataset_t *rdataset) {
7596 	rbtdb_load_t *loadctx = arg;
7597 	dns_rbtdb_t *rbtdb = loadctx->rbtdb;
7598 	dns_rbtnode_t *node;
7599 	isc_result_t result;
7600 	isc_region_t region;
7601 	rdatasetheader_t *newheader;
7602 
7603 	REQUIRE(rdataset->rdclass == rbtdb->common.rdclass);
7604 
7605 	/*
7606 	 * SOA records are only allowed at top of zone.
7607 	 */
7608 	if (rdataset->type == dns_rdatatype_soa && !IS_CACHE(rbtdb) &&
7609 	    !dns_name_equal(name, &rbtdb->common.origin))
7610 	{
7611 		return (DNS_R_NOTZONETOP);
7612 	}
7613 
7614 	if (rdataset->type != dns_rdatatype_nsec3 &&
7615 	    rdataset->covers != dns_rdatatype_nsec3)
7616 	{
7617 		add_empty_wildcards(rbtdb, name, false);
7618 	}
7619 
7620 	if (dns_name_iswildcard(name)) {
7621 		/*
7622 		 * NS record owners cannot legally be wild cards.
7623 		 */
7624 		if (rdataset->type == dns_rdatatype_ns) {
7625 			return (DNS_R_INVALIDNS);
7626 		}
7627 		/*
7628 		 * NSEC3 record owners cannot legally be wild cards.
7629 		 */
7630 		if (rdataset->type == dns_rdatatype_nsec3) {
7631 			return (DNS_R_INVALIDNSEC3);
7632 		}
7633 		result = add_wildcard_magic(rbtdb, name, false);
7634 		if (result != ISC_R_SUCCESS) {
7635 			return (result);
7636 		}
7637 	}
7638 
7639 	node = NULL;
7640 	if (rdataset->type == dns_rdatatype_nsec3 ||
7641 	    rdataset->covers == dns_rdatatype_nsec3)
7642 	{
7643 		result = dns_rbt_addnode(rbtdb->nsec3, name, &node);
7644 		if (result == ISC_R_SUCCESS) {
7645 			node->nsec = DNS_RBT_NSEC_NSEC3;
7646 		}
7647 	} else if (rdataset->type == dns_rdatatype_nsec) {
7648 		result = loadnode(rbtdb, name, &node, true);
7649 	} else {
7650 		result = loadnode(rbtdb, name, &node, false);
7651 	}
7652 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
7653 		return (result);
7654 	}
7655 	if (result == ISC_R_SUCCESS) {
7656 		node->locknum = node->hashval % rbtdb->node_lock_count;
7657 	}
7658 
7659 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7660 					    &region, sizeof(rdatasetheader_t),
7661 					    rbtdb->maxrrperset);
7662 	if (result != ISC_R_SUCCESS) {
7663 		return (result);
7664 	}
7665 	newheader = (rdatasetheader_t *)region.base;
7666 	init_rdataset(rbtdb, newheader);
7667 	set_ttl(rbtdb, newheader, rdataset->ttl + loadctx->now); /* XXX overflow
7668 								  * check */
7669 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7670 						rdataset->covers);
7671 	atomic_init(&newheader->attributes, 0);
7672 	newheader->trust = rdataset->trust;
7673 	newheader->serial = 1;
7674 	newheader->noqname = NULL;
7675 	newheader->closest = NULL;
7676 	atomic_init(&newheader->count,
7677 		    atomic_fetch_add_relaxed(&init_count, 1));
7678 	newheader->last_used = 0;
7679 	newheader->node = node;
7680 	setownercase(newheader, name);
7681 
7682 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7683 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7684 		newheader->resign =
7685 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7686 					1);
7687 		newheader->resign_lsb = rdataset->resign & 0x1;
7688 	} else {
7689 		newheader->resign = 0;
7690 		newheader->resign_lsb = 0;
7691 	}
7692 
7693 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
7694 	result = add32(rbtdb, node, name, rbtdb->current_version, newheader,
7695 		       DNS_DBADD_MERGE, true, NULL, 0);
7696 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
7697 		    isc_rwlocktype_write);
7698 
7699 	if (result == ISC_R_SUCCESS &&
7700 	    delegating_type(rbtdb, node, rdataset->type))
7701 	{
7702 		node->find_callback = 1;
7703 	} else if (result == DNS_R_UNCHANGED) {
7704 		result = ISC_R_SUCCESS;
7705 	}
7706 
7707 	return (result);
7708 }
7709 
7710 static isc_result_t
7711 beginload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7712 	rbtdb_load_t *loadctx;
7713 	dns_rbtdb_t *rbtdb;
7714 	rbtdb = (dns_rbtdb_t *)db;
7715 
7716 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7717 	REQUIRE(VALID_RBTDB(rbtdb));
7718 
7719 	loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
7720 
7721 	loadctx->rbtdb = rbtdb;
7722 	if (IS_CACHE(rbtdb)) {
7723 		isc_stdtime_get(&loadctx->now);
7724 	} else {
7725 		loadctx->now = 0;
7726 	}
7727 
7728 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7729 
7730 	REQUIRE((rbtdb->attributes &
7731 		 (RBTDB_ATTR_LOADED | RBTDB_ATTR_LOADING)) == 0);
7732 	rbtdb->attributes |= RBTDB_ATTR_LOADING;
7733 
7734 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7735 
7736 	callbacks->add = loading_addrdataset;
7737 	callbacks->add_private = loadctx;
7738 
7739 	return (ISC_R_SUCCESS);
7740 }
7741 
7742 static isc_result_t
7743 endload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7744 	rbtdb_load_t *loadctx;
7745 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7746 
7747 	REQUIRE(VALID_RBTDB(rbtdb));
7748 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7749 	loadctx = callbacks->add_private;
7750 	REQUIRE(loadctx != NULL);
7751 	REQUIRE(loadctx->rbtdb == rbtdb);
7752 
7753 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7754 
7755 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
7756 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
7757 
7758 	rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
7759 	rbtdb->attributes |= RBTDB_ATTR_LOADED;
7760 
7761 	/*
7762 	 * If there's a KEY rdataset at the zone origin containing a
7763 	 * zone key, we consider the zone secure.
7764 	 */
7765 	if (!IS_CACHE(rbtdb) && rbtdb->origin_node != NULL) {
7766 		dns_dbversion_t *version = rbtdb->current_version;
7767 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7768 		iszonesecure(db, version, rbtdb->origin_node);
7769 	} else {
7770 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7771 	}
7772 
7773 	callbacks->add = NULL;
7774 	callbacks->add_private = NULL;
7775 
7776 	isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
7777 
7778 	return (ISC_R_SUCCESS);
7779 }
7780 
7781 static isc_result_t
7782 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
7783      dns_masterformat_t masterformat) {
7784 	dns_rbtdb_t *rbtdb;
7785 	rbtdb_version_t *rbtversion = version;
7786 
7787 	rbtdb = (dns_rbtdb_t *)db;
7788 
7789 	REQUIRE(VALID_RBTDB(rbtdb));
7790 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7791 
7792 	return (dns_master_dump(rbtdb->common.mctx, db, version,
7793 				&dns_master_style_default, filename,
7794 				masterformat, NULL));
7795 }
7796 
7797 static void
7798 delete_callback(void *data, void *arg) {
7799 	dns_rbtdb_t *rbtdb = arg;
7800 	rdatasetheader_t *current, *next;
7801 	unsigned int locknum;
7802 
7803 	current = data;
7804 	locknum = current->node->locknum;
7805 	NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
7806 	while (current != NULL) {
7807 		next = current->next;
7808 		free_rdataset(rbtdb, rbtdb->common.mctx, current);
7809 		current = next;
7810 	}
7811 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
7812 }
7813 
7814 static bool
7815 issecure(dns_db_t *db) {
7816 	dns_rbtdb_t *rbtdb;
7817 	bool secure;
7818 
7819 	rbtdb = (dns_rbtdb_t *)db;
7820 
7821 	REQUIRE(VALID_RBTDB(rbtdb));
7822 
7823 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7824 	secure = (rbtdb->current_version->secure == dns_db_secure);
7825 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7826 
7827 	return (secure);
7828 }
7829 
7830 static bool
7831 isdnssec(dns_db_t *db) {
7832 	dns_rbtdb_t *rbtdb;
7833 	bool dnssec;
7834 
7835 	rbtdb = (dns_rbtdb_t *)db;
7836 
7837 	REQUIRE(VALID_RBTDB(rbtdb));
7838 
7839 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7840 	dnssec = (rbtdb->current_version->secure != dns_db_insecure);
7841 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7842 
7843 	return (dnssec);
7844 }
7845 
7846 static unsigned int
7847 nodecount(dns_db_t *db, dns_dbtree_t tree) {
7848 	dns_rbtdb_t *rbtdb;
7849 	unsigned int count;
7850 
7851 	rbtdb = (dns_rbtdb_t *)db;
7852 
7853 	REQUIRE(VALID_RBTDB(rbtdb));
7854 
7855 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7856 	switch (tree) {
7857 	case dns_dbtree_main:
7858 		count = dns_rbt_nodecount(rbtdb->tree);
7859 		break;
7860 	case dns_dbtree_nsec:
7861 		count = dns_rbt_nodecount(rbtdb->nsec);
7862 		break;
7863 	case dns_dbtree_nsec3:
7864 		count = dns_rbt_nodecount(rbtdb->nsec3);
7865 		break;
7866 	default:
7867 		UNREACHABLE();
7868 	}
7869 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7870 
7871 	return (count);
7872 }
7873 
7874 static size_t
7875 hashsize(dns_db_t *db) {
7876 	dns_rbtdb_t *rbtdb;
7877 	size_t size;
7878 
7879 	rbtdb = (dns_rbtdb_t *)db;
7880 
7881 	REQUIRE(VALID_RBTDB(rbtdb));
7882 
7883 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7884 	size = dns_rbt_hashsize(rbtdb->tree);
7885 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7886 
7887 	return (size);
7888 }
7889 
7890 static void
7891 settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
7892 	dns_rbtdb_t *rbtdb;
7893 
7894 	rbtdb = (dns_rbtdb_t *)db;
7895 
7896 	REQUIRE(VALID_RBTDB(rbtdb));
7897 
7898 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7899 	if (rbtdb->task != NULL) {
7900 		isc_task_detach(&rbtdb->task);
7901 	}
7902 	if (task != NULL) {
7903 		isc_task_attach(task, &rbtdb->task);
7904 	}
7905 	if (rbtdb->prunetask != NULL) {
7906 		isc_task_detach(&rbtdb->prunetask);
7907 	}
7908 	if (prunetask != NULL) {
7909 		isc_task_attach(prunetask, &rbtdb->prunetask);
7910 	}
7911 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7912 }
7913 
7914 static bool
7915 ispersistent(dns_db_t *db) {
7916 	UNUSED(db);
7917 	return (false);
7918 }
7919 
7920 static isc_result_t
7921 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
7922 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7923 	dns_rbtnode_t *onode;
7924 	isc_result_t result = ISC_R_SUCCESS;
7925 
7926 	REQUIRE(VALID_RBTDB(rbtdb));
7927 	REQUIRE(nodep != NULL && *nodep == NULL);
7928 
7929 	/* Note that the access to origin_node doesn't require a DB lock */
7930 	onode = (dns_rbtnode_t *)rbtdb->origin_node;
7931 	if (onode != NULL) {
7932 		new_reference(rbtdb, onode, isc_rwlocktype_none);
7933 		*nodep = rbtdb->origin_node;
7934 	} else {
7935 		INSIST(IS_CACHE(rbtdb));
7936 		result = ISC_R_NOTFOUND;
7937 	}
7938 
7939 	return (result);
7940 }
7941 
7942 static isc_result_t
7943 getnsec3parameters(dns_db_t *db, dns_dbversion_t *version, dns_hash_t *hash,
7944 		   uint8_t *flags, uint16_t *iterations, unsigned char *salt,
7945 		   size_t *salt_length) {
7946 	dns_rbtdb_t *rbtdb;
7947 	isc_result_t result = ISC_R_NOTFOUND;
7948 	rbtdb_version_t *rbtversion = version;
7949 
7950 	rbtdb = (dns_rbtdb_t *)db;
7951 
7952 	REQUIRE(VALID_RBTDB(rbtdb));
7953 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7954 
7955 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7956 	if (rbtversion == NULL) {
7957 		rbtversion = rbtdb->current_version;
7958 	}
7959 
7960 	if (rbtversion->havensec3) {
7961 		if (hash != NULL) {
7962 			*hash = rbtversion->hash;
7963 		}
7964 		if (salt != NULL && salt_length != NULL) {
7965 			REQUIRE(*salt_length >= rbtversion->salt_length);
7966 			memmove(salt, rbtversion->salt,
7967 				rbtversion->salt_length);
7968 		}
7969 		if (salt_length != NULL) {
7970 			*salt_length = rbtversion->salt_length;
7971 		}
7972 		if (iterations != NULL) {
7973 			*iterations = rbtversion->iterations;
7974 		}
7975 		if (flags != NULL) {
7976 			*flags = rbtversion->flags;
7977 		}
7978 		result = ISC_R_SUCCESS;
7979 	}
7980 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7981 
7982 	return (result);
7983 }
7984 
7985 static isc_result_t
7986 getsize(dns_db_t *db, dns_dbversion_t *version, uint64_t *records,
7987 	uint64_t *xfrsize) {
7988 	dns_rbtdb_t *rbtdb;
7989 	isc_result_t result = ISC_R_SUCCESS;
7990 	rbtdb_version_t *rbtversion = version;
7991 
7992 	rbtdb = (dns_rbtdb_t *)db;
7993 
7994 	REQUIRE(VALID_RBTDB(rbtdb));
7995 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7996 
7997 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7998 	if (rbtversion == NULL) {
7999 		rbtversion = rbtdb->current_version;
8000 	}
8001 
8002 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8003 	if (records != NULL) {
8004 		*records = rbtversion->records;
8005 	}
8006 
8007 	if (xfrsize != NULL) {
8008 		*xfrsize = rbtversion->xfrsize;
8009 	}
8010 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8011 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8012 
8013 	return (result);
8014 }
8015 
8016 static isc_result_t
8017 setsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, isc_stdtime_t resign) {
8018 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8019 	rdatasetheader_t *header, oldheader;
8020 
8021 	REQUIRE(VALID_RBTDB(rbtdb));
8022 	REQUIRE(!IS_CACHE(rbtdb));
8023 	REQUIRE(rdataset != NULL);
8024 
8025 	header = rdataset->private3;
8026 	header--;
8027 
8028 	NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock,
8029 		  isc_rwlocktype_write);
8030 
8031 	oldheader = *header;
8032 	/*
8033 	 * Only break the heap invariant (by adjusting resign and resign_lsb)
8034 	 * if we are going to be restoring it by calling isc_heap_increased
8035 	 * or isc_heap_decreased.
8036 	 */
8037 	if (resign != 0) {
8038 		header->resign = (isc_stdtime_t)(dns_time64_from32(resign) >>
8039 						 1);
8040 		header->resign_lsb = resign & 0x1;
8041 	}
8042 	if (header->heap_index != 0) {
8043 		INSIST(RESIGN(header));
8044 		if (resign == 0) {
8045 			isc_heap_delete(rbtdb->heaps[header->node->locknum],
8046 					header->heap_index);
8047 			header->heap_index = 0;
8048 		} else if (resign_sooner(header, &oldheader)) {
8049 			isc_heap_increased(rbtdb->heaps[header->node->locknum],
8050 					   header->heap_index);
8051 		} else if (resign_sooner(&oldheader, header)) {
8052 			isc_heap_decreased(rbtdb->heaps[header->node->locknum],
8053 					   header->heap_index);
8054 		}
8055 	} else if (resign != 0) {
8056 		RDATASET_ATTR_SET(header, RDATASET_ATTR_RESIGN);
8057 		resign_insert(rbtdb, header->node->locknum, header);
8058 	}
8059 	NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock,
8060 		    isc_rwlocktype_write);
8061 	return (ISC_R_SUCCESS);
8062 }
8063 
8064 static isc_result_t
8065 getsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, dns_name_t *foundname) {
8066 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8067 	rdatasetheader_t *header = NULL, *this;
8068 	unsigned int i;
8069 	isc_result_t result = ISC_R_NOTFOUND;
8070 	unsigned int locknum = 0;
8071 
8072 	REQUIRE(VALID_RBTDB(rbtdb));
8073 
8074 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8075 
8076 	for (i = 0; i < rbtdb->node_lock_count; i++) {
8077 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read);
8078 
8079 		/*
8080 		 * Find for the earliest signing time among all of the
8081 		 * heaps, each of which is covered by a different bucket
8082 		 * lock.
8083 		 */
8084 		this = isc_heap_element(rbtdb->heaps[i], 1);
8085 		if (this == NULL) {
8086 			/* Nothing found; unlock and try the next heap. */
8087 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8088 				    isc_rwlocktype_read);
8089 			continue;
8090 		}
8091 
8092 		if (header == NULL) {
8093 			/*
8094 			 * Found a signing time: retain the bucket lock and
8095 			 * preserve the lock number so we can unlock it
8096 			 * later.
8097 			 */
8098 			header = this;
8099 			locknum = i;
8100 		} else if (resign_sooner(this, header)) {
8101 			/*
8102 			 * Found an earlier signing time; release the
8103 			 * previous bucket lock and retain this one instead.
8104 			 */
8105 			NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8106 				    isc_rwlocktype_read);
8107 			header = this;
8108 			locknum = i;
8109 		} else {
8110 			/*
8111 			 * Earliest signing time in this heap isn't
8112 			 * an improvement; unlock and try the next heap.
8113 			 */
8114 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8115 				    isc_rwlocktype_read);
8116 		}
8117 	}
8118 
8119 	if (header != NULL) {
8120 		/*
8121 		 * Found something; pass back the answer and unlock
8122 		 * the bucket.
8123 		 */
8124 		bind_rdataset(rbtdb, header->node, header, 0,
8125 			      isc_rwlocktype_read, rdataset);
8126 
8127 		if (foundname != NULL) {
8128 			dns_rbt_fullnamefromnode(header->node, foundname);
8129 		}
8130 
8131 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8132 			    isc_rwlocktype_read);
8133 
8134 		result = ISC_R_SUCCESS;
8135 	}
8136 
8137 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8138 
8139 	return (result);
8140 }
8141 
8142 static void
8143 resigned(dns_db_t *db, dns_rdataset_t *rdataset, dns_dbversion_t *version) {
8144 	rbtdb_version_t *rbtversion = (rbtdb_version_t *)version;
8145 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8146 	dns_rbtnode_t *node;
8147 	rdatasetheader_t *header;
8148 
8149 	REQUIRE(VALID_RBTDB(rbtdb));
8150 	REQUIRE(rdataset != NULL);
8151 	REQUIRE(rdataset->methods == &rdataset_methods);
8152 	REQUIRE(rbtdb->future_version == rbtversion);
8153 	REQUIRE(rbtversion != NULL);
8154 	REQUIRE(rbtversion->writer);
8155 	REQUIRE(rbtversion->rbtdb == rbtdb);
8156 
8157 	node = rdataset->private2;
8158 	INSIST(node != NULL);
8159 	header = rdataset->private3;
8160 	INSIST(header != NULL);
8161 	header--;
8162 
8163 	if (header->heap_index == 0) {
8164 		return;
8165 	}
8166 
8167 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8168 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
8169 	/*
8170 	 * Delete from heap and save to re-signed list so that it can
8171 	 * be restored if we backout of this change.
8172 	 */
8173 	resign_delete(rbtdb, rbtversion, header);
8174 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
8175 		    isc_rwlocktype_write);
8176 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8177 }
8178 
8179 static isc_result_t
8180 setcachestats(dns_db_t *db, isc_stats_t *stats) {
8181 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8182 
8183 	REQUIRE(VALID_RBTDB(rbtdb));
8184 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8185 	REQUIRE(stats != NULL);
8186 
8187 	isc_stats_attach(stats, &rbtdb->cachestats);
8188 	return (ISC_R_SUCCESS);
8189 }
8190 
8191 static isc_result_t
8192 setgluecachestats(dns_db_t *db, isc_stats_t *stats) {
8193 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8194 
8195 	REQUIRE(VALID_RBTDB(rbtdb));
8196 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
8197 	REQUIRE(stats != NULL);
8198 
8199 	isc_stats_attach(stats, &rbtdb->gluecachestats);
8200 	return (ISC_R_SUCCESS);
8201 }
8202 
8203 static void
8204 setmaxrrperset(dns_db_t *db, uint32_t maxrrperset) {
8205 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8206 
8207 	REQUIRE(VALID_RBTDB(rbtdb));
8208 
8209 	rbtdb->maxrrperset = maxrrperset;
8210 }
8211 
8212 static void
8213 setmaxtypepername(dns_db_t *db, uint32_t maxtypepername) {
8214 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8215 
8216 	REQUIRE(VALID_RBTDB(rbtdb));
8217 
8218 	rbtdb->maxtypepername = maxtypepername;
8219 }
8220 
8221 static dns_stats_t *
8222 getrrsetstats(dns_db_t *db) {
8223 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8224 
8225 	REQUIRE(VALID_RBTDB(rbtdb));
8226 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8227 
8228 	return (rbtdb->rrsetstats);
8229 }
8230 
8231 static isc_result_t
8232 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name) {
8233 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8234 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
8235 	isc_result_t result;
8236 
8237 	REQUIRE(VALID_RBTDB(rbtdb));
8238 	REQUIRE(node != NULL);
8239 	REQUIRE(name != NULL);
8240 
8241 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8242 	result = dns_rbt_fullnamefromnode(rbtnode, name);
8243 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8244 
8245 	return (result);
8246 }
8247 
8248 static isc_result_t
8249 setservestalettl(dns_db_t *db, dns_ttl_t ttl) {
8250 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8251 
8252 	REQUIRE(VALID_RBTDB(rbtdb));
8253 	REQUIRE(IS_CACHE(rbtdb));
8254 
8255 	/* currently no bounds checking.  0 means disable. */
8256 	rbtdb->serve_stale_ttl = ttl;
8257 	return (ISC_R_SUCCESS);
8258 }
8259 
8260 static isc_result_t
8261 getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
8262 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8263 
8264 	REQUIRE(VALID_RBTDB(rbtdb));
8265 	REQUIRE(IS_CACHE(rbtdb));
8266 
8267 	*ttl = rbtdb->serve_stale_ttl;
8268 	return (ISC_R_SUCCESS);
8269 }
8270 
8271 static isc_result_t
8272 setservestalerefresh(dns_db_t *db, uint32_t interval) {
8273 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8274 
8275 	REQUIRE(VALID_RBTDB(rbtdb));
8276 	REQUIRE(IS_CACHE(rbtdb));
8277 
8278 	/* currently no bounds checking.  0 means disable. */
8279 	rbtdb->serve_stale_refresh = interval;
8280 	return (ISC_R_SUCCESS);
8281 }
8282 
8283 static isc_result_t
8284 getservestalerefresh(dns_db_t *db, uint32_t *interval) {
8285 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8286 
8287 	REQUIRE(VALID_RBTDB(rbtdb));
8288 	REQUIRE(IS_CACHE(rbtdb));
8289 
8290 	*interval = rbtdb->serve_stale_refresh;
8291 	return (ISC_R_SUCCESS);
8292 }
8293 
8294 static dns_dbmethods_t zone_methods = { attach,
8295 					detach,
8296 					beginload,
8297 					endload,
8298 					dump,
8299 					currentversion,
8300 					newversion,
8301 					attachversion,
8302 					closeversion,
8303 					findnode,
8304 					zone_find,
8305 					zone_findzonecut,
8306 					attachnode,
8307 					detachnode,
8308 					expirenode,
8309 					printnode,
8310 					createiterator,
8311 					zone_findrdataset,
8312 					allrdatasets,
8313 					addrdataset,
8314 					subtractrdataset,
8315 					deleterdataset,
8316 					issecure,
8317 					nodecount,
8318 					ispersistent,
8319 					overmem,
8320 					settask,
8321 					getoriginnode,
8322 					NULL, /* transfernode */
8323 					getnsec3parameters,
8324 					findnsec3node,
8325 					setsigningtime,
8326 					getsigningtime,
8327 					resigned,
8328 					isdnssec,
8329 					NULL, /* getrrsetstats */
8330 					NULL, /* rpz_attach */
8331 					NULL, /* rpz_ready */
8332 					NULL, /* findnodeext */
8333 					NULL, /* findext */
8334 					NULL, /* setcachestats */
8335 					hashsize,
8336 					nodefullname,
8337 					getsize,
8338 					NULL, /* setservestalettl */
8339 					NULL, /* getservestalettl */
8340 					NULL, /* setservestalerefresh */
8341 					NULL, /* getservestalerefresh */
8342 					setgluecachestats,
8343 					setmaxrrperset,
8344 					setmaxtypepername };
8345 
8346 static dns_dbmethods_t cache_methods = { attach,
8347 					 detach,
8348 					 beginload,
8349 					 endload,
8350 					 dump,
8351 					 currentversion,
8352 					 newversion,
8353 					 attachversion,
8354 					 closeversion,
8355 					 findnode,
8356 					 cache_find,
8357 					 cache_findzonecut,
8358 					 attachnode,
8359 					 detachnode,
8360 					 expirenode,
8361 					 printnode,
8362 					 createiterator,
8363 					 cache_findrdataset,
8364 					 allrdatasets,
8365 					 addrdataset,
8366 					 subtractrdataset,
8367 					 deleterdataset,
8368 					 issecure,
8369 					 nodecount,
8370 					 ispersistent,
8371 					 overmem,
8372 					 settask,
8373 					 getoriginnode,
8374 					 NULL, /* transfernode */
8375 					 NULL, /* getnsec3parameters */
8376 					 NULL, /* findnsec3node */
8377 					 NULL, /* setsigningtime */
8378 					 NULL, /* getsigningtime */
8379 					 NULL, /* resigned */
8380 					 isdnssec,
8381 					 getrrsetstats,
8382 					 NULL, /* rpz_attach */
8383 					 NULL, /* rpz_ready */
8384 					 NULL, /* findnodeext */
8385 					 NULL, /* findext */
8386 					 setcachestats,
8387 					 hashsize,
8388 					 nodefullname,
8389 					 NULL, /* getsize */
8390 					 setservestalettl,
8391 					 getservestalettl,
8392 					 setservestalerefresh,
8393 					 getservestalerefresh,
8394 					 NULL,
8395 					 setmaxrrperset,
8396 					 setmaxtypepername };
8397 
8398 isc_result_t
8399 dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
8400 		 dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
8401 		 void *driverarg, dns_db_t **dbp) {
8402 	dns_rbtdb_t *rbtdb;
8403 	isc_result_t result;
8404 	int i;
8405 	dns_name_t name;
8406 	bool (*sooner)(void *, void *);
8407 	isc_mem_t *hmctx = mctx;
8408 
8409 	/* Keep the compiler happy. */
8410 	UNUSED(driverarg);
8411 
8412 	rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
8413 
8414 	/*
8415 	 * If argv[0] exists, it points to a memory context to use for heap
8416 	 */
8417 	if (argc != 0) {
8418 		hmctx = (isc_mem_t *)argv[0];
8419 	}
8420 
8421 	memset(rbtdb, '\0', sizeof(*rbtdb));
8422 	dns_name_init(&rbtdb->common.origin, NULL);
8423 	rbtdb->common.attributes = 0;
8424 	if (type == dns_dbtype_cache) {
8425 		rbtdb->common.methods = &cache_methods;
8426 		rbtdb->common.attributes |= DNS_DBATTR_CACHE;
8427 	} else if (type == dns_dbtype_stub) {
8428 		rbtdb->common.methods = &zone_methods;
8429 		rbtdb->common.attributes |= DNS_DBATTR_STUB;
8430 	} else {
8431 		rbtdb->common.methods = &zone_methods;
8432 	}
8433 	rbtdb->common.rdclass = rdclass;
8434 	rbtdb->common.mctx = NULL;
8435 
8436 	ISC_LIST_INIT(rbtdb->common.update_listeners);
8437 
8438 	RBTDB_INITLOCK(&rbtdb->lock);
8439 
8440 	isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
8441 
8442 	/*
8443 	 * Initialize node_lock_count in a generic way to support future
8444 	 * extension which allows the user to specify this value on creation.
8445 	 * Note that when specified for a cache DB it must be larger than 1
8446 	 * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
8447 	 */
8448 	if (rbtdb->node_lock_count == 0) {
8449 		if (IS_CACHE(rbtdb)) {
8450 			rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
8451 		} else {
8452 			rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
8453 		}
8454 	} else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
8455 		result = ISC_R_RANGE;
8456 		goto cleanup_tree_lock;
8457 	}
8458 	INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
8459 	rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
8460 						      sizeof(rbtdb_nodelock_t));
8461 
8462 	rbtdb->cachestats = NULL;
8463 	rbtdb->gluecachestats = NULL;
8464 
8465 	rbtdb->rrsetstats = NULL;
8466 	if (IS_CACHE(rbtdb)) {
8467 		result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
8468 		if (result != ISC_R_SUCCESS) {
8469 			goto cleanup_node_locks;
8470 		}
8471 		rbtdb->rdatasets = isc_mem_get(
8472 			mctx,
8473 			rbtdb->node_lock_count * sizeof(rdatasetheaderlist_t));
8474 		for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8475 			ISC_LIST_INIT(rbtdb->rdatasets[i]);
8476 		}
8477 	} else {
8478 		rbtdb->rdatasets = NULL;
8479 	}
8480 
8481 	/*
8482 	 * Create the heaps.
8483 	 */
8484 	rbtdb->heaps = isc_mem_get(hmctx, rbtdb->node_lock_count *
8485 						  sizeof(isc_heap_t *));
8486 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8487 		rbtdb->heaps[i] = NULL;
8488 	}
8489 	sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner;
8490 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8491 		isc_heap_create(hmctx, sooner, set_index, 0, &rbtdb->heaps[i]);
8492 	}
8493 
8494 	/*
8495 	 * Create deadnode lists.
8496 	 */
8497 	rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
8498 						     sizeof(rbtnodelist_t));
8499 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8500 		ISC_LIST_INIT(rbtdb->deadnodes[i]);
8501 	}
8502 
8503 	rbtdb->active = rbtdb->node_lock_count;
8504 
8505 	for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
8506 		NODE_INITLOCK(&rbtdb->node_locks[i].lock);
8507 		isc_refcount_init(&rbtdb->node_locks[i].references, 0);
8508 		rbtdb->node_locks[i].exiting = false;
8509 	}
8510 
8511 	/*
8512 	 * Attach to the mctx.  The database will persist so long as there
8513 	 * are references to it, and attaching to the mctx ensures that our
8514 	 * mctx won't disappear out from under us.
8515 	 */
8516 	isc_mem_attach(mctx, &rbtdb->common.mctx);
8517 	isc_mem_attach(hmctx, &rbtdb->hmctx);
8518 
8519 	/*
8520 	 * Make a copy of the origin name.
8521 	 */
8522 	result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
8523 	if (result != ISC_R_SUCCESS) {
8524 		free_rbtdb(rbtdb, false, NULL);
8525 		return (result);
8526 	}
8527 
8528 	/*
8529 	 * Make the Red-Black Trees.
8530 	 */
8531 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
8532 	if (result != ISC_R_SUCCESS) {
8533 		free_rbtdb(rbtdb, false, NULL);
8534 		return (result);
8535 	}
8536 
8537 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec);
8538 	if (result != ISC_R_SUCCESS) {
8539 		free_rbtdb(rbtdb, false, NULL);
8540 		return (result);
8541 	}
8542 
8543 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3);
8544 	if (result != ISC_R_SUCCESS) {
8545 		free_rbtdb(rbtdb, false, NULL);
8546 		return (result);
8547 	}
8548 
8549 	/*
8550 	 * In order to set the node callback bit correctly in zone databases,
8551 	 * we need to know if the node has the origin name of the zone.
8552 	 * In loading_addrdataset() we could simply compare the new name
8553 	 * to the origin name, but this is expensive.  Also, we don't know the
8554 	 * node name in addrdataset(), so we need another way of knowing the
8555 	 * zone's top.
8556 	 *
8557 	 * We now explicitly create a node for the zone's origin, and then
8558 	 * we simply remember the node's address.  This is safe, because
8559 	 * the top-of-zone node can never be deleted, nor can its address
8560 	 * change.
8561 	 */
8562 	if (!IS_CACHE(rbtdb)) {
8563 		rbtdb->origin_node = NULL;
8564 		result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
8565 					 &rbtdb->origin_node);
8566 		if (result != ISC_R_SUCCESS) {
8567 			INSIST(result != ISC_R_EXISTS);
8568 			free_rbtdb(rbtdb, false, NULL);
8569 			return (result);
8570 		}
8571 		INSIST(rbtdb->origin_node != NULL);
8572 		rbtdb->origin_node->nsec = DNS_RBT_NSEC_NORMAL;
8573 		/*
8574 		 * We need to give the origin node the right locknum.
8575 		 */
8576 		dns_name_init(&name, NULL);
8577 		dns_rbt_namefromnode(rbtdb->origin_node, &name);
8578 		rbtdb->origin_node->locknum = rbtdb->origin_node->hashval %
8579 					      rbtdb->node_lock_count;
8580 		/*
8581 		 * Add an apex node to the NSEC3 tree so that NSEC3 searches
8582 		 * return partial matches when there is only a single NSEC3
8583 		 * record in the tree.
8584 		 */
8585 		rbtdb->nsec3_origin_node = NULL;
8586 		result = dns_rbt_addnode(rbtdb->nsec3, &rbtdb->common.origin,
8587 					 &rbtdb->nsec3_origin_node);
8588 		if (result != ISC_R_SUCCESS) {
8589 			INSIST(result != ISC_R_EXISTS);
8590 			free_rbtdb(rbtdb, false, NULL);
8591 			return (result);
8592 		}
8593 		rbtdb->nsec3_origin_node->nsec = DNS_RBT_NSEC_NSEC3;
8594 		/*
8595 		 * We need to give the nsec3 origin node the right locknum.
8596 		 */
8597 		dns_name_init(&name, NULL);
8598 		dns_rbt_namefromnode(rbtdb->nsec3_origin_node, &name);
8599 		rbtdb->nsec3_origin_node->locknum =
8600 			rbtdb->nsec3_origin_node->hashval %
8601 			rbtdb->node_lock_count;
8602 	}
8603 
8604 	/*
8605 	 * Misc. Initialization.
8606 	 */
8607 	isc_refcount_init(&rbtdb->references, 1);
8608 	rbtdb->attributes = 0;
8609 	rbtdb->task = NULL;
8610 	rbtdb->prunetask = NULL;
8611 	rbtdb->serve_stale_ttl = 0;
8612 
8613 	/*
8614 	 * Version Initialization.
8615 	 */
8616 	rbtdb->current_serial = 1;
8617 	rbtdb->least_serial = 1;
8618 	rbtdb->next_serial = 2;
8619 	rbtdb->current_version = allocate_version(mctx, 1, 1, false);
8620 	rbtdb->current_version->rbtdb = rbtdb;
8621 	rbtdb->current_version->secure = dns_db_insecure;
8622 	rbtdb->current_version->havensec3 = false;
8623 	rbtdb->current_version->flags = 0;
8624 	rbtdb->current_version->iterations = 0;
8625 	rbtdb->current_version->hash = 0;
8626 	rbtdb->current_version->salt_length = 0;
8627 	memset(rbtdb->current_version->salt, 0,
8628 	       sizeof(rbtdb->current_version->salt));
8629 	isc_rwlock_init(&rbtdb->current_version->rwlock, 0, 0);
8630 	rbtdb->current_version->records = 0;
8631 	rbtdb->current_version->xfrsize = 0;
8632 	rbtdb->future_version = NULL;
8633 	ISC_LIST_INIT(rbtdb->open_versions);
8634 	/*
8635 	 * Keep the current version in the open list so that list operation
8636 	 * won't happen in normal lookup operations.
8637 	 */
8638 	PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
8639 
8640 	rbtdb->common.magic = DNS_DB_MAGIC;
8641 	rbtdb->common.impmagic = RBTDB_MAGIC;
8642 
8643 	*dbp = (dns_db_t *)rbtdb;
8644 
8645 	return (ISC_R_SUCCESS);
8646 
8647 cleanup_node_locks:
8648 	isc_mem_put(mctx, rbtdb->node_locks,
8649 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
8650 
8651 cleanup_tree_lock:
8652 	isc_rwlock_destroy(&rbtdb->tree_lock);
8653 	RBTDB_DESTROYLOCK(&rbtdb->lock);
8654 	isc_mem_put(mctx, rbtdb, sizeof(*rbtdb));
8655 	return (result);
8656 }
8657 
8658 /*
8659  * Slabbed Rdataset Methods
8660  */
8661 
8662 static void
8663 rdataset_disassociate(dns_rdataset_t *rdataset) {
8664 	dns_db_t *db = rdataset->private1;
8665 	dns_dbnode_t *node = rdataset->private2;
8666 
8667 	detachnode(db, &node);
8668 }
8669 
8670 static isc_result_t
8671 rdataset_first(dns_rdataset_t *rdataset) {
8672 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
8673 	unsigned int count;
8674 
8675 	count = raw[0] * 256 + raw[1];
8676 	if (count == 0) {
8677 		rdataset->private5 = NULL;
8678 		return (ISC_R_NOMORE);
8679 	}
8680 
8681 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
8682 		raw += DNS_RDATASET_COUNT;
8683 	}
8684 
8685 	raw += DNS_RDATASET_LENGTH;
8686 
8687 	/*
8688 	 * The privateuint4 field is the number of rdata beyond the
8689 	 * cursor position, so we decrement the total count by one
8690 	 * before storing it.
8691 	 *
8692 	 * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
8693 	 * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
8694 	 * to the first entry in the offset table.
8695 	 */
8696 	count--;
8697 	rdataset->privateuint4 = count;
8698 	rdataset->private5 = raw;
8699 
8700 	return (ISC_R_SUCCESS);
8701 }
8702 
8703 static isc_result_t
8704 rdataset_next(dns_rdataset_t *rdataset) {
8705 	unsigned int count;
8706 	unsigned int length;
8707 	unsigned char *raw; /* RDATASLAB */
8708 
8709 	count = rdataset->privateuint4;
8710 	if (count == 0) {
8711 		return (ISC_R_NOMORE);
8712 	}
8713 	count--;
8714 	rdataset->privateuint4 = count;
8715 
8716 	/*
8717 	 * Skip forward one record (length + 4) or one offset (4).
8718 	 */
8719 	raw = rdataset->private5;
8720 #if DNS_RDATASET_FIXED
8721 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
8722 #endif /* DNS_RDATASET_FIXED */
8723 	{
8724 		length = raw[0] * 256 + raw[1];
8725 		raw += length;
8726 	}
8727 
8728 	rdataset->private5 = raw + DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
8729 
8730 	return (ISC_R_SUCCESS);
8731 }
8732 
8733 static void
8734 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
8735 	unsigned char *raw = rdataset->private5; /* RDATASLAB */
8736 	unsigned int length;
8737 	isc_region_t r;
8738 	unsigned int flags = 0;
8739 
8740 	REQUIRE(raw != NULL);
8741 
8742 	/*
8743 	 * Find the start of the record if not already in private5
8744 	 * then skip the length and order fields.
8745 	 */
8746 #if DNS_RDATASET_FIXED
8747 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
8748 		unsigned int offset;
8749 		offset = ((unsigned int)raw[0] << 24) +
8750 			 ((unsigned int)raw[1] << 16) +
8751 			 ((unsigned int)raw[2] << 8) + (unsigned int)raw[3];
8752 		raw = rdataset->private3;
8753 		raw += offset;
8754 	}
8755 #endif /* if DNS_RDATASET_FIXED */
8756 
8757 	length = raw[0] * 256 + raw[1];
8758 
8759 	raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
8760 
8761 	if (rdataset->type == dns_rdatatype_rrsig) {
8762 		if (*raw & DNS_RDATASLAB_OFFLINE) {
8763 			flags |= DNS_RDATA_OFFLINE;
8764 		}
8765 		length--;
8766 		raw++;
8767 	}
8768 	r.length = length;
8769 	r.base = raw;
8770 	dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
8771 	rdata->flags |= flags;
8772 }
8773 
8774 static void
8775 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
8776 	dns_db_t *db = source->private1;
8777 	dns_dbnode_t *node = source->private2;
8778 	dns_dbnode_t *cloned_node = NULL;
8779 
8780 	attachnode(db, node, &cloned_node);
8781 	INSIST(!ISC_LINK_LINKED(target, link));
8782 	*target = *source;
8783 	ISC_LINK_INIT(target, link);
8784 
8785 	/*
8786 	 * Reset iterator state.
8787 	 */
8788 	target->privateuint4 = 0;
8789 	target->private5 = NULL;
8790 }
8791 
8792 static unsigned int
8793 rdataset_count(dns_rdataset_t *rdataset) {
8794 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
8795 	unsigned int count;
8796 
8797 	count = raw[0] * 256 + raw[1];
8798 
8799 	return (count);
8800 }
8801 
8802 static isc_result_t
8803 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
8804 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
8805 	dns_db_t *db = rdataset->private1;
8806 	dns_dbnode_t *node = rdataset->private2;
8807 	dns_dbnode_t *cloned_node;
8808 	const struct noqname *noqname = rdataset->private6;
8809 
8810 	cloned_node = NULL;
8811 	attachnode(db, node, &cloned_node);
8812 	nsec->methods = &slab_methods;
8813 	nsec->rdclass = db->rdclass;
8814 	nsec->type = noqname->type;
8815 	nsec->covers = 0;
8816 	nsec->ttl = rdataset->ttl;
8817 	nsec->trust = rdataset->trust;
8818 	nsec->private1 = rdataset->private1;
8819 	nsec->private2 = rdataset->private2;
8820 	nsec->private3 = noqname->neg;
8821 	nsec->privateuint4 = 0;
8822 	nsec->private5 = NULL;
8823 	nsec->private6 = NULL;
8824 	nsec->private7 = NULL;
8825 
8826 	cloned_node = NULL;
8827 	attachnode(db, node, &cloned_node);
8828 	nsecsig->methods = &slab_methods;
8829 	nsecsig->rdclass = db->rdclass;
8830 	nsecsig->type = dns_rdatatype_rrsig;
8831 	nsecsig->covers = noqname->type;
8832 	nsecsig->ttl = rdataset->ttl;
8833 	nsecsig->trust = rdataset->trust;
8834 	nsecsig->private1 = rdataset->private1;
8835 	nsecsig->private2 = rdataset->private2;
8836 	nsecsig->private3 = noqname->negsig;
8837 	nsecsig->privateuint4 = 0;
8838 	nsecsig->private5 = NULL;
8839 	nsec->private6 = NULL;
8840 	nsec->private7 = NULL;
8841 
8842 	dns_name_clone(&noqname->name, name);
8843 
8844 	return (ISC_R_SUCCESS);
8845 }
8846 
8847 static isc_result_t
8848 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
8849 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
8850 	dns_db_t *db = rdataset->private1;
8851 	dns_dbnode_t *node = rdataset->private2;
8852 	dns_dbnode_t *cloned_node;
8853 	const struct noqname *closest = rdataset->private7;
8854 
8855 	cloned_node = NULL;
8856 	attachnode(db, node, &cloned_node);
8857 	nsec->methods = &slab_methods;
8858 	nsec->rdclass = db->rdclass;
8859 	nsec->type = closest->type;
8860 	nsec->covers = 0;
8861 	nsec->ttl = rdataset->ttl;
8862 	nsec->trust = rdataset->trust;
8863 	nsec->private1 = rdataset->private1;
8864 	nsec->private2 = rdataset->private2;
8865 	nsec->private3 = closest->neg;
8866 	nsec->privateuint4 = 0;
8867 	nsec->private5 = NULL;
8868 	nsec->private6 = NULL;
8869 	nsec->private7 = NULL;
8870 
8871 	cloned_node = NULL;
8872 	attachnode(db, node, &cloned_node);
8873 	nsecsig->methods = &slab_methods;
8874 	nsecsig->rdclass = db->rdclass;
8875 	nsecsig->type = dns_rdatatype_rrsig;
8876 	nsecsig->covers = closest->type;
8877 	nsecsig->ttl = rdataset->ttl;
8878 	nsecsig->trust = rdataset->trust;
8879 	nsecsig->private1 = rdataset->private1;
8880 	nsecsig->private2 = rdataset->private2;
8881 	nsecsig->private3 = closest->negsig;
8882 	nsecsig->privateuint4 = 0;
8883 	nsecsig->private5 = NULL;
8884 	nsec->private6 = NULL;
8885 	nsec->private7 = NULL;
8886 
8887 	dns_name_clone(&closest->name, name);
8888 
8889 	return (ISC_R_SUCCESS);
8890 }
8891 
8892 static void
8893 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust) {
8894 	dns_rbtdb_t *rbtdb = rdataset->private1;
8895 	dns_rbtnode_t *rbtnode = rdataset->private2;
8896 	rdatasetheader_t *header = rdataset->private3;
8897 
8898 	header--;
8899 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8900 		  isc_rwlocktype_write);
8901 	header->trust = rdataset->trust = trust;
8902 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8903 		    isc_rwlocktype_write);
8904 }
8905 
8906 static void
8907 rdataset_expire(dns_rdataset_t *rdataset) {
8908 	dns_rbtdb_t *rbtdb = rdataset->private1;
8909 	dns_rbtnode_t *rbtnode = rdataset->private2;
8910 	rdatasetheader_t *header = rdataset->private3;
8911 
8912 	header--;
8913 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8914 		  isc_rwlocktype_write);
8915 	expire_header(rbtdb, header, false, expire_flush);
8916 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8917 		    isc_rwlocktype_write);
8918 }
8919 
8920 static void
8921 rdataset_clearprefetch(dns_rdataset_t *rdataset) {
8922 	dns_rbtdb_t *rbtdb = rdataset->private1;
8923 	dns_rbtnode_t *rbtnode = rdataset->private2;
8924 	rdatasetheader_t *header = rdataset->private3;
8925 
8926 	header--;
8927 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8928 		  isc_rwlocktype_write);
8929 	RDATASET_ATTR_CLR(header, RDATASET_ATTR_PREFETCH);
8930 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8931 		    isc_rwlocktype_write);
8932 }
8933 
8934 /*
8935  * Rdataset Iterator Methods
8936  */
8937 
8938 static void
8939 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
8940 	rbtdb_rdatasetiter_t *rbtiterator;
8941 
8942 	rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
8943 
8944 	if (rbtiterator->common.version != NULL) {
8945 		closeversion(rbtiterator->common.db,
8946 			     &rbtiterator->common.version, false);
8947 	}
8948 	detachnode(rbtiterator->common.db, &rbtiterator->common.node);
8949 	isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
8950 		    sizeof(*rbtiterator));
8951 
8952 	*iteratorp = NULL;
8953 }
8954 
8955 static bool
8956 iterator_active(dns_rbtdb_t *rbtdb, rbtdb_rdatasetiter_t *rbtiterator,
8957 		rdatasetheader_t *header) {
8958 	dns_ttl_t stale_ttl = header->rdh_ttl + STALE_TTL(header, rbtdb);
8959 
8960 	/*
8961 	 * Is this a "this rdataset doesn't exist" record?
8962 	 */
8963 	if (NONEXISTENT(header)) {
8964 		return (false);
8965 	}
8966 
8967 	/*
8968 	 * If this is a zone or this header still active then return it.
8969 	 */
8970 	if (!IS_CACHE(rbtdb) || ACTIVE(header, rbtiterator->common.now)) {
8971 		return (true);
8972 	}
8973 
8974 	/*
8975 	 * If we are not returning stale records or the rdataset is
8976 	 * too old don't return it.
8977 	 */
8978 	if (!STALEOK(rbtiterator) || (rbtiterator->common.now > stale_ttl)) {
8979 		return (false);
8980 	}
8981 	return (true);
8982 }
8983 
8984 static isc_result_t
8985 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
8986 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
8987 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
8988 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
8989 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
8990 	rdatasetheader_t *header, *top_next;
8991 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
8992 
8993 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
8994 		  isc_rwlocktype_read);
8995 
8996 	for (header = rbtnode->data; header != NULL; header = top_next) {
8997 		top_next = header->next;
8998 		do {
8999 			if (EXPIREDOK(rbtiterator)) {
9000 				if (!NONEXISTENT(header)) {
9001 					break;
9002 				}
9003 				header = header->down;
9004 			} else if (header->serial <= serial && !IGNORE(header))
9005 			{
9006 				if (!iterator_active(rbtdb, rbtiterator,
9007 						     header))
9008 				{
9009 					header = NULL;
9010 				}
9011 				break;
9012 			} else {
9013 				header = header->down;
9014 			}
9015 		} while (header != NULL);
9016 		if (header != NULL) {
9017 			break;
9018 		}
9019 	}
9020 
9021 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9022 		    isc_rwlocktype_read);
9023 
9024 	rbtiterator->current = header;
9025 
9026 	if (header == NULL) {
9027 		return (ISC_R_NOMORE);
9028 	}
9029 
9030 	return (ISC_R_SUCCESS);
9031 }
9032 
9033 static isc_result_t
9034 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
9035 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9036 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9037 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9038 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
9039 	rdatasetheader_t *header, *top_next;
9040 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
9041 	rbtdb_rdatatype_t type, negtype;
9042 	dns_rdatatype_t rdtype, covers;
9043 	bool expiredok = EXPIREDOK(rbtiterator);
9044 
9045 	header = rbtiterator->current;
9046 	if (header == NULL) {
9047 		return (ISC_R_NOMORE);
9048 	}
9049 
9050 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9051 		  isc_rwlocktype_read);
9052 
9053 	type = header->type;
9054 	rdtype = RBTDB_RDATATYPE_BASE(header->type);
9055 	if (NEGATIVE(header)) {
9056 		covers = RBTDB_RDATATYPE_EXT(header->type);
9057 		negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
9058 	} else {
9059 		negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
9060 	}
9061 
9062 	/*
9063 	 * Find the start of the header chain for the next type
9064 	 * by walking back up the list.
9065 	 */
9066 	top_next = header->next;
9067 	while (top_next != NULL &&
9068 	       (top_next->type == type || top_next->type == negtype))
9069 	{
9070 		top_next = top_next->next;
9071 	}
9072 	if (expiredok) {
9073 		/*
9074 		 * Keep walking down the list if possible or
9075 		 * start the next type.
9076 		 */
9077 		header = header->down != NULL ? header->down : top_next;
9078 	} else {
9079 		header = top_next;
9080 	}
9081 	for (; header != NULL; header = top_next) {
9082 		top_next = header->next;
9083 		do {
9084 			if (expiredok) {
9085 				if (!NONEXISTENT(header)) {
9086 					break;
9087 				}
9088 				header = header->down;
9089 			} else if (header->serial <= serial && !IGNORE(header))
9090 			{
9091 				if (!iterator_active(rbtdb, rbtiterator,
9092 						     header))
9093 				{
9094 					header = NULL;
9095 				}
9096 				break;
9097 			} else {
9098 				header = header->down;
9099 			}
9100 		} while (header != NULL);
9101 		if (header != NULL) {
9102 			break;
9103 		}
9104 		/*
9105 		 * Find the start of the header chain for the next type
9106 		 * by walking back up the list.
9107 		 */
9108 		while (top_next != NULL &&
9109 		       (top_next->type == type || top_next->type == negtype))
9110 		{
9111 			top_next = top_next->next;
9112 		}
9113 	}
9114 
9115 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9116 		    isc_rwlocktype_read);
9117 
9118 	rbtiterator->current = header;
9119 
9120 	if (header == NULL) {
9121 		return (ISC_R_NOMORE);
9122 	}
9123 
9124 	return (ISC_R_SUCCESS);
9125 }
9126 
9127 static void
9128 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
9129 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9130 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9131 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9132 	rdatasetheader_t *header;
9133 
9134 	header = rbtiterator->current;
9135 	REQUIRE(header != NULL);
9136 
9137 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9138 		  isc_rwlocktype_read);
9139 
9140 	bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
9141 		      isc_rwlocktype_read, rdataset);
9142 
9143 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9144 		    isc_rwlocktype_read);
9145 }
9146 
9147 /*
9148  * Database Iterator Methods
9149  */
9150 
9151 static void
9152 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9153 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9154 	dns_rbtnode_t *node = rbtdbiter->node;
9155 
9156 	if (node == NULL) {
9157 		return;
9158 	}
9159 
9160 	INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
9161 	reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
9162 }
9163 
9164 static void
9165 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9166 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9167 	dns_rbtnode_t *node = rbtdbiter->node;
9168 	nodelock_t *lock;
9169 
9170 	if (node == NULL) {
9171 		return;
9172 	}
9173 
9174 	lock = &rbtdb->node_locks[node->locknum].lock;
9175 	NODE_LOCK(lock, isc_rwlocktype_read);
9176 	decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9177 			    rbtdbiter->tree_locked, false);
9178 	NODE_UNLOCK(lock, isc_rwlocktype_read);
9179 
9180 	rbtdbiter->node = NULL;
9181 }
9182 
9183 static void
9184 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
9185 	dns_rbtnode_t *node;
9186 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9187 	bool was_read_locked = false;
9188 	nodelock_t *lock;
9189 	int i;
9190 
9191 	if (rbtdbiter->delcnt != 0) {
9192 		/*
9193 		 * Note that "%d node of %d in tree" can report things like
9194 		 * "flush_deletions: 59 nodes of 41 in tree".  This means
9195 		 * That some nodes appear on the deletions list more than
9196 		 * once.  Only the last occurrence will actually be deleted.
9197 		 */
9198 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
9199 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
9200 			      "flush_deletions: %d nodes of %d in tree",
9201 			      rbtdbiter->delcnt,
9202 			      dns_rbt_nodecount(rbtdb->tree));
9203 
9204 		if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9205 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9206 			was_read_locked = true;
9207 		}
9208 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9209 		rbtdbiter->tree_locked = isc_rwlocktype_write;
9210 
9211 		for (i = 0; i < rbtdbiter->delcnt; i++) {
9212 			node = rbtdbiter->deletions[i];
9213 			lock = &rbtdb->node_locks[node->locknum].lock;
9214 
9215 			NODE_LOCK(lock, isc_rwlocktype_read);
9216 			decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9217 					    rbtdbiter->tree_locked, false);
9218 			NODE_UNLOCK(lock, isc_rwlocktype_read);
9219 		}
9220 
9221 		rbtdbiter->delcnt = 0;
9222 
9223 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9224 		if (was_read_locked) {
9225 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9226 			rbtdbiter->tree_locked = isc_rwlocktype_read;
9227 		} else {
9228 			rbtdbiter->tree_locked = isc_rwlocktype_none;
9229 		}
9230 	}
9231 }
9232 
9233 static void
9234 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
9235 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9236 
9237 	REQUIRE(rbtdbiter->paused);
9238 	REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
9239 
9240 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9241 	rbtdbiter->tree_locked = isc_rwlocktype_read;
9242 
9243 	rbtdbiter->paused = false;
9244 }
9245 
9246 static void
9247 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
9248 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
9249 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9250 	dns_db_t *db = NULL;
9251 
9252 	if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9253 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9254 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9255 	} else {
9256 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
9257 	}
9258 
9259 	dereference_iter_node(rbtdbiter);
9260 
9261 	flush_deletions(rbtdbiter);
9262 
9263 	dns_db_attach(rbtdbiter->common.db, &db);
9264 	dns_db_detach(&rbtdbiter->common.db);
9265 
9266 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9267 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9268 	isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
9269 	dns_db_detach(&db);
9270 
9271 	*iteratorp = NULL;
9272 }
9273 
9274 static isc_result_t
9275 dbiterator_first(dns_dbiterator_t *iterator) {
9276 	isc_result_t result;
9277 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9278 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9279 	dns_name_t *name, *origin;
9280 
9281 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9282 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9283 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9284 	    rbtdbiter->result != ISC_R_NOMORE)
9285 	{
9286 		return (rbtdbiter->result);
9287 	}
9288 
9289 	if (rbtdbiter->paused) {
9290 		resume_iteration(rbtdbiter);
9291 	}
9292 
9293 	dereference_iter_node(rbtdbiter);
9294 
9295 	name = dns_fixedname_name(&rbtdbiter->name);
9296 	origin = dns_fixedname_name(&rbtdbiter->origin);
9297 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9298 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9299 
9300 	switch (rbtdbiter->nsec3mode) {
9301 	case nsec3only:
9302 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9303 		result = dns_rbtnodechain_first(rbtdbiter->current,
9304 						rbtdb->nsec3, name, origin);
9305 		break;
9306 	case nonsec3:
9307 		rbtdbiter->current = &rbtdbiter->chain;
9308 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
9309 						name, origin);
9310 		break;
9311 	case full:
9312 		rbtdbiter->current = &rbtdbiter->chain;
9313 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
9314 						name, origin);
9315 		if (result == ISC_R_NOTFOUND) {
9316 			rbtdbiter->current = &rbtdbiter->nsec3chain;
9317 			result = dns_rbtnodechain_first(
9318 				rbtdbiter->current, rbtdb->nsec3, name, origin);
9319 		}
9320 		break;
9321 	default:
9322 		UNREACHABLE();
9323 	}
9324 
9325 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9326 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9327 						  NULL, &rbtdbiter->node);
9328 
9329 		/* If we're in the NSEC3 tree, skip the origin */
9330 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
9331 			rbtdbiter->node = NULL;
9332 			result = dns_rbtnodechain_next(rbtdbiter->current, name,
9333 						       origin);
9334 			if (result == ISC_R_SUCCESS ||
9335 			    result == DNS_R_NEWORIGIN)
9336 			{
9337 				result = dns_rbtnodechain_current(
9338 					rbtdbiter->current, NULL, NULL,
9339 					&rbtdbiter->node);
9340 			}
9341 		}
9342 		if (result == ISC_R_SUCCESS) {
9343 			rbtdbiter->new_origin = true;
9344 			reference_iter_node(rbtdbiter);
9345 		}
9346 	} else {
9347 		INSIST(result == ISC_R_NOTFOUND);
9348 		result = ISC_R_NOMORE; /* The tree is empty. */
9349 	}
9350 
9351 	rbtdbiter->result = result;
9352 
9353 	if (result != ISC_R_SUCCESS) {
9354 		ENSURE(!rbtdbiter->paused);
9355 	}
9356 
9357 	return (result);
9358 }
9359 
9360 static isc_result_t
9361 dbiterator_last(dns_dbiterator_t *iterator) {
9362 	isc_result_t result;
9363 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9364 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9365 	dns_name_t *name, *origin;
9366 
9367 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9368 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9369 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9370 	    rbtdbiter->result != ISC_R_NOMORE)
9371 	{
9372 		return (rbtdbiter->result);
9373 	}
9374 
9375 	if (rbtdbiter->paused) {
9376 		resume_iteration(rbtdbiter);
9377 	}
9378 
9379 	dereference_iter_node(rbtdbiter);
9380 
9381 	name = dns_fixedname_name(&rbtdbiter->name);
9382 	origin = dns_fixedname_name(&rbtdbiter->origin);
9383 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9384 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9385 
9386 	switch (rbtdbiter->nsec3mode) {
9387 	case nsec3only:
9388 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9389 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
9390 					       name, origin);
9391 		break;
9392 	case nonsec3:
9393 		rbtdbiter->current = &rbtdbiter->chain;
9394 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9395 					       name, origin);
9396 		break;
9397 	case full:
9398 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9399 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
9400 					       name, origin);
9401 		if (result == ISC_R_NOTFOUND) {
9402 			rbtdbiter->current = &rbtdbiter->chain;
9403 			result = dns_rbtnodechain_last(
9404 				rbtdbiter->current, rbtdb->tree, name, origin);
9405 		}
9406 		break;
9407 	default:
9408 		UNREACHABLE();
9409 	}
9410 
9411 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9412 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9413 						  NULL, &rbtdbiter->node);
9414 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
9415 			/*
9416 			 * NSEC3 tree only has an origin node.
9417 			 */
9418 			rbtdbiter->node = NULL;
9419 			switch (rbtdbiter->nsec3mode) {
9420 			case nsec3only:
9421 				result = ISC_R_NOMORE;
9422 				break;
9423 			case nonsec3:
9424 			case full:
9425 				rbtdbiter->current = &rbtdbiter->chain;
9426 				result = dns_rbtnodechain_last(
9427 					rbtdbiter->current, rbtdb->tree, name,
9428 					origin);
9429 				if (result == ISC_R_SUCCESS ||
9430 				    result == DNS_R_NEWORIGIN)
9431 				{
9432 					result = dns_rbtnodechain_current(
9433 						rbtdbiter->current, NULL, NULL,
9434 						&rbtdbiter->node);
9435 				}
9436 				break;
9437 			default:
9438 				UNREACHABLE();
9439 			}
9440 		}
9441 		if (result == ISC_R_SUCCESS) {
9442 			rbtdbiter->new_origin = true;
9443 			reference_iter_node(rbtdbiter);
9444 		}
9445 	} else {
9446 		INSIST(result == ISC_R_NOTFOUND);
9447 		result = ISC_R_NOMORE; /* The tree is empty. */
9448 	}
9449 
9450 	rbtdbiter->result = result;
9451 
9452 	return (result);
9453 }
9454 
9455 static isc_result_t
9456 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name) {
9457 	isc_result_t result, tresult;
9458 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9459 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9460 	dns_name_t *iname, *origin;
9461 
9462 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9463 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9464 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9465 	    rbtdbiter->result != ISC_R_NOMORE)
9466 	{
9467 		return (rbtdbiter->result);
9468 	}
9469 
9470 	if (rbtdbiter->paused) {
9471 		resume_iteration(rbtdbiter);
9472 	}
9473 
9474 	dereference_iter_node(rbtdbiter);
9475 
9476 	iname = dns_fixedname_name(&rbtdbiter->name);
9477 	origin = dns_fixedname_name(&rbtdbiter->origin);
9478 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9479 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9480 
9481 	switch (rbtdbiter->nsec3mode) {
9482 	case nsec3only:
9483 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9484 		result = dns_rbt_findnode(rbtdb->nsec3, name, NULL,
9485 					  &rbtdbiter->node, rbtdbiter->current,
9486 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9487 		break;
9488 	case nonsec3:
9489 		rbtdbiter->current = &rbtdbiter->chain;
9490 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9491 					  &rbtdbiter->node, rbtdbiter->current,
9492 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9493 		break;
9494 	case full:
9495 		/*
9496 		 * Stay on main chain if not found on either chain.
9497 		 */
9498 		rbtdbiter->current = &rbtdbiter->chain;
9499 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9500 					  &rbtdbiter->node, rbtdbiter->current,
9501 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9502 		if (result == DNS_R_PARTIALMATCH) {
9503 			dns_rbtnode_t *node = NULL;
9504 			tresult = dns_rbt_findnode(
9505 				rbtdb->nsec3, name, NULL, &node,
9506 				&rbtdbiter->nsec3chain, DNS_RBTFIND_EMPTYDATA,
9507 				NULL, NULL);
9508 			if (tresult == ISC_R_SUCCESS) {
9509 				rbtdbiter->node = node;
9510 				rbtdbiter->current = &rbtdbiter->nsec3chain;
9511 				result = tresult;
9512 			}
9513 		}
9514 		break;
9515 	default:
9516 		UNREACHABLE();
9517 	}
9518 
9519 	if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) {
9520 		tresult = dns_rbtnodechain_current(rbtdbiter->current, iname,
9521 						   origin, NULL);
9522 		if (tresult == ISC_R_SUCCESS) {
9523 			rbtdbiter->new_origin = true;
9524 			reference_iter_node(rbtdbiter);
9525 		} else {
9526 			result = tresult;
9527 			rbtdbiter->node = NULL;
9528 		}
9529 	} else {
9530 		rbtdbiter->node = NULL;
9531 	}
9532 
9533 	rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? ISC_R_SUCCESS
9534 							   : result;
9535 
9536 	return (result);
9537 }
9538 
9539 static isc_result_t
9540 dbiterator_prev(dns_dbiterator_t *iterator) {
9541 	isc_result_t result;
9542 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9543 	dns_name_t *name, *origin;
9544 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9545 
9546 	REQUIRE(rbtdbiter->node != NULL);
9547 
9548 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9549 		return (rbtdbiter->result);
9550 	}
9551 
9552 	if (rbtdbiter->paused) {
9553 		resume_iteration(rbtdbiter);
9554 	}
9555 
9556 	dereference_iter_node(rbtdbiter);
9557 
9558 	name = dns_fixedname_name(&rbtdbiter->name);
9559 	origin = dns_fixedname_name(&rbtdbiter->origin);
9560 	result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin);
9561 	if (rbtdbiter->current == &rbtdbiter->nsec3chain &&
9562 	    (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN))
9563 	{
9564 		/*
9565 		 * If we're in the NSEC3 tree, it's empty or we've
9566 		 * reached the origin, then we're done with it.
9567 		 */
9568 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9569 						  NULL, &rbtdbiter->node);
9570 		if (result == ISC_R_NOTFOUND ||
9571 		    RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter))
9572 		{
9573 			rbtdbiter->node = NULL;
9574 			result = ISC_R_NOMORE;
9575 		}
9576 	}
9577 	if (result == ISC_R_NOMORE && rbtdbiter->nsec3mode != nsec3only &&
9578 	    &rbtdbiter->nsec3chain == rbtdbiter->current)
9579 	{
9580 		rbtdbiter->current = &rbtdbiter->chain;
9581 		dns_rbtnodechain_reset(rbtdbiter->current);
9582 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9583 					       name, origin);
9584 		if (result == ISC_R_NOTFOUND) {
9585 			result = ISC_R_NOMORE;
9586 		}
9587 	}
9588 
9589 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9590 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9591 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9592 						  NULL, &rbtdbiter->node);
9593 	}
9594 
9595 	if (result == ISC_R_SUCCESS) {
9596 		reference_iter_node(rbtdbiter);
9597 	}
9598 
9599 	rbtdbiter->result = result;
9600 
9601 	return (result);
9602 }
9603 
9604 static isc_result_t
9605 dbiterator_next(dns_dbiterator_t *iterator) {
9606 	isc_result_t result;
9607 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9608 	dns_name_t *name, *origin;
9609 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9610 
9611 	REQUIRE(rbtdbiter->node != NULL);
9612 
9613 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9614 		return (rbtdbiter->result);
9615 	}
9616 
9617 	if (rbtdbiter->paused) {
9618 		resume_iteration(rbtdbiter);
9619 	}
9620 
9621 	name = dns_fixedname_name(&rbtdbiter->name);
9622 	origin = dns_fixedname_name(&rbtdbiter->origin);
9623 	result = dns_rbtnodechain_next(rbtdbiter->current, name, origin);
9624 	if (result == ISC_R_NOMORE && rbtdbiter->nsec3mode != nonsec3 &&
9625 	    &rbtdbiter->chain == rbtdbiter->current)
9626 	{
9627 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9628 		dns_rbtnodechain_reset(rbtdbiter->current);
9629 		result = dns_rbtnodechain_first(rbtdbiter->current,
9630 						rbtdb->nsec3, name, origin);
9631 		if (result == ISC_R_NOTFOUND) {
9632 			result = ISC_R_NOMORE;
9633 		}
9634 	}
9635 
9636 	dereference_iter_node(rbtdbiter);
9637 
9638 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9639 		/*
9640 		 * If we've just started the NSEC3 tree,
9641 		 * skip over the origin.
9642 		 */
9643 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9644 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9645 						  NULL, &rbtdbiter->node);
9646 		if (RBTDBITER_NSEC3_ORIGIN_NODE(rbtdb, rbtdbiter)) {
9647 			rbtdbiter->node = NULL;
9648 			result = dns_rbtnodechain_next(rbtdbiter->current, name,
9649 						       origin);
9650 			if (result == ISC_R_SUCCESS ||
9651 			    result == DNS_R_NEWORIGIN)
9652 			{
9653 				result = dns_rbtnodechain_current(
9654 					rbtdbiter->current, NULL, NULL,
9655 					&rbtdbiter->node);
9656 			}
9657 		}
9658 	}
9659 	if (result == ISC_R_SUCCESS) {
9660 		reference_iter_node(rbtdbiter);
9661 	}
9662 
9663 	rbtdbiter->result = result;
9664 
9665 	return (result);
9666 }
9667 
9668 static isc_result_t
9669 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
9670 		   dns_name_t *name) {
9671 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9672 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9673 	dns_rbtnode_t *node = rbtdbiter->node;
9674 	isc_result_t result;
9675 	dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
9676 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9677 
9678 	REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
9679 	REQUIRE(rbtdbiter->node != NULL);
9680 
9681 	if (rbtdbiter->paused) {
9682 		resume_iteration(rbtdbiter);
9683 	}
9684 
9685 	if (name != NULL) {
9686 		if (rbtdbiter->common.relative_names) {
9687 			origin = NULL;
9688 		}
9689 		result = dns_name_concatenate(nodename, origin, name, NULL);
9690 		if (result != ISC_R_SUCCESS) {
9691 			return (result);
9692 		}
9693 		if (rbtdbiter->common.relative_names && rbtdbiter->new_origin) {
9694 			result = DNS_R_NEWORIGIN;
9695 		}
9696 	} else {
9697 		result = ISC_R_SUCCESS;
9698 	}
9699 
9700 	new_reference(rbtdb, node, isc_rwlocktype_none);
9701 
9702 	*nodep = rbtdbiter->node;
9703 
9704 	if (iterator->cleaning && result == ISC_R_SUCCESS) {
9705 		isc_result_t expire_result;
9706 
9707 		/*
9708 		 * If the deletion array is full, flush it before trying
9709 		 * to expire the current node.  The current node can't
9710 		 * fully deleted while the iteration cursor is still on it.
9711 		 */
9712 		if (rbtdbiter->delcnt == DELETION_BATCH_MAX) {
9713 			flush_deletions(rbtdbiter);
9714 		}
9715 
9716 		expire_result = expirenode(iterator->db, *nodep, 0);
9717 
9718 		/*
9719 		 * expirenode() currently always returns success.
9720 		 */
9721 		if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
9722 			rbtdbiter->deletions[rbtdbiter->delcnt++] = node;
9723 			isc_refcount_increment(&node->references);
9724 		}
9725 	}
9726 
9727 	return (result);
9728 }
9729 
9730 static isc_result_t
9731 dbiterator_pause(dns_dbiterator_t *iterator) {
9732 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9733 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9734 
9735 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9736 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9737 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9738 	    rbtdbiter->result != ISC_R_NOMORE)
9739 	{
9740 		return (rbtdbiter->result);
9741 	}
9742 
9743 	if (rbtdbiter->paused) {
9744 		return (ISC_R_SUCCESS);
9745 	}
9746 
9747 	rbtdbiter->paused = true;
9748 
9749 	if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
9750 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
9751 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9752 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9753 	}
9754 
9755 	flush_deletions(rbtdbiter);
9756 
9757 	return (ISC_R_SUCCESS);
9758 }
9759 
9760 static isc_result_t
9761 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
9762 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9763 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9764 
9765 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9766 		return (rbtdbiter->result);
9767 	}
9768 
9769 	dns_name_copy(origin, name);
9770 	return (ISC_R_SUCCESS);
9771 }
9772 
9773 static void
9774 setownercase(rdatasetheader_t *header, const dns_name_t *name) {
9775 	unsigned int i;
9776 	bool fully_lower;
9777 
9778 	/*
9779 	 * We do not need to worry about label lengths as they are all
9780 	 * less than or equal to 63.
9781 	 */
9782 	memset(header->upper, 0, sizeof(header->upper));
9783 	fully_lower = true;
9784 	for (i = 0; i < name->length; i++) {
9785 		if (isupper(name->ndata[i])) {
9786 			header->upper[i / 8] |= 1 << (i % 8);
9787 			fully_lower = false;
9788 		}
9789 	}
9790 	RDATASET_ATTR_SET(header, RDATASET_ATTR_CASESET);
9791 	if (fully_lower) {
9792 		RDATASET_ATTR_SET(header, RDATASET_ATTR_CASEFULLYLOWER);
9793 	}
9794 }
9795 
9796 static void
9797 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name) {
9798 	dns_rbtdb_t *rbtdb = rdataset->private1;
9799 	dns_rbtnode_t *rbtnode = rdataset->private2;
9800 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9801 	rdatasetheader_t *header;
9802 
9803 	header = (struct rdatasetheader *)(raw - sizeof(*header));
9804 
9805 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9806 		  isc_rwlocktype_write);
9807 	setownercase(header, name);
9808 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9809 		    isc_rwlocktype_write);
9810 }
9811 
9812 static void
9813 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name) {
9814 	dns_rbtdb_t *rbtdb = rdataset->private1;
9815 	dns_rbtnode_t *rbtnode = rdataset->private2;
9816 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9817 	rdatasetheader_t *header = NULL;
9818 	uint8_t mask = (1 << 7);
9819 	uint8_t bits = 0;
9820 
9821 	header = (struct rdatasetheader *)(raw - sizeof(*header));
9822 
9823 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9824 		  isc_rwlocktype_read);
9825 
9826 	if (!CASESET(header)) {
9827 		goto unlock;
9828 	}
9829 
9830 	if (CASEFULLYLOWER(header)) {
9831 		for (size_t i = 0; i < name->length; i++) {
9832 			name->ndata[i] = tolower(name->ndata[i]);
9833 		}
9834 	} else {
9835 		for (size_t i = 0; i < name->length; i++) {
9836 			if (mask == (1 << 7)) {
9837 				bits = header->upper[i / 8];
9838 				mask = 1;
9839 			} else {
9840 				mask <<= 1;
9841 			}
9842 
9843 			name->ndata[i] = ((bits & mask) != 0)
9844 						 ? toupper(name->ndata[i])
9845 						 : tolower(name->ndata[i]);
9846 		}
9847 	}
9848 
9849 unlock:
9850 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9851 		    isc_rwlocktype_read);
9852 }
9853 
9854 struct rbtdb_glue {
9855 	struct rbtdb_glue *next;
9856 	dns_fixedname_t fixedname;
9857 	dns_rdataset_t rdataset_a;
9858 	dns_rdataset_t sigrdataset_a;
9859 	dns_rdataset_t rdataset_aaaa;
9860 	dns_rdataset_t sigrdataset_aaaa;
9861 };
9862 
9863 typedef struct {
9864 	rbtdb_glue_t *glue_list;
9865 	dns_rbtdb_t *rbtdb;
9866 	rbtdb_version_t *rbtversion;
9867 } rbtdb_glue_additionaldata_ctx_t;
9868 
9869 static void
9870 free_gluelist(rbtdb_glue_t *glue_list, dns_rbtdb_t *rbtdb) {
9871 	rbtdb_glue_t *cur, *cur_next;
9872 
9873 	if (glue_list == (void *)-1) {
9874 		return;
9875 	}
9876 
9877 	cur = glue_list;
9878 	while (cur != NULL) {
9879 		cur_next = cur->next;
9880 
9881 		if (dns_rdataset_isassociated(&cur->rdataset_a)) {
9882 			dns_rdataset_disassociate(&cur->rdataset_a);
9883 		}
9884 		if (dns_rdataset_isassociated(&cur->sigrdataset_a)) {
9885 			dns_rdataset_disassociate(&cur->sigrdataset_a);
9886 		}
9887 
9888 		if (dns_rdataset_isassociated(&cur->rdataset_aaaa)) {
9889 			dns_rdataset_disassociate(&cur->rdataset_aaaa);
9890 		}
9891 		if (dns_rdataset_isassociated(&cur->sigrdataset_aaaa)) {
9892 			dns_rdataset_disassociate(&cur->sigrdataset_aaaa);
9893 		}
9894 
9895 		dns_rdataset_invalidate(&cur->rdataset_a);
9896 		dns_rdataset_invalidate(&cur->sigrdataset_a);
9897 		dns_rdataset_invalidate(&cur->rdataset_aaaa);
9898 		dns_rdataset_invalidate(&cur->sigrdataset_aaaa);
9899 
9900 		isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
9901 		cur = cur_next;
9902 	}
9903 }
9904 
9905 static void
9906 free_gluetable(rbtdb_version_t *version) {
9907 	dns_rbtdb_t *rbtdb;
9908 	size_t size, i;
9909 
9910 	RWLOCK(&version->glue_rwlock, isc_rwlocktype_write);
9911 
9912 	rbtdb = version->rbtdb;
9913 
9914 	for (i = 0; i < HASHSIZE(version->glue_table_bits); i++) {
9915 		rbtdb_glue_table_node_t *cur, *cur_next;
9916 
9917 		cur = version->glue_table[i];
9918 		while (cur != NULL) {
9919 			cur_next = cur->next;
9920 			/* isc_refcount_decrement(&cur->node->references); */
9921 			cur->node = NULL;
9922 			free_gluelist(cur->glue_list, rbtdb);
9923 			cur->glue_list = NULL;
9924 			isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
9925 			cur = cur_next;
9926 		}
9927 		version->glue_table[i] = NULL;
9928 	}
9929 
9930 	size = HASHSIZE(version->glue_table_bits) *
9931 	       sizeof(*version->glue_table);
9932 	isc_mem_put(rbtdb->common.mctx, version->glue_table, size);
9933 
9934 	RWUNLOCK(&version->glue_rwlock, isc_rwlocktype_write);
9935 }
9936 
9937 static uint32_t
9938 rehash_bits(rbtdb_version_t *version, size_t newcount) {
9939 	uint32_t oldbits = version->glue_table_bits;
9940 	uint32_t newbits = oldbits;
9941 
9942 	while (newcount >= HASHSIZE(newbits) &&
9943 	       newbits < RBTDB_GLUE_TABLE_MAX_BITS)
9944 	{
9945 		newbits += 1;
9946 	}
9947 
9948 	return (newbits);
9949 }
9950 
9951 /*%
9952  * Write lock (version->glue_rwlock) must be held.
9953  */
9954 static void
9955 rehash_gluetable(rbtdb_version_t *version) {
9956 	uint32_t oldbits, newbits;
9957 	size_t newsize, oldcount, i;
9958 	rbtdb_glue_table_node_t **oldtable;
9959 
9960 	oldbits = version->glue_table_bits;
9961 	oldcount = HASHSIZE(oldbits);
9962 	oldtable = version->glue_table;
9963 
9964 	newbits = rehash_bits(version, version->glue_table_nodecount);
9965 	newsize = HASHSIZE(newbits) * sizeof(version->glue_table[0]);
9966 
9967 	version->glue_table = isc_mem_get(version->rbtdb->common.mctx, newsize);
9968 	version->glue_table_bits = newbits;
9969 	memset(version->glue_table, 0, newsize);
9970 
9971 	for (i = 0; i < oldcount; i++) {
9972 		rbtdb_glue_table_node_t *gluenode;
9973 		rbtdb_glue_table_node_t *nextgluenode;
9974 		for (gluenode = oldtable[i]; gluenode != NULL;
9975 		     gluenode = nextgluenode)
9976 		{
9977 			uint32_t hash = isc_hash32(
9978 				&gluenode->node, sizeof(gluenode->node), true);
9979 			uint32_t idx = hash_32(hash, newbits);
9980 			nextgluenode = gluenode->next;
9981 			gluenode->next = version->glue_table[idx];
9982 			version->glue_table[idx] = gluenode;
9983 		}
9984 	}
9985 
9986 	isc_mem_put(version->rbtdb->common.mctx, oldtable,
9987 		    oldcount * sizeof(*version->glue_table));
9988 
9989 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_ZONE,
9990 		      ISC_LOG_DEBUG(3),
9991 		      "rehash_gluetable(): "
9992 		      "resized glue table from %zu to "
9993 		      "%zu",
9994 		      oldcount, newsize / sizeof(version->glue_table[0]));
9995 }
9996 
9997 static void
9998 maybe_rehash_gluetable(rbtdb_version_t *version) {
9999 	size_t overcommit = HASHSIZE(version->glue_table_bits) *
10000 			    RBTDB_GLUE_TABLE_OVERCOMMIT;
10001 	if (version->glue_table_nodecount < overcommit) {
10002 		return;
10003 	}
10004 
10005 	rehash_gluetable(version);
10006 }
10007 
10008 static isc_result_t
10009 glue_nsdname_cb(void *arg, const dns_name_t *name, dns_rdatatype_t qtype,
10010 		dns_rdataset_t *unused) {
10011 	rbtdb_glue_additionaldata_ctx_t *ctx;
10012 	isc_result_t result;
10013 	dns_fixedname_t fixedname_a;
10014 	dns_name_t *name_a = NULL;
10015 	dns_rdataset_t rdataset_a, sigrdataset_a;
10016 	dns_rbtnode_t *node_a = NULL;
10017 	dns_fixedname_t fixedname_aaaa;
10018 	dns_name_t *name_aaaa = NULL;
10019 	dns_rdataset_t rdataset_aaaa, sigrdataset_aaaa;
10020 	dns_rbtnode_t *node_aaaa = NULL;
10021 	rbtdb_glue_t *glue = NULL;
10022 	dns_name_t *gluename = NULL;
10023 
10024 	UNUSED(unused);
10025 
10026 	/*
10027 	 * NS records want addresses in additional records.
10028 	 */
10029 	INSIST(qtype == dns_rdatatype_a);
10030 
10031 	ctx = (rbtdb_glue_additionaldata_ctx_t *)arg;
10032 
10033 	name_a = dns_fixedname_initname(&fixedname_a);
10034 	dns_rdataset_init(&rdataset_a);
10035 	dns_rdataset_init(&sigrdataset_a);
10036 
10037 	name_aaaa = dns_fixedname_initname(&fixedname_aaaa);
10038 	dns_rdataset_init(&rdataset_aaaa);
10039 	dns_rdataset_init(&sigrdataset_aaaa);
10040 
10041 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10042 			   dns_rdatatype_a, DNS_DBFIND_GLUEOK, 0,
10043 			   (dns_dbnode_t **)&node_a, name_a, &rdataset_a,
10044 			   &sigrdataset_a);
10045 	if (result == DNS_R_GLUE) {
10046 		glue = isc_mem_get(ctx->rbtdb->common.mctx, sizeof(*glue));
10047 
10048 		gluename = dns_fixedname_initname(&glue->fixedname);
10049 		dns_name_copy(name_a, gluename);
10050 
10051 		dns_rdataset_init(&glue->rdataset_a);
10052 		dns_rdataset_init(&glue->sigrdataset_a);
10053 		dns_rdataset_init(&glue->rdataset_aaaa);
10054 		dns_rdataset_init(&glue->sigrdataset_aaaa);
10055 
10056 		dns_rdataset_clone(&rdataset_a, &glue->rdataset_a);
10057 		if (dns_rdataset_isassociated(&sigrdataset_a)) {
10058 			dns_rdataset_clone(&sigrdataset_a,
10059 					   &glue->sigrdataset_a);
10060 		}
10061 	}
10062 
10063 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10064 			   dns_rdatatype_aaaa, DNS_DBFIND_GLUEOK, 0,
10065 			   (dns_dbnode_t **)&node_aaaa, name_aaaa,
10066 			   &rdataset_aaaa, &sigrdataset_aaaa);
10067 	if (result == DNS_R_GLUE) {
10068 		if (glue == NULL) {
10069 			glue = isc_mem_get(ctx->rbtdb->common.mctx,
10070 					   sizeof(*glue));
10071 
10072 			gluename = dns_fixedname_initname(&glue->fixedname);
10073 			dns_name_copy(name_aaaa, gluename);
10074 
10075 			dns_rdataset_init(&glue->rdataset_a);
10076 			dns_rdataset_init(&glue->sigrdataset_a);
10077 			dns_rdataset_init(&glue->rdataset_aaaa);
10078 			dns_rdataset_init(&glue->sigrdataset_aaaa);
10079 		} else {
10080 			INSIST(node_a == node_aaaa);
10081 			INSIST(dns_name_equal(name_a, name_aaaa));
10082 		}
10083 
10084 		dns_rdataset_clone(&rdataset_aaaa, &glue->rdataset_aaaa);
10085 		if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10086 			dns_rdataset_clone(&sigrdataset_aaaa,
10087 					   &glue->sigrdataset_aaaa);
10088 		}
10089 	}
10090 
10091 	if (glue != NULL) {
10092 		glue->next = ctx->glue_list;
10093 		ctx->glue_list = glue;
10094 	}
10095 
10096 	result = ISC_R_SUCCESS;
10097 
10098 	if (dns_rdataset_isassociated(&rdataset_a)) {
10099 		rdataset_disassociate(&rdataset_a);
10100 	}
10101 	if (dns_rdataset_isassociated(&sigrdataset_a)) {
10102 		rdataset_disassociate(&sigrdataset_a);
10103 	}
10104 
10105 	if (dns_rdataset_isassociated(&rdataset_aaaa)) {
10106 		rdataset_disassociate(&rdataset_aaaa);
10107 	}
10108 	if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10109 		rdataset_disassociate(&sigrdataset_aaaa);
10110 	}
10111 
10112 	if (node_a != NULL) {
10113 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_a);
10114 	}
10115 	if (node_aaaa != NULL) {
10116 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_aaaa);
10117 	}
10118 
10119 	return (result);
10120 }
10121 
10122 static isc_result_t
10123 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
10124 		 dns_message_t *msg) {
10125 	dns_rbtdb_t *rbtdb = rdataset->private1;
10126 	dns_rbtnode_t *node = rdataset->private2;
10127 	rbtdb_version_t *rbtversion = version;
10128 	uint32_t idx;
10129 	rbtdb_glue_table_node_t *cur;
10130 	bool found = false;
10131 	bool restarted = false;
10132 	rbtdb_glue_t *ge;
10133 	rbtdb_glue_additionaldata_ctx_t ctx;
10134 	isc_result_t result;
10135 	uint64_t hash;
10136 
10137 	REQUIRE(rdataset->type == dns_rdatatype_ns);
10138 	REQUIRE(rbtdb == rbtversion->rbtdb);
10139 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
10140 
10141 	/*
10142 	 * The glue table cache that forms a part of the DB version
10143 	 * structure is not explicitly bounded and there's no cache
10144 	 * cleaning. The zone data size itself is an implicit bound.
10145 	 *
10146 	 * The key into the glue hashtable is the node pointer. This is
10147 	 * because the glue hashtable is a property of the DB version,
10148 	 * and the glue is keyed for the ownername/NS tuple. We don't
10149 	 * bother with using an expensive dns_name_t comparison here as
10150 	 * the node pointer is a fixed value that won't change for a DB
10151 	 * version and can be compared directly.
10152 	 */
10153 	hash = isc_hash_function(&node, sizeof(node), true);
10154 
10155 restart:
10156 	/*
10157 	 * First, check if we have the additional entries already cached
10158 	 * in the glue table.
10159 	 */
10160 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10161 
10162 	idx = hash_32(hash, rbtversion->glue_table_bits);
10163 
10164 	for (cur = rbtversion->glue_table[idx]; cur != NULL; cur = cur->next) {
10165 		if (cur->node == node) {
10166 			break;
10167 		}
10168 	}
10169 
10170 	if (cur == NULL) {
10171 		goto no_glue;
10172 	}
10173 	/*
10174 	 * We found a cached result. Add it to the message and
10175 	 * return.
10176 	 */
10177 	found = true;
10178 	ge = cur->glue_list;
10179 
10180 	/*
10181 	 * (void *) -1 is a special value that means no glue is
10182 	 * present in the zone.
10183 	 */
10184 	if (ge == (void *)-1) {
10185 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10186 			isc_stats_increment(
10187 				rbtdb->gluecachestats,
10188 				dns_gluecachestatscounter_hits_absent);
10189 		}
10190 		goto no_glue;
10191 	} else {
10192 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10193 			isc_stats_increment(
10194 				rbtdb->gluecachestats,
10195 				dns_gluecachestatscounter_hits_present);
10196 		}
10197 	}
10198 
10199 	for (; ge != NULL; ge = ge->next) {
10200 		dns_name_t *name = NULL;
10201 		dns_rdataset_t *rdataset_a = NULL;
10202 		dns_rdataset_t *sigrdataset_a = NULL;
10203 		dns_rdataset_t *rdataset_aaaa = NULL;
10204 		dns_rdataset_t *sigrdataset_aaaa = NULL;
10205 		dns_name_t *gluename = dns_fixedname_name(&ge->fixedname);
10206 
10207 		result = dns_message_gettempname(msg, &name);
10208 		if (result != ISC_R_SUCCESS) {
10209 			goto no_glue;
10210 		}
10211 
10212 		dns_name_copy(gluename, name);
10213 
10214 		if (dns_rdataset_isassociated(&ge->rdataset_a)) {
10215 			result = dns_message_gettemprdataset(msg, &rdataset_a);
10216 			if (result != ISC_R_SUCCESS) {
10217 				dns_message_puttempname(msg, &name);
10218 				goto no_glue;
10219 			}
10220 		}
10221 
10222 		if (dns_rdataset_isassociated(&ge->sigrdataset_a)) {
10223 			result = dns_message_gettemprdataset(msg,
10224 							     &sigrdataset_a);
10225 			if (result != ISC_R_SUCCESS) {
10226 				if (rdataset_a != NULL) {
10227 					dns_message_puttemprdataset(
10228 						msg, &rdataset_a);
10229 				}
10230 				dns_message_puttempname(msg, &name);
10231 				goto no_glue;
10232 			}
10233 		}
10234 
10235 		if (dns_rdataset_isassociated(&ge->rdataset_aaaa)) {
10236 			result = dns_message_gettemprdataset(msg,
10237 							     &rdataset_aaaa);
10238 			if (result != ISC_R_SUCCESS) {
10239 				dns_message_puttempname(msg, &name);
10240 				if (rdataset_a != NULL) {
10241 					dns_message_puttemprdataset(
10242 						msg, &rdataset_a);
10243 				}
10244 				if (sigrdataset_a != NULL) {
10245 					dns_message_puttemprdataset(
10246 						msg, &sigrdataset_a);
10247 				}
10248 				goto no_glue;
10249 			}
10250 		}
10251 
10252 		if (dns_rdataset_isassociated(&ge->sigrdataset_aaaa)) {
10253 			result = dns_message_gettemprdataset(msg,
10254 							     &sigrdataset_aaaa);
10255 			if (result != ISC_R_SUCCESS) {
10256 				dns_message_puttempname(msg, &name);
10257 				if (rdataset_a != NULL) {
10258 					dns_message_puttemprdataset(
10259 						msg, &rdataset_a);
10260 				}
10261 				if (sigrdataset_a != NULL) {
10262 					dns_message_puttemprdataset(
10263 						msg, &sigrdataset_a);
10264 				}
10265 				if (rdataset_aaaa != NULL) {
10266 					dns_message_puttemprdataset(
10267 						msg, &rdataset_aaaa);
10268 				}
10269 				goto no_glue;
10270 			}
10271 		}
10272 
10273 		if (rdataset_a != NULL) {
10274 			dns_rdataset_clone(&ge->rdataset_a, rdataset_a);
10275 			ISC_LIST_APPEND(name->list, rdataset_a, link);
10276 		}
10277 
10278 		if (sigrdataset_a != NULL) {
10279 			dns_rdataset_clone(&ge->sigrdataset_a, sigrdataset_a);
10280 			ISC_LIST_APPEND(name->list, sigrdataset_a, link);
10281 		}
10282 
10283 		if (rdataset_aaaa != NULL) {
10284 			dns_rdataset_clone(&ge->rdataset_aaaa, rdataset_aaaa);
10285 			ISC_LIST_APPEND(name->list, rdataset_aaaa, link);
10286 		}
10287 		if (sigrdataset_aaaa != NULL) {
10288 			dns_rdataset_clone(&ge->sigrdataset_aaaa,
10289 					   sigrdataset_aaaa);
10290 			ISC_LIST_APPEND(name->list, sigrdataset_aaaa, link);
10291 		}
10292 
10293 		dns_message_addname(msg, name, DNS_SECTION_ADDITIONAL);
10294 	}
10295 
10296 no_glue:
10297 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10298 
10299 	if (found) {
10300 		return (ISC_R_SUCCESS);
10301 	}
10302 
10303 	if (restarted) {
10304 		return (ISC_R_FAILURE);
10305 	}
10306 
10307 	/*
10308 	 * No cached glue was found in the table. Cache it and restart
10309 	 * this function.
10310 	 *
10311 	 * Due to the gap between the read lock and the write lock, it's
10312 	 * possible that we may cache a duplicate glue table entry, but
10313 	 * we don't care.
10314 	 */
10315 
10316 	ctx.glue_list = NULL;
10317 	ctx.rbtdb = rbtdb;
10318 	ctx.rbtversion = rbtversion;
10319 
10320 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10321 
10322 	maybe_rehash_gluetable(rbtversion);
10323 	idx = hash_32(hash, rbtversion->glue_table_bits);
10324 
10325 	(void)dns_rdataset_additionaldata(rdataset, dns_rootname,
10326 					  glue_nsdname_cb, &ctx);
10327 
10328 	cur = isc_mem_get(rbtdb->common.mctx, sizeof(*cur));
10329 
10330 	/*
10331 	 * XXXMUKS: it looks like the dns_dbversion is not destroyed
10332 	 * when named is terminated by a keyboard break. This doesn't
10333 	 * cleanup the node reference and keeps the process dangling.
10334 	 */
10335 	/* isc_refcount_increment0(&node->references); */
10336 	cur->node = node;
10337 
10338 	if (ctx.glue_list == NULL) {
10339 		/*
10340 		 * No glue was found. Cache it so.
10341 		 */
10342 		cur->glue_list = (void *)-1;
10343 		if (rbtdb->gluecachestats != NULL) {
10344 			isc_stats_increment(
10345 				rbtdb->gluecachestats,
10346 				dns_gluecachestatscounter_inserts_absent);
10347 		}
10348 	} else {
10349 		cur->glue_list = ctx.glue_list;
10350 		if (rbtdb->gluecachestats != NULL) {
10351 			isc_stats_increment(
10352 				rbtdb->gluecachestats,
10353 				dns_gluecachestatscounter_inserts_present);
10354 		}
10355 	}
10356 
10357 	cur->next = rbtversion->glue_table[idx];
10358 	rbtversion->glue_table[idx] = cur;
10359 	rbtversion->glue_table_nodecount++;
10360 
10361 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10362 
10363 	restarted = true;
10364 	goto restart;
10365 
10366 	/* UNREACHABLE */
10367 }
10368 
10369 /*%
10370  * Routines for LRU-based cache management.
10371  */
10372 
10373 /*%
10374  * See if a given cache entry that is being reused needs to be updated
10375  * in the LRU-list.  From the LRU management point of view, this function is
10376  * expected to return true for almost all cases.  When used with threads,
10377  * however, this may cause a non-negligible performance penalty because a
10378  * writer lock will have to be acquired before updating the list.
10379  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
10380  * function returns true if the entry has not been updated for some period of
10381  * time.  We differentiate the NS or glue address case and the others since
10382  * experiments have shown that the former tends to be accessed relatively
10383  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
10384  * may cause external queries at a higher level zone, involving more
10385  * transactions).
10386  *
10387  * Caller must hold the node (read or write) lock.
10388  */
10389 static bool
10390 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
10391 	if (RDATASET_ATTR_GET(header, (RDATASET_ATTR_NONEXISTENT |
10392 				       RDATASET_ATTR_ANCIENT |
10393 				       RDATASET_ATTR_ZEROTTL)) != 0)
10394 	{
10395 		return (false);
10396 	}
10397 
10398 #if DNS_RBTDB_LIMITLRUUPDATE
10399 	if (header->type == dns_rdatatype_ns ||
10400 	    (header->trust == dns_trust_glue &&
10401 	     (header->type == dns_rdatatype_a ||
10402 	      header->type == dns_rdatatype_aaaa)))
10403 	{
10404 		/*
10405 		 * Glue records are updated if at least DNS_RBTDB_LRUUPDATE_GLUE
10406 		 * seconds have passed since the previous update time.
10407 		 */
10408 		return (header->last_used + DNS_RBTDB_LRUUPDATE_GLUE <= now);
10409 	}
10410 
10411 	/*
10412 	 * Other records are updated if DNS_RBTDB_LRUUPDATE_REGULAR seconds
10413 	 * have passed.
10414 	 */
10415 	return (header->last_used + DNS_RBTDB_LRUUPDATE_REGULAR <= now);
10416 #else
10417 	UNUSED(now);
10418 
10419 	return (true);
10420 #endif /* if DNS_RBTDB_LIMITLRUUPDATE */
10421 }
10422 
10423 /*%
10424  * Update the timestamp of a given cache entry and move it to the head
10425  * of the corresponding LRU list.
10426  *
10427  * Caller must hold the node (write) lock.
10428  *
10429  * Note that the we do NOT touch the heap here, as the TTL has not changed.
10430  */
10431 static void
10432 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now) {
10433 	INSIST(IS_CACHE(rbtdb));
10434 
10435 	/* To be checked: can we really assume this? XXXMLG */
10436 	INSIST(ISC_LINK_LINKED(header, link));
10437 
10438 	ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum], header, link);
10439 	header->last_used = now;
10440 	ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], header, link);
10441 }
10442 
10443 static size_t
10444 expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize,
10445 		   bool tree_locked) {
10446 	rdatasetheader_t *header;
10447 	size_t purged = 0;
10448 
10449 	for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
10450 	     header != NULL &&
10451 	     header->last_used <= atomic_load(&rbtdb->last_used) &&
10452 	     purged <= purgesize;
10453 	     header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]))
10454 	{
10455 		/*
10456 		 * Unlink the entry at this point to avoid checking it
10457 		 * again even if it's currently used someone else and
10458 		 * cannot be purged at this moment.  This entry won't be
10459 		 * referenced any more (so unlinking is safe) since the
10460 		 * TTL will be reset to 0.
10461 		 */
10462 		ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, link);
10463 		size_t header_size = rdataset_size(header);
10464 		expire_header(rbtdb, header, tree_locked, expire_lru);
10465 		purged += header_size;
10466 	}
10467 
10468 	return (purged);
10469 }
10470 
10471 /*%
10472  * Purge some stale (i.e. unused for some period - LRU based cleaning) cache
10473  * entries under the overmem condition.  To recover from this condition quickly,
10474  * we cleanup entries up to the size of newly added rdata (passed as purgesize).
10475  *
10476  * The LRU lists tails are processed in LRU order to the nearest second.
10477  *
10478  * A write lock on the tree must be held.
10479  */
10480 static void
10481 overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
10482 	      bool tree_locked) {
10483 	uint32_t locknum_start = atomic_fetch_add(&rbtdb->lru_sweep, 1) %
10484 				 rbtdb->node_lock_count;
10485 	uint32_t locknum = locknum_start;
10486 	/* Size of added data, possible node and possible ENT node. */
10487 	size_t purgesize = rdataset_size(newheader) +
10488 			   2 * dns__rbtnode_getsize(newheader->node);
10489 	size_t purged = 0;
10490 	isc_stdtime_t min_last_used = 0;
10491 	size_t max_passes = 8;
10492 
10493 again:
10494 	do {
10495 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
10496 			  isc_rwlocktype_write);
10497 
10498 		purged += expire_lru_headers(rbtdb, locknum, purgesize - purged,
10499 					     tree_locked);
10500 
10501 		/*
10502 		 * Work out the oldest remaining last_used values of the list
10503 		 * tails as we walk across the array of lru lists.
10504 		 */
10505 		rdatasetheader_t *header =
10506 			ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
10507 		if (header != NULL &&
10508 		    (min_last_used == 0 || header->last_used < min_last_used))
10509 		{
10510 			min_last_used = header->last_used;
10511 		}
10512 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
10513 			    isc_rwlocktype_write);
10514 		locknum = (locknum + 1) % rbtdb->node_lock_count;
10515 	} while (locknum != locknum_start && purged <= purgesize);
10516 
10517 	/*
10518 	 * Update rbtdb->last_used if we have walked all the list tails and have
10519 	 * not freed the required amount of memory.
10520 	 */
10521 	if (purged < purgesize) {
10522 		if (min_last_used != 0) {
10523 			atomic_store(&rbtdb->last_used, min_last_used);
10524 			if (max_passes-- > 0) {
10525 				goto again;
10526 			}
10527 		}
10528 	}
10529 }
10530 
10531 static void
10532 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
10533 	      expire_t reason) {
10534 	set_ttl(rbtdb, header, 0);
10535 	mark_header_ancient(rbtdb, header);
10536 
10537 	/*
10538 	 * Caller must hold the node (write) lock.
10539 	 */
10540 
10541 	if (isc_refcount_current(&header->node->references) == 0) {
10542 		/*
10543 		 * If no one else is using the node, we can clean it up now.
10544 		 * We first need to gain a new reference to the node to meet a
10545 		 * requirement of decrement_reference().
10546 		 */
10547 		new_reference(rbtdb, header->node, isc_rwlocktype_write);
10548 		decrement_reference(rbtdb, header->node, 0,
10549 				    isc_rwlocktype_write,
10550 				    tree_locked ? isc_rwlocktype_write
10551 						: isc_rwlocktype_none,
10552 				    false);
10553 
10554 		if (rbtdb->cachestats == NULL) {
10555 			return;
10556 		}
10557 
10558 		switch (reason) {
10559 		case expire_ttl:
10560 			isc_stats_increment(rbtdb->cachestats,
10561 					    dns_cachestatscounter_deletettl);
10562 			break;
10563 		case expire_lru:
10564 			isc_stats_increment(rbtdb->cachestats,
10565 					    dns_cachestatscounter_deletelru);
10566 			break;
10567 		default:
10568 			break;
10569 		}
10570 	}
10571 }
10572 
10573 /*
10574  * Caller must be holding the node write lock.
10575  */
10576 static void
10577 expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, bool tree_locked,
10578 		   isc_stdtime_t now) {
10579 	isc_heap_t *heap = rbtdb->heaps[locknum];
10580 
10581 	for (size_t i = 0; i < DNS_RBTDB_EXPIRE_TTL_COUNT; i++) {
10582 		rdatasetheader_t *header = isc_heap_element(heap, 1);
10583 
10584 		if (header == NULL) {
10585 			/* No headers left on this TTL heap; exit cleaning */
10586 			return;
10587 		}
10588 
10589 		dns_ttl_t ttl = header->rdh_ttl;
10590 
10591 		if (!isc_mem_isovermem(rbtdb->common.mctx)) {
10592 			/* Only account for stale TTL if cache is not overmem */
10593 			ttl += STALE_TTL(header, rbtdb);
10594 		}
10595 
10596 		if (ttl >= now - RBTDB_VIRTUAL) {
10597 			/*
10598 			 * The header at the top of this TTL heap is not yet
10599 			 * eligible for expiry, so none of the other headers on
10600 			 * the same heap can be eligible for expiry, either;
10601 			 * exit cleaning.
10602 			 */
10603 			return;
10604 		}
10605 
10606 		expire_header(rbtdb, header, tree_locked, expire_ttl);
10607 	}
10608 }
10609