xref: /netbsd-src/external/mpl/dhcp/bind/dist/lib/dns/rbtdb.c (revision 4afad4b7fa6d4a0d3dedf41d1587a7250710ae54)
1 /*	$NetBSD: rbtdb.c,v 1.1 2024/02/18 20:57:33 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <ctype.h>
19 #include <inttypes.h>
20 #include <stdbool.h>
21 
22 #include <isc/atomic.h>
23 #include <isc/crc64.h>
24 #include <isc/event.h>
25 #include <isc/file.h>
26 #include <isc/hash.h>
27 #include <isc/heap.h>
28 #include <isc/hex.h>
29 #include <isc/mem.h>
30 #include <isc/mutex.h>
31 #include <isc/once.h>
32 #include <isc/platform.h>
33 #include <isc/print.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/serial.h>
38 #include <isc/socket.h>
39 #include <isc/stdio.h>
40 #include <isc/string.h>
41 #include <isc/task.h>
42 #include <isc/time.h>
43 #include <isc/util.h>
44 
45 #include <dns/callbacks.h>
46 #include <dns/db.h>
47 #include <dns/dbiterator.h>
48 #include <dns/events.h>
49 #include <dns/fixedname.h>
50 #include <dns/lib.h>
51 #include <dns/log.h>
52 #include <dns/masterdump.h>
53 #include <dns/nsec.h>
54 #include <dns/nsec3.h>
55 #include <dns/rbt.h>
56 #include <dns/rdata.h>
57 #include <dns/rdataset.h>
58 #include <dns/rdatasetiter.h>
59 #include <dns/rdataslab.h>
60 #include <dns/rdatastruct.h>
61 #include <dns/result.h>
62 #include <dns/stats.h>
63 #include <dns/time.h>
64 #include <dns/version.h>
65 #include <dns/view.h>
66 #include <dns/zone.h>
67 #include <dns/zonekey.h>
68 
69 #ifndef WIN32
70 #include <sys/mman.h>
71 #else /* ifndef WIN32 */
72 #define PROT_READ   0x01
73 #define PROT_WRITE  0x02
74 #define MAP_PRIVATE 0x0002
75 #define MAP_FAILED  ((void *)-1)
76 #endif /* ifndef WIN32 */
77 
78 #include "rbtdb.h"
79 
80 #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
81 
82 #define CHECK(op)                            \
83 	do {                                 \
84 		result = (op);               \
85 		if (result != ISC_R_SUCCESS) \
86 			goto failure;        \
87 	} while (0)
88 
89 /*
90  * This is the map file header for RBTDB images.  It is populated, and then
91  * written, as the LAST thing done to the file.  Writing this last (with
92  * zeros in the header area initially) will ensure that the header is only
93  * valid when the RBTDB image is also valid.
94  */
95 typedef struct rbtdb_file_header rbtdb_file_header_t;
96 
97 /* Header length, always the same size regardless of structure size */
98 #define RBTDB_HEADER_LENGTH 1024
99 
100 struct rbtdb_file_header {
101 	char version1[32];
102 	uint32_t ptrsize;
103 	unsigned int bigendian : 1;
104 	uint64_t tree;
105 	uint64_t nsec;
106 	uint64_t nsec3;
107 
108 	char version2[32]; /* repeated; must match version1 */
109 };
110 
111 /*%
112  * Note that "impmagic" is not the first four bytes of the struct, so
113  * ISC_MAGIC_VALID cannot be used.
114  */
115 #define VALID_RBTDB(rbtdb) \
116 	((rbtdb) != NULL && (rbtdb)->common.impmagic == RBTDB_MAGIC)
117 
118 typedef uint32_t rbtdb_serial_t;
119 typedef uint32_t rbtdb_rdatatype_t;
120 
121 #define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type)&0xFFFF))
122 #define RBTDB_RDATATYPE_EXT(type)  ((dns_rdatatype_t)((type) >> 16))
123 #define RBTDB_RDATATYPE_VALUE(base, ext)              \
124 	((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \
125 	 (((uint32_t)base) & 0xffff))
126 
127 #define RBTDB_RDATATYPE_SIGNSEC \
128 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
129 #define RBTDB_RDATATYPE_SIGNSEC3 \
130 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
131 #define RBTDB_RDATATYPE_SIGNS \
132 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
133 #define RBTDB_RDATATYPE_SIGCNAME \
134 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
135 #define RBTDB_RDATATYPE_SIGDNAME \
136 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
137 #define RBTDB_RDATATYPE_SIGDS \
138 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
139 #define RBTDB_RDATATYPE_SIGSOA \
140 	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa)
141 #define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
142 
143 #define RBTDB_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
144 #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
145 #define RBTDB_LOCK(l, t)     RWLOCK((l), (t))
146 #define RBTDB_UNLOCK(l, t)   RWUNLOCK((l), (t))
147 
148 /*
149  * Since node locking is sensitive to both performance and memory footprint,
150  * we need some trick here.  If we have both high-performance rwlock and
151  * high performance and small-memory reference counters, we use rwlock for
152  * node lock and isc_refcount for node references.  In this case, we don't have
153  * to protect the access to the counters by locks.
154  * Otherwise, we simply use ordinary mutex lock for node locking, and use
155  * simple integers as reference counters which is protected by the lock.
156  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
157  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
158  * counters first and then protect other parts of a node as read-only data.
159  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
160  * provided for these special cases.  When we can use the efficient backend
161  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
162  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
163  * section including the access to the reference counter.
164  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
165  * section is also protected by NODE_STRONGLOCK().
166  */
167 typedef isc_rwlock_t nodelock_t;
168 
169 #define NODE_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
170 #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
171 #define NODE_LOCK(l, t)	    RWLOCK((l), (t))
172 #define NODE_UNLOCK(l, t)   RWUNLOCK((l), (t))
173 #define NODE_TRYUPGRADE(l)  isc_rwlock_tryupgrade(l)
174 #define NODE_DOWNGRADE(l)   isc_rwlock_downgrade(l)
175 
176 /*%
177  * Whether to rate-limit updating the LRU to avoid possible thread contention.
178  * Updating LRU requires write locking, so we don't do it every time the
179  * record is touched - only after some time passes.
180  */
181 #ifndef DNS_RBTDB_LIMITLRUUPDATE
182 #define DNS_RBTDB_LIMITLRUUPDATE 1
183 #endif
184 
185 /*% Time after which we update LRU for glue records, 5 minutes */
186 #define DNS_RBTDB_LRUUPDATE_GLUE 300
187 /*% Time after which we update LRU for all other records, 10 minutes */
188 #define DNS_RBTDB_LRUUPDATE_REGULAR 600
189 
190 /*
191  * Allow clients with a virtual time of up to 5 minutes in the past to see
192  * records that would have otherwise have expired.
193  */
194 #define RBTDB_VIRTUAL 300
195 
196 struct noqname {
197 	dns_name_t name;
198 	void *neg;
199 	void *negsig;
200 	dns_rdatatype_t type;
201 };
202 
203 typedef struct rdatasetheader {
204 	/*%
205 	 * Locked by the owning node's lock.
206 	 */
207 	rbtdb_serial_t serial;
208 	dns_ttl_t rdh_ttl;
209 	rbtdb_rdatatype_t type;
210 	atomic_uint_least16_t attributes;
211 	dns_trust_t trust;
212 	atomic_uint_fast32_t last_refresh_fail_ts;
213 	struct noqname *noqname;
214 	struct noqname *closest;
215 	unsigned int is_mmapped	      : 1;
216 	unsigned int next_is_relative : 1;
217 	unsigned int node_is_relative : 1;
218 	unsigned int resign_lsb	      : 1;
219 	/*%<
220 	 * We don't use the LIST macros, because the LIST structure has
221 	 * both head and tail pointers, and is doubly linked.
222 	 */
223 
224 	struct rdatasetheader *next;
225 	/*%<
226 	 * If this is the top header for an rdataset, 'next' points
227 	 * to the top header for the next rdataset (i.e., the next type).
228 	 * Otherwise, it points up to the header whose down pointer points
229 	 * at this header.
230 	 */
231 
232 	struct rdatasetheader *down;
233 	/*%<
234 	 * Points to the header for the next older version of
235 	 * this rdataset.
236 	 */
237 
238 	atomic_uint_fast32_t count;
239 	/*%<
240 	 * Monotonously increased every time this rdataset is bound so that
241 	 * it is used as the base of the starting point in DNS responses
242 	 * when the "cyclic" rrset-order is required.
243 	 */
244 
245 	dns_rbtnode_t *node;
246 	isc_stdtime_t last_used;
247 	ISC_LINK(struct rdatasetheader) link;
248 
249 	unsigned int heap_index;
250 	/*%<
251 	 * Used for TTL-based cache cleaning.
252 	 */
253 	isc_stdtime_t resign;
254 	/*%<
255 	 * Case vector.  If the bit is set then the corresponding
256 	 * character in the owner name needs to be AND'd with 0x20,
257 	 * rendering that character upper case.
258 	 */
259 	unsigned char upper[32];
260 } rdatasetheader_t;
261 
262 typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
263 typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
264 
265 #define RDATASET_ATTR_NONEXISTENT 0x0001
266 /*%< May be potentially served as stale data. */
267 #define RDATASET_ATTR_STALE	     0x0002
268 #define RDATASET_ATTR_IGNORE	     0x0004
269 #define RDATASET_ATTR_RETAIN	     0x0008
270 #define RDATASET_ATTR_NXDOMAIN	     0x0010
271 #define RDATASET_ATTR_RESIGN	     0x0020
272 #define RDATASET_ATTR_STATCOUNT	     0x0040
273 #define RDATASET_ATTR_OPTOUT	     0x0080
274 #define RDATASET_ATTR_NEGATIVE	     0x0100
275 #define RDATASET_ATTR_PREFETCH	     0x0200
276 #define RDATASET_ATTR_CASESET	     0x0400
277 #define RDATASET_ATTR_ZEROTTL	     0x0800
278 #define RDATASET_ATTR_CASEFULLYLOWER 0x1000
279 /*%< Ancient - awaiting cleanup. */
280 #define RDATASET_ATTR_ANCIENT	   0x2000
281 #define RDATASET_ATTR_STALE_WINDOW 0x4000
282 
283 /*
284  * XXX
285  * When the cache will pre-expire data (due to memory low or other
286  * situations) before the rdataset's TTL has expired, it MUST
287  * respect the RETAIN bit and not expire the data until its TTL is
288  * expired.
289  */
290 
291 #undef IGNORE /* WIN32 winbase.h defines this. */
292 
293 #define EXISTS(header)                                 \
294 	((atomic_load_acquire(&(header)->attributes) & \
295 	  RDATASET_ATTR_NONEXISTENT) == 0)
296 #define NONEXISTENT(header)                            \
297 	((atomic_load_acquire(&(header)->attributes) & \
298 	  RDATASET_ATTR_NONEXISTENT) != 0)
299 #define IGNORE(header)                                 \
300 	((atomic_load_acquire(&(header)->attributes) & \
301 	  RDATASET_ATTR_IGNORE) != 0)
302 #define RETAIN(header)                                 \
303 	((atomic_load_acquire(&(header)->attributes) & \
304 	  RDATASET_ATTR_RETAIN) != 0)
305 #define NXDOMAIN(header)                               \
306 	((atomic_load_acquire(&(header)->attributes) & \
307 	  RDATASET_ATTR_NXDOMAIN) != 0)
308 #define STALE(header)                                                          \
309 	((atomic_load_acquire(&(header)->attributes) & RDATASET_ATTR_STALE) != \
310 	 0)
311 #define STALE_WINDOW(header)                           \
312 	((atomic_load_acquire(&(header)->attributes) & \
313 	  RDATASET_ATTR_STALE_WINDOW) != 0)
314 #define RESIGN(header)                                 \
315 	((atomic_load_acquire(&(header)->attributes) & \
316 	  RDATASET_ATTR_RESIGN) != 0)
317 #define OPTOUT(header)                                 \
318 	((atomic_load_acquire(&(header)->attributes) & \
319 	  RDATASET_ATTR_OPTOUT) != 0)
320 #define NEGATIVE(header)                               \
321 	((atomic_load_acquire(&(header)->attributes) & \
322 	  RDATASET_ATTR_NEGATIVE) != 0)
323 #define PREFETCH(header)                               \
324 	((atomic_load_acquire(&(header)->attributes) & \
325 	  RDATASET_ATTR_PREFETCH) != 0)
326 #define CASESET(header)                                \
327 	((atomic_load_acquire(&(header)->attributes) & \
328 	  RDATASET_ATTR_CASESET) != 0)
329 #define ZEROTTL(header)                                \
330 	((atomic_load_acquire(&(header)->attributes) & \
331 	  RDATASET_ATTR_ZEROTTL) != 0)
332 #define CASEFULLYLOWER(header)                         \
333 	((atomic_load_acquire(&(header)->attributes) & \
334 	  RDATASET_ATTR_CASEFULLYLOWER) != 0)
335 #define ANCIENT(header)                                \
336 	((atomic_load_acquire(&(header)->attributes) & \
337 	  RDATASET_ATTR_ANCIENT) != 0)
338 #define STATCOUNT(header)                              \
339 	((atomic_load_acquire(&(header)->attributes) & \
340 	  RDATASET_ATTR_STATCOUNT) != 0)
341 
342 #define RDATASET_ATTR_GET(header, attribute) \
343 	(atomic_load_acquire(&(header)->attributes) & attribute)
344 #define RDATASET_ATTR_SET(header, attribute) \
345 	atomic_fetch_or_release(&(header)->attributes, attribute)
346 #define RDATASET_ATTR_CLR(header, attribute) \
347 	atomic_fetch_and_release(&(header)->attributes, ~(attribute))
348 
349 #define ACTIVE(header, now)             \
350 	(((header)->rdh_ttl > (now)) || \
351 	 ((header)->rdh_ttl == (now) && ZEROTTL(header)))
352 
353 #define DEFAULT_NODE_LOCK_COUNT	    7 /*%< Should be prime. */
354 #define RBTDB_GLUE_TABLE_INIT_BITS  2U
355 #define RBTDB_GLUE_TABLE_MAX_BITS   32U
356 #define RBTDB_GLUE_TABLE_OVERCOMMIT 3
357 
358 #define GOLDEN_RATIO_32 0x61C88647
359 #define HASHSIZE(bits)	(UINT64_C(1) << (bits))
360 
361 static uint32_t
hash_32(uint32_t val,unsigned int bits)362 hash_32(uint32_t val, unsigned int bits) {
363 	REQUIRE(bits <= RBTDB_GLUE_TABLE_MAX_BITS);
364 	/* High bits are more random. */
365 	return (val * GOLDEN_RATIO_32 >> (32 - bits));
366 }
367 
368 #define EXPIREDOK(rbtiterator) \
369 	(((rbtiterator)->common.options & DNS_DB_EXPIREDOK) != 0)
370 
371 #define STALEOK(rbtiterator) \
372 	(((rbtiterator)->common.options & DNS_DB_STALEOK) != 0)
373 
374 /*%
375  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
376  * There is a tradeoff issue about configuring this value: if this is too
377  * small, it may cause heavier contention between threads; if this is too large,
378  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
379  * The default value should work well for most environments, but this can
380  * also be configurable at compilation time via the
381  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
382  * 1 due to the assumption of overmem_purge().
383  */
384 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
385 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
386 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
387 #else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
388 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
389 #endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
390 #else  /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
391 #define DEFAULT_CACHE_NODE_LOCK_COUNT 17
392 #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
393 
394 typedef struct {
395 	nodelock_t lock;
396 	/* Protected in the refcount routines. */
397 	isc_refcount_t references;
398 	/* Locked by lock. */
399 	bool exiting;
400 } rbtdb_nodelock_t;
401 
402 typedef struct rbtdb_changed {
403 	dns_rbtnode_t *node;
404 	bool dirty;
405 	ISC_LINK(struct rbtdb_changed) link;
406 } rbtdb_changed_t;
407 
408 typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
409 
410 typedef enum { dns_db_insecure, dns_db_partial, dns_db_secure } dns_db_secure_t;
411 
412 typedef struct dns_rbtdb dns_rbtdb_t;
413 
414 /* Reason for expiring a record from cache */
415 typedef enum { expire_lru, expire_ttl, expire_flush } expire_t;
416 
417 typedef struct rbtdb_glue rbtdb_glue_t;
418 
419 typedef struct rbtdb_glue_table_node {
420 	struct rbtdb_glue_table_node *next;
421 	dns_rbtnode_t *node;
422 	rbtdb_glue_t *glue_list;
423 } rbtdb_glue_table_node_t;
424 
425 typedef enum {
426 	rdataset_ttl_fresh,
427 	rdataset_ttl_stale,
428 	rdataset_ttl_ancient
429 } rdataset_ttl_t;
430 
431 typedef struct rbtdb_version {
432 	/* Not locked */
433 	rbtdb_serial_t serial;
434 	dns_rbtdb_t *rbtdb;
435 	/*
436 	 * Protected in the refcount routines.
437 	 * XXXJT: should we change the lock policy based on the refcount
438 	 * performance?
439 	 */
440 	isc_refcount_t references;
441 	/* Locked by database lock. */
442 	bool writer;
443 	bool commit_ok;
444 	rbtdb_changedlist_t changed_list;
445 	rdatasetheaderlist_t resigned_list;
446 	ISC_LINK(struct rbtdb_version) link;
447 	dns_db_secure_t secure;
448 	bool havensec3;
449 	/* NSEC3 parameters */
450 	dns_hash_t hash;
451 	uint8_t flags;
452 	uint16_t iterations;
453 	uint8_t salt_length;
454 	unsigned char salt[DNS_NSEC3_SALTSIZE];
455 
456 	/*
457 	 * records and xfrsize are covered by rwlock.
458 	 */
459 	isc_rwlock_t rwlock;
460 	uint64_t records;
461 	uint64_t xfrsize;
462 
463 	isc_rwlock_t glue_rwlock;
464 	size_t glue_table_bits;
465 	size_t glue_table_nodecount;
466 	rbtdb_glue_table_node_t **glue_table;
467 } rbtdb_version_t;
468 
469 typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
470 
471 struct dns_rbtdb {
472 	/* Unlocked. */
473 	dns_db_t common;
474 	/* Locks the data in this struct */
475 	isc_rwlock_t lock;
476 	/* Locks the tree structure (prevents nodes appearing/disappearing) */
477 	isc_rwlock_t tree_lock;
478 	/* Locks for individual tree nodes */
479 	unsigned int node_lock_count;
480 	rbtdb_nodelock_t *node_locks;
481 	dns_rbtnode_t *origin_node;
482 	dns_rbtnode_t *nsec3_origin_node;
483 	dns_stats_t *rrsetstats;     /* cache DB only */
484 	isc_stats_t *cachestats;     /* cache DB only */
485 	isc_stats_t *gluecachestats; /* zone DB only */
486 	/* Locked by lock. */
487 	unsigned int active;
488 	isc_refcount_t references;
489 	unsigned int attributes;
490 	rbtdb_serial_t current_serial;
491 	rbtdb_serial_t least_serial;
492 	rbtdb_serial_t next_serial;
493 	rbtdb_version_t *current_version;
494 	rbtdb_version_t *future_version;
495 	rbtdb_versionlist_t open_versions;
496 	isc_task_t *task;
497 	dns_dbnode_t *soanode;
498 	dns_dbnode_t *nsnode;
499 
500 	/*
501 	 * Maximum length of time to keep using a stale answer past its
502 	 * normal TTL expiry.
503 	 */
504 	dns_ttl_t serve_stale_ttl;
505 
506 	/*
507 	 * The time after a failed lookup, where stale answers from cache
508 	 * may be used directly in a DNS response without attempting a
509 	 * new iterative lookup.
510 	 */
511 	uint32_t serve_stale_refresh;
512 
513 	/*
514 	 * This is a linked list used to implement the LRU cache.  There will
515 	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
516 	 * placed on the linked list rdatasets[1].
517 	 */
518 	rdatasetheaderlist_t *rdatasets;
519 
520 	/*%
521 	 * Temporary storage for stale cache nodes and dynamically deleted
522 	 * nodes that await being cleaned up.
523 	 */
524 	rbtnodelist_t *deadnodes;
525 
526 	/* List of nodes from which recursive tree pruning can be started from.
527 	 * Locked by tree_lock. */
528 	rbtnodelist_t prunenodes;
529 
530 	/*
531 	 * Heaps.  These are used for TTL based expiry in a cache,
532 	 * or for zone resigning in a zone DB.  hmctx is the memory
533 	 * context to use for the heap (which differs from the main
534 	 * database memory context in the case of a cache).
535 	 */
536 	isc_mem_t *hmctx;
537 	isc_heap_t **heaps;
538 
539 	/*
540 	 * Base values for the mmap() code.
541 	 */
542 	void *mmap_location;
543 	size_t mmap_size;
544 
545 	/* Locked by tree_lock. */
546 	dns_rbt_t *tree;
547 	dns_rbt_t *nsec;
548 	dns_rbt_t *nsec3;
549 
550 	/* Unlocked */
551 	unsigned int quantum;
552 };
553 
554 #define RBTDB_ATTR_LOADED  0x01
555 #define RBTDB_ATTR_LOADING 0x02
556 
557 #define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0)
558 
559 /*%
560  * Search Context
561  */
562 typedef struct {
563 	dns_rbtdb_t *rbtdb;
564 	rbtdb_version_t *rbtversion;
565 	rbtdb_serial_t serial;
566 	unsigned int options;
567 	dns_rbtnodechain_t chain;
568 	bool copy_name;
569 	bool need_cleanup;
570 	bool wild;
571 	dns_rbtnode_t *zonecut;
572 	rdatasetheader_t *zonecut_rdataset;
573 	rdatasetheader_t *zonecut_sigrdataset;
574 	dns_fixedname_t zonecut_name;
575 	isc_stdtime_t now;
576 } rbtdb_search_t;
577 
578 /*%
579  * Load Context
580  */
581 typedef struct {
582 	dns_rbtdb_t *rbtdb;
583 	isc_stdtime_t now;
584 } rbtdb_load_t;
585 
586 static void
587 delete_callback(void *data, void *arg);
588 static void
589 rdataset_disassociate(dns_rdataset_t *rdataset);
590 static isc_result_t
591 rdataset_first(dns_rdataset_t *rdataset);
592 static isc_result_t
593 rdataset_next(dns_rdataset_t *rdataset);
594 static void
595 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
596 static void
597 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
598 static unsigned int
599 rdataset_count(dns_rdataset_t *rdataset);
600 static isc_result_t
601 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
602 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
603 static isc_result_t
604 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
605 		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
606 static bool
607 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now);
608 static void
609 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now);
610 static void
611 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
612 	      expire_t reason);
613 static void
614 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
615 	      bool tree_locked);
616 static void
617 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
618 static void
619 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
620 	      rdatasetheader_t *header);
621 static void
622 prune_tree(isc_task_t *task, isc_event_t *event);
623 static void
624 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
625 static void
626 rdataset_expire(dns_rdataset_t *rdataset);
627 static void
628 rdataset_clearprefetch(dns_rdataset_t *rdataset);
629 static void
630 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name);
631 static void
632 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name);
633 static isc_result_t
634 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
635 		 dns_message_t *msg);
636 static void
637 free_gluetable(rbtdb_version_t *version);
638 static isc_result_t
639 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name);
640 
641 static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate,
642 						  rdataset_first,
643 						  rdataset_next,
644 						  rdataset_current,
645 						  rdataset_clone,
646 						  rdataset_count,
647 						  NULL, /* addnoqname */
648 						  rdataset_getnoqname,
649 						  NULL, /* addclosest */
650 						  rdataset_getclosest,
651 						  rdataset_settrust,
652 						  rdataset_expire,
653 						  rdataset_clearprefetch,
654 						  rdataset_setownercase,
655 						  rdataset_getownercase,
656 						  rdataset_addglue };
657 
658 static dns_rdatasetmethods_t slab_methods = {
659 	rdataset_disassociate,
660 	rdataset_first,
661 	rdataset_next,
662 	rdataset_current,
663 	rdataset_clone,
664 	rdataset_count,
665 	NULL, /* addnoqname */
666 	NULL, /* getnoqname */
667 	NULL, /* addclosest */
668 	NULL, /* getclosest */
669 	NULL, /* settrust */
670 	NULL, /* expire */
671 	NULL, /* clearprefetch */
672 	NULL, /* setownercase */
673 	NULL, /* getownercase */
674 	NULL  /* addglue */
675 };
676 
677 static void
678 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
679 static isc_result_t
680 rdatasetiter_first(dns_rdatasetiter_t *iterator);
681 static isc_result_t
682 rdatasetiter_next(dns_rdatasetiter_t *iterator);
683 static void
684 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset);
685 
686 static dns_rdatasetitermethods_t rdatasetiter_methods = {
687 	rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next,
688 	rdatasetiter_current
689 };
690 
691 typedef struct rbtdb_rdatasetiter {
692 	dns_rdatasetiter_t common;
693 	rdatasetheader_t *current;
694 } rbtdb_rdatasetiter_t;
695 
696 /*
697  * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
698  * DNS_DB_NONSEC3, will transparently move between the last node of the
699  * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
700  * ("nsec3chain" field) of the database in question, as if the latter was a
701  * successor to the former in lexical order.  The "current" field always holds
702  * the address of either "chain" or "nsec3chain", depending on which RBT is
703  * being traversed at given time.
704  */
705 static void
706 dbiterator_destroy(dns_dbiterator_t **iteratorp);
707 static isc_result_t
708 dbiterator_first(dns_dbiterator_t *iterator);
709 static isc_result_t
710 dbiterator_last(dns_dbiterator_t *iterator);
711 static isc_result_t
712 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name);
713 static isc_result_t
714 dbiterator_prev(dns_dbiterator_t *iterator);
715 static isc_result_t
716 dbiterator_next(dns_dbiterator_t *iterator);
717 static isc_result_t
718 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
719 		   dns_name_t *name);
720 static isc_result_t
721 dbiterator_pause(dns_dbiterator_t *iterator);
722 static isc_result_t
723 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name);
724 
725 static dns_dbiteratormethods_t dbiterator_methods = {
726 	dbiterator_destroy, dbiterator_first, dbiterator_last,
727 	dbiterator_seek,    dbiterator_prev,  dbiterator_next,
728 	dbiterator_current, dbiterator_pause, dbiterator_origin
729 };
730 
731 #define DELETION_BATCH_MAX 64
732 
733 /*
734  * If 'paused' is true, then the tree lock is not being held.
735  */
736 typedef struct rbtdb_dbiterator {
737 	dns_dbiterator_t common;
738 	bool paused;
739 	bool new_origin;
740 	isc_rwlocktype_t tree_locked;
741 	isc_result_t result;
742 	dns_fixedname_t name;
743 	dns_fixedname_t origin;
744 	dns_rbtnodechain_t chain;
745 	dns_rbtnodechain_t nsec3chain;
746 	dns_rbtnodechain_t *current;
747 	dns_rbtnode_t *node;
748 	dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
749 	int delcnt;
750 	bool nsec3only;
751 	bool nonsec3;
752 } rbtdb_dbiterator_t;
753 
754 #define IS_STUB(rbtdb)	(((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
755 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
756 
757 static void
758 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event);
759 static void
760 overmem(dns_db_t *db, bool over);
761 static void
762 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
763 static void
764 setownercase(rdatasetheader_t *header, const dns_name_t *name);
765 
766 static bool
767 match_header_version(rbtdb_file_header_t *header);
768 
769 /* Pad to 32 bytes */
770 static char FILE_VERSION[32] = "\0";
771 
772 /*%
773  * 'init_count' is used to initialize 'newheader->count' which inturn
774  * is used to determine where in the cycle rrset-order cyclic starts.
775  * We don't lock this as we don't care about simultaneous updates.
776  *
777  * Note:
778  *      Both init_count and header->count can be UINT32_MAX.
779  *      The count on the returned rdataset however can't be as
780  *      that indicates that the database does not implement cyclic
781  *      processing.
782  */
783 static atomic_uint_fast32_t init_count = 0;
784 
785 /*
786  * Locking
787  *
788  * If a routine is going to lock more than one lock in this module, then
789  * the locking must be done in the following order:
790  *
791  *      Tree Lock
792  *
793  *      Node Lock       (Only one from the set may be locked at one time by
794  *                       any caller)
795  *
796  *      Database Lock
797  *
798  * Failure to follow this hierarchy can result in deadlock.
799  */
800 
801 /*
802  * Deleting Nodes
803  *
804  * For zone databases the node for the origin of the zone MUST NOT be deleted.
805  */
806 
807 /*
808  * Debugging routines
809  */
810 #ifdef DEBUG
811 static void
hexdump(const char * desc,unsigned char * data,size_t size)812 hexdump(const char *desc, unsigned char *data, size_t size) {
813 	char hexdump[BUFSIZ * 2 + 1];
814 	isc_buffer_t b;
815 	isc_region_t r;
816 	isc_result_t result;
817 	size_t bytes;
818 
819 	fprintf(stderr, "%s: ", desc);
820 	do {
821 		isc_buffer_init(&b, hexdump, sizeof(hexdump));
822 		r.base = data;
823 		r.length = bytes = (size > BUFSIZ) ? BUFSIZ : size;
824 		result = isc_hex_totext(&r, 0, "", &b);
825 		RUNTIME_CHECK(result == ISC_R_SUCCESS);
826 		isc_buffer_putuint8(&b, 0);
827 		fprintf(stderr, "%s", hexdump);
828 		data += bytes;
829 		size -= bytes;
830 	} while (size > 0);
831 	fprintf(stderr, "\n");
832 }
833 #endif /* ifdef DEBUG */
834 
835 /* Fixed RRSet helper macros */
836 
837 #define DNS_RDATASET_LENGTH 2;
838 
839 #if DNS_RDATASET_FIXED
840 #define DNS_RDATASET_ORDER 2
841 #define DNS_RDATASET_COUNT (count * 4)
842 #else /* !DNS_RDATASET_FIXED */
843 #define DNS_RDATASET_ORDER 0
844 #define DNS_RDATASET_COUNT 0
845 #endif /* DNS_RDATASET_FIXED */
846 
847 /*
848  * DB Routines
849  */
850 
851 static void
attach(dns_db_t * source,dns_db_t ** targetp)852 attach(dns_db_t *source, dns_db_t **targetp) {
853 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
854 
855 	REQUIRE(VALID_RBTDB(rbtdb));
856 
857 	isc_refcount_increment(&rbtdb->references);
858 
859 	*targetp = source;
860 }
861 
862 static void
free_rbtdb_callback(isc_task_t * task,isc_event_t * event)863 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
864 	dns_rbtdb_t *rbtdb = event->ev_arg;
865 
866 	UNUSED(task);
867 
868 	free_rbtdb(rbtdb, true, event);
869 }
870 
871 static void
update_cachestats(dns_rbtdb_t * rbtdb,isc_result_t result)872 update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
873 	INSIST(IS_CACHE(rbtdb));
874 
875 	if (rbtdb->cachestats == NULL) {
876 		return;
877 	}
878 
879 	switch (result) {
880 	case ISC_R_SUCCESS:
881 	case DNS_R_CNAME:
882 	case DNS_R_DNAME:
883 	case DNS_R_DELEGATION:
884 	case DNS_R_NCACHENXDOMAIN:
885 	case DNS_R_NCACHENXRRSET:
886 		isc_stats_increment(rbtdb->cachestats,
887 				    dns_cachestatscounter_hits);
888 		break;
889 	default:
890 		isc_stats_increment(rbtdb->cachestats,
891 				    dns_cachestatscounter_misses);
892 	}
893 }
894 
895 static bool
do_stats(rdatasetheader_t * header)896 do_stats(rdatasetheader_t *header) {
897 	return (EXISTS(header) && STATCOUNT(header));
898 }
899 
900 static void
update_rrsetstats(dns_rbtdb_t * rbtdb,const rbtdb_rdatatype_t htype,const uint_least16_t hattributes,const bool increment)901 update_rrsetstats(dns_rbtdb_t *rbtdb, const rbtdb_rdatatype_t htype,
902 		  const uint_least16_t hattributes, const bool increment) {
903 	dns_rdatastatstype_t statattributes = 0;
904 	dns_rdatastatstype_t base = 0;
905 	dns_rdatastatstype_t type;
906 	rdatasetheader_t *header = &(rdatasetheader_t){
907 		.type = htype,
908 		.attributes = hattributes,
909 	};
910 
911 	if (!do_stats(header)) {
912 		return;
913 	}
914 
915 	/* At the moment we count statistics only for cache DB */
916 	INSIST(IS_CACHE(rbtdb));
917 
918 	if (NEGATIVE(header)) {
919 		if (NXDOMAIN(header)) {
920 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
921 		} else {
922 			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
923 			base = RBTDB_RDATATYPE_EXT(header->type);
924 		}
925 	} else {
926 		base = RBTDB_RDATATYPE_BASE(header->type);
927 	}
928 
929 	if (STALE(header)) {
930 		statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE;
931 	}
932 	if (ANCIENT(header)) {
933 		statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT;
934 	}
935 
936 	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
937 	if (increment) {
938 		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
939 	} else {
940 		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
941 	}
942 }
943 
944 static void
set_ttl(dns_rbtdb_t * rbtdb,rdatasetheader_t * header,dns_ttl_t newttl)945 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
946 	int idx;
947 	isc_heap_t *heap;
948 	dns_ttl_t oldttl;
949 
950 	if (!IS_CACHE(rbtdb)) {
951 		header->rdh_ttl = newttl;
952 		return;
953 	}
954 
955 	oldttl = header->rdh_ttl;
956 	header->rdh_ttl = newttl;
957 
958 	/*
959 	 * It's possible the rbtdb is not a cache.  If this is the case,
960 	 * we will not have a heap, and we move on.  If we do, though,
961 	 * we might need to adjust things.
962 	 */
963 	if (header->heap_index == 0 || newttl == oldttl) {
964 		return;
965 	}
966 	idx = header->node->locknum;
967 	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) {
968 		return;
969 	}
970 	heap = rbtdb->heaps[idx];
971 
972 	if (newttl < oldttl) {
973 		isc_heap_increased(heap, header->heap_index);
974 	} else {
975 		isc_heap_decreased(heap, header->heap_index);
976 	}
977 }
978 
979 /*%
980  * These functions allow the heap code to rank the priority of each
981  * element.  It returns true if v1 happens "sooner" than v2.
982  */
983 static bool
ttl_sooner(void * v1,void * v2)984 ttl_sooner(void *v1, void *v2) {
985 	rdatasetheader_t *h1 = v1;
986 	rdatasetheader_t *h2 = v2;
987 
988 	return (h1->rdh_ttl < h2->rdh_ttl);
989 }
990 
991 /*%
992  * Return which RRset should be resigned sooner.  If the RRsets have the
993  * same signing time, prefer the other RRset over the SOA RRset.
994  */
995 static bool
resign_sooner(void * v1,void * v2)996 resign_sooner(void *v1, void *v2) {
997 	rdatasetheader_t *h1 = v1;
998 	rdatasetheader_t *h2 = v2;
999 
1000 	return (h1->resign < h2->resign ||
1001 		(h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) ||
1002 		(h1->resign == h2->resign && h1->resign_lsb == h2->resign_lsb &&
1003 		 h2->type == RBTDB_RDATATYPE_SIGSOA));
1004 }
1005 
1006 /*%
1007  * This function sets the heap index into the header.
1008  */
1009 static void
set_index(void * what,unsigned int idx)1010 set_index(void *what, unsigned int idx) {
1011 	rdatasetheader_t *h = what;
1012 
1013 	h->heap_index = idx;
1014 }
1015 
1016 /*%
1017  * Work out how many nodes can be deleted in the time between two
1018  * requests to the nameserver.  Smooth the resulting number and use it
1019  * as a estimate for the number of nodes to be deleted in the next
1020  * iteration.
1021  */
1022 static unsigned int
adjust_quantum(unsigned int old,isc_time_t * start)1023 adjust_quantum(unsigned int old, isc_time_t *start) {
1024 	unsigned int pps = dns_pps; /* packets per second */
1025 	unsigned int interval;
1026 	uint64_t usecs;
1027 	isc_time_t end;
1028 	unsigned int nodes;
1029 
1030 	if (pps < 100) {
1031 		pps = 100;
1032 	}
1033 	isc_time_now(&end);
1034 
1035 	interval = 1000000 / pps; /* interval in usec */
1036 	if (interval == 0) {
1037 		interval = 1;
1038 	}
1039 	usecs = isc_time_microdiff(&end, start);
1040 	if (usecs == 0) {
1041 		/*
1042 		 * We were unable to measure the amount of time taken.
1043 		 * Double the nodes deleted next time.
1044 		 */
1045 		old *= 2;
1046 		if (old > 1000) {
1047 			old = 1000;
1048 		}
1049 		return (old);
1050 	}
1051 	nodes = old * interval;
1052 	nodes /= (unsigned int)usecs;
1053 	if (nodes == 0) {
1054 		nodes = 1;
1055 	} else if (nodes > 1000) {
1056 		nodes = 1000;
1057 	}
1058 
1059 	/* Smooth */
1060 	nodes = (nodes + old * 3) / 4;
1061 
1062 	if (nodes != old) {
1063 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1064 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1065 			      "adjust_quantum: old=%d, new=%d", old, nodes);
1066 	}
1067 
1068 	return (nodes);
1069 }
1070 
1071 static void
free_rbtdb(dns_rbtdb_t * rbtdb,bool log,isc_event_t * event)1072 free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
1073 	unsigned int i;
1074 	isc_result_t result;
1075 	char buf[DNS_NAME_FORMATSIZE];
1076 	dns_rbtnode_t *node = NULL;
1077 	dns_rbt_t **treep;
1078 	isc_time_t start;
1079 
1080 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1081 		overmem((dns_db_t *)rbtdb, (bool)-1);
1082 	}
1083 
1084 	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
1085 	REQUIRE(rbtdb->future_version == NULL);
1086 
1087 	if (rbtdb->current_version != NULL) {
1088 		isc_refcount_decrementz(&rbtdb->current_version->references);
1089 		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
1090 		isc_rwlock_destroy(&rbtdb->current_version->glue_rwlock);
1091 		isc_refcount_destroy(&rbtdb->current_version->references);
1092 		isc_rwlock_destroy(&rbtdb->current_version->rwlock);
1093 		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
1094 			    sizeof(rbtdb_version_t));
1095 	}
1096 
1097 	/*
1098 	 * We assume the number of remaining dead nodes is reasonably small;
1099 	 * the overhead of unlinking all nodes here should be negligible.
1100 	 */
1101 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1102 		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1103 		while (node != NULL) {
1104 			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
1105 			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
1106 		}
1107 	}
1108 
1109 	node = ISC_LIST_HEAD(rbtdb->prunenodes);
1110 	while (node != NULL) {
1111 		ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink);
1112 		node = ISC_LIST_HEAD(rbtdb->prunenodes);
1113 	}
1114 
1115 	if (event == NULL) {
1116 		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
1117 	}
1118 
1119 	for (;;) {
1120 		/*
1121 		 * pick the next tree to (start to) destroy
1122 		 */
1123 		treep = &rbtdb->tree;
1124 		if (*treep == NULL) {
1125 			treep = &rbtdb->nsec;
1126 			if (*treep == NULL) {
1127 				treep = &rbtdb->nsec3;
1128 				/*
1129 				 * we're finished after clear cutting
1130 				 */
1131 				if (*treep == NULL) {
1132 					break;
1133 				}
1134 			}
1135 		}
1136 
1137 		isc_time_now(&start);
1138 		result = dns_rbt_destroy2(treep, rbtdb->quantum);
1139 		if (result == ISC_R_QUOTA) {
1140 			INSIST(rbtdb->task != NULL);
1141 			if (rbtdb->quantum != 0) {
1142 				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
1143 								&start);
1144 			}
1145 			if (event == NULL) {
1146 				event = isc_event_allocate(
1147 					rbtdb->common.mctx, NULL,
1148 					DNS_EVENT_FREESTORAGE,
1149 					free_rbtdb_callback, rbtdb,
1150 					sizeof(isc_event_t));
1151 			}
1152 			isc_task_send(rbtdb->task, &event);
1153 			return;
1154 		}
1155 		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
1156 	}
1157 
1158 	if (event != NULL) {
1159 		isc_event_free(&event);
1160 	}
1161 	if (log) {
1162 		if (dns_name_dynamic(&rbtdb->common.origin)) {
1163 			dns_name_format(&rbtdb->common.origin, buf,
1164 					sizeof(buf));
1165 		} else {
1166 			strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1167 		}
1168 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1169 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1170 			      "done free_rbtdb(%s)", buf);
1171 	}
1172 	if (dns_name_dynamic(&rbtdb->common.origin)) {
1173 		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
1174 	}
1175 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1176 		isc_refcount_destroy(&rbtdb->node_locks[i].references);
1177 		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
1178 	}
1179 
1180 	/*
1181 	 * Clean up LRU / re-signing order lists.
1182 	 */
1183 	if (rbtdb->rdatasets != NULL) {
1184 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1185 			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
1186 		}
1187 		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
1188 			    rbtdb->node_lock_count *
1189 				    sizeof(rdatasetheaderlist_t));
1190 	}
1191 	/*
1192 	 * Clean up dead node buckets.
1193 	 */
1194 	if (rbtdb->deadnodes != NULL) {
1195 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1196 			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
1197 		}
1198 		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
1199 			    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
1200 	}
1201 	/*
1202 	 * Clean up heap objects.
1203 	 */
1204 	if (rbtdb->heaps != NULL) {
1205 		for (i = 0; i < rbtdb->node_lock_count; i++) {
1206 			isc_heap_destroy(&rbtdb->heaps[i]);
1207 		}
1208 		isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
1209 			    rbtdb->node_lock_count * sizeof(isc_heap_t *));
1210 	}
1211 
1212 	if (rbtdb->rrsetstats != NULL) {
1213 		dns_stats_detach(&rbtdb->rrsetstats);
1214 	}
1215 	if (rbtdb->cachestats != NULL) {
1216 		isc_stats_detach(&rbtdb->cachestats);
1217 	}
1218 	if (rbtdb->gluecachestats != NULL) {
1219 		isc_stats_detach(&rbtdb->gluecachestats);
1220 	}
1221 
1222 	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
1223 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
1224 	isc_rwlock_destroy(&rbtdb->tree_lock);
1225 	isc_refcount_destroy(&rbtdb->references);
1226 	if (rbtdb->task != NULL) {
1227 		isc_task_detach(&rbtdb->task);
1228 	}
1229 
1230 	RBTDB_DESTROYLOCK(&rbtdb->lock);
1231 	rbtdb->common.magic = 0;
1232 	rbtdb->common.impmagic = 0;
1233 	isc_mem_detach(&rbtdb->hmctx);
1234 
1235 	if (rbtdb->mmap_location != NULL) {
1236 		isc_file_munmap(rbtdb->mmap_location, (size_t)rbtdb->mmap_size);
1237 	}
1238 
1239 	INSIST(ISC_LIST_EMPTY(rbtdb->common.update_listeners));
1240 
1241 	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
1242 }
1243 
1244 static void
maybe_free_rbtdb(dns_rbtdb_t * rbtdb)1245 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
1246 	bool want_free = false;
1247 	unsigned int i;
1248 	unsigned int inactive = 0;
1249 
1250 	/* XXX check for open versions here */
1251 
1252 	if (rbtdb->soanode != NULL) {
1253 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
1254 	}
1255 	if (rbtdb->nsnode != NULL) {
1256 		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
1257 	}
1258 
1259 	/*
1260 	 * The current version's glue table needs to be freed early
1261 	 * so the nodes are dereferenced before we check the active
1262 	 * node count below.
1263 	 */
1264 	if (rbtdb->current_version != NULL) {
1265 		free_gluetable(rbtdb->current_version);
1266 	}
1267 
1268 	/*
1269 	 * Even though there are no external direct references, there still
1270 	 * may be nodes in use.
1271 	 */
1272 	for (i = 0; i < rbtdb->node_lock_count; i++) {
1273 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1274 		rbtdb->node_locks[i].exiting = true;
1275 		if (isc_refcount_current(&rbtdb->node_locks[i].references) == 0)
1276 		{
1277 			inactive++;
1278 		}
1279 		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1280 	}
1281 
1282 	if (inactive != 0) {
1283 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1284 		rbtdb->active -= inactive;
1285 		if (rbtdb->active == 0) {
1286 			want_free = true;
1287 		}
1288 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1289 		if (want_free) {
1290 			char buf[DNS_NAME_FORMATSIZE];
1291 			if (dns_name_dynamic(&rbtdb->common.origin)) {
1292 				dns_name_format(&rbtdb->common.origin, buf,
1293 						sizeof(buf));
1294 			} else {
1295 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
1296 			}
1297 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1298 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1299 				      "calling free_rbtdb(%s)", buf);
1300 			free_rbtdb(rbtdb, true, NULL);
1301 		}
1302 	}
1303 }
1304 
1305 static void
detach(dns_db_t ** dbp)1306 detach(dns_db_t **dbp) {
1307 	REQUIRE(dbp != NULL && VALID_RBTDB((dns_rbtdb_t *)(*dbp)));
1308 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
1309 	*dbp = NULL;
1310 
1311 	if (isc_refcount_decrement(&rbtdb->references) == 1) {
1312 		maybe_free_rbtdb(rbtdb);
1313 	}
1314 }
1315 
1316 static void
currentversion(dns_db_t * db,dns_dbversion_t ** versionp)1317 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
1318 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1319 	rbtdb_version_t *version;
1320 
1321 	REQUIRE(VALID_RBTDB(rbtdb));
1322 
1323 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1324 	version = rbtdb->current_version;
1325 	isc_refcount_increment(&version->references);
1326 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1327 
1328 	*versionp = (dns_dbversion_t *)version;
1329 }
1330 
1331 static rbtdb_version_t *
allocate_version(isc_mem_t * mctx,rbtdb_serial_t serial,unsigned int references,bool writer)1332 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
1333 		 unsigned int references, bool writer) {
1334 	rbtdb_version_t *version;
1335 	size_t size;
1336 
1337 	version = isc_mem_get(mctx, sizeof(*version));
1338 	version->serial = serial;
1339 
1340 	isc_refcount_init(&version->references, references);
1341 	isc_rwlock_init(&version->glue_rwlock, 0, 0);
1342 
1343 	version->glue_table_bits = RBTDB_GLUE_TABLE_INIT_BITS;
1344 	version->glue_table_nodecount = 0U;
1345 
1346 	size = HASHSIZE(version->glue_table_bits) *
1347 	       sizeof(version->glue_table[0]);
1348 	version->glue_table = isc_mem_get(mctx, size);
1349 	memset(version->glue_table, 0, size);
1350 
1351 	version->writer = writer;
1352 	version->commit_ok = false;
1353 	ISC_LIST_INIT(version->changed_list);
1354 	ISC_LIST_INIT(version->resigned_list);
1355 	ISC_LINK_INIT(version, link);
1356 
1357 	return (version);
1358 }
1359 
1360 static isc_result_t
newversion(dns_db_t * db,dns_dbversion_t ** versionp)1361 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1362 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1363 	rbtdb_version_t *version;
1364 
1365 	REQUIRE(VALID_RBTDB(rbtdb));
1366 	REQUIRE(versionp != NULL && *versionp == NULL);
1367 	REQUIRE(rbtdb->future_version == NULL);
1368 
1369 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1370 	RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
1371 	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1372 				   true);
1373 	version->rbtdb = rbtdb;
1374 	version->commit_ok = true;
1375 	version->secure = rbtdb->current_version->secure;
1376 	version->havensec3 = rbtdb->current_version->havensec3;
1377 	if (version->havensec3) {
1378 		version->flags = rbtdb->current_version->flags;
1379 		version->iterations = rbtdb->current_version->iterations;
1380 		version->hash = rbtdb->current_version->hash;
1381 		version->salt_length = rbtdb->current_version->salt_length;
1382 		memmove(version->salt, rbtdb->current_version->salt,
1383 			version->salt_length);
1384 	} else {
1385 		version->flags = 0;
1386 		version->iterations = 0;
1387 		version->hash = 0;
1388 		version->salt_length = 0;
1389 		memset(version->salt, 0, sizeof(version->salt));
1390 	}
1391 	isc_rwlock_init(&version->rwlock, 0, 0);
1392 	RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1393 	version->records = rbtdb->current_version->records;
1394 	version->xfrsize = rbtdb->current_version->xfrsize;
1395 	RWUNLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read);
1396 	rbtdb->next_serial++;
1397 	rbtdb->future_version = version;
1398 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1399 
1400 	*versionp = version;
1401 
1402 	return (ISC_R_SUCCESS);
1403 }
1404 
1405 static void
attachversion(dns_db_t * db,dns_dbversion_t * source,dns_dbversion_t ** targetp)1406 attachversion(dns_db_t *db, dns_dbversion_t *source,
1407 	      dns_dbversion_t **targetp) {
1408 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1409 	rbtdb_version_t *rbtversion = source;
1410 
1411 	REQUIRE(VALID_RBTDB(rbtdb));
1412 	INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
1413 
1414 	isc_refcount_increment(&rbtversion->references);
1415 
1416 	*targetp = rbtversion;
1417 }
1418 
1419 static rbtdb_changed_t *
add_changed(dns_rbtdb_t * rbtdb,rbtdb_version_t * version,dns_rbtnode_t * node)1420 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) {
1421 	rbtdb_changed_t *changed;
1422 
1423 	/*
1424 	 * Caller must be holding the node lock if its reference must be
1425 	 * protected by the lock.
1426 	 */
1427 
1428 	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1429 
1430 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1431 
1432 	REQUIRE(version->writer);
1433 
1434 	if (changed != NULL) {
1435 		isc_refcount_increment(&node->references);
1436 		changed->node = node;
1437 		changed->dirty = false;
1438 		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1439 	} else {
1440 		version->commit_ok = false;
1441 	}
1442 
1443 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1444 
1445 	return (changed);
1446 }
1447 
1448 static void
free_noqname(isc_mem_t * mctx,struct noqname ** noqname)1449 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1450 	if (dns_name_dynamic(&(*noqname)->name)) {
1451 		dns_name_free(&(*noqname)->name, mctx);
1452 	}
1453 	if ((*noqname)->neg != NULL) {
1454 		isc_mem_put(mctx, (*noqname)->neg,
1455 			    dns_rdataslab_size((*noqname)->neg, 0));
1456 	}
1457 	if ((*noqname)->negsig != NULL) {
1458 		isc_mem_put(mctx, (*noqname)->negsig,
1459 			    dns_rdataslab_size((*noqname)->negsig, 0));
1460 	}
1461 	isc_mem_put(mctx, *noqname, sizeof(**noqname));
1462 	*noqname = NULL;
1463 }
1464 
1465 static void
init_rdataset(dns_rbtdb_t * rbtdb,rdatasetheader_t * h)1466 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) {
1467 	ISC_LINK_INIT(h, link);
1468 	h->heap_index = 0;
1469 	h->is_mmapped = 0;
1470 	h->next_is_relative = 0;
1471 	h->node_is_relative = 0;
1472 	atomic_init(&h->attributes, 0);
1473 	atomic_init(&h->last_refresh_fail_ts, 0);
1474 
1475 	STATIC_ASSERT((sizeof(h->attributes) == 2),
1476 		      "The .attributes field of rdatasetheader_t needs to be "
1477 		      "16-bit int type exactly.");
1478 
1479 #if TRACE_HEADER
1480 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1481 		fprintf(stderr, "initialized header: %p\n", h);
1482 	}
1483 #else  /* if TRACE_HEADER */
1484 	UNUSED(rbtdb);
1485 #endif /* if TRACE_HEADER */
1486 }
1487 
1488 /*
1489  * Update the copied values of 'next' and 'node' if they are relative.
1490  */
1491 static void
update_newheader(rdatasetheader_t * newh,rdatasetheader_t * old)1492 update_newheader(rdatasetheader_t *newh, rdatasetheader_t *old) {
1493 	char *p;
1494 
1495 	if (old->next_is_relative) {
1496 		p = (char *)old;
1497 		p += (uintptr_t)old->next;
1498 		newh->next = (rdatasetheader_t *)p;
1499 	}
1500 	if (old->node_is_relative) {
1501 		p = (char *)old;
1502 		p += (uintptr_t)old->node;
1503 		newh->node = (dns_rbtnode_t *)p;
1504 	}
1505 	if (CASESET(old)) {
1506 		uint_least16_t attr = RDATASET_ATTR_GET(
1507 			old,
1508 			(RDATASET_ATTR_CASESET | RDATASET_ATTR_CASEFULLYLOWER));
1509 		RDATASET_ATTR_SET(newh, attr);
1510 		memmove(newh->upper, old->upper, sizeof(old->upper));
1511 	}
1512 }
1513 
1514 static rdatasetheader_t *
new_rdataset(dns_rbtdb_t * rbtdb,isc_mem_t * mctx)1515 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) {
1516 	rdatasetheader_t *h;
1517 
1518 	h = isc_mem_get(mctx, sizeof(*h));
1519 
1520 #if TRACE_HEADER
1521 	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
1522 		fprintf(stderr, "allocated header: %p\n", h);
1523 	}
1524 #endif /* if TRACE_HEADER */
1525 	memset(h->upper, 0xeb, sizeof(h->upper));
1526 	init_rdataset(rbtdb, h);
1527 	h->rdh_ttl = 0;
1528 	return (h);
1529 }
1530 
1531 static void
free_rdataset(dns_rbtdb_t * rbtdb,isc_mem_t * mctx,rdatasetheader_t * rdataset)1532 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) {
1533 	unsigned int size;
1534 	int idx;
1535 
1536 	update_rrsetstats(rbtdb, rdataset->type,
1537 			  atomic_load_acquire(&rdataset->attributes), false);
1538 
1539 	idx = rdataset->node->locknum;
1540 	if (ISC_LINK_LINKED(rdataset, link)) {
1541 		INSIST(IS_CACHE(rbtdb));
1542 		ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link);
1543 	}
1544 
1545 	if (rdataset->heap_index != 0) {
1546 		isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
1547 	}
1548 	rdataset->heap_index = 0;
1549 
1550 	if (rdataset->noqname != NULL) {
1551 		free_noqname(mctx, &rdataset->noqname);
1552 	}
1553 	if (rdataset->closest != NULL) {
1554 		free_noqname(mctx, &rdataset->closest);
1555 	}
1556 
1557 	if (NONEXISTENT(rdataset)) {
1558 		size = sizeof(*rdataset);
1559 	} else {
1560 		size = dns_rdataslab_size((unsigned char *)rdataset,
1561 					  sizeof(*rdataset));
1562 	}
1563 
1564 	if (rdataset->is_mmapped == 1) {
1565 		return;
1566 	}
1567 
1568 	isc_mem_put(mctx, rdataset, size);
1569 }
1570 
1571 static void
rollback_node(dns_rbtnode_t * node,rbtdb_serial_t serial)1572 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1573 	rdatasetheader_t *header, *dcurrent;
1574 	bool make_dirty = false;
1575 
1576 	/*
1577 	 * Caller must hold the node lock.
1578 	 */
1579 
1580 	/*
1581 	 * We set the IGNORE attribute on rdatasets with serial number
1582 	 * 'serial'.  When the reference count goes to zero, these rdatasets
1583 	 * will be cleaned up; until that time, they will be ignored.
1584 	 */
1585 	for (header = node->data; header != NULL; header = header->next) {
1586 		if (header->serial == serial) {
1587 			RDATASET_ATTR_SET(header, RDATASET_ATTR_IGNORE);
1588 			make_dirty = true;
1589 		}
1590 		for (dcurrent = header->down; dcurrent != NULL;
1591 		     dcurrent = dcurrent->down)
1592 		{
1593 			if (dcurrent->serial == serial) {
1594 				RDATASET_ATTR_SET(dcurrent,
1595 						  RDATASET_ATTR_IGNORE);
1596 				make_dirty = true;
1597 			}
1598 		}
1599 	}
1600 	if (make_dirty) {
1601 		node->dirty = 1;
1602 	}
1603 }
1604 
1605 static void
mark_header_ancient(dns_rbtdb_t * rbtdb,rdatasetheader_t * header)1606 mark_header_ancient(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1607 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1608 	uint_least16_t newattributes = 0;
1609 
1610 	/*
1611 	 * If we are already ancient there is nothing to do.
1612 	 */
1613 	do {
1614 		if ((attributes & RDATASET_ATTR_ANCIENT) != 0) {
1615 			return;
1616 		}
1617 		newattributes = attributes | RDATASET_ATTR_ANCIENT;
1618 	} while (!atomic_compare_exchange_weak_acq_rel(
1619 		&header->attributes, &attributes, newattributes));
1620 
1621 	/*
1622 	 * Decrement the stats counter for the appropriate RRtype.
1623 	 * If the STALE attribute is set, this will decrement the
1624 	 * stale type counter, otherwise it decrements the active
1625 	 * stats type counter.
1626 	 */
1627 	update_rrsetstats(rbtdb, header->type, attributes, false);
1628 	header->node->dirty = 1;
1629 
1630 	/* Increment the stats counter for the ancient RRtype. */
1631 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1632 }
1633 
1634 static void
mark_header_stale(dns_rbtdb_t * rbtdb,rdatasetheader_t * header)1635 mark_header_stale(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) {
1636 	uint_least16_t attributes = atomic_load_acquire(&header->attributes);
1637 	uint_least16_t newattributes = 0;
1638 
1639 	INSIST((attributes & RDATASET_ATTR_ZEROTTL) == 0);
1640 
1641 	/*
1642 	 * If we are already stale there is nothing to do.
1643 	 */
1644 	do {
1645 		if ((attributes & RDATASET_ATTR_STALE) != 0) {
1646 			return;
1647 		}
1648 		newattributes = attributes | RDATASET_ATTR_STALE;
1649 	} while (!atomic_compare_exchange_weak_acq_rel(
1650 		&header->attributes, &attributes, newattributes));
1651 
1652 	/* Decrement the stats counter for the appropriate RRtype.
1653 	 * If the ANCIENT attribute is set (although it is very
1654 	 * unlikely that an RRset goes from ANCIENT to STALE), this
1655 	 * will decrement the ancient stale type counter, otherwise it
1656 	 * decrements the active stats type counter.
1657 	 */
1658 
1659 	update_rrsetstats(rbtdb, header->type, attributes, false);
1660 	update_rrsetstats(rbtdb, header->type, newattributes, true);
1661 }
1662 
1663 static void
clean_stale_headers(dns_rbtdb_t * rbtdb,isc_mem_t * mctx,rdatasetheader_t * top)1664 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx,
1665 		    rdatasetheader_t *top) {
1666 	rdatasetheader_t *d, *down_next;
1667 
1668 	for (d = top->down; d != NULL; d = down_next) {
1669 		down_next = d->down;
1670 		free_rdataset(rbtdb, mctx, d);
1671 	}
1672 	top->down = NULL;
1673 }
1674 
1675 static void
clean_cache_node(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node)1676 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1677 	rdatasetheader_t *current, *top_prev, *top_next;
1678 	isc_mem_t *mctx = rbtdb->common.mctx;
1679 
1680 	/*
1681 	 * Caller must be holding the node lock.
1682 	 */
1683 
1684 	top_prev = NULL;
1685 	for (current = node->data; current != NULL; current = top_next) {
1686 		top_next = current->next;
1687 		clean_stale_headers(rbtdb, mctx, current);
1688 		/*
1689 		 * If current is nonexistent, ancient, or stale and
1690 		 * we are not keeping stale, we can clean it up.
1691 		 */
1692 		if (NONEXISTENT(current) || ANCIENT(current) ||
1693 		    (STALE(current) && !KEEPSTALE(rbtdb)))
1694 		{
1695 			if (top_prev != NULL) {
1696 				top_prev->next = current->next;
1697 			} else {
1698 				node->data = current->next;
1699 			}
1700 			free_rdataset(rbtdb, mctx, current);
1701 		} else {
1702 			top_prev = current;
1703 		}
1704 	}
1705 	node->dirty = 0;
1706 }
1707 
1708 static void
clean_zone_node(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,rbtdb_serial_t least_serial)1709 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1710 		rbtdb_serial_t least_serial) {
1711 	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1712 	rdatasetheader_t *top_prev, *top_next;
1713 	isc_mem_t *mctx = rbtdb->common.mctx;
1714 	bool still_dirty = false;
1715 
1716 	/*
1717 	 * Caller must be holding the node lock.
1718 	 */
1719 	REQUIRE(least_serial != 0);
1720 
1721 	top_prev = NULL;
1722 	for (current = node->data; current != NULL; current = top_next) {
1723 		top_next = current->next;
1724 
1725 		/*
1726 		 * First, we clean up any instances of multiple rdatasets
1727 		 * with the same serial number, or that have the IGNORE
1728 		 * attribute.
1729 		 */
1730 		dparent = current;
1731 		for (dcurrent = current->down; dcurrent != NULL;
1732 		     dcurrent = down_next)
1733 		{
1734 			down_next = dcurrent->down;
1735 			INSIST(dcurrent->serial <= dparent->serial);
1736 			if (dcurrent->serial == dparent->serial ||
1737 			    IGNORE(dcurrent))
1738 			{
1739 				if (down_next != NULL) {
1740 					down_next->next = dparent;
1741 				}
1742 				dparent->down = down_next;
1743 				free_rdataset(rbtdb, mctx, dcurrent);
1744 			} else {
1745 				dparent = dcurrent;
1746 			}
1747 		}
1748 
1749 		/*
1750 		 * We've now eliminated all IGNORE datasets with the possible
1751 		 * exception of current, which we now check.
1752 		 */
1753 		if (IGNORE(current)) {
1754 			down_next = current->down;
1755 			if (down_next == NULL) {
1756 				if (top_prev != NULL) {
1757 					top_prev->next = current->next;
1758 				} else {
1759 					node->data = current->next;
1760 				}
1761 				free_rdataset(rbtdb, mctx, current);
1762 				/*
1763 				 * current no longer exists, so we can
1764 				 * just continue with the loop.
1765 				 */
1766 				continue;
1767 			} else {
1768 				/*
1769 				 * Pull up current->down, making it the new
1770 				 * current.
1771 				 */
1772 				if (top_prev != NULL) {
1773 					top_prev->next = down_next;
1774 				} else {
1775 					node->data = down_next;
1776 				}
1777 				down_next->next = top_next;
1778 				free_rdataset(rbtdb, mctx, current);
1779 				current = down_next;
1780 			}
1781 		}
1782 
1783 		/*
1784 		 * We now try to find the first down node less than the
1785 		 * least serial.
1786 		 */
1787 		dparent = current;
1788 		for (dcurrent = current->down; dcurrent != NULL;
1789 		     dcurrent = down_next)
1790 		{
1791 			down_next = dcurrent->down;
1792 			if (dcurrent->serial < least_serial) {
1793 				break;
1794 			}
1795 			dparent = dcurrent;
1796 		}
1797 
1798 		/*
1799 		 * If there is a such an rdataset, delete it and any older
1800 		 * versions.
1801 		 */
1802 		if (dcurrent != NULL) {
1803 			do {
1804 				down_next = dcurrent->down;
1805 				INSIST(dcurrent->serial <= least_serial);
1806 				free_rdataset(rbtdb, mctx, dcurrent);
1807 				dcurrent = down_next;
1808 			} while (dcurrent != NULL);
1809 			dparent->down = NULL;
1810 		}
1811 
1812 		/*
1813 		 * Note.  The serial number of 'current' might be less than
1814 		 * least_serial too, but we cannot delete it because it is
1815 		 * the most recent version, unless it is a NONEXISTENT
1816 		 * rdataset.
1817 		 */
1818 		if (current->down != NULL) {
1819 			still_dirty = true;
1820 			top_prev = current;
1821 		} else {
1822 			/*
1823 			 * If this is a NONEXISTENT rdataset, we can delete it.
1824 			 */
1825 			if (NONEXISTENT(current)) {
1826 				if (top_prev != NULL) {
1827 					top_prev->next = current->next;
1828 				} else {
1829 					node->data = current->next;
1830 				}
1831 				free_rdataset(rbtdb, mctx, current);
1832 			} else {
1833 				top_prev = current;
1834 			}
1835 		}
1836 	}
1837 	if (!still_dirty) {
1838 		node->dirty = 0;
1839 	}
1840 }
1841 
1842 /*
1843  * tree_lock(write) must be held.
1844  */
1845 static void
delete_node(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node)1846 delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1847 	dns_rbtnode_t *nsecnode;
1848 	dns_fixedname_t fname;
1849 	dns_name_t *name;
1850 	isc_result_t result = ISC_R_UNEXPECTED;
1851 
1852 	INSIST(!ISC_LINK_LINKED(node, deadlink));
1853 
1854 	if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1855 		char printname[DNS_NAME_FORMATSIZE];
1856 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1857 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1858 			      "delete_node(): %p %s (bucket %d)", node,
1859 			      dns_rbt_formatnodename(node, printname,
1860 						     sizeof(printname)),
1861 			      node->locknum);
1862 	}
1863 
1864 	switch (node->nsec) {
1865 	case DNS_RBT_NSEC_NORMAL:
1866 		/*
1867 		 * Though this may be wasteful, it has to be done before
1868 		 * node is deleted.
1869 		 */
1870 		name = dns_fixedname_initname(&fname);
1871 		dns_rbt_fullnamefromnode(node, name);
1872 
1873 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1874 		break;
1875 	case DNS_RBT_NSEC_HAS_NSEC:
1876 		name = dns_fixedname_initname(&fname);
1877 		dns_rbt_fullnamefromnode(node, name);
1878 		/*
1879 		 * Delete the corresponding node from the auxiliary NSEC
1880 		 * tree before deleting from the main tree.
1881 		 */
1882 		nsecnode = NULL;
1883 		result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
1884 					  NULL, DNS_RBTFIND_EMPTYDATA, NULL,
1885 					  NULL);
1886 		if (result != ISC_R_SUCCESS) {
1887 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1888 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1889 				      "delete_node: "
1890 				      "dns_rbt_findnode(nsec): %s",
1891 				      isc_result_totext(result));
1892 		} else {
1893 			result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
1894 						    false);
1895 			if (result != ISC_R_SUCCESS) {
1896 				isc_log_write(
1897 					dns_lctx, DNS_LOGCATEGORY_DATABASE,
1898 					DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1899 					"delete_node(): "
1900 					"dns_rbt_deletenode(nsecnode): %s",
1901 					isc_result_totext(result));
1902 			}
1903 		}
1904 		result = dns_rbt_deletenode(rbtdb->tree, node, false);
1905 		break;
1906 	case DNS_RBT_NSEC_NSEC:
1907 		result = dns_rbt_deletenode(rbtdb->nsec, node, false);
1908 		break;
1909 	case DNS_RBT_NSEC_NSEC3:
1910 		result = dns_rbt_deletenode(rbtdb->nsec3, node, false);
1911 		break;
1912 	}
1913 	if (result != ISC_R_SUCCESS) {
1914 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1915 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1916 			      "delete_node(): "
1917 			      "dns_rbt_deletenode: %s",
1918 			      isc_result_totext(result));
1919 	}
1920 }
1921 
1922 /*
1923  * Caller must be holding the node lock.
1924  */
1925 static void
new_reference(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,isc_rwlocktype_t locktype)1926 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1927 	      isc_rwlocktype_t locktype) {
1928 	if (locktype == isc_rwlocktype_write && ISC_LINK_LINKED(node, deadlink))
1929 	{
1930 		ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
1931 				deadlink);
1932 	}
1933 	if (isc_refcount_increment0(&node->references) == 0) {
1934 		/* this is the first reference to the node */
1935 		isc_refcount_increment0(
1936 			&rbtdb->node_locks[node->locknum].references);
1937 	}
1938 }
1939 
1940 /*%
1941  * The tree lock must be held for the result to be valid.
1942  */
1943 static bool
is_leaf(dns_rbtnode_t * node)1944 is_leaf(dns_rbtnode_t *node) {
1945 	return (node->parent != NULL && node->parent->down == node &&
1946 		node->left == NULL && node->right == NULL);
1947 }
1948 
1949 /*%
1950  * The tree lock must be held when this function is called as it reads and
1951  * updates rbtdb->prunenodes.
1952  */
1953 static void
send_to_prune_tree(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,isc_rwlocktype_t locktype)1954 send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1955 		   isc_rwlocktype_t locktype) {
1956 	bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL);
1957 
1958 	INSIST(locktype == isc_rwlocktype_write);
1959 
1960 	new_reference(rbtdb, node, locktype);
1961 	INSIST(!ISC_LINK_LINKED(node, prunelink));
1962 	ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink);
1963 
1964 	if (!pruning_queued) {
1965 		isc_event_t *ev = NULL;
1966 		dns_db_t *db = NULL;
1967 
1968 		attach((dns_db_t *)rbtdb, &db);
1969 
1970 		ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1971 					DNS_EVENT_RBTPRUNE, prune_tree, db,
1972 					sizeof(isc_event_t));
1973 		isc_task_send(rbtdb->task, &ev);
1974 	}
1975 }
1976 
1977 /*%
1978  * Clean up dead nodes.  These are nodes which have no references, and
1979  * have no data.  They are dead but we could not or chose not to delete
1980  * them when we deleted all the data at that node because we did not want
1981  * to wait for the tree write lock.
1982  *
1983  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1984  */
1985 static void
cleanup_dead_nodes(dns_rbtdb_t * rbtdb,int bucketnum)1986 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1987 	dns_rbtnode_t *node;
1988 	int count = 10; /* XXXJT: should be adjustable */
1989 
1990 	node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1991 	while (node != NULL && count > 0) {
1992 		ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1993 
1994 		/*
1995 		 * We might have reactivated this node without a tree write
1996 		 * lock, so we couldn't remove this node from deadnodes then
1997 		 * and we have to do it now.
1998 		 */
1999 		if (isc_refcount_current(&node->references) != 0 ||
2000 		    node->data != NULL)
2001 		{
2002 			node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
2003 			count--;
2004 			continue;
2005 		}
2006 
2007 		if (is_leaf(node) && rbtdb->task != NULL) {
2008 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
2009 		} else if (node->down == NULL && node->data == NULL) {
2010 			/*
2011 			 * Not a interior node and not needing to be
2012 			 * reactivated.
2013 			 */
2014 			delete_node(rbtdb, node);
2015 		} else if (node->data == NULL) {
2016 			/*
2017 			 * A interior node without data. Leave linked to
2018 			 * to be cleaned up when node->down becomes NULL.
2019 			 */
2020 			ISC_LIST_APPEND(rbtdb->deadnodes[bucketnum], node,
2021 					deadlink);
2022 		}
2023 		node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
2024 		count--;
2025 	}
2026 }
2027 
2028 /*
2029  * This function is assumed to be called when a node is newly referenced
2030  * and can be in the deadnode list.  In that case the node must be retrieved
2031  * from the list because it is going to be used.  In addition, if the caller
2032  * happens to hold a write lock on the tree, it's a good chance to purge dead
2033  * nodes.
2034  * Note: while a new reference is gained in multiple places, there are only very
2035  * few cases where the node can be in the deadnode list (only empty nodes can
2036  * have been added to the list).
2037  */
2038 static void
reactivate_node(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,isc_rwlocktype_t treelocktype)2039 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2040 		isc_rwlocktype_t treelocktype) {
2041 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
2042 	nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
2043 	bool maybe_cleanup = false;
2044 
2045 	POST(locktype);
2046 
2047 	NODE_LOCK(nodelock, locktype);
2048 
2049 	/*
2050 	 * Check if we can possibly cleanup the dead node.  If so, upgrade
2051 	 * the node lock below to perform the cleanup.
2052 	 */
2053 	if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
2054 	    treelocktype == isc_rwlocktype_write)
2055 	{
2056 		maybe_cleanup = true;
2057 	}
2058 
2059 	if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
2060 		/*
2061 		 * Upgrade the lock and test if we still need to unlink.
2062 		 */
2063 		NODE_UNLOCK(nodelock, locktype);
2064 		locktype = isc_rwlocktype_write;
2065 		POST(locktype);
2066 		NODE_LOCK(nodelock, locktype);
2067 		if (ISC_LINK_LINKED(node, deadlink)) {
2068 			ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
2069 					deadlink);
2070 		}
2071 		if (maybe_cleanup) {
2072 			cleanup_dead_nodes(rbtdb, node->locknum);
2073 		}
2074 	}
2075 
2076 	new_reference(rbtdb, node, locktype);
2077 
2078 	NODE_UNLOCK(nodelock, locktype);
2079 }
2080 
2081 /*
2082  * Caller must be holding the node lock; either the "strong", read or write
2083  * lock.  Note that the lock must be held even when node references are
2084  * atomically modified; in that case the decrement operation itself does not
2085  * have to be protected, but we must avoid a race condition where multiple
2086  * threads are decreasing the reference to zero simultaneously and at least
2087  * one of them is going to free the node.
2088  *
2089  * This function returns true if and only if the node reference decreases
2090  * to zero.
2091  *
2092  * NOTE: Decrementing the reference count of a node to zero does not mean it
2093  * will be immediately freed.
2094  */
2095 static bool
decrement_reference(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,rbtdb_serial_t least_serial,isc_rwlocktype_t nlock,isc_rwlocktype_t tlock,bool pruning)2096 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2097 		    rbtdb_serial_t least_serial, isc_rwlocktype_t nlock,
2098 		    isc_rwlocktype_t tlock, bool pruning) {
2099 	isc_result_t result;
2100 	bool write_locked;
2101 	bool locked = tlock != isc_rwlocktype_none;
2102 	rbtdb_nodelock_t *nodelock;
2103 	int bucket = node->locknum;
2104 	bool no_reference = true;
2105 	uint_fast32_t refs;
2106 
2107 	nodelock = &rbtdb->node_locks[bucket];
2108 
2109 #define KEEP_NODE(n, r, l)                                  \
2110 	((n)->data != NULL || ((l) && (n)->down != NULL) || \
2111 	 (n) == (r)->origin_node || (n) == (r)->nsec3_origin_node)
2112 
2113 	/* Handle easy and typical case first. */
2114 	if (!node->dirty && KEEP_NODE(node, rbtdb, locked)) {
2115 		if (isc_refcount_decrement(&node->references) == 1) {
2116 			refs = isc_refcount_decrement(&nodelock->references);
2117 			INSIST(refs > 0);
2118 			return (true);
2119 		} else {
2120 			return (false);
2121 		}
2122 	}
2123 
2124 	/* Upgrade the lock? */
2125 	if (nlock == isc_rwlocktype_read) {
2126 		NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
2127 		NODE_LOCK(&nodelock->lock, isc_rwlocktype_write);
2128 	}
2129 
2130 	if (isc_refcount_decrement(&node->references) > 1) {
2131 		/* Restore the lock? */
2132 		if (nlock == isc_rwlocktype_read) {
2133 			NODE_DOWNGRADE(&nodelock->lock);
2134 		}
2135 		return (false);
2136 	}
2137 
2138 	if (node->dirty) {
2139 		if (IS_CACHE(rbtdb)) {
2140 			clean_cache_node(rbtdb, node);
2141 		} else {
2142 			if (least_serial == 0) {
2143 				/*
2144 				 * Caller doesn't know the least serial.
2145 				 * Get it.
2146 				 */
2147 				RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2148 				least_serial = rbtdb->least_serial;
2149 				RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2150 			}
2151 			clean_zone_node(rbtdb, node, least_serial);
2152 		}
2153 	}
2154 
2155 	/*
2156 	 * Attempt to switch to a write lock on the tree.  If this fails,
2157 	 * we will add this node to a linked list of nodes in this locking
2158 	 * bucket which we will free later.
2159 	 */
2160 	if (tlock != isc_rwlocktype_write) {
2161 		/*
2162 		 * Locking hierarchy notwithstanding, we don't need to free
2163 		 * the node lock before acquiring the tree write lock because
2164 		 * we only do a trylock.
2165 		 */
2166 		if (tlock == isc_rwlocktype_read) {
2167 			result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
2168 		} else {
2169 			result = isc_rwlock_trylock(&rbtdb->tree_lock,
2170 						    isc_rwlocktype_write);
2171 		}
2172 		RUNTIME_CHECK(result == ISC_R_SUCCESS ||
2173 			      result == ISC_R_LOCKBUSY);
2174 
2175 		write_locked = (result == ISC_R_SUCCESS);
2176 	} else {
2177 		write_locked = true;
2178 	}
2179 
2180 	refs = isc_refcount_decrement(&nodelock->references);
2181 	INSIST(refs > 0);
2182 
2183 	if (KEEP_NODE(node, rbtdb, locked || write_locked)) {
2184 		goto restore_locks;
2185 	}
2186 
2187 #undef KEEP_NODE
2188 
2189 	if (write_locked) {
2190 		/*
2191 		 * We can now delete the node.
2192 		 */
2193 
2194 		/*
2195 		 * If this node is the only one in the level it's in, deleting
2196 		 * this node may recursively make its parent the only node in
2197 		 * the parent level; if so, and if no one is currently using
2198 		 * the parent node, this is almost the only opportunity to
2199 		 * clean it up.  But the recursive cleanup is not that trivial
2200 		 * since the child and parent may be in different lock buckets,
2201 		 * which would cause a lock order reversal problem.  To avoid
2202 		 * the trouble, we'll dispatch a separate event for batch
2203 		 * cleaning.  We need to check whether we're deleting the node
2204 		 * as a result of pruning to avoid infinite dispatching.
2205 		 * Note: pruning happens only when a task has been set for the
2206 		 * rbtdb.  If the user of the rbtdb chooses not to set a task,
2207 		 * it's their responsibility to purge stale leaves (e.g. by
2208 		 * periodic walk-through).
2209 		 */
2210 		if (!pruning && is_leaf(node) && rbtdb->task != NULL) {
2211 			send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
2212 			no_reference = false;
2213 		} else {
2214 			delete_node(rbtdb, node);
2215 		}
2216 	} else {
2217 		INSIST(node->data == NULL);
2218 		if (!ISC_LINK_LINKED(node, deadlink)) {
2219 			ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
2220 					deadlink);
2221 		}
2222 	}
2223 
2224 restore_locks:
2225 	/* Restore the lock? */
2226 	if (nlock == isc_rwlocktype_read) {
2227 		NODE_DOWNGRADE(&nodelock->lock);
2228 	}
2229 
2230 	/*
2231 	 * Relock a read lock, or unlock the write lock if no lock was held.
2232 	 */
2233 	if (tlock == isc_rwlocktype_none) {
2234 		if (write_locked) {
2235 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2236 		}
2237 	}
2238 
2239 	if (tlock == isc_rwlocktype_read) {
2240 		if (write_locked) {
2241 			isc_rwlock_downgrade(&rbtdb->tree_lock);
2242 		}
2243 	}
2244 
2245 	return (no_reference);
2246 }
2247 
2248 /*
2249  * Prune the tree by recursively cleaning up single leaves.  Go through all
2250  * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst
2251  * case, it will be necessary to traverse a number of tree levels equal to the
2252  * maximum legal number of domain name labels (127); in practice, the number of
2253  * tree levels to traverse will virtually always be much smaller (a few levels
2254  * at most).  While holding the tree lock throughout this entire operation is
2255  * less than ideal, so is splitting the latter up by queueing a separate
2256  * prune_tree() run for each node to start pruning from (as queueing requires
2257  * allocating memory and can therefore potentially be exploited to exhaust
2258  * available memory).  Also note that actually freeing up the memory used by
2259  * RBTDB nodes (which is what this function does) is essential to keeping cache
2260  * memory use in check, so since the tree lock needs to be acquired anyway,
2261  * freeing as many nodes as possible before the tree lock gets released is
2262  * prudent.
2263  */
2264 static void
prune_tree(isc_task_t * task,isc_event_t * event)2265 prune_tree(isc_task_t *task, isc_event_t *event) {
2266 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)event->ev_arg;
2267 	dns_rbtnode_t *node = NULL;
2268 	dns_rbtnode_t *parent = NULL;
2269 	unsigned int locknum;
2270 
2271 	UNUSED(task);
2272 
2273 	isc_event_free(&event);
2274 
2275 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2276 
2277 	while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) {
2278 		locknum = node->locknum;
2279 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2280 			  isc_rwlocktype_write);
2281 		do {
2282 			if (ISC_LINK_LINKED(node, prunelink)) {
2283 				ISC_LIST_UNLINK(rbtdb->prunenodes, node,
2284 						prunelink);
2285 			}
2286 
2287 			parent = node->parent;
2288 			decrement_reference(rbtdb, node, 0,
2289 					    isc_rwlocktype_write,
2290 					    isc_rwlocktype_write, true);
2291 
2292 			if (parent != NULL && parent->down == NULL) {
2293 				/*
2294 				 * node was the only down child of the parent
2295 				 * and has just been removed.  We'll then need
2296 				 * to examine the parent.  Keep the lock if
2297 				 * possible; otherwise, release the old lock and
2298 				 * acquire one for the parent.
2299 				 */
2300 				if (parent->locknum != locknum) {
2301 					NODE_UNLOCK(
2302 						&rbtdb->node_locks[locknum].lock,
2303 						isc_rwlocktype_write);
2304 					locknum = parent->locknum;
2305 					NODE_LOCK(
2306 						&rbtdb->node_locks[locknum].lock,
2307 						isc_rwlocktype_write);
2308 				}
2309 
2310 				/*
2311 				 * We need to gain a reference to the node
2312 				 * before decrementing it in the next iteration.
2313 				 */
2314 				if (ISC_LINK_LINKED(parent, deadlink)) {
2315 					ISC_LIST_UNLINK(
2316 						rbtdb->deadnodes[locknum],
2317 						parent, deadlink);
2318 				}
2319 				new_reference(rbtdb, parent,
2320 					      isc_rwlocktype_write);
2321 			} else {
2322 				parent = NULL;
2323 			}
2324 
2325 			node = parent;
2326 		} while (node != NULL);
2327 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2328 			    isc_rwlocktype_write);
2329 	}
2330 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2331 
2332 	detach((dns_db_t **)(void *)&rbtdb);
2333 }
2334 
2335 static void
make_least_version(dns_rbtdb_t * rbtdb,rbtdb_version_t * version,rbtdb_changedlist_t * cleanup_list)2336 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
2337 		   rbtdb_changedlist_t *cleanup_list) {
2338 	/*
2339 	 * Caller must be holding the database lock.
2340 	 */
2341 
2342 	rbtdb->least_serial = version->serial;
2343 	*cleanup_list = version->changed_list;
2344 	ISC_LIST_INIT(version->changed_list);
2345 }
2346 
2347 static void
cleanup_nondirty(rbtdb_version_t * version,rbtdb_changedlist_t * cleanup_list)2348 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
2349 	rbtdb_changed_t *changed, *next_changed;
2350 
2351 	/*
2352 	 * If the changed record is dirty, then
2353 	 * an update created multiple versions of
2354 	 * a given rdataset.  We keep this list
2355 	 * until we're the least open version, at
2356 	 * which point it's safe to get rid of any
2357 	 * older versions.
2358 	 *
2359 	 * If the changed record isn't dirty, then
2360 	 * we don't need it anymore since we're
2361 	 * committing and not rolling back.
2362 	 *
2363 	 * The caller must be holding the database lock.
2364 	 */
2365 	for (changed = HEAD(version->changed_list); changed != NULL;
2366 	     changed = next_changed)
2367 	{
2368 		next_changed = NEXT(changed, link);
2369 		if (!changed->dirty) {
2370 			UNLINK(version->changed_list, changed, link);
2371 			APPEND(*cleanup_list, changed, link);
2372 		}
2373 	}
2374 }
2375 
2376 static void
iszonesecure(dns_db_t * db,rbtdb_version_t * version,dns_dbnode_t * origin)2377 iszonesecure(dns_db_t *db, rbtdb_version_t *version, dns_dbnode_t *origin) {
2378 	dns_rdataset_t keyset;
2379 	dns_rdataset_t nsecset, signsecset;
2380 	bool haszonekey = false;
2381 	bool hasnsec = false;
2382 	isc_result_t result;
2383 
2384 	dns_rdataset_init(&keyset);
2385 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey,
2386 				     0, 0, &keyset, NULL);
2387 	if (result == ISC_R_SUCCESS) {
2388 		result = dns_rdataset_first(&keyset);
2389 		while (result == ISC_R_SUCCESS) {
2390 			dns_rdata_t keyrdata = DNS_RDATA_INIT;
2391 			dns_rdataset_current(&keyset, &keyrdata);
2392 			if (dns_zonekey_iszonekey(&keyrdata)) {
2393 				haszonekey = true;
2394 				break;
2395 			}
2396 			result = dns_rdataset_next(&keyset);
2397 		}
2398 		dns_rdataset_disassociate(&keyset);
2399 	}
2400 	if (!haszonekey) {
2401 		version->secure = dns_db_insecure;
2402 		version->havensec3 = false;
2403 		return;
2404 	}
2405 
2406 	dns_rdataset_init(&nsecset);
2407 	dns_rdataset_init(&signsecset);
2408 	result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, 0,
2409 				     0, &nsecset, &signsecset);
2410 	if (result == ISC_R_SUCCESS) {
2411 		if (dns_rdataset_isassociated(&signsecset)) {
2412 			hasnsec = true;
2413 			dns_rdataset_disassociate(&signsecset);
2414 		}
2415 		dns_rdataset_disassociate(&nsecset);
2416 	}
2417 
2418 	setnsec3parameters(db, version);
2419 
2420 	/*
2421 	 * Do we have a valid NSEC/NSEC3 chain?
2422 	 */
2423 	if (version->havensec3 || hasnsec) {
2424 		version->secure = dns_db_secure;
2425 	} else {
2426 		version->secure = dns_db_insecure;
2427 	}
2428 }
2429 
2430 /*%<
2431  * Walk the origin node looking for NSEC3PARAM records.
2432  * Cache the nsec3 parameters.
2433  */
2434 static void
setnsec3parameters(dns_db_t * db,rbtdb_version_t * version)2435 setnsec3parameters(dns_db_t *db, rbtdb_version_t *version) {
2436 	dns_rbtnode_t *node;
2437 	dns_rdata_nsec3param_t nsec3param;
2438 	dns_rdata_t rdata = DNS_RDATA_INIT;
2439 	isc_region_t region;
2440 	isc_result_t result;
2441 	rdatasetheader_t *header, *header_next;
2442 	unsigned char *raw; /* RDATASLAB */
2443 	unsigned int count, length;
2444 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2445 
2446 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2447 	version->havensec3 = false;
2448 	node = rbtdb->origin_node;
2449 	NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2450 		  isc_rwlocktype_read);
2451 	for (header = node->data; header != NULL; header = header_next) {
2452 		header_next = header->next;
2453 		do {
2454 			if (header->serial <= version->serial &&
2455 			    !IGNORE(header))
2456 			{
2457 				if (NONEXISTENT(header)) {
2458 					header = NULL;
2459 				}
2460 				break;
2461 			} else {
2462 				header = header->down;
2463 			}
2464 		} while (header != NULL);
2465 
2466 		if (header != NULL &&
2467 		    (header->type == dns_rdatatype_nsec3param))
2468 		{
2469 			/*
2470 			 * Find A NSEC3PARAM with a supported algorithm.
2471 			 */
2472 			raw = (unsigned char *)header + sizeof(*header);
2473 			count = raw[0] * 256 + raw[1]; /* count */
2474 			raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
2475 			while (count-- > 0U) {
2476 				length = raw[0] * 256 + raw[1];
2477 				raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
2478 				region.base = raw;
2479 				region.length = length;
2480 				raw += length;
2481 				dns_rdata_fromregion(
2482 					&rdata, rbtdb->common.rdclass,
2483 					dns_rdatatype_nsec3param, &region);
2484 				result = dns_rdata_tostruct(&rdata, &nsec3param,
2485 							    NULL);
2486 				INSIST(result == ISC_R_SUCCESS);
2487 				dns_rdata_reset(&rdata);
2488 
2489 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG &&
2490 				    !dns_nsec3_supportedhash(nsec3param.hash))
2491 				{
2492 					continue;
2493 				}
2494 
2495 				if (nsec3param.flags != 0) {
2496 					continue;
2497 				}
2498 
2499 				memmove(version->salt, nsec3param.salt,
2500 					nsec3param.salt_length);
2501 				version->hash = nsec3param.hash;
2502 				version->salt_length = nsec3param.salt_length;
2503 				version->iterations = nsec3param.iterations;
2504 				version->flags = nsec3param.flags;
2505 				version->havensec3 = true;
2506 				/*
2507 				 * Look for a better algorithm than the
2508 				 * unknown test algorithm.
2509 				 */
2510 				if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) {
2511 					goto unlock;
2512 				}
2513 			}
2514 		}
2515 	}
2516 unlock:
2517 	NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2518 		    isc_rwlocktype_read);
2519 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
2520 }
2521 
2522 static void
cleanup_dead_nodes_callback(isc_task_t * task,isc_event_t * event)2523 cleanup_dead_nodes_callback(isc_task_t *task, isc_event_t *event) {
2524 	dns_rbtdb_t *rbtdb = event->ev_arg;
2525 	bool again = false;
2526 	unsigned int locknum;
2527 
2528 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2529 	for (locknum = 0; locknum < rbtdb->node_lock_count; locknum++) {
2530 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
2531 			  isc_rwlocktype_write);
2532 		cleanup_dead_nodes(rbtdb, locknum);
2533 		if (ISC_LIST_HEAD(rbtdb->deadnodes[locknum]) != NULL) {
2534 			again = true;
2535 		}
2536 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
2537 			    isc_rwlocktype_write);
2538 	}
2539 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2540 	if (again) {
2541 		isc_task_send(task, &event);
2542 	} else {
2543 		isc_event_free(&event);
2544 		if (isc_refcount_decrement(&rbtdb->references) == 1) {
2545 			(void)isc_refcount_current(&rbtdb->references);
2546 			maybe_free_rbtdb(rbtdb);
2547 		}
2548 	}
2549 }
2550 
2551 static void
closeversion(dns_db_t * db,dns_dbversion_t ** versionp,bool commit)2552 closeversion(dns_db_t *db, dns_dbversion_t **versionp, bool commit) {
2553 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2554 	rbtdb_version_t *version, *cleanup_version, *least_greater;
2555 	bool rollback = false;
2556 	rbtdb_changedlist_t cleanup_list;
2557 	rdatasetheaderlist_t resigned_list;
2558 	rbtdb_changed_t *changed, *next_changed;
2559 	rbtdb_serial_t serial, least_serial;
2560 	dns_rbtnode_t *rbtnode;
2561 	rdatasetheader_t *header;
2562 
2563 	REQUIRE(VALID_RBTDB(rbtdb));
2564 	version = (rbtdb_version_t *)*versionp;
2565 	INSIST(version->rbtdb == rbtdb);
2566 
2567 	cleanup_version = NULL;
2568 	ISC_LIST_INIT(cleanup_list);
2569 	ISC_LIST_INIT(resigned_list);
2570 
2571 	if (isc_refcount_decrement(&version->references) > 1) {
2572 		/* typical and easy case first */
2573 		if (commit) {
2574 			RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
2575 			INSIST(!version->writer);
2576 			RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
2577 		}
2578 		goto end;
2579 	}
2580 
2581 	/*
2582 	 * Update the zone's secure status in version before making
2583 	 * it the current version.
2584 	 */
2585 	if (version->writer && commit && !IS_CACHE(rbtdb)) {
2586 		iszonesecure(db, version, rbtdb->origin_node);
2587 	}
2588 
2589 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
2590 	serial = version->serial;
2591 	if (version->writer) {
2592 		if (commit) {
2593 			unsigned cur_ref;
2594 			rbtdb_version_t *cur_version;
2595 
2596 			INSIST(version->commit_ok);
2597 			INSIST(version == rbtdb->future_version);
2598 			/*
2599 			 * The current version is going to be replaced.
2600 			 * Release the (likely last) reference to it from the
2601 			 * DB itself and unlink it from the open list.
2602 			 */
2603 			cur_version = rbtdb->current_version;
2604 			cur_ref = isc_refcount_decrement(
2605 				&cur_version->references);
2606 			if (cur_ref == 1) {
2607 				(void)isc_refcount_current(
2608 					&cur_version->references);
2609 				if (cur_version->serial == rbtdb->least_serial)
2610 				{
2611 					INSIST(EMPTY(
2612 						cur_version->changed_list));
2613 				}
2614 				UNLINK(rbtdb->open_versions, cur_version, link);
2615 			}
2616 			if (EMPTY(rbtdb->open_versions)) {
2617 				/*
2618 				 * We're going to become the least open
2619 				 * version.
2620 				 */
2621 				make_least_version(rbtdb, version,
2622 						   &cleanup_list);
2623 			} else {
2624 				/*
2625 				 * Some other open version is the
2626 				 * least version.  We can't cleanup
2627 				 * records that were changed in this
2628 				 * version because the older versions
2629 				 * may still be in use by an open
2630 				 * version.
2631 				 *
2632 				 * We can, however, discard the
2633 				 * changed records for things that
2634 				 * we've added that didn't exist in
2635 				 * prior versions.
2636 				 */
2637 				cleanup_nondirty(version, &cleanup_list);
2638 			}
2639 			/*
2640 			 * If the (soon to be former) current version
2641 			 * isn't being used by anyone, we can clean
2642 			 * it up.
2643 			 */
2644 			if (cur_ref == 1) {
2645 				cleanup_version = cur_version;
2646 				APPENDLIST(version->changed_list,
2647 					   cleanup_version->changed_list, link);
2648 			}
2649 			/*
2650 			 * Become the current version.
2651 			 */
2652 			version->writer = false;
2653 			rbtdb->current_version = version;
2654 			rbtdb->current_serial = version->serial;
2655 			rbtdb->future_version = NULL;
2656 
2657 			/*
2658 			 * Keep the current version in the open list, and
2659 			 * gain a reference for the DB itself (see the DB
2660 			 * creation function below).  This must be the only
2661 			 * case where we need to increment the counter from
2662 			 * zero and need to use isc_refcount_increment0().
2663 			 */
2664 			INSIST(isc_refcount_increment0(&version->references) ==
2665 			       0);
2666 			PREPEND(rbtdb->open_versions, rbtdb->current_version,
2667 				link);
2668 			resigned_list = version->resigned_list;
2669 			ISC_LIST_INIT(version->resigned_list);
2670 		} else {
2671 			/*
2672 			 * We're rolling back this transaction.
2673 			 */
2674 			cleanup_list = version->changed_list;
2675 			ISC_LIST_INIT(version->changed_list);
2676 			resigned_list = version->resigned_list;
2677 			ISC_LIST_INIT(version->resigned_list);
2678 			rollback = true;
2679 			cleanup_version = version;
2680 			rbtdb->future_version = NULL;
2681 		}
2682 	} else {
2683 		if (version != rbtdb->current_version) {
2684 			/*
2685 			 * There are no external or internal references
2686 			 * to this version and it can be cleaned up.
2687 			 */
2688 			cleanup_version = version;
2689 
2690 			/*
2691 			 * Find the version with the least serial
2692 			 * number greater than ours.
2693 			 */
2694 			least_greater = PREV(version, link);
2695 			if (least_greater == NULL) {
2696 				least_greater = rbtdb->current_version;
2697 			}
2698 
2699 			INSIST(version->serial < least_greater->serial);
2700 			/*
2701 			 * Is this the least open version?
2702 			 */
2703 			if (version->serial == rbtdb->least_serial) {
2704 				/*
2705 				 * Yes.  Install the new least open
2706 				 * version.
2707 				 */
2708 				make_least_version(rbtdb, least_greater,
2709 						   &cleanup_list);
2710 			} else {
2711 				/*
2712 				 * Add any unexecuted cleanups to
2713 				 * those of the least greater version.
2714 				 */
2715 				APPENDLIST(least_greater->changed_list,
2716 					   version->changed_list, link);
2717 			}
2718 		} else if (version->serial == rbtdb->least_serial) {
2719 			INSIST(EMPTY(version->changed_list));
2720 		}
2721 		UNLINK(rbtdb->open_versions, version, link);
2722 	}
2723 	least_serial = rbtdb->least_serial;
2724 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2725 
2726 	if (cleanup_version != NULL) {
2727 		INSIST(EMPTY(cleanup_version->changed_list));
2728 		free_gluetable(cleanup_version);
2729 		isc_rwlock_destroy(&cleanup_version->glue_rwlock);
2730 		isc_rwlock_destroy(&cleanup_version->rwlock);
2731 		isc_mem_put(rbtdb->common.mctx, cleanup_version,
2732 			    sizeof(*cleanup_version));
2733 	}
2734 
2735 	/*
2736 	 * Commit/rollback re-signed headers.
2737 	 */
2738 	for (header = HEAD(resigned_list); header != NULL;
2739 	     header = HEAD(resigned_list))
2740 	{
2741 		nodelock_t *lock;
2742 
2743 		ISC_LIST_UNLINK(resigned_list, header, link);
2744 
2745 		lock = &rbtdb->node_locks[header->node->locknum].lock;
2746 		NODE_LOCK(lock, isc_rwlocktype_write);
2747 		if (rollback && !IGNORE(header)) {
2748 			resign_insert(rbtdb, header->node->locknum, header);
2749 		}
2750 		decrement_reference(rbtdb, header->node, least_serial,
2751 				    isc_rwlocktype_write, isc_rwlocktype_none,
2752 				    false);
2753 		NODE_UNLOCK(lock, isc_rwlocktype_write);
2754 	}
2755 
2756 	if (!EMPTY(cleanup_list)) {
2757 		isc_event_t *event = NULL;
2758 		isc_rwlocktype_t tlock = isc_rwlocktype_none;
2759 
2760 		if (rbtdb->task != NULL) {
2761 			event = isc_event_allocate(rbtdb->common.mctx, NULL,
2762 						   DNS_EVENT_RBTDEADNODES,
2763 						   cleanup_dead_nodes_callback,
2764 						   rbtdb, sizeof(isc_event_t));
2765 		}
2766 		if (event == NULL) {
2767 			/*
2768 			 * We acquire a tree write lock here in order to make
2769 			 * sure that stale nodes will be removed in
2770 			 * decrement_reference().  If we didn't have the lock,
2771 			 * those nodes could miss the chance to be removed
2772 			 * until the server stops.  The write lock is
2773 			 * expensive, but this event should be rare enough
2774 			 * to justify the cost.
2775 			 */
2776 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2777 			tlock = isc_rwlocktype_write;
2778 		}
2779 
2780 		for (changed = HEAD(cleanup_list); changed != NULL;
2781 		     changed = next_changed)
2782 		{
2783 			nodelock_t *lock;
2784 
2785 			next_changed = NEXT(changed, link);
2786 			rbtnode = changed->node;
2787 			lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2788 
2789 			NODE_LOCK(lock, isc_rwlocktype_write);
2790 			/*
2791 			 * This is a good opportunity to purge any dead nodes,
2792 			 * so use it.
2793 			 */
2794 			if (event == NULL) {
2795 				cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2796 			}
2797 
2798 			if (rollback) {
2799 				rollback_node(rbtnode, serial);
2800 			}
2801 			decrement_reference(rbtdb, rbtnode, least_serial,
2802 					    isc_rwlocktype_write, tlock, false);
2803 
2804 			NODE_UNLOCK(lock, isc_rwlocktype_write);
2805 
2806 			isc_mem_put(rbtdb->common.mctx, changed,
2807 				    sizeof(*changed));
2808 		}
2809 		if (event != NULL) {
2810 			isc_refcount_increment(&rbtdb->references);
2811 			isc_task_send(rbtdb->task, &event);
2812 		} else {
2813 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2814 		}
2815 	}
2816 
2817 end:
2818 	*versionp = NULL;
2819 }
2820 
2821 /*
2822  * Add the necessary magic for the wildcard name 'name'
2823  * to be found in 'rbtdb'.
2824  *
2825  * In order for wildcard matching to work correctly in
2826  * zone_find(), we must ensure that a node for the wildcarding
2827  * level exists in the database, and has its 'find_callback'
2828  * and 'wild' bits set.
2829  *
2830  * E.g. if the wildcard name is "*.sub.example." then we
2831  * must ensure that "sub.example." exists and is marked as
2832  * a wildcard level.
2833  *
2834  * tree_lock(write) must be held.
2835  */
2836 static isc_result_t
add_wildcard_magic(dns_rbtdb_t * rbtdb,const dns_name_t * name,bool lock)2837 add_wildcard_magic(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2838 	isc_result_t result;
2839 	dns_name_t foundname;
2840 	dns_offsets_t offsets;
2841 	unsigned int n;
2842 	dns_rbtnode_t *node = NULL;
2843 
2844 	dns_name_init(&foundname, offsets);
2845 	n = dns_name_countlabels(name);
2846 	INSIST(n >= 2);
2847 	n--;
2848 	dns_name_getlabelsequence(name, 1, n, &foundname);
2849 	result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2850 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2851 		return (result);
2852 	}
2853 	if (result == ISC_R_SUCCESS) {
2854 		node->nsec = DNS_RBT_NSEC_NORMAL;
2855 	}
2856 	node->find_callback = 1;
2857 	if (lock) {
2858 		NODE_LOCK(&rbtdb->node_locks[node->locknum].lock,
2859 			  isc_rwlocktype_write);
2860 	}
2861 	node->wild = 1;
2862 	if (lock) {
2863 		NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
2864 			    isc_rwlocktype_write);
2865 	}
2866 	return (ISC_R_SUCCESS);
2867 }
2868 
2869 /*
2870  * tree_lock(write) must be held.
2871  */
2872 static isc_result_t
add_empty_wildcards(dns_rbtdb_t * rbtdb,const dns_name_t * name,bool lock)2873 add_empty_wildcards(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) {
2874 	isc_result_t result;
2875 	dns_name_t foundname;
2876 	dns_offsets_t offsets;
2877 	unsigned int n, l, i;
2878 
2879 	dns_name_init(&foundname, offsets);
2880 	n = dns_name_countlabels(name);
2881 	l = dns_name_countlabels(&rbtdb->common.origin);
2882 	i = l + 1;
2883 	while (i < n) {
2884 		dns_rbtnode_t *node = NULL; /* dummy */
2885 		dns_name_getlabelsequence(name, n - i, i, &foundname);
2886 		if (dns_name_iswildcard(&foundname)) {
2887 			result = add_wildcard_magic(rbtdb, &foundname, lock);
2888 			if (result != ISC_R_SUCCESS) {
2889 				return (result);
2890 			}
2891 			result = dns_rbt_addnode(rbtdb->tree, &foundname,
2892 						 &node);
2893 			if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
2894 				return (result);
2895 			}
2896 			if (result == ISC_R_SUCCESS) {
2897 				node->nsec = DNS_RBT_NSEC_NORMAL;
2898 			}
2899 		}
2900 		i++;
2901 	}
2902 	return (ISC_R_SUCCESS);
2903 }
2904 
2905 static isc_result_t
findnodeintree(dns_rbtdb_t * rbtdb,dns_rbt_t * tree,const dns_name_t * name,bool create,dns_dbnode_t ** nodep)2906 findnodeintree(dns_rbtdb_t *rbtdb, dns_rbt_t *tree, const dns_name_t *name,
2907 	       bool create, dns_dbnode_t **nodep) {
2908 	dns_rbtnode_t *node = NULL;
2909 	dns_name_t nodename;
2910 	isc_result_t result;
2911 	isc_rwlocktype_t locktype = isc_rwlocktype_read;
2912 
2913 	INSIST(tree == rbtdb->tree || tree == rbtdb->nsec3);
2914 
2915 	dns_name_init(&nodename, NULL);
2916 	RWLOCK(&rbtdb->tree_lock, locktype);
2917 	result = dns_rbt_findnode(tree, name, NULL, &node, NULL,
2918 				  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2919 	if (result != ISC_R_SUCCESS) {
2920 		RWUNLOCK(&rbtdb->tree_lock, locktype);
2921 		if (!create) {
2922 			if (result == DNS_R_PARTIALMATCH) {
2923 				result = ISC_R_NOTFOUND;
2924 			}
2925 			return (result);
2926 		}
2927 		/*
2928 		 * It would be nice to try to upgrade the lock instead of
2929 		 * unlocking then relocking.
2930 		 */
2931 		locktype = isc_rwlocktype_write;
2932 		RWLOCK(&rbtdb->tree_lock, locktype);
2933 		node = NULL;
2934 		result = dns_rbt_addnode(tree, name, &node);
2935 		if (result == ISC_R_SUCCESS) {
2936 			dns_rbt_namefromnode(node, &nodename);
2937 			node->locknum = node->hashval % rbtdb->node_lock_count;
2938 			if (tree == rbtdb->tree) {
2939 				add_empty_wildcards(rbtdb, name, true);
2940 
2941 				if (dns_name_iswildcard(name)) {
2942 					result = add_wildcard_magic(rbtdb, name,
2943 								    true);
2944 					if (result != ISC_R_SUCCESS) {
2945 						RWUNLOCK(&rbtdb->tree_lock,
2946 							 locktype);
2947 						return (result);
2948 					}
2949 				}
2950 			}
2951 			if (tree == rbtdb->nsec3) {
2952 				node->nsec = DNS_RBT_NSEC_NSEC3;
2953 			}
2954 		} else if (result != ISC_R_EXISTS) {
2955 			RWUNLOCK(&rbtdb->tree_lock, locktype);
2956 			return (result);
2957 		}
2958 	}
2959 
2960 	if (tree == rbtdb->nsec3) {
2961 		INSIST(node->nsec == DNS_RBT_NSEC_NSEC3);
2962 	}
2963 
2964 	reactivate_node(rbtdb, node, locktype);
2965 
2966 	RWUNLOCK(&rbtdb->tree_lock, locktype);
2967 
2968 	*nodep = (dns_dbnode_t *)node;
2969 
2970 	return (ISC_R_SUCCESS);
2971 }
2972 
2973 static isc_result_t
findnode(dns_db_t * db,const dns_name_t * name,bool create,dns_dbnode_t ** nodep)2974 findnode(dns_db_t *db, const dns_name_t *name, bool create,
2975 	 dns_dbnode_t **nodep) {
2976 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2977 
2978 	REQUIRE(VALID_RBTDB(rbtdb));
2979 
2980 	return (findnodeintree(rbtdb, rbtdb->tree, name, create, nodep));
2981 }
2982 
2983 static isc_result_t
findnsec3node(dns_db_t * db,const dns_name_t * name,bool create,dns_dbnode_t ** nodep)2984 findnsec3node(dns_db_t *db, const dns_name_t *name, bool create,
2985 	      dns_dbnode_t **nodep) {
2986 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2987 
2988 	REQUIRE(VALID_RBTDB(rbtdb));
2989 
2990 	return (findnodeintree(rbtdb, rbtdb->nsec3, name, create, nodep));
2991 }
2992 
2993 static isc_result_t
zone_zonecut_callback(dns_rbtnode_t * node,dns_name_t * name,void * arg)2994 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2995 	rbtdb_search_t *search = arg;
2996 	rdatasetheader_t *header, *header_next;
2997 	rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2998 	rdatasetheader_t *found;
2999 	isc_result_t result;
3000 	dns_rbtnode_t *onode;
3001 
3002 	/*
3003 	 * We only want to remember the topmost zone cut, since it's the one
3004 	 * that counts, so we'll just continue if we've already found a
3005 	 * zonecut.
3006 	 */
3007 	if (search->zonecut != NULL) {
3008 		return (DNS_R_CONTINUE);
3009 	}
3010 
3011 	found = NULL;
3012 	result = DNS_R_CONTINUE;
3013 	onode = search->rbtdb->origin_node;
3014 
3015 	NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3016 		  isc_rwlocktype_read);
3017 
3018 	/*
3019 	 * Look for an NS or DNAME rdataset active in our version.
3020 	 */
3021 	ns_header = NULL;
3022 	dname_header = NULL;
3023 	sigdname_header = NULL;
3024 	for (header = node->data; header != NULL; header = header_next) {
3025 		header_next = header->next;
3026 		if (header->type == dns_rdatatype_ns ||
3027 		    header->type == dns_rdatatype_dname ||
3028 		    header->type == RBTDB_RDATATYPE_SIGDNAME)
3029 		{
3030 			do {
3031 				if (header->serial <= search->serial &&
3032 				    !IGNORE(header))
3033 				{
3034 					/*
3035 					 * Is this a "this rdataset doesn't
3036 					 * exist" record?
3037 					 */
3038 					if (NONEXISTENT(header)) {
3039 						header = NULL;
3040 					}
3041 					break;
3042 				} else {
3043 					header = header->down;
3044 				}
3045 			} while (header != NULL);
3046 			if (header != NULL) {
3047 				if (header->type == dns_rdatatype_dname) {
3048 					dname_header = header;
3049 				} else if (header->type ==
3050 					   RBTDB_RDATATYPE_SIGDNAME)
3051 				{
3052 					sigdname_header = header;
3053 				} else if (node != onode ||
3054 					   IS_STUB(search->rbtdb))
3055 				{
3056 					/*
3057 					 * We've found an NS rdataset that
3058 					 * isn't at the origin node.  We check
3059 					 * that they're not at the origin node,
3060 					 * because otherwise we'd erroneously
3061 					 * treat the zone top as if it were
3062 					 * a delegation.
3063 					 */
3064 					ns_header = header;
3065 				}
3066 			}
3067 		}
3068 	}
3069 
3070 	/*
3071 	 * Did we find anything?
3072 	 */
3073 	if (!IS_CACHE(search->rbtdb) && !IS_STUB(search->rbtdb) &&
3074 	    ns_header != NULL)
3075 	{
3076 		/*
3077 		 * Note that NS has precedence over DNAME if both exist
3078 		 * in a zone.  Otherwise DNAME take precedence over NS.
3079 		 */
3080 		found = ns_header;
3081 		search->zonecut_sigrdataset = NULL;
3082 	} else if (dname_header != NULL) {
3083 		found = dname_header;
3084 		search->zonecut_sigrdataset = sigdname_header;
3085 	} else if (ns_header != NULL) {
3086 		found = ns_header;
3087 		search->zonecut_sigrdataset = NULL;
3088 	}
3089 
3090 	if (found != NULL) {
3091 		/*
3092 		 * We increment the reference count on node to ensure that
3093 		 * search->zonecut_rdataset will still be valid later.
3094 		 */
3095 		new_reference(search->rbtdb, node, isc_rwlocktype_read);
3096 		search->zonecut = node;
3097 		search->zonecut_rdataset = found;
3098 		search->need_cleanup = true;
3099 		/*
3100 		 * Since we've found a zonecut, anything beneath it is
3101 		 * glue and is not subject to wildcard matching, so we
3102 		 * may clear search->wild.
3103 		 */
3104 		search->wild = false;
3105 		if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
3106 			/*
3107 			 * If the caller does not want to find glue, then
3108 			 * this is the best answer and the search should
3109 			 * stop now.
3110 			 */
3111 			result = DNS_R_PARTIALMATCH;
3112 		} else {
3113 			dns_name_t *zcname;
3114 
3115 			/*
3116 			 * The search will continue beneath the zone cut.
3117 			 * This may or may not be the best match.  In case it
3118 			 * is, we need to remember the node name.
3119 			 */
3120 			zcname = dns_fixedname_name(&search->zonecut_name);
3121 			dns_name_copynf(name, zcname);
3122 			search->copy_name = true;
3123 		}
3124 	} else {
3125 		/*
3126 		 * There is no zonecut at this node which is active in this
3127 		 * version.
3128 		 *
3129 		 * If this is a "wild" node and the caller hasn't disabled
3130 		 * wildcard matching, remember that we've seen a wild node
3131 		 * in case we need to go searching for wildcard matches
3132 		 * later on.
3133 		 */
3134 		if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0) {
3135 			search->wild = true;
3136 		}
3137 	}
3138 
3139 	NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3140 		    isc_rwlocktype_read);
3141 
3142 	return (result);
3143 }
3144 
3145 static void
bind_rdataset(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,rdatasetheader_t * header,isc_stdtime_t now,isc_rwlocktype_t locktype,dns_rdataset_t * rdataset)3146 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header,
3147 	      isc_stdtime_t now, isc_rwlocktype_t locktype,
3148 	      dns_rdataset_t *rdataset) {
3149 	unsigned char *raw; /* RDATASLAB */
3150 	bool stale = STALE(header);
3151 	bool ancient = ANCIENT(header);
3152 
3153 	/*
3154 	 * Caller must be holding the node reader lock.
3155 	 * XXXJT: technically, we need a writer lock, since we'll increment
3156 	 * the header count below.  However, since the actual counter value
3157 	 * doesn't matter, we prioritize performance here.  (We may want to
3158 	 * use atomic increment when available).
3159 	 */
3160 
3161 	if (rdataset == NULL) {
3162 		return;
3163 	}
3164 
3165 	new_reference(rbtdb, node, locktype);
3166 
3167 	INSIST(rdataset->methods == NULL); /* We must be disassociated. */
3168 
3169 	/*
3170 	 * Mark header stale or ancient if the RRset is no longer active.
3171 	 */
3172 	if (!ACTIVE(header, now)) {
3173 		dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
3174 		/*
3175 		 * If this data is in the stale window keep it and if
3176 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
3177 		 * skip this record.  We skip the records with ZEROTTL
3178 		 * (these records should not be cached anyway).
3179 		 */
3180 
3181 		if (KEEPSTALE(rbtdb) && stale_ttl > now) {
3182 			stale = true;
3183 		} else {
3184 			/*
3185 			 * We are not keeping stale, or it is outside the
3186 			 * stale window. Mark ancient, i.e. ready for cleanup.
3187 			 */
3188 			ancient = true;
3189 		}
3190 	}
3191 
3192 	rdataset->methods = &rdataset_methods;
3193 	rdataset->rdclass = rbtdb->common.rdclass;
3194 	rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
3195 	rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
3196 	rdataset->ttl = header->rdh_ttl - now;
3197 	rdataset->trust = header->trust;
3198 
3199 	if (NEGATIVE(header)) {
3200 		rdataset->attributes |= DNS_RDATASETATTR_NEGATIVE;
3201 	}
3202 	if (NXDOMAIN(header)) {
3203 		rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
3204 	}
3205 	if (OPTOUT(header)) {
3206 		rdataset->attributes |= DNS_RDATASETATTR_OPTOUT;
3207 	}
3208 	if (PREFETCH(header)) {
3209 		rdataset->attributes |= DNS_RDATASETATTR_PREFETCH;
3210 	}
3211 
3212 	if (stale && !ancient) {
3213 		dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
3214 		if (stale_ttl > now) {
3215 			rdataset->ttl = stale_ttl - now;
3216 		} else {
3217 			rdataset->ttl = 0;
3218 		}
3219 		if (STALE_WINDOW(header)) {
3220 			rdataset->attributes |= DNS_RDATASETATTR_STALE_WINDOW;
3221 		}
3222 		rdataset->attributes |= DNS_RDATASETATTR_STALE;
3223 	} else if (IS_CACHE(rbtdb) && !ACTIVE(header, now)) {
3224 		rdataset->attributes |= DNS_RDATASETATTR_ANCIENT;
3225 		rdataset->ttl = header->rdh_ttl;
3226 	}
3227 
3228 	rdataset->private1 = rbtdb;
3229 	rdataset->private2 = node;
3230 	raw = (unsigned char *)header + sizeof(*header);
3231 	rdataset->private3 = raw;
3232 	rdataset->count = atomic_fetch_add_relaxed(&header->count, 1);
3233 	if (rdataset->count == UINT32_MAX) {
3234 		rdataset->count = 0;
3235 	}
3236 
3237 	/*
3238 	 * Reset iterator state.
3239 	 */
3240 	rdataset->privateuint4 = 0;
3241 	rdataset->private5 = NULL;
3242 
3243 	/*
3244 	 * Add noqname proof.
3245 	 */
3246 	rdataset->private6 = header->noqname;
3247 	if (rdataset->private6 != NULL) {
3248 		rdataset->attributes |= DNS_RDATASETATTR_NOQNAME;
3249 	}
3250 	rdataset->private7 = header->closest;
3251 	if (rdataset->private7 != NULL) {
3252 		rdataset->attributes |= DNS_RDATASETATTR_CLOSEST;
3253 	}
3254 
3255 	/*
3256 	 * Copy out re-signing information.
3257 	 */
3258 	if (RESIGN(header)) {
3259 		rdataset->attributes |= DNS_RDATASETATTR_RESIGN;
3260 		rdataset->resign = (header->resign << 1) | header->resign_lsb;
3261 	} else {
3262 		rdataset->resign = 0;
3263 	}
3264 }
3265 
3266 static isc_result_t
setup_delegation(rbtdb_search_t * search,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)3267 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
3268 		 dns_name_t *foundname, dns_rdataset_t *rdataset,
3269 		 dns_rdataset_t *sigrdataset) {
3270 	dns_name_t *zcname;
3271 	rbtdb_rdatatype_t type;
3272 	dns_rbtnode_t *node;
3273 
3274 	/*
3275 	 * The caller MUST NOT be holding any node locks.
3276 	 */
3277 
3278 	node = search->zonecut;
3279 	type = search->zonecut_rdataset->type;
3280 
3281 	/*
3282 	 * If we have to set foundname, we do it before anything else.
3283 	 * If we were to set foundname after we had set nodep or bound the
3284 	 * rdataset, then we'd have to undo that work if dns_name_copy()
3285 	 * failed.  By setting foundname first, there's nothing to undo if
3286 	 * we have trouble.
3287 	 */
3288 	if (foundname != NULL && search->copy_name) {
3289 		zcname = dns_fixedname_name(&search->zonecut_name);
3290 		dns_name_copynf(zcname, foundname);
3291 	}
3292 	if (nodep != NULL) {
3293 		/*
3294 		 * Note that we don't have to increment the node's reference
3295 		 * count here because we're going to use the reference we
3296 		 * already have in the search block.
3297 		 */
3298 		*nodep = node;
3299 		search->need_cleanup = false;
3300 	}
3301 	if (rdataset != NULL) {
3302 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3303 			  isc_rwlocktype_read);
3304 		bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
3305 			      search->now, isc_rwlocktype_read, rdataset);
3306 		if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
3307 		{
3308 			bind_rdataset(search->rbtdb, node,
3309 				      search->zonecut_sigrdataset, search->now,
3310 				      isc_rwlocktype_read, sigrdataset);
3311 		}
3312 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3313 			    isc_rwlocktype_read);
3314 	}
3315 
3316 	if (type == dns_rdatatype_dname) {
3317 		return (DNS_R_DNAME);
3318 	}
3319 	return (DNS_R_DELEGATION);
3320 }
3321 
3322 static bool
valid_glue(rbtdb_search_t * search,dns_name_t * name,rbtdb_rdatatype_t type,dns_rbtnode_t * node)3323 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
3324 	   dns_rbtnode_t *node) {
3325 	unsigned char *raw; /* RDATASLAB */
3326 	unsigned int count, size;
3327 	dns_name_t ns_name;
3328 	bool valid = false;
3329 	dns_offsets_t offsets;
3330 	isc_region_t region;
3331 	rdatasetheader_t *header;
3332 
3333 	/*
3334 	 * No additional locking is required.
3335 	 */
3336 
3337 	/*
3338 	 * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
3339 	 * if it occurs at a zone cut, but is not valid below it.
3340 	 */
3341 	if (type == dns_rdatatype_ns) {
3342 		if (node != search->zonecut) {
3343 			return (false);
3344 		}
3345 	} else if (type != dns_rdatatype_a && type != dns_rdatatype_aaaa &&
3346 		   type != dns_rdatatype_a6)
3347 	{
3348 		return (false);
3349 	}
3350 
3351 	header = search->zonecut_rdataset;
3352 	raw = (unsigned char *)header + sizeof(*header);
3353 	count = raw[0] * 256 + raw[1];
3354 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3355 
3356 	while (count > 0) {
3357 		count--;
3358 		size = raw[0] * 256 + raw[1];
3359 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3360 		region.base = raw;
3361 		region.length = size;
3362 		raw += size;
3363 		/*
3364 		 * XXX Until we have rdata structures, we have no choice but
3365 		 * to directly access the rdata format.
3366 		 */
3367 		dns_name_init(&ns_name, offsets);
3368 		dns_name_fromregion(&ns_name, &region);
3369 		if (dns_name_compare(&ns_name, name) == 0) {
3370 			valid = true;
3371 			break;
3372 		}
3373 	}
3374 
3375 	return (valid);
3376 }
3377 
3378 static bool
activeempty(rbtdb_search_t * search,dns_rbtnodechain_t * chain,const dns_name_t * name)3379 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
3380 	    const dns_name_t *name) {
3381 	dns_fixedname_t fnext;
3382 	dns_fixedname_t forigin;
3383 	dns_name_t *next;
3384 	dns_name_t *origin;
3385 	dns_name_t prefix;
3386 	dns_rbtdb_t *rbtdb;
3387 	dns_rbtnode_t *node;
3388 	isc_result_t result;
3389 	bool answer = false;
3390 	rdatasetheader_t *header;
3391 
3392 	rbtdb = search->rbtdb;
3393 
3394 	dns_name_init(&prefix, NULL);
3395 	next = dns_fixedname_initname(&fnext);
3396 	origin = dns_fixedname_initname(&forigin);
3397 
3398 	result = dns_rbtnodechain_next(chain, NULL, NULL);
3399 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3400 		node = NULL;
3401 		result = dns_rbtnodechain_current(chain, &prefix, origin,
3402 						  &node);
3403 		if (result != ISC_R_SUCCESS) {
3404 			break;
3405 		}
3406 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3407 			  isc_rwlocktype_read);
3408 		for (header = node->data; header != NULL; header = header->next)
3409 		{
3410 			if (header->serial <= search->serial &&
3411 			    !IGNORE(header) && EXISTS(header))
3412 			{
3413 				break;
3414 			}
3415 		}
3416 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3417 			    isc_rwlocktype_read);
3418 		if (header != NULL) {
3419 			break;
3420 		}
3421 		result = dns_rbtnodechain_next(chain, NULL, NULL);
3422 	}
3423 	if (result == ISC_R_SUCCESS) {
3424 		result = dns_name_concatenate(&prefix, origin, next, NULL);
3425 	}
3426 	if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name)) {
3427 		answer = true;
3428 	}
3429 	return (answer);
3430 }
3431 
3432 static bool
activeemptynode(rbtdb_search_t * search,const dns_name_t * qname,dns_name_t * wname)3433 activeemptynode(rbtdb_search_t *search, const dns_name_t *qname,
3434 		dns_name_t *wname) {
3435 	dns_fixedname_t fnext;
3436 	dns_fixedname_t forigin;
3437 	dns_fixedname_t fprev;
3438 	dns_name_t *next;
3439 	dns_name_t *origin;
3440 	dns_name_t *prev;
3441 	dns_name_t name;
3442 	dns_name_t rname;
3443 	dns_name_t tname;
3444 	dns_rbtdb_t *rbtdb;
3445 	dns_rbtnode_t *node;
3446 	dns_rbtnodechain_t chain;
3447 	bool check_next = true;
3448 	bool check_prev = true;
3449 	bool answer = false;
3450 	isc_result_t result;
3451 	rdatasetheader_t *header;
3452 	unsigned int n;
3453 
3454 	rbtdb = search->rbtdb;
3455 
3456 	dns_name_init(&name, NULL);
3457 	dns_name_init(&tname, NULL);
3458 	dns_name_init(&rname, NULL);
3459 	next = dns_fixedname_initname(&fnext);
3460 	prev = dns_fixedname_initname(&fprev);
3461 	origin = dns_fixedname_initname(&forigin);
3462 
3463 	/*
3464 	 * Find if qname is at or below a empty node.
3465 	 * Use our own copy of the chain.
3466 	 */
3467 
3468 	chain = search->chain;
3469 	do {
3470 		node = NULL;
3471 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3472 		if (result != ISC_R_SUCCESS) {
3473 			break;
3474 		}
3475 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3476 			  isc_rwlocktype_read);
3477 		for (header = node->data; header != NULL; header = header->next)
3478 		{
3479 			if (header->serial <= search->serial &&
3480 			    !IGNORE(header) && EXISTS(header))
3481 			{
3482 				break;
3483 			}
3484 		}
3485 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3486 			    isc_rwlocktype_read);
3487 		if (header != NULL) {
3488 			break;
3489 		}
3490 		result = dns_rbtnodechain_prev(&chain, NULL, NULL);
3491 	} while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
3492 	if (result == ISC_R_SUCCESS) {
3493 		result = dns_name_concatenate(&name, origin, prev, NULL);
3494 	}
3495 	if (result != ISC_R_SUCCESS) {
3496 		check_prev = false;
3497 	}
3498 
3499 	result = dns_rbtnodechain_next(&chain, NULL, NULL);
3500 	while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
3501 		node = NULL;
3502 		result = dns_rbtnodechain_current(&chain, &name, origin, &node);
3503 		if (result != ISC_R_SUCCESS) {
3504 			break;
3505 		}
3506 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3507 			  isc_rwlocktype_read);
3508 		for (header = node->data; header != NULL; header = header->next)
3509 		{
3510 			if (header->serial <= search->serial &&
3511 			    !IGNORE(header) && EXISTS(header))
3512 			{
3513 				break;
3514 			}
3515 		}
3516 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3517 			    isc_rwlocktype_read);
3518 		if (header != NULL) {
3519 			break;
3520 		}
3521 		result = dns_rbtnodechain_next(&chain, NULL, NULL);
3522 	}
3523 	if (result == ISC_R_SUCCESS) {
3524 		result = dns_name_concatenate(&name, origin, next, NULL);
3525 	}
3526 	if (result != ISC_R_SUCCESS) {
3527 		check_next = false;
3528 	}
3529 
3530 	dns_name_clone(qname, &rname);
3531 
3532 	/*
3533 	 * Remove the wildcard label to find the terminal name.
3534 	 */
3535 	n = dns_name_countlabels(wname);
3536 	dns_name_getlabelsequence(wname, 1, n - 1, &tname);
3537 
3538 	do {
3539 		if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
3540 		    (check_next && dns_name_issubdomain(next, &rname)))
3541 		{
3542 			answer = true;
3543 			break;
3544 		}
3545 		/*
3546 		 * Remove the left hand label.
3547 		 */
3548 		n = dns_name_countlabels(&rname);
3549 		dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
3550 	} while (!dns_name_equal(&rname, &tname));
3551 	return (answer);
3552 }
3553 
3554 static isc_result_t
find_wildcard(rbtdb_search_t * search,dns_rbtnode_t ** nodep,const dns_name_t * qname)3555 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
3556 	      const dns_name_t *qname) {
3557 	unsigned int i, j;
3558 	dns_rbtnode_t *node, *level_node, *wnode;
3559 	rdatasetheader_t *header;
3560 	isc_result_t result = ISC_R_NOTFOUND;
3561 	dns_name_t name;
3562 	dns_name_t *wname;
3563 	dns_fixedname_t fwname;
3564 	dns_rbtdb_t *rbtdb;
3565 	bool done, wild, active;
3566 	dns_rbtnodechain_t wchain;
3567 
3568 	/*
3569 	 * Caller must be holding the tree lock and MUST NOT be holding
3570 	 * any node locks.
3571 	 */
3572 
3573 	/*
3574 	 * Examine each ancestor level.  If the level's wild bit
3575 	 * is set, then construct the corresponding wildcard name and
3576 	 * search for it.  If the wildcard node exists, and is active in
3577 	 * this version, we're done.  If not, then we next check to see
3578 	 * if the ancestor is active in this version.  If so, then there
3579 	 * can be no possible wildcard match and again we're done.  If not,
3580 	 * continue the search.
3581 	 */
3582 
3583 	rbtdb = search->rbtdb;
3584 	i = search->chain.level_matches;
3585 	done = false;
3586 	node = *nodep;
3587 	do {
3588 		NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
3589 			  isc_rwlocktype_read);
3590 
3591 		/*
3592 		 * First we try to figure out if this node is active in
3593 		 * the search's version.  We do this now, even though we
3594 		 * may not need the information, because it simplifies the
3595 		 * locking and code flow.
3596 		 */
3597 		for (header = node->data; header != NULL; header = header->next)
3598 		{
3599 			if (header->serial <= search->serial &&
3600 			    !IGNORE(header) && EXISTS(header) &&
3601 			    !ANCIENT(header))
3602 			{
3603 				break;
3604 			}
3605 		}
3606 		if (header != NULL) {
3607 			active = true;
3608 		} else {
3609 			active = false;
3610 		}
3611 
3612 		if (node->wild) {
3613 			wild = true;
3614 		} else {
3615 			wild = false;
3616 		}
3617 
3618 		NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
3619 			    isc_rwlocktype_read);
3620 
3621 		if (wild) {
3622 			/*
3623 			 * Construct the wildcard name for this level.
3624 			 */
3625 			dns_name_init(&name, NULL);
3626 			dns_rbt_namefromnode(node, &name);
3627 			wname = dns_fixedname_initname(&fwname);
3628 			result = dns_name_concatenate(dns_wildcardname, &name,
3629 						      wname, NULL);
3630 			j = i;
3631 			while (result == ISC_R_SUCCESS && j != 0) {
3632 				j--;
3633 				level_node = search->chain.levels[j];
3634 				dns_name_init(&name, NULL);
3635 				dns_rbt_namefromnode(level_node, &name);
3636 				result = dns_name_concatenate(wname, &name,
3637 							      wname, NULL);
3638 			}
3639 			if (result != ISC_R_SUCCESS) {
3640 				break;
3641 			}
3642 
3643 			wnode = NULL;
3644 			dns_rbtnodechain_init(&wchain);
3645 			result = dns_rbt_findnode(
3646 				rbtdb->tree, wname, NULL, &wnode, &wchain,
3647 				DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3648 			if (result == ISC_R_SUCCESS) {
3649 				nodelock_t *lock;
3650 
3651 				/*
3652 				 * We have found the wildcard node.  If it
3653 				 * is active in the search's version, we're
3654 				 * done.
3655 				 */
3656 				lock = &rbtdb->node_locks[wnode->locknum].lock;
3657 				NODE_LOCK(lock, isc_rwlocktype_read);
3658 				for (header = wnode->data; header != NULL;
3659 				     header = header->next)
3660 				{
3661 					if (header->serial <= search->serial &&
3662 					    !IGNORE(header) && EXISTS(header) &&
3663 					    !ANCIENT(header))
3664 					{
3665 						break;
3666 					}
3667 				}
3668 				NODE_UNLOCK(lock, isc_rwlocktype_read);
3669 				if (header != NULL ||
3670 				    activeempty(search, &wchain, wname))
3671 				{
3672 					if (activeemptynode(search, qname,
3673 							    wname))
3674 					{
3675 						return (ISC_R_NOTFOUND);
3676 					}
3677 					/*
3678 					 * The wildcard node is active!
3679 					 *
3680 					 * Note: result is still ISC_R_SUCCESS
3681 					 * so we don't have to set it.
3682 					 */
3683 					*nodep = wnode;
3684 					break;
3685 				}
3686 			} else if (result != ISC_R_NOTFOUND &&
3687 				   result != DNS_R_PARTIALMATCH)
3688 			{
3689 				/*
3690 				 * An error has occurred.  Bail out.
3691 				 */
3692 				break;
3693 			}
3694 		}
3695 
3696 		if (active) {
3697 			/*
3698 			 * The level node is active.  Any wildcarding
3699 			 * present at higher levels has no
3700 			 * effect and we're done.
3701 			 */
3702 			result = ISC_R_NOTFOUND;
3703 			break;
3704 		}
3705 
3706 		if (i > 0) {
3707 			i--;
3708 			node = search->chain.levels[i];
3709 		} else {
3710 			done = true;
3711 		}
3712 	} while (!done);
3713 
3714 	return (result);
3715 }
3716 
3717 static bool
matchparams(rdatasetheader_t * header,rbtdb_search_t * search)3718 matchparams(rdatasetheader_t *header, rbtdb_search_t *search) {
3719 	dns_rdata_t rdata = DNS_RDATA_INIT;
3720 	dns_rdata_nsec3_t nsec3;
3721 	unsigned char *raw; /* RDATASLAB */
3722 	unsigned int rdlen, count;
3723 	isc_region_t region;
3724 	isc_result_t result;
3725 
3726 	REQUIRE(header->type == dns_rdatatype_nsec3);
3727 
3728 	raw = (unsigned char *)header + sizeof(*header);
3729 	count = raw[0] * 256 + raw[1]; /* count */
3730 	raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH;
3731 
3732 	while (count-- > 0) {
3733 		rdlen = raw[0] * 256 + raw[1];
3734 		raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
3735 		region.base = raw;
3736 		region.length = rdlen;
3737 		dns_rdata_fromregion(&rdata, search->rbtdb->common.rdclass,
3738 				     dns_rdatatype_nsec3, &region);
3739 		raw += rdlen;
3740 		result = dns_rdata_tostruct(&rdata, &nsec3, NULL);
3741 		INSIST(result == ISC_R_SUCCESS);
3742 		if (nsec3.hash == search->rbtversion->hash &&
3743 		    nsec3.iterations == search->rbtversion->iterations &&
3744 		    nsec3.salt_length == search->rbtversion->salt_length &&
3745 		    memcmp(nsec3.salt, search->rbtversion->salt,
3746 			   nsec3.salt_length) == 0)
3747 		{
3748 			return (true);
3749 		}
3750 		dns_rdata_reset(&rdata);
3751 	}
3752 	return (false);
3753 }
3754 
3755 /*
3756  * Find node of the NSEC/NSEC3 record that is 'name'.
3757  */
3758 static isc_result_t
previous_closest_nsec(dns_rdatatype_t type,rbtdb_search_t * search,dns_name_t * name,dns_name_t * origin,dns_rbtnode_t ** nodep,dns_rbtnodechain_t * nsecchain,bool * firstp)3759 previous_closest_nsec(dns_rdatatype_t type, rbtdb_search_t *search,
3760 		      dns_name_t *name, dns_name_t *origin,
3761 		      dns_rbtnode_t **nodep, dns_rbtnodechain_t *nsecchain,
3762 		      bool *firstp) {
3763 	dns_fixedname_t ftarget;
3764 	dns_name_t *target;
3765 	dns_rbtnode_t *nsecnode;
3766 	isc_result_t result;
3767 
3768 	REQUIRE(nodep != NULL && *nodep == NULL);
3769 	REQUIRE(type == dns_rdatatype_nsec3 || firstp != NULL);
3770 
3771 	if (type == dns_rdatatype_nsec3) {
3772 		result = dns_rbtnodechain_prev(&search->chain, NULL, NULL);
3773 		if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) {
3774 			return (result);
3775 		}
3776 		result = dns_rbtnodechain_current(&search->chain, name, origin,
3777 						  nodep);
3778 		return (result);
3779 	}
3780 
3781 	target = dns_fixedname_initname(&ftarget);
3782 
3783 	for (;;) {
3784 		if (*firstp) {
3785 			/*
3786 			 * Construct the name of the second node to check.
3787 			 * It is the first node sought in the NSEC tree.
3788 			 */
3789 			*firstp = false;
3790 			dns_rbtnodechain_init(nsecchain);
3791 			result = dns_name_concatenate(name, origin, target,
3792 						      NULL);
3793 			if (result != ISC_R_SUCCESS) {
3794 				return (result);
3795 			}
3796 			nsecnode = NULL;
3797 			result = dns_rbt_findnode(
3798 				search->rbtdb->nsec, target, NULL, &nsecnode,
3799 				nsecchain, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3800 			if (result == ISC_R_SUCCESS) {
3801 				/*
3802 				 * Since this was the first loop, finding the
3803 				 * name in the NSEC tree implies that the first
3804 				 * node checked in the main tree had an
3805 				 * unacceptable NSEC record.
3806 				 * Try the previous node in the NSEC tree.
3807 				 */
3808 				result = dns_rbtnodechain_prev(nsecchain, name,
3809 							       origin);
3810 				if (result == DNS_R_NEWORIGIN) {
3811 					result = ISC_R_SUCCESS;
3812 				}
3813 			} else if (result == ISC_R_NOTFOUND ||
3814 				   result == DNS_R_PARTIALMATCH)
3815 			{
3816 				result = dns_rbtnodechain_current(
3817 					nsecchain, name, origin, NULL);
3818 				if (result == ISC_R_NOTFOUND) {
3819 					result = ISC_R_NOMORE;
3820 				}
3821 			}
3822 		} else {
3823 			/*
3824 			 * This is a second or later trip through the auxiliary
3825 			 * tree for the name of a third or earlier NSEC node in
3826 			 * the main tree.  Previous trips through the NSEC tree
3827 			 * must have found nodes in the main tree with NSEC
3828 			 * records.  Perhaps they lacked signature records.
3829 			 */
3830 			result = dns_rbtnodechain_prev(nsecchain, name, origin);
3831 			if (result == DNS_R_NEWORIGIN) {
3832 				result = ISC_R_SUCCESS;
3833 			}
3834 		}
3835 		if (result != ISC_R_SUCCESS) {
3836 			return (result);
3837 		}
3838 
3839 		/*
3840 		 * Construct the name to seek in the main tree.
3841 		 */
3842 		result = dns_name_concatenate(name, origin, target, NULL);
3843 		if (result != ISC_R_SUCCESS) {
3844 			return (result);
3845 		}
3846 
3847 		*nodep = NULL;
3848 		result = dns_rbt_findnode(search->rbtdb->tree, target, NULL,
3849 					  nodep, &search->chain,
3850 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
3851 		if (result == ISC_R_SUCCESS) {
3852 			return (result);
3853 		}
3854 
3855 		/*
3856 		 * There should always be a node in the main tree with the
3857 		 * same name as the node in the auxiliary NSEC tree, except for
3858 		 * nodes in the auxiliary tree that are awaiting deletion.
3859 		 */
3860 		if (result != DNS_R_PARTIALMATCH && result != ISC_R_NOTFOUND) {
3861 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
3862 				      DNS_LOGMODULE_CACHE, ISC_LOG_ERROR,
3863 				      "previous_closest_nsec(): %s",
3864 				      isc_result_totext(result));
3865 			return (DNS_R_BADDB);
3866 		}
3867 	}
3868 }
3869 
3870 /*
3871  * Find the NSEC/NSEC3 which is or before the current point on the
3872  * search chain.  For NSEC3 records only NSEC3 records that match the
3873  * current NSEC3PARAM record are considered.
3874  */
3875 static isc_result_t
find_closest_nsec(rbtdb_search_t * search,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset,dns_rbt_t * tree,dns_db_secure_t secure)3876 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3877 		  dns_name_t *foundname, dns_rdataset_t *rdataset,
3878 		  dns_rdataset_t *sigrdataset, dns_rbt_t *tree,
3879 		  dns_db_secure_t secure) {
3880 	dns_rbtnode_t *node, *prevnode;
3881 	rdatasetheader_t *header, *header_next, *found, *foundsig;
3882 	dns_rbtnodechain_t nsecchain;
3883 	bool empty_node;
3884 	isc_result_t result;
3885 	dns_fixedname_t fname, forigin;
3886 	dns_name_t *name, *origin;
3887 	dns_rdatatype_t type;
3888 	rbtdb_rdatatype_t sigtype;
3889 	bool wraps;
3890 	bool first = true;
3891 	bool need_sig = (secure == dns_db_secure);
3892 
3893 	if (tree == search->rbtdb->nsec3) {
3894 		type = dns_rdatatype_nsec3;
3895 		sigtype = RBTDB_RDATATYPE_SIGNSEC3;
3896 		wraps = true;
3897 	} else {
3898 		type = dns_rdatatype_nsec;
3899 		sigtype = RBTDB_RDATATYPE_SIGNSEC;
3900 		wraps = false;
3901 	}
3902 
3903 	/*
3904 	 * Use the auxiliary tree only starting with the second node in the
3905 	 * hope that the original node will be right much of the time.
3906 	 */
3907 	name = dns_fixedname_initname(&fname);
3908 	origin = dns_fixedname_initname(&forigin);
3909 again:
3910 	node = NULL;
3911 	prevnode = NULL;
3912 	result = dns_rbtnodechain_current(&search->chain, name, origin, &node);
3913 	if (result != ISC_R_SUCCESS) {
3914 		return (result);
3915 	}
3916 	do {
3917 		NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
3918 			  isc_rwlocktype_read);
3919 		found = NULL;
3920 		foundsig = NULL;
3921 		empty_node = true;
3922 		for (header = node->data; header != NULL; header = header_next)
3923 		{
3924 			header_next = header->next;
3925 			/*
3926 			 * Look for an active, extant NSEC or RRSIG NSEC.
3927 			 */
3928 			do {
3929 				if (header->serial <= search->serial &&
3930 				    !IGNORE(header))
3931 				{
3932 					/*
3933 					 * Is this a "this rdataset doesn't
3934 					 * exist" record?
3935 					 */
3936 					if (NONEXISTENT(header)) {
3937 						header = NULL;
3938 					}
3939 					break;
3940 				} else {
3941 					header = header->down;
3942 				}
3943 			} while (header != NULL);
3944 			if (header != NULL) {
3945 				/*
3946 				 * We now know that there is at least one
3947 				 * active rdataset at this node.
3948 				 */
3949 				empty_node = false;
3950 				if (header->type == type) {
3951 					found = header;
3952 					if (foundsig != NULL) {
3953 						break;
3954 					}
3955 				} else if (header->type == sigtype) {
3956 					foundsig = header;
3957 					if (found != NULL) {
3958 						break;
3959 					}
3960 				}
3961 			}
3962 		}
3963 		if (!empty_node) {
3964 			if (found != NULL && search->rbtversion->havensec3 &&
3965 			    found->type == dns_rdatatype_nsec3 &&
3966 			    !matchparams(found, search))
3967 			{
3968 				empty_node = true;
3969 				found = NULL;
3970 				foundsig = NULL;
3971 				result = previous_closest_nsec(
3972 					type, search, name, origin, &prevnode,
3973 					NULL, NULL);
3974 			} else if (found != NULL &&
3975 				   (foundsig != NULL || !need_sig))
3976 			{
3977 				/*
3978 				 * We've found the right NSEC/NSEC3 record.
3979 				 *
3980 				 * Note: for this to really be the right
3981 				 * NSEC record, it's essential that the NSEC
3982 				 * records of any nodes obscured by a zone
3983 				 * cut have been removed; we assume this is
3984 				 * the case.
3985 				 */
3986 				result = dns_name_concatenate(name, origin,
3987 							      foundname, NULL);
3988 				if (result == ISC_R_SUCCESS) {
3989 					if (nodep != NULL) {
3990 						new_reference(
3991 							search->rbtdb, node,
3992 							isc_rwlocktype_read);
3993 						*nodep = node;
3994 					}
3995 					bind_rdataset(search->rbtdb, node,
3996 						      found, search->now,
3997 						      isc_rwlocktype_read,
3998 						      rdataset);
3999 					if (foundsig != NULL) {
4000 						bind_rdataset(
4001 							search->rbtdb, node,
4002 							foundsig, search->now,
4003 							isc_rwlocktype_read,
4004 							sigrdataset);
4005 					}
4006 				}
4007 			} else if (found == NULL && foundsig == NULL) {
4008 				/*
4009 				 * This node is active, but has no NSEC or
4010 				 * RRSIG NSEC.  That means it's glue or
4011 				 * other obscured zone data that isn't
4012 				 * relevant for our search.  Treat the
4013 				 * node as if it were empty and keep looking.
4014 				 */
4015 				empty_node = true;
4016 				result = previous_closest_nsec(
4017 					type, search, name, origin, &prevnode,
4018 					&nsecchain, &first);
4019 			} else {
4020 				/*
4021 				 * We found an active node, but either the
4022 				 * NSEC or the RRSIG NSEC is missing.  This
4023 				 * shouldn't happen.
4024 				 */
4025 				result = DNS_R_BADDB;
4026 			}
4027 		} else {
4028 			/*
4029 			 * This node isn't active.  We've got to keep
4030 			 * looking.
4031 			 */
4032 			result = previous_closest_nsec(type, search, name,
4033 						       origin, &prevnode,
4034 						       &nsecchain, &first);
4035 		}
4036 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
4037 			    isc_rwlocktype_read);
4038 		node = prevnode;
4039 		prevnode = NULL;
4040 	} while (empty_node && result == ISC_R_SUCCESS);
4041 
4042 	if (!first) {
4043 		dns_rbtnodechain_invalidate(&nsecchain);
4044 	}
4045 
4046 	if (result == ISC_R_NOMORE && wraps) {
4047 		result = dns_rbtnodechain_last(&search->chain, tree, NULL,
4048 					       NULL);
4049 		if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
4050 			wraps = false;
4051 			goto again;
4052 		}
4053 	}
4054 
4055 	/*
4056 	 * If the result is ISC_R_NOMORE, then we got to the beginning of
4057 	 * the database and didn't find a NSEC record.  This shouldn't
4058 	 * happen.
4059 	 */
4060 	if (result == ISC_R_NOMORE) {
4061 		result = DNS_R_BADDB;
4062 	}
4063 
4064 	return (result);
4065 }
4066 
4067 static isc_result_t
zone_find(dns_db_t * db,const dns_name_t * name,dns_dbversion_t * version,dns_rdatatype_t type,unsigned int options,isc_stdtime_t now,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)4068 zone_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
4069 	  dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
4070 	  dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset,
4071 	  dns_rdataset_t *sigrdataset) {
4072 	dns_rbtnode_t *node = NULL;
4073 	isc_result_t result;
4074 	rbtdb_search_t search;
4075 	bool cname_ok = true;
4076 	bool close_version = false;
4077 	bool maybe_zonecut = false;
4078 	bool at_zonecut = false;
4079 	bool wild;
4080 	bool empty_node;
4081 	rdatasetheader_t *header, *header_next, *found, *nsecheader;
4082 	rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
4083 	rbtdb_rdatatype_t sigtype;
4084 	bool active;
4085 	nodelock_t *lock;
4086 	dns_rbt_t *tree;
4087 
4088 	search.rbtdb = (dns_rbtdb_t *)db;
4089 
4090 	REQUIRE(VALID_RBTDB(search.rbtdb));
4091 	INSIST(version == NULL ||
4092 	       ((rbtdb_version_t *)version)->rbtdb == (dns_rbtdb_t *)db);
4093 
4094 	/*
4095 	 * We don't care about 'now'.
4096 	 */
4097 	UNUSED(now);
4098 
4099 	/*
4100 	 * If the caller didn't supply a version, attach to the current
4101 	 * version.
4102 	 */
4103 	if (version == NULL) {
4104 		currentversion(db, &version);
4105 		close_version = true;
4106 	}
4107 
4108 	search.rbtversion = version;
4109 	search.serial = search.rbtversion->serial;
4110 	search.options = options;
4111 	search.copy_name = false;
4112 	search.need_cleanup = false;
4113 	search.wild = false;
4114 	search.zonecut = NULL;
4115 	dns_fixedname_init(&search.zonecut_name);
4116 	dns_rbtnodechain_init(&search.chain);
4117 	search.now = 0;
4118 
4119 	/*
4120 	 * 'wild' will be true iff. we've matched a wildcard.
4121 	 */
4122 	wild = false;
4123 
4124 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4125 
4126 	/*
4127 	 * Search down from the root of the tree.  If, while going down, we
4128 	 * encounter a callback node, zone_zonecut_callback() will search the
4129 	 * rdatasets at the zone cut for active DNAME or NS rdatasets.
4130 	 */
4131 	tree = (options & DNS_DBFIND_FORCENSEC3) != 0 ? search.rbtdb->nsec3
4132 						      : search.rbtdb->tree;
4133 	result = dns_rbt_findnode(tree, name, foundname, &node, &search.chain,
4134 				  DNS_RBTFIND_EMPTYDATA, zone_zonecut_callback,
4135 				  &search);
4136 
4137 	if (result == DNS_R_PARTIALMATCH) {
4138 	partial_match:
4139 		if (search.zonecut != NULL) {
4140 			result = setup_delegation(&search, nodep, foundname,
4141 						  rdataset, sigrdataset);
4142 			goto tree_exit;
4143 		}
4144 
4145 		if (search.wild) {
4146 			/*
4147 			 * At least one of the levels in the search chain
4148 			 * potentially has a wildcard.  For each such level,
4149 			 * we must see if there's a matching wildcard active
4150 			 * in the current version.
4151 			 */
4152 			result = find_wildcard(&search, &node, name);
4153 			if (result == ISC_R_SUCCESS) {
4154 				dns_name_copynf(name, foundname);
4155 				wild = true;
4156 				goto found;
4157 			} else if (result != ISC_R_NOTFOUND) {
4158 				goto tree_exit;
4159 			}
4160 		}
4161 
4162 		active = false;
4163 		if ((options & DNS_DBFIND_FORCENSEC3) == 0) {
4164 			/*
4165 			 * The NSEC3 tree won't have empty nodes,
4166 			 * so it isn't necessary to check for them.
4167 			 */
4168 			dns_rbtnodechain_t chain = search.chain;
4169 			active = activeempty(&search, &chain, name);
4170 		}
4171 
4172 		/*
4173 		 * If we're here, then the name does not exist, is not
4174 		 * beneath a zonecut, and there's no matching wildcard.
4175 		 */
4176 		if ((search.rbtversion->secure == dns_db_secure &&
4177 		     !search.rbtversion->havensec3) ||
4178 		    (search.options & DNS_DBFIND_FORCENSEC) != 0 ||
4179 		    (search.options & DNS_DBFIND_FORCENSEC3) != 0)
4180 		{
4181 			result = find_closest_nsec(&search, nodep, foundname,
4182 						   rdataset, sigrdataset, tree,
4183 						   search.rbtversion->secure);
4184 			if (result == ISC_R_SUCCESS) {
4185 				result = active ? DNS_R_EMPTYNAME
4186 						: DNS_R_NXDOMAIN;
4187 			}
4188 		} else {
4189 			result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
4190 		}
4191 		goto tree_exit;
4192 	} else if (result != ISC_R_SUCCESS) {
4193 		goto tree_exit;
4194 	}
4195 
4196 found:
4197 	/*
4198 	 * We have found a node whose name is the desired name, or we
4199 	 * have matched a wildcard.
4200 	 */
4201 
4202 	if (search.zonecut != NULL) {
4203 		/*
4204 		 * If we're beneath a zone cut, we don't want to look for
4205 		 * CNAMEs because they're not legitimate zone glue.
4206 		 */
4207 		cname_ok = false;
4208 	} else {
4209 		/*
4210 		 * The node may be a zone cut itself.  If it might be one,
4211 		 * make sure we check for it later.
4212 		 *
4213 		 * DS records live above the zone cut in ordinary zone so
4214 		 * we want to ignore any referral.
4215 		 *
4216 		 * Stub zones don't have anything "above" the delegation so
4217 		 * we always return a referral.
4218 		 */
4219 		if (node->find_callback &&
4220 		    ((node != search.rbtdb->origin_node &&
4221 		      !dns_rdatatype_atparent(type)) ||
4222 		     IS_STUB(search.rbtdb)))
4223 		{
4224 			maybe_zonecut = true;
4225 		}
4226 	}
4227 
4228 	/*
4229 	 * Certain DNSSEC types are not subject to CNAME matching
4230 	 * (RFC4035, section 2.5 and RFC3007).
4231 	 *
4232 	 * We don't check for RRSIG, because we don't store RRSIG records
4233 	 * directly.
4234 	 */
4235 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
4236 		cname_ok = false;
4237 	}
4238 
4239 	/*
4240 	 * We now go looking for rdata...
4241 	 */
4242 
4243 	lock = &search.rbtdb->node_locks[node->locknum].lock;
4244 	NODE_LOCK(lock, isc_rwlocktype_read);
4245 
4246 	found = NULL;
4247 	foundsig = NULL;
4248 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4249 	nsecheader = NULL;
4250 	nsecsig = NULL;
4251 	cnamesig = NULL;
4252 	empty_node = true;
4253 	for (header = node->data; header != NULL; header = header_next) {
4254 		header_next = header->next;
4255 		/*
4256 		 * Look for an active, extant rdataset.
4257 		 */
4258 		do {
4259 			if (header->serial <= search.serial && !IGNORE(header))
4260 			{
4261 				/*
4262 				 * Is this a "this rdataset doesn't
4263 				 * exist" record?
4264 				 */
4265 				if (NONEXISTENT(header)) {
4266 					header = NULL;
4267 				}
4268 				break;
4269 			} else {
4270 				header = header->down;
4271 			}
4272 		} while (header != NULL);
4273 		if (header != NULL) {
4274 			/*
4275 			 * We now know that there is at least one active
4276 			 * rdataset at this node.
4277 			 */
4278 			empty_node = false;
4279 
4280 			/*
4281 			 * Do special zone cut handling, if requested.
4282 			 */
4283 			if (maybe_zonecut && header->type == dns_rdatatype_ns) {
4284 				/*
4285 				 * We increment the reference count on node to
4286 				 * ensure that search->zonecut_rdataset will
4287 				 * still be valid later.
4288 				 */
4289 				new_reference(search.rbtdb, node,
4290 					      isc_rwlocktype_read);
4291 				search.zonecut = node;
4292 				search.zonecut_rdataset = header;
4293 				search.zonecut_sigrdataset = NULL;
4294 				search.need_cleanup = true;
4295 				maybe_zonecut = false;
4296 				at_zonecut = true;
4297 				/*
4298 				 * It is not clear if KEY should still be
4299 				 * allowed at the parent side of the zone
4300 				 * cut or not.  It is needed for RFC3007
4301 				 * validated updates.
4302 				 */
4303 				if ((search.options & DNS_DBFIND_GLUEOK) == 0 &&
4304 				    type != dns_rdatatype_nsec &&
4305 				    type != dns_rdatatype_key)
4306 				{
4307 					/*
4308 					 * Glue is not OK, but any answer we
4309 					 * could return would be glue.  Return
4310 					 * the delegation.
4311 					 */
4312 					found = NULL;
4313 					break;
4314 				}
4315 				if (found != NULL && foundsig != NULL) {
4316 					break;
4317 				}
4318 			}
4319 
4320 			/*
4321 			 * If the NSEC3 record doesn't match the chain
4322 			 * we are using behave as if it isn't here.
4323 			 */
4324 			if (header->type == dns_rdatatype_nsec3 &&
4325 			    !matchparams(header, &search))
4326 			{
4327 				NODE_UNLOCK(lock, isc_rwlocktype_read);
4328 				goto partial_match;
4329 			}
4330 			/*
4331 			 * If we found a type we were looking for,
4332 			 * remember it.
4333 			 */
4334 			if (header->type == type || type == dns_rdatatype_any ||
4335 			    (header->type == dns_rdatatype_cname && cname_ok))
4336 			{
4337 				/*
4338 				 * We've found the answer!
4339 				 */
4340 				found = header;
4341 				if (header->type == dns_rdatatype_cname &&
4342 				    cname_ok)
4343 				{
4344 					/*
4345 					 * We may be finding a CNAME instead
4346 					 * of the desired type.
4347 					 *
4348 					 * If we've already got the CNAME RRSIG,
4349 					 * use it, otherwise change sigtype
4350 					 * so that we find it.
4351 					 */
4352 					if (cnamesig != NULL) {
4353 						foundsig = cnamesig;
4354 					} else {
4355 						sigtype =
4356 							RBTDB_RDATATYPE_SIGCNAME;
4357 					}
4358 				}
4359 				/*
4360 				 * If we've got all we need, end the search.
4361 				 */
4362 				if (!maybe_zonecut && foundsig != NULL) {
4363 					break;
4364 				}
4365 			} else if (header->type == sigtype) {
4366 				/*
4367 				 * We've found the RRSIG rdataset for our
4368 				 * target type.  Remember it.
4369 				 */
4370 				foundsig = header;
4371 				/*
4372 				 * If we've got all we need, end the search.
4373 				 */
4374 				if (!maybe_zonecut && found != NULL) {
4375 					break;
4376 				}
4377 			} else if (header->type == dns_rdatatype_nsec &&
4378 				   !search.rbtversion->havensec3)
4379 			{
4380 				/*
4381 				 * Remember a NSEC rdataset even if we're
4382 				 * not specifically looking for it, because
4383 				 * we might need it later.
4384 				 */
4385 				nsecheader = header;
4386 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC &&
4387 				   !search.rbtversion->havensec3)
4388 			{
4389 				/*
4390 				 * If we need the NSEC rdataset, we'll also
4391 				 * need its signature.
4392 				 */
4393 				nsecsig = header;
4394 			} else if (cname_ok &&
4395 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
4396 			{
4397 				/*
4398 				 * If we get a CNAME match, we'll also need
4399 				 * its signature.
4400 				 */
4401 				cnamesig = header;
4402 			}
4403 		}
4404 	}
4405 
4406 	if (empty_node) {
4407 		/*
4408 		 * We have an exact match for the name, but there are no
4409 		 * active rdatasets in the desired version.  That means that
4410 		 * this node doesn't exist in the desired version, and that
4411 		 * we really have a partial match.
4412 		 */
4413 		if (!wild) {
4414 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4415 			goto partial_match;
4416 		}
4417 	}
4418 
4419 	/*
4420 	 * If we didn't find what we were looking for...
4421 	 */
4422 	if (found == NULL) {
4423 		if (search.zonecut != NULL) {
4424 			/*
4425 			 * We were trying to find glue at a node beneath a
4426 			 * zone cut, but didn't.
4427 			 *
4428 			 * Return the delegation.
4429 			 */
4430 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4431 			result = setup_delegation(&search, nodep, foundname,
4432 						  rdataset, sigrdataset);
4433 			goto tree_exit;
4434 		}
4435 		/*
4436 		 * The desired type doesn't exist.
4437 		 */
4438 		result = DNS_R_NXRRSET;
4439 		if (search.rbtversion->secure == dns_db_secure &&
4440 		    !search.rbtversion->havensec3 &&
4441 		    (nsecheader == NULL || nsecsig == NULL))
4442 		{
4443 			/*
4444 			 * The zone is secure but there's no NSEC,
4445 			 * or the NSEC has no signature!
4446 			 */
4447 			if (!wild) {
4448 				result = DNS_R_BADDB;
4449 				goto node_exit;
4450 			}
4451 
4452 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4453 			result = find_closest_nsec(&search, nodep, foundname,
4454 						   rdataset, sigrdataset,
4455 						   search.rbtdb->tree,
4456 						   search.rbtversion->secure);
4457 			if (result == ISC_R_SUCCESS) {
4458 				result = DNS_R_EMPTYWILD;
4459 			}
4460 			goto tree_exit;
4461 		}
4462 		if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
4463 		    nsecheader == NULL)
4464 		{
4465 			/*
4466 			 * There's no NSEC record, and we were told
4467 			 * to find one.
4468 			 */
4469 			result = DNS_R_BADDB;
4470 			goto node_exit;
4471 		}
4472 		if (nodep != NULL) {
4473 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4474 			*nodep = node;
4475 		}
4476 		if ((search.rbtversion->secure == dns_db_secure &&
4477 		     !search.rbtversion->havensec3) ||
4478 		    (search.options & DNS_DBFIND_FORCENSEC) != 0)
4479 		{
4480 			bind_rdataset(search.rbtdb, node, nsecheader, 0,
4481 				      isc_rwlocktype_read, rdataset);
4482 			if (nsecsig != NULL) {
4483 				bind_rdataset(search.rbtdb, node, nsecsig, 0,
4484 					      isc_rwlocktype_read, sigrdataset);
4485 			}
4486 		}
4487 		if (wild) {
4488 			foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4489 		}
4490 		goto node_exit;
4491 	}
4492 
4493 	/*
4494 	 * We found what we were looking for, or we found a CNAME.
4495 	 */
4496 
4497 	if (type != found->type && type != dns_rdatatype_any &&
4498 	    found->type == dns_rdatatype_cname)
4499 	{
4500 		/*
4501 		 * We weren't doing an ANY query and we found a CNAME instead
4502 		 * of the type we were looking for, so we need to indicate
4503 		 * that result to the caller.
4504 		 */
4505 		result = DNS_R_CNAME;
4506 	} else if (search.zonecut != NULL) {
4507 		/*
4508 		 * If we're beneath a zone cut, we must indicate that the
4509 		 * result is glue, unless we're actually at the zone cut
4510 		 * and the type is NSEC or KEY.
4511 		 */
4512 		if (search.zonecut == node) {
4513 			/*
4514 			 * It is not clear if KEY should still be
4515 			 * allowed at the parent side of the zone
4516 			 * cut or not.  It is needed for RFC3007
4517 			 * validated updates.
4518 			 */
4519 			if (type == dns_rdatatype_nsec ||
4520 			    type == dns_rdatatype_nsec3 ||
4521 			    type == dns_rdatatype_key)
4522 			{
4523 				result = ISC_R_SUCCESS;
4524 			} else if (type == dns_rdatatype_any) {
4525 				result = DNS_R_ZONECUT;
4526 			} else {
4527 				result = DNS_R_GLUE;
4528 			}
4529 		} else {
4530 			result = DNS_R_GLUE;
4531 		}
4532 		/*
4533 		 * We might have found data that isn't glue, but was occluded
4534 		 * by a dynamic update.  If the caller cares about this, they
4535 		 * will have told us to validate glue.
4536 		 *
4537 		 * XXX We should cache the glue validity state!
4538 		 */
4539 		if (result == DNS_R_GLUE &&
4540 		    (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
4541 		    !valid_glue(&search, foundname, type, node))
4542 		{
4543 			NODE_UNLOCK(lock, isc_rwlocktype_read);
4544 			result = setup_delegation(&search, nodep, foundname,
4545 						  rdataset, sigrdataset);
4546 			goto tree_exit;
4547 		}
4548 	} else {
4549 		/*
4550 		 * An ordinary successful query!
4551 		 */
4552 		result = ISC_R_SUCCESS;
4553 	}
4554 
4555 	if (nodep != NULL) {
4556 		if (!at_zonecut) {
4557 			new_reference(search.rbtdb, node, isc_rwlocktype_read);
4558 		} else {
4559 			search.need_cleanup = false;
4560 		}
4561 		*nodep = node;
4562 	}
4563 
4564 	if (type != dns_rdatatype_any) {
4565 		bind_rdataset(search.rbtdb, node, found, 0, isc_rwlocktype_read,
4566 			      rdataset);
4567 		if (foundsig != NULL) {
4568 			bind_rdataset(search.rbtdb, node, foundsig, 0,
4569 				      isc_rwlocktype_read, sigrdataset);
4570 		}
4571 	}
4572 
4573 	if (wild) {
4574 		foundname->attributes |= DNS_NAMEATTR_WILDCARD;
4575 	}
4576 
4577 node_exit:
4578 	NODE_UNLOCK(lock, isc_rwlocktype_read);
4579 
4580 tree_exit:
4581 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4582 
4583 	/*
4584 	 * If we found a zonecut but aren't going to use it, we have to
4585 	 * let go of it.
4586 	 */
4587 	if (search.need_cleanup) {
4588 		node = search.zonecut;
4589 		INSIST(node != NULL);
4590 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
4591 
4592 		NODE_LOCK(lock, isc_rwlocktype_read);
4593 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
4594 				    isc_rwlocktype_none, false);
4595 		NODE_UNLOCK(lock, isc_rwlocktype_read);
4596 	}
4597 
4598 	if (close_version) {
4599 		closeversion(db, &version, false);
4600 	}
4601 
4602 	dns_rbtnodechain_reset(&search.chain);
4603 
4604 	return (result);
4605 }
4606 
4607 static isc_result_t
zone_findzonecut(dns_db_t * db,const dns_name_t * name,unsigned int options,isc_stdtime_t now,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_name_t * dcname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)4608 zone_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
4609 		 isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname,
4610 		 dns_name_t *dcname, dns_rdataset_t *rdataset,
4611 		 dns_rdataset_t *sigrdataset) {
4612 	UNUSED(db);
4613 	UNUSED(name);
4614 	UNUSED(options);
4615 	UNUSED(now);
4616 	UNUSED(nodep);
4617 	UNUSED(foundname);
4618 	UNUSED(dcname);
4619 	UNUSED(rdataset);
4620 	UNUSED(sigrdataset);
4621 
4622 	FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
4623 
4624 	UNREACHABLE();
4625 	return (ISC_R_NOTIMPLEMENTED);
4626 }
4627 
4628 static bool
check_stale_header(dns_rbtnode_t * node,rdatasetheader_t * header,isc_rwlocktype_t * locktype,nodelock_t * lock,rbtdb_search_t * search,rdatasetheader_t ** header_prev)4629 check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header,
4630 		   isc_rwlocktype_t *locktype, nodelock_t *lock,
4631 		   rbtdb_search_t *search, rdatasetheader_t **header_prev) {
4632 	if (!ACTIVE(header, search->now)) {
4633 		dns_ttl_t stale = header->rdh_ttl +
4634 				  search->rbtdb->serve_stale_ttl;
4635 		/*
4636 		 * If this data is in the stale window keep it and if
4637 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
4638 		 * skip this record.  We skip the records with ZEROTTL
4639 		 * (these records should not be cached anyway).
4640 		 */
4641 
4642 		RDATASET_ATTR_CLR(header, RDATASET_ATTR_STALE_WINDOW);
4643 		if (!ZEROTTL(header) && KEEPSTALE(search->rbtdb) &&
4644 		    stale > search->now)
4645 		{
4646 			mark_header_stale(search->rbtdb, header);
4647 			*header_prev = header;
4648 			/*
4649 			 * If DNS_DBFIND_STALESTART is set then it means we
4650 			 * failed to resolve the name during recursion, in
4651 			 * this case we mark the time in which the refresh
4652 			 * failed.
4653 			 */
4654 			if ((search->options & DNS_DBFIND_STALESTART) != 0) {
4655 				atomic_store_release(
4656 					&header->last_refresh_fail_ts,
4657 					search->now);
4658 			} else if ((search->options &
4659 				    DNS_DBFIND_STALEENABLED) != 0 &&
4660 				   search->now <
4661 					   (atomic_load_acquire(
4662 						    &header->last_refresh_fail_ts) +
4663 					    search->rbtdb->serve_stale_refresh))
4664 			{
4665 				/*
4666 				 * If we are within interval between last
4667 				 * refresh failure time + 'stale-refresh-time',
4668 				 * then don't skip this stale entry but use it
4669 				 * instead.
4670 				 */
4671 				RDATASET_ATTR_SET(header,
4672 						  RDATASET_ATTR_STALE_WINDOW);
4673 				return (false);
4674 			} else if ((search->options &
4675 				    DNS_DBFIND_STALETIMEOUT) != 0)
4676 			{
4677 				/*
4678 				 * We want stale RRset due to timeout, so we
4679 				 * don't skip it.
4680 				 */
4681 				return (false);
4682 			}
4683 			return ((search->options & DNS_DBFIND_STALEOK) == 0);
4684 		}
4685 
4686 		/*
4687 		 * This rdataset is stale.  If no one else is using the
4688 		 * node, we can clean it up right now, otherwise we mark
4689 		 * it as ancient, and the node as dirty, so it will get
4690 		 * cleaned up later.
4691 		 */
4692 		if ((header->rdh_ttl < search->now - RBTDB_VIRTUAL) &&
4693 		    (*locktype == isc_rwlocktype_write ||
4694 		     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
4695 		{
4696 			/*
4697 			 * We update the node's status only when we can
4698 			 * get write access; otherwise, we leave others
4699 			 * to this work.  Periodical cleaning will
4700 			 * eventually take the job as the last resort.
4701 			 * We won't downgrade the lock, since other
4702 			 * rdatasets are probably stale, too.
4703 			 */
4704 			*locktype = isc_rwlocktype_write;
4705 
4706 			if (isc_refcount_current(&node->references) == 0) {
4707 				isc_mem_t *mctx;
4708 
4709 				/*
4710 				 * header->down can be non-NULL if the
4711 				 * refcount has just decremented to 0
4712 				 * but decrement_reference() has not
4713 				 * performed clean_cache_node(), in
4714 				 * which case we need to purge the stale
4715 				 * headers first.
4716 				 */
4717 				mctx = search->rbtdb->common.mctx;
4718 				clean_stale_headers(search->rbtdb, mctx,
4719 						    header);
4720 				if (*header_prev != NULL) {
4721 					(*header_prev)->next = header->next;
4722 				} else {
4723 					node->data = header->next;
4724 				}
4725 				free_rdataset(search->rbtdb, mctx, header);
4726 			} else {
4727 				mark_header_ancient(search->rbtdb, header);
4728 				*header_prev = header;
4729 			}
4730 		} else {
4731 			*header_prev = header;
4732 		}
4733 		return (true);
4734 	}
4735 	return (false);
4736 }
4737 
4738 static isc_result_t
cache_zonecut_callback(dns_rbtnode_t * node,dns_name_t * name,void * arg)4739 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
4740 	rbtdb_search_t *search = arg;
4741 	rdatasetheader_t *header, *header_prev, *header_next;
4742 	rdatasetheader_t *dname_header, *sigdname_header;
4743 	isc_result_t result;
4744 	nodelock_t *lock;
4745 	isc_rwlocktype_t locktype;
4746 
4747 	/* XXX comment */
4748 
4749 	REQUIRE(search->zonecut == NULL);
4750 
4751 	/*
4752 	 * Keep compiler silent.
4753 	 */
4754 	UNUSED(name);
4755 
4756 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
4757 	locktype = isc_rwlocktype_read;
4758 	NODE_LOCK(lock, locktype);
4759 
4760 	/*
4761 	 * Look for a DNAME or RRSIG DNAME rdataset.
4762 	 */
4763 	dname_header = NULL;
4764 	sigdname_header = NULL;
4765 	header_prev = NULL;
4766 	for (header = node->data; header != NULL; header = header_next) {
4767 		header_next = header->next;
4768 		if (check_stale_header(node, header, &locktype, lock, search,
4769 				       &header_prev))
4770 		{
4771 			/* Do nothing. */
4772 		} else if (header->type == dns_rdatatype_dname &&
4773 			   EXISTS(header) && !ANCIENT(header))
4774 		{
4775 			dname_header = header;
4776 			header_prev = header;
4777 		} else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
4778 			   EXISTS(header) && !ANCIENT(header))
4779 		{
4780 			sigdname_header = header;
4781 			header_prev = header;
4782 		} else {
4783 			header_prev = header;
4784 		}
4785 	}
4786 
4787 	if (dname_header != NULL &&
4788 	    (!DNS_TRUST_PENDING(dname_header->trust) ||
4789 	     (search->options & DNS_DBFIND_PENDINGOK) != 0))
4790 	{
4791 		/*
4792 		 * We increment the reference count on node to ensure that
4793 		 * search->zonecut_rdataset will still be valid later.
4794 		 */
4795 		new_reference(search->rbtdb, node, locktype);
4796 		search->zonecut = node;
4797 		search->zonecut_rdataset = dname_header;
4798 		search->zonecut_sigrdataset = sigdname_header;
4799 		search->need_cleanup = true;
4800 		result = DNS_R_PARTIALMATCH;
4801 	} else {
4802 		result = DNS_R_CONTINUE;
4803 	}
4804 
4805 	NODE_UNLOCK(lock, locktype);
4806 
4807 	return (result);
4808 }
4809 
4810 static isc_result_t
find_deepest_zonecut(rbtdb_search_t * search,dns_rbtnode_t * node,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)4811 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
4812 		     dns_dbnode_t **nodep, dns_name_t *foundname,
4813 		     dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
4814 	unsigned int i;
4815 	dns_rbtnode_t *level_node;
4816 	rdatasetheader_t *header, *header_prev, *header_next;
4817 	rdatasetheader_t *found, *foundsig;
4818 	isc_result_t result = ISC_R_NOTFOUND;
4819 	dns_name_t name;
4820 	dns_rbtdb_t *rbtdb;
4821 	bool done;
4822 	nodelock_t *lock;
4823 	isc_rwlocktype_t locktype;
4824 
4825 	/*
4826 	 * Caller must be holding the tree lock.
4827 	 */
4828 
4829 	rbtdb = search->rbtdb;
4830 	i = search->chain.level_matches;
4831 	done = false;
4832 	do {
4833 		locktype = isc_rwlocktype_read;
4834 		lock = &rbtdb->node_locks[node->locknum].lock;
4835 		NODE_LOCK(lock, locktype);
4836 
4837 		/*
4838 		 * Look for NS and RRSIG NS rdatasets.
4839 		 */
4840 		found = NULL;
4841 		foundsig = NULL;
4842 		header_prev = NULL;
4843 		for (header = node->data; header != NULL; header = header_next)
4844 		{
4845 			header_next = header->next;
4846 			if (check_stale_header(node, header, &locktype, lock,
4847 					       search, &header_prev))
4848 			{
4849 				/* Do nothing. */
4850 			} else if (EXISTS(header) && !ANCIENT(header)) {
4851 				/*
4852 				 * We've found an extant rdataset.  See if
4853 				 * we're interested in it.
4854 				 */
4855 				if (header->type == dns_rdatatype_ns) {
4856 					found = header;
4857 					if (foundsig != NULL) {
4858 						break;
4859 					}
4860 				} else if (header->type ==
4861 					   RBTDB_RDATATYPE_SIGNS)
4862 				{
4863 					foundsig = header;
4864 					if (found != NULL) {
4865 						break;
4866 					}
4867 				}
4868 				header_prev = header;
4869 			} else {
4870 				header_prev = header;
4871 			}
4872 		}
4873 
4874 		if (found != NULL) {
4875 			/*
4876 			 * If we have to set foundname, we do it before
4877 			 * anything else.  If we were to set foundname after
4878 			 * we had set nodep or bound the rdataset, then we'd
4879 			 * have to undo that work if dns_name_concatenate()
4880 			 * failed.  By setting foundname first, there's
4881 			 * nothing to undo if we have trouble.
4882 			 */
4883 			if (foundname != NULL) {
4884 				dns_name_init(&name, NULL);
4885 				dns_rbt_namefromnode(node, &name);
4886 				dns_name_copynf(&name, foundname);
4887 				while (i > 0) {
4888 					i--;
4889 					level_node = search->chain.levels[i];
4890 					dns_name_init(&name, NULL);
4891 					dns_rbt_namefromnode(level_node, &name);
4892 					result = dns_name_concatenate(
4893 						foundname, &name, foundname,
4894 						NULL);
4895 					if (result != ISC_R_SUCCESS) {
4896 						if (nodep != NULL) {
4897 							*nodep = NULL;
4898 						}
4899 						goto node_exit;
4900 					}
4901 				}
4902 			}
4903 			result = DNS_R_DELEGATION;
4904 			if (nodep != NULL) {
4905 				new_reference(search->rbtdb, node, locktype);
4906 				*nodep = node;
4907 			}
4908 			bind_rdataset(search->rbtdb, node, found, search->now,
4909 				      locktype, rdataset);
4910 			if (foundsig != NULL) {
4911 				bind_rdataset(search->rbtdb, node, foundsig,
4912 					      search->now, locktype,
4913 					      sigrdataset);
4914 			}
4915 			if (need_headerupdate(found, search->now) ||
4916 			    (foundsig != NULL &&
4917 			     need_headerupdate(foundsig, search->now)))
4918 			{
4919 				if (locktype != isc_rwlocktype_write) {
4920 					NODE_UNLOCK(lock, locktype);
4921 					NODE_LOCK(lock, isc_rwlocktype_write);
4922 					locktype = isc_rwlocktype_write;
4923 					POST(locktype);
4924 				}
4925 				if (need_headerupdate(found, search->now)) {
4926 					update_header(search->rbtdb, found,
4927 						      search->now);
4928 				}
4929 				if (foundsig != NULL &&
4930 				    need_headerupdate(foundsig, search->now))
4931 				{
4932 					update_header(search->rbtdb, foundsig,
4933 						      search->now);
4934 				}
4935 			}
4936 		}
4937 
4938 	node_exit:
4939 		NODE_UNLOCK(lock, locktype);
4940 
4941 		if (found == NULL && i > 0) {
4942 			i--;
4943 			node = search->chain.levels[i];
4944 		} else {
4945 			done = true;
4946 		}
4947 	} while (!done);
4948 
4949 	return (result);
4950 }
4951 
4952 static isc_result_t
find_coveringnsec(rbtdb_search_t * search,dns_dbnode_t ** nodep,isc_stdtime_t now,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)4953 find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
4954 		  isc_stdtime_t now, dns_name_t *foundname,
4955 		  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
4956 	dns_rbtnode_t *node;
4957 	rdatasetheader_t *header, *header_next, *header_prev;
4958 	rdatasetheader_t *found, *foundsig;
4959 	bool empty_node;
4960 	isc_result_t result;
4961 	dns_fixedname_t fname, forigin;
4962 	dns_name_t *name, *origin;
4963 	rbtdb_rdatatype_t matchtype, sigmatchtype;
4964 	nodelock_t *lock;
4965 	isc_rwlocktype_t locktype;
4966 	dns_rbtnodechain_t chain;
4967 
4968 	chain = search->chain;
4969 
4970 	matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
4971 	sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
4972 					     dns_rdatatype_nsec);
4973 
4974 	do {
4975 		node = NULL;
4976 		name = dns_fixedname_initname(&fname);
4977 		origin = dns_fixedname_initname(&forigin);
4978 		result = dns_rbtnodechain_current(&chain, name, origin, &node);
4979 		if (result != ISC_R_SUCCESS) {
4980 			return (result);
4981 		}
4982 		locktype = isc_rwlocktype_read;
4983 		lock = &(search->rbtdb->node_locks[node->locknum].lock);
4984 		NODE_LOCK(lock, locktype);
4985 		found = NULL;
4986 		foundsig = NULL;
4987 		empty_node = true;
4988 		header_prev = NULL;
4989 		for (header = node->data; header != NULL; header = header_next)
4990 		{
4991 			header_next = header->next;
4992 			if (check_stale_header(node, header, &locktype, lock,
4993 					       search, &header_prev))
4994 			{
4995 				continue;
4996 			}
4997 			if (NONEXISTENT(header) ||
4998 			    RBTDB_RDATATYPE_BASE(header->type) == 0)
4999 			{
5000 				header_prev = header;
5001 				continue;
5002 			}
5003 			/*
5004 			 * Don't stop on provable noqname / RRSIG.
5005 			 */
5006 			if (header->noqname == NULL &&
5007 			    RBTDB_RDATATYPE_BASE(header->type) !=
5008 				    dns_rdatatype_rrsig)
5009 			{
5010 				empty_node = false;
5011 			}
5012 			if (header->type == matchtype) {
5013 				found = header;
5014 			} else if (header->type == sigmatchtype) {
5015 				foundsig = header;
5016 			}
5017 			header_prev = header;
5018 		}
5019 		if (found != NULL) {
5020 			result = dns_name_concatenate(name, origin, foundname,
5021 						      NULL);
5022 			if (result != ISC_R_SUCCESS) {
5023 				goto unlock_node;
5024 			}
5025 			bind_rdataset(search->rbtdb, node, found, now, locktype,
5026 				      rdataset);
5027 			if (foundsig != NULL) {
5028 				bind_rdataset(search->rbtdb, node, foundsig,
5029 					      now, locktype, sigrdataset);
5030 			}
5031 			new_reference(search->rbtdb, node, locktype);
5032 			*nodep = node;
5033 			result = DNS_R_COVERINGNSEC;
5034 		} else if (!empty_node) {
5035 			result = ISC_R_NOTFOUND;
5036 		} else {
5037 			result = dns_rbtnodechain_prev(&chain, NULL, NULL);
5038 		}
5039 	unlock_node:
5040 		NODE_UNLOCK(lock, locktype);
5041 	} while (empty_node && result == ISC_R_SUCCESS);
5042 	return (result);
5043 }
5044 
5045 static isc_result_t
cache_find(dns_db_t * db,const dns_name_t * name,dns_dbversion_t * version,dns_rdatatype_t type,unsigned int options,isc_stdtime_t now,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)5046 cache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
5047 	   dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
5048 	   dns_dbnode_t **nodep, dns_name_t *foundname,
5049 	   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5050 	dns_rbtnode_t *node = NULL;
5051 	isc_result_t result;
5052 	rbtdb_search_t search;
5053 	bool cname_ok = true;
5054 	bool empty_node;
5055 	nodelock_t *lock;
5056 	isc_rwlocktype_t locktype;
5057 	rdatasetheader_t *header, *header_prev, *header_next;
5058 	rdatasetheader_t *found, *nsheader;
5059 	rdatasetheader_t *foundsig, *nssig, *cnamesig;
5060 	rdatasetheader_t *update, *updatesig;
5061 	rdatasetheader_t *nsecheader, *nsecsig;
5062 	rbtdb_rdatatype_t sigtype, negtype;
5063 
5064 	UNUSED(version);
5065 
5066 	search.rbtdb = (dns_rbtdb_t *)db;
5067 
5068 	REQUIRE(VALID_RBTDB(search.rbtdb));
5069 	REQUIRE(version == NULL);
5070 
5071 	if (now == 0) {
5072 		isc_stdtime_get(&now);
5073 	}
5074 
5075 	search.rbtversion = NULL;
5076 	search.serial = 1;
5077 	search.options = options;
5078 	search.copy_name = false;
5079 	search.need_cleanup = false;
5080 	search.wild = false;
5081 	search.zonecut = NULL;
5082 	dns_fixedname_init(&search.zonecut_name);
5083 	dns_rbtnodechain_init(&search.chain);
5084 	search.now = now;
5085 	update = NULL;
5086 	updatesig = NULL;
5087 
5088 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5089 
5090 	/*
5091 	 * Search down from the root of the tree.  If, while going down, we
5092 	 * encounter a callback node, cache_zonecut_callback() will search the
5093 	 * rdatasets at the zone cut for a DNAME rdataset.
5094 	 */
5095 	result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
5096 				  &search.chain, DNS_RBTFIND_EMPTYDATA,
5097 				  cache_zonecut_callback, &search);
5098 
5099 	if (result == DNS_R_PARTIALMATCH) {
5100 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
5101 			result = find_coveringnsec(&search, nodep, now,
5102 						   foundname, rdataset,
5103 						   sigrdataset);
5104 			if (result == DNS_R_COVERINGNSEC) {
5105 				goto tree_exit;
5106 			}
5107 		}
5108 		if (search.zonecut != NULL) {
5109 			result = setup_delegation(&search, nodep, foundname,
5110 						  rdataset, sigrdataset);
5111 			goto tree_exit;
5112 		} else {
5113 		find_ns:
5114 			result = find_deepest_zonecut(&search, node, nodep,
5115 						      foundname, rdataset,
5116 						      sigrdataset);
5117 			goto tree_exit;
5118 		}
5119 	} else if (result != ISC_R_SUCCESS) {
5120 		goto tree_exit;
5121 	}
5122 
5123 	/*
5124 	 * Certain DNSSEC types are not subject to CNAME matching
5125 	 * (RFC4035, section 2.5 and RFC3007).
5126 	 *
5127 	 * We don't check for RRSIG, because we don't store RRSIG records
5128 	 * directly.
5129 	 */
5130 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
5131 		cname_ok = false;
5132 	}
5133 
5134 	/*
5135 	 * We now go looking for rdata...
5136 	 */
5137 
5138 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5139 	locktype = isc_rwlocktype_read;
5140 	NODE_LOCK(lock, locktype);
5141 
5142 	found = NULL;
5143 	foundsig = NULL;
5144 	sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5145 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
5146 	nsheader = NULL;
5147 	nsecheader = NULL;
5148 	nssig = NULL;
5149 	nsecsig = NULL;
5150 	cnamesig = NULL;
5151 	empty_node = true;
5152 	header_prev = NULL;
5153 	for (header = node->data; header != NULL; header = header_next) {
5154 		header_next = header->next;
5155 		if (check_stale_header(node, header, &locktype, lock, &search,
5156 				       &header_prev))
5157 		{
5158 			/* Do nothing. */
5159 		} else if (EXISTS(header) && !ANCIENT(header)) {
5160 			/*
5161 			 * We now know that there is at least one active
5162 			 * non-stale rdataset at this node.
5163 			 */
5164 			empty_node = false;
5165 
5166 			/*
5167 			 * If we found a type we were looking for, remember
5168 			 * it.
5169 			 */
5170 			if (header->type == type ||
5171 			    (type == dns_rdatatype_any &&
5172 			     RBTDB_RDATATYPE_BASE(header->type) != 0) ||
5173 			    (cname_ok && header->type == dns_rdatatype_cname))
5174 			{
5175 				/*
5176 				 * We've found the answer.
5177 				 */
5178 				found = header;
5179 				if (header->type == dns_rdatatype_cname &&
5180 				    cname_ok && cnamesig != NULL)
5181 				{
5182 					/*
5183 					 * If we've already got the
5184 					 * CNAME RRSIG, use it.
5185 					 */
5186 					foundsig = cnamesig;
5187 				}
5188 			} else if (header->type == sigtype) {
5189 				/*
5190 				 * We've found the RRSIG rdataset for our
5191 				 * target type.  Remember it.
5192 				 */
5193 				foundsig = header;
5194 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
5195 				   header->type == negtype)
5196 			{
5197 				/*
5198 				 * We've found a negative cache entry.
5199 				 */
5200 				found = header;
5201 			} else if (header->type == dns_rdatatype_ns) {
5202 				/*
5203 				 * Remember a NS rdataset even if we're
5204 				 * not specifically looking for it, because
5205 				 * we might need it later.
5206 				 */
5207 				nsheader = header;
5208 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5209 				/*
5210 				 * If we need the NS rdataset, we'll also
5211 				 * need its signature.
5212 				 */
5213 				nssig = header;
5214 			} else if (header->type == dns_rdatatype_nsec) {
5215 				nsecheader = header;
5216 			} else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
5217 				nsecsig = header;
5218 			} else if (cname_ok &&
5219 				   header->type == RBTDB_RDATATYPE_SIGCNAME)
5220 			{
5221 				/*
5222 				 * If we get a CNAME match, we'll also need
5223 				 * its signature.
5224 				 */
5225 				cnamesig = header;
5226 			}
5227 			header_prev = header;
5228 		} else {
5229 			header_prev = header;
5230 		}
5231 	}
5232 
5233 	if (empty_node) {
5234 		/*
5235 		 * We have an exact match for the name, but there are no
5236 		 * extant rdatasets.  That means that this node doesn't
5237 		 * meaningfully exist, and that we really have a partial match.
5238 		 */
5239 		NODE_UNLOCK(lock, locktype);
5240 		goto find_ns;
5241 	}
5242 
5243 	/*
5244 	 * If we didn't find what we were looking for...
5245 	 */
5246 	if (found == NULL ||
5247 	    (DNS_TRUST_ADDITIONAL(found->trust) &&
5248 	     ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
5249 	    (found->trust == dns_trust_glue &&
5250 	     ((options & DNS_DBFIND_GLUEOK) == 0)) ||
5251 	    (DNS_TRUST_PENDING(found->trust) &&
5252 	     ((options & DNS_DBFIND_PENDINGOK) == 0)))
5253 	{
5254 		/*
5255 		 * Return covering NODATA NSEC record.
5256 		 */
5257 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
5258 		    nsecheader != NULL)
5259 		{
5260 			if (nodep != NULL) {
5261 				new_reference(search.rbtdb, node, locktype);
5262 				*nodep = node;
5263 			}
5264 			bind_rdataset(search.rbtdb, node, nsecheader,
5265 				      search.now, locktype, rdataset);
5266 			if (need_headerupdate(nsecheader, search.now)) {
5267 				update = nsecheader;
5268 			}
5269 			if (nsecsig != NULL) {
5270 				bind_rdataset(search.rbtdb, node, nsecsig,
5271 					      search.now, locktype,
5272 					      sigrdataset);
5273 				if (need_headerupdate(nsecsig, search.now)) {
5274 					updatesig = nsecsig;
5275 				}
5276 			}
5277 			result = DNS_R_COVERINGNSEC;
5278 			goto node_exit;
5279 		}
5280 
5281 		/*
5282 		 * If there is an NS rdataset at this node, then this is the
5283 		 * deepest zone cut.
5284 		 */
5285 		if (nsheader != NULL) {
5286 			if (nodep != NULL) {
5287 				new_reference(search.rbtdb, node, locktype);
5288 				*nodep = node;
5289 			}
5290 			bind_rdataset(search.rbtdb, node, nsheader, search.now,
5291 				      locktype, rdataset);
5292 			if (need_headerupdate(nsheader, search.now)) {
5293 				update = nsheader;
5294 			}
5295 			if (nssig != NULL) {
5296 				bind_rdataset(search.rbtdb, node, nssig,
5297 					      search.now, locktype,
5298 					      sigrdataset);
5299 				if (need_headerupdate(nssig, search.now)) {
5300 					updatesig = nssig;
5301 				}
5302 			}
5303 			result = DNS_R_DELEGATION;
5304 			goto node_exit;
5305 		}
5306 
5307 		/*
5308 		 * Go find the deepest zone cut.
5309 		 */
5310 		NODE_UNLOCK(lock, locktype);
5311 		goto find_ns;
5312 	}
5313 
5314 	/*
5315 	 * We found what we were looking for, or we found a CNAME.
5316 	 */
5317 
5318 	if (nodep != NULL) {
5319 		new_reference(search.rbtdb, node, locktype);
5320 		*nodep = node;
5321 	}
5322 
5323 	if (NEGATIVE(found)) {
5324 		/*
5325 		 * We found a negative cache entry.
5326 		 */
5327 		if (NXDOMAIN(found)) {
5328 			result = DNS_R_NCACHENXDOMAIN;
5329 		} else {
5330 			result = DNS_R_NCACHENXRRSET;
5331 		}
5332 	} else if (type != found->type && type != dns_rdatatype_any &&
5333 		   found->type == dns_rdatatype_cname)
5334 	{
5335 		/*
5336 		 * We weren't doing an ANY query and we found a CNAME instead
5337 		 * of the type we were looking for, so we need to indicate
5338 		 * that result to the caller.
5339 		 */
5340 		result = DNS_R_CNAME;
5341 	} else {
5342 		/*
5343 		 * An ordinary successful query!
5344 		 */
5345 		result = ISC_R_SUCCESS;
5346 	}
5347 
5348 	if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
5349 	    result == DNS_R_NCACHENXRRSET)
5350 	{
5351 		bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5352 			      rdataset);
5353 		if (need_headerupdate(found, search.now)) {
5354 			update = found;
5355 		}
5356 		if (!NEGATIVE(found) && foundsig != NULL) {
5357 			bind_rdataset(search.rbtdb, node, foundsig, search.now,
5358 				      locktype, sigrdataset);
5359 			if (need_headerupdate(foundsig, search.now)) {
5360 				updatesig = foundsig;
5361 			}
5362 		}
5363 	}
5364 
5365 node_exit:
5366 	if ((update != NULL || updatesig != NULL) &&
5367 	    locktype != isc_rwlocktype_write)
5368 	{
5369 		NODE_UNLOCK(lock, locktype);
5370 		NODE_LOCK(lock, isc_rwlocktype_write);
5371 		locktype = isc_rwlocktype_write;
5372 		POST(locktype);
5373 	}
5374 	if (update != NULL && need_headerupdate(update, search.now)) {
5375 		update_header(search.rbtdb, update, search.now);
5376 	}
5377 	if (updatesig != NULL && need_headerupdate(updatesig, search.now)) {
5378 		update_header(search.rbtdb, updatesig, search.now);
5379 	}
5380 
5381 	NODE_UNLOCK(lock, locktype);
5382 
5383 tree_exit:
5384 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5385 
5386 	/*
5387 	 * If we found a zonecut but aren't going to use it, we have to
5388 	 * let go of it.
5389 	 */
5390 	if (search.need_cleanup) {
5391 		node = search.zonecut;
5392 		INSIST(node != NULL);
5393 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
5394 
5395 		NODE_LOCK(lock, isc_rwlocktype_read);
5396 		decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read,
5397 				    isc_rwlocktype_none, false);
5398 		NODE_UNLOCK(lock, isc_rwlocktype_read);
5399 	}
5400 
5401 	dns_rbtnodechain_reset(&search.chain);
5402 
5403 	update_cachestats(search.rbtdb, result);
5404 	return (result);
5405 }
5406 
5407 static isc_result_t
cache_findzonecut(dns_db_t * db,const dns_name_t * name,unsigned int options,isc_stdtime_t now,dns_dbnode_t ** nodep,dns_name_t * foundname,dns_name_t * dcname,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)5408 cache_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
5409 		  isc_stdtime_t now, dns_dbnode_t **nodep,
5410 		  dns_name_t *foundname, dns_name_t *dcname,
5411 		  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) {
5412 	dns_rbtnode_t *node = NULL;
5413 	nodelock_t *lock;
5414 	isc_result_t result;
5415 	rbtdb_search_t search;
5416 	rdatasetheader_t *header, *header_prev, *header_next;
5417 	rdatasetheader_t *found, *foundsig;
5418 	unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
5419 	isc_rwlocktype_t locktype;
5420 	bool dcnull = (dcname == NULL);
5421 
5422 	search.rbtdb = (dns_rbtdb_t *)db;
5423 
5424 	REQUIRE(VALID_RBTDB(search.rbtdb));
5425 
5426 	if (now == 0) {
5427 		isc_stdtime_get(&now);
5428 	}
5429 
5430 	search.rbtversion = NULL;
5431 	search.serial = 1;
5432 	search.options = options;
5433 	search.copy_name = false;
5434 	search.need_cleanup = false;
5435 	search.wild = false;
5436 	search.zonecut = NULL;
5437 	dns_fixedname_init(&search.zonecut_name);
5438 	dns_rbtnodechain_init(&search.chain);
5439 	search.now = now;
5440 
5441 	if (dcnull) {
5442 		dcname = foundname;
5443 	}
5444 
5445 	if ((options & DNS_DBFIND_NOEXACT) != 0) {
5446 		rbtoptions |= DNS_RBTFIND_NOEXACT;
5447 	}
5448 
5449 	RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5450 
5451 	/*
5452 	 * Search down from the root of the tree.
5453 	 */
5454 	result = dns_rbt_findnode(search.rbtdb->tree, name, dcname, &node,
5455 				  &search.chain, rbtoptions, NULL, &search);
5456 
5457 	if (result == DNS_R_PARTIALMATCH) {
5458 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5459 					      rdataset, sigrdataset);
5460 		goto tree_exit;
5461 	} else if (result != ISC_R_SUCCESS) {
5462 		goto tree_exit;
5463 	} else if (!dcnull) {
5464 		dns_name_copynf(dcname, foundname);
5465 	}
5466 
5467 	/*
5468 	 * We now go looking for an NS rdataset at the node.
5469 	 */
5470 
5471 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
5472 	locktype = isc_rwlocktype_read;
5473 	NODE_LOCK(lock, locktype);
5474 
5475 	found = NULL;
5476 	foundsig = NULL;
5477 	header_prev = NULL;
5478 	for (header = node->data; header != NULL; header = header_next) {
5479 		header_next = header->next;
5480 		if (check_stale_header(node, header, &locktype, lock, &search,
5481 				       &header_prev))
5482 		{
5483 			/*
5484 			 * The function dns_rbt_findnode found us the a matching
5485 			 * node for 'name' and stored the result in 'dcname'.
5486 			 * This is the deepest known zonecut in our database.
5487 			 * However, this node may be stale and if serve-stale
5488 			 * is not enabled (in other words 'stale-answer-enable'
5489 			 * is set to no), this node may not be used as a
5490 			 * zonecut we know about. If so, find the deepest
5491 			 * zonecut from this node up and return that instead.
5492 			 */
5493 			NODE_UNLOCK(lock, locktype);
5494 			result = find_deepest_zonecut(&search, node, nodep,
5495 						      foundname, rdataset,
5496 						      sigrdataset);
5497 			dns_name_copynf(foundname, dcname);
5498 			goto tree_exit;
5499 		} else if (EXISTS(header) && !ANCIENT(header)) {
5500 			/*
5501 			 * If we found a type we were looking for, remember
5502 			 * it.
5503 			 */
5504 			if (header->type == dns_rdatatype_ns) {
5505 				/*
5506 				 * Remember a NS rdataset even if we're
5507 				 * not specifically looking for it, because
5508 				 * we might need it later.
5509 				 */
5510 				found = header;
5511 			} else if (header->type == RBTDB_RDATATYPE_SIGNS) {
5512 				/*
5513 				 * If we need the NS rdataset, we'll also
5514 				 * need its signature.
5515 				 */
5516 				foundsig = header;
5517 			}
5518 			header_prev = header;
5519 		} else {
5520 			header_prev = header;
5521 		}
5522 	}
5523 
5524 	if (found == NULL) {
5525 		/*
5526 		 * No NS records here.
5527 		 */
5528 		NODE_UNLOCK(lock, locktype);
5529 		result = find_deepest_zonecut(&search, node, nodep, foundname,
5530 					      rdataset, sigrdataset);
5531 		goto tree_exit;
5532 	}
5533 
5534 	if (nodep != NULL) {
5535 		new_reference(search.rbtdb, node, locktype);
5536 		*nodep = node;
5537 	}
5538 
5539 	bind_rdataset(search.rbtdb, node, found, search.now, locktype,
5540 		      rdataset);
5541 	if (foundsig != NULL) {
5542 		bind_rdataset(search.rbtdb, node, foundsig, search.now,
5543 			      locktype, sigrdataset);
5544 	}
5545 
5546 	if (need_headerupdate(found, search.now) ||
5547 	    (foundsig != NULL && need_headerupdate(foundsig, search.now)))
5548 	{
5549 		if (locktype != isc_rwlocktype_write) {
5550 			NODE_UNLOCK(lock, locktype);
5551 			NODE_LOCK(lock, isc_rwlocktype_write);
5552 			locktype = isc_rwlocktype_write;
5553 			POST(locktype);
5554 		}
5555 		if (need_headerupdate(found, search.now)) {
5556 			update_header(search.rbtdb, found, search.now);
5557 		}
5558 		if (foundsig != NULL && need_headerupdate(foundsig, search.now))
5559 		{
5560 			update_header(search.rbtdb, foundsig, search.now);
5561 		}
5562 	}
5563 
5564 	NODE_UNLOCK(lock, locktype);
5565 
5566 tree_exit:
5567 	RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
5568 
5569 	INSIST(!search.need_cleanup);
5570 
5571 	dns_rbtnodechain_reset(&search.chain);
5572 
5573 	if (result == DNS_R_DELEGATION) {
5574 		result = ISC_R_SUCCESS;
5575 	}
5576 
5577 	return (result);
5578 }
5579 
5580 static void
attachnode(dns_db_t * db,dns_dbnode_t * source,dns_dbnode_t ** targetp)5581 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
5582 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5583 	dns_rbtnode_t *node = (dns_rbtnode_t *)source;
5584 
5585 	REQUIRE(VALID_RBTDB(rbtdb));
5586 	REQUIRE(targetp != NULL && *targetp == NULL);
5587 
5588 	isc_refcount_increment(&node->references);
5589 
5590 	*targetp = source;
5591 }
5592 
5593 static void
detachnode(dns_db_t * db,dns_dbnode_t ** targetp)5594 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
5595 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5596 	dns_rbtnode_t *node;
5597 	bool want_free = false;
5598 	bool inactive = false;
5599 	rbtdb_nodelock_t *nodelock;
5600 
5601 	REQUIRE(VALID_RBTDB(rbtdb));
5602 	REQUIRE(targetp != NULL && *targetp != NULL);
5603 
5604 	node = (dns_rbtnode_t *)(*targetp);
5605 	nodelock = &rbtdb->node_locks[node->locknum];
5606 
5607 	NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
5608 
5609 	if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
5610 				isc_rwlocktype_none, false))
5611 	{
5612 		if (isc_refcount_current(&nodelock->references) == 0 &&
5613 		    nodelock->exiting)
5614 		{
5615 			inactive = true;
5616 		}
5617 	}
5618 
5619 	NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
5620 
5621 	*targetp = NULL;
5622 
5623 	if (inactive) {
5624 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5625 		rbtdb->active--;
5626 		if (rbtdb->active == 0) {
5627 			want_free = true;
5628 		}
5629 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5630 		if (want_free) {
5631 			char buf[DNS_NAME_FORMATSIZE];
5632 			if (dns_name_dynamic(&rbtdb->common.origin)) {
5633 				dns_name_format(&rbtdb->common.origin, buf,
5634 						sizeof(buf));
5635 			} else {
5636 				strlcpy(buf, "<UNKNOWN>", sizeof(buf));
5637 			}
5638 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
5639 				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
5640 				      "calling free_rbtdb(%s)", buf);
5641 			free_rbtdb(rbtdb, true, NULL);
5642 		}
5643 	}
5644 }
5645 
5646 static isc_result_t
expirenode(dns_db_t * db,dns_dbnode_t * node,isc_stdtime_t now)5647 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
5648 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5649 	dns_rbtnode_t *rbtnode = node;
5650 	rdatasetheader_t *header;
5651 	bool force_expire = false;
5652 	/*
5653 	 * These are the category and module used by the cache cleaner.
5654 	 */
5655 	bool log = false;
5656 	isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
5657 	isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
5658 	int level = ISC_LOG_DEBUG(2);
5659 	char printname[DNS_NAME_FORMATSIZE];
5660 
5661 	REQUIRE(VALID_RBTDB(rbtdb));
5662 
5663 	/*
5664 	 * Caller must hold a tree lock.
5665 	 */
5666 
5667 	if (now == 0) {
5668 		isc_stdtime_get(&now);
5669 	}
5670 
5671 	if (isc_mem_isovermem(rbtdb->common.mctx)) {
5672 		/*
5673 		 * Force expire with 25% probability.
5674 		 * XXXDCL Could stand to have a better policy, like LRU.
5675 		 */
5676 		force_expire = (rbtnode->down == NULL &&
5677 				(isc_random32() % 4) == 0);
5678 
5679 		/*
5680 		 * Note that 'log' can be true IFF overmem is also true.
5681 		 * overmem can currently only be true for cache
5682 		 * databases -- hence all of the "overmem cache" log strings.
5683 		 */
5684 		log = isc_log_wouldlog(dns_lctx, level);
5685 		if (log) {
5686 			isc_log_write(
5687 				dns_lctx, category, module, level,
5688 				"overmem cache: %s %s",
5689 				force_expire ? "FORCE" : "check",
5690 				dns_rbt_formatnodename(rbtnode, printname,
5691 						       sizeof(printname)));
5692 		}
5693 	}
5694 
5695 	/*
5696 	 * We may not need write access, but this code path is not performance
5697 	 * sensitive, so it should be okay to always lock as a writer.
5698 	 */
5699 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5700 		  isc_rwlocktype_write);
5701 
5702 	for (header = rbtnode->data; header != NULL; header = header->next) {
5703 		if (header->rdh_ttl + rbtdb->serve_stale_ttl <=
5704 		    now - RBTDB_VIRTUAL)
5705 		{
5706 			/*
5707 			 * We don't check if refcurrent(rbtnode) == 0 and try
5708 			 * to free like we do in cache_find(), because
5709 			 * refcurrent(rbtnode) must be non-zero.  This is so
5710 			 * because 'node' is an argument to the function.
5711 			 */
5712 			mark_header_ancient(rbtdb, header);
5713 			if (log) {
5714 				isc_log_write(dns_lctx, category, module, level,
5715 					      "overmem cache: ancient %s",
5716 					      printname);
5717 			}
5718 		} else if (force_expire) {
5719 			if (!RETAIN(header)) {
5720 				set_ttl(rbtdb, header, 0);
5721 				mark_header_ancient(rbtdb, header);
5722 			} else if (log) {
5723 				isc_log_write(dns_lctx, category, module, level,
5724 					      "overmem cache: "
5725 					      "reprieve by RETAIN() %s",
5726 					      printname);
5727 			}
5728 		} else if (isc_mem_isovermem(rbtdb->common.mctx) && log) {
5729 			isc_log_write(dns_lctx, category, module, level,
5730 				      "overmem cache: saved %s", printname);
5731 		}
5732 	}
5733 
5734 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5735 		    isc_rwlocktype_write);
5736 
5737 	return (ISC_R_SUCCESS);
5738 }
5739 
5740 static void
overmem(dns_db_t * db,bool over)5741 overmem(dns_db_t *db, bool over) {
5742 	/* This is an empty callback.  See adb.c:water() */
5743 
5744 	UNUSED(db);
5745 	UNUSED(over);
5746 
5747 	return;
5748 }
5749 
5750 static void
printnode(dns_db_t * db,dns_dbnode_t * node,FILE * out)5751 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
5752 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5753 	dns_rbtnode_t *rbtnode = node;
5754 	bool first;
5755 	uint32_t refs;
5756 
5757 	REQUIRE(VALID_RBTDB(rbtdb));
5758 
5759 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5760 		  isc_rwlocktype_read);
5761 
5762 	refs = isc_refcount_current(&rbtnode->references);
5763 	fprintf(out, "node %p, %" PRIu32 " references, locknum = %u\n", rbtnode,
5764 		refs, rbtnode->locknum);
5765 	if (rbtnode->data != NULL) {
5766 		rdatasetheader_t *current, *top_next;
5767 
5768 		for (current = rbtnode->data; current != NULL;
5769 		     current = top_next)
5770 		{
5771 			top_next = current->next;
5772 			first = true;
5773 			fprintf(out, "\ttype %u", current->type);
5774 			do {
5775 				uint_least16_t attributes = atomic_load_acquire(
5776 					&current->attributes);
5777 				if (!first) {
5778 					fprintf(out, "\t");
5779 				}
5780 				first = false;
5781 				fprintf(out,
5782 					"\tserial = %lu, ttl = %u, "
5783 					"trust = %u, attributes = %" PRIuLEAST16
5784 					", "
5785 					"resign = %u\n",
5786 					(unsigned long)current->serial,
5787 					current->rdh_ttl, current->trust,
5788 					attributes,
5789 					(current->resign << 1) |
5790 						current->resign_lsb);
5791 				current = current->down;
5792 			} while (current != NULL);
5793 		}
5794 	} else {
5795 		fprintf(out, "(empty)\n");
5796 	}
5797 
5798 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5799 		    isc_rwlocktype_read);
5800 }
5801 
5802 static isc_result_t
createiterator(dns_db_t * db,unsigned int options,dns_dbiterator_t ** iteratorp)5803 createiterator(dns_db_t *db, unsigned int options,
5804 	       dns_dbiterator_t **iteratorp) {
5805 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5806 	rbtdb_dbiterator_t *rbtdbiter;
5807 
5808 	REQUIRE(VALID_RBTDB(rbtdb));
5809 
5810 	rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
5811 
5812 	rbtdbiter->common.methods = &dbiterator_methods;
5813 	rbtdbiter->common.db = NULL;
5814 	dns_db_attach(db, &rbtdbiter->common.db);
5815 	rbtdbiter->common.relative_names = ((options & DNS_DB_RELATIVENAMES) !=
5816 					    0);
5817 	rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
5818 	rbtdbiter->common.cleaning = false;
5819 	rbtdbiter->paused = true;
5820 	rbtdbiter->tree_locked = isc_rwlocktype_none;
5821 	rbtdbiter->result = ISC_R_SUCCESS;
5822 	dns_fixedname_init(&rbtdbiter->name);
5823 	dns_fixedname_init(&rbtdbiter->origin);
5824 	rbtdbiter->node = NULL;
5825 	rbtdbiter->delcnt = 0;
5826 	rbtdbiter->nsec3only = ((options & DNS_DB_NSEC3ONLY) != 0);
5827 	rbtdbiter->nonsec3 = ((options & DNS_DB_NONSEC3) != 0);
5828 	memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
5829 	dns_rbtnodechain_init(&rbtdbiter->chain);
5830 	dns_rbtnodechain_init(&rbtdbiter->nsec3chain);
5831 	if (rbtdbiter->nsec3only) {
5832 		rbtdbiter->current = &rbtdbiter->nsec3chain;
5833 	} else {
5834 		rbtdbiter->current = &rbtdbiter->chain;
5835 	}
5836 
5837 	*iteratorp = (dns_dbiterator_t *)rbtdbiter;
5838 
5839 	return (ISC_R_SUCCESS);
5840 }
5841 
5842 static isc_result_t
zone_findrdataset(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,dns_rdatatype_t type,dns_rdatatype_t covers,isc_stdtime_t now,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)5843 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5844 		  dns_rdatatype_t type, dns_rdatatype_t covers,
5845 		  isc_stdtime_t now, dns_rdataset_t *rdataset,
5846 		  dns_rdataset_t *sigrdataset) {
5847 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5848 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5849 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5850 	rbtdb_serial_t serial;
5851 	rbtdb_version_t *rbtversion = version;
5852 	bool close_version = false;
5853 	rbtdb_rdatatype_t matchtype, sigmatchtype;
5854 
5855 	REQUIRE(VALID_RBTDB(rbtdb));
5856 	REQUIRE(type != dns_rdatatype_any);
5857 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
5858 
5859 	if (rbtversion == NULL) {
5860 		currentversion(db, (dns_dbversion_t **)(void *)(&rbtversion));
5861 		close_version = true;
5862 	}
5863 	serial = rbtversion->serial;
5864 	now = 0;
5865 
5866 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5867 		  isc_rwlocktype_read);
5868 
5869 	found = NULL;
5870 	foundsig = NULL;
5871 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
5872 	if (covers == 0) {
5873 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5874 	} else {
5875 		sigmatchtype = 0;
5876 	}
5877 
5878 	for (header = rbtnode->data; header != NULL; header = header_next) {
5879 		header_next = header->next;
5880 		do {
5881 			if (header->serial <= serial && !IGNORE(header)) {
5882 				/*
5883 				 * Is this a "this rdataset doesn't
5884 				 * exist" record?
5885 				 */
5886 				if (NONEXISTENT(header)) {
5887 					header = NULL;
5888 				}
5889 				break;
5890 			} else {
5891 				header = header->down;
5892 			}
5893 		} while (header != NULL);
5894 		if (header != NULL) {
5895 			/*
5896 			 * We have an active, extant rdataset.  If it's a
5897 			 * type we're looking for, remember it.
5898 			 */
5899 			if (header->type == matchtype) {
5900 				found = header;
5901 				if (foundsig != NULL) {
5902 					break;
5903 				}
5904 			} else if (header->type == sigmatchtype) {
5905 				foundsig = header;
5906 				if (found != NULL) {
5907 					break;
5908 				}
5909 			}
5910 		}
5911 	}
5912 	if (found != NULL) {
5913 		bind_rdataset(rbtdb, rbtnode, found, now, isc_rwlocktype_read,
5914 			      rdataset);
5915 		if (foundsig != NULL) {
5916 			bind_rdataset(rbtdb, rbtnode, foundsig, now,
5917 				      isc_rwlocktype_read, sigrdataset);
5918 		}
5919 	}
5920 
5921 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5922 		    isc_rwlocktype_read);
5923 
5924 	if (close_version) {
5925 		closeversion(db, (dns_dbversion_t **)(void *)(&rbtversion),
5926 			     false);
5927 	}
5928 
5929 	if (found == NULL) {
5930 		return (ISC_R_NOTFOUND);
5931 	}
5932 
5933 	return (ISC_R_SUCCESS);
5934 }
5935 
5936 static isc_result_t
cache_findrdataset(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,dns_rdatatype_t type,dns_rdatatype_t covers,isc_stdtime_t now,dns_rdataset_t * rdataset,dns_rdataset_t * sigrdataset)5937 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5938 		   dns_rdatatype_t type, dns_rdatatype_t covers,
5939 		   isc_stdtime_t now, dns_rdataset_t *rdataset,
5940 		   dns_rdataset_t *sigrdataset) {
5941 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5942 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5943 	rdatasetheader_t *header, *header_next, *found, *foundsig;
5944 	rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
5945 	isc_result_t result;
5946 	nodelock_t *lock;
5947 	isc_rwlocktype_t locktype;
5948 
5949 	REQUIRE(VALID_RBTDB(rbtdb));
5950 	REQUIRE(type != dns_rdatatype_any);
5951 
5952 	UNUSED(version);
5953 
5954 	result = ISC_R_SUCCESS;
5955 
5956 	if (now == 0) {
5957 		isc_stdtime_get(&now);
5958 	}
5959 
5960 	lock = &rbtdb->node_locks[rbtnode->locknum].lock;
5961 	locktype = isc_rwlocktype_read;
5962 	NODE_LOCK(lock, locktype);
5963 
5964 	found = NULL;
5965 	foundsig = NULL;
5966 	matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
5967 	negtype = RBTDB_RDATATYPE_VALUE(0, type);
5968 	if (covers == 0) {
5969 		sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
5970 	} else {
5971 		sigmatchtype = 0;
5972 	}
5973 
5974 	for (header = rbtnode->data; header != NULL; header = header_next) {
5975 		header_next = header->next;
5976 		if (!ACTIVE(header, now)) {
5977 			if ((header->rdh_ttl + rbtdb->serve_stale_ttl <
5978 			     now - RBTDB_VIRTUAL) &&
5979 			    (locktype == isc_rwlocktype_write ||
5980 			     NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS))
5981 			{
5982 				/*
5983 				 * We update the node's status only when we
5984 				 * can get write access.
5985 				 */
5986 				locktype = isc_rwlocktype_write;
5987 
5988 				/*
5989 				 * We don't check if refcurrent(rbtnode) == 0
5990 				 * and try to free like we do in cache_find(),
5991 				 * because refcurrent(rbtnode) must be
5992 				 * non-zero.  This is so because 'node' is an
5993 				 * argument to the function.
5994 				 */
5995 				mark_header_ancient(rbtdb, header);
5996 			}
5997 		} else if (EXISTS(header) && !ANCIENT(header)) {
5998 			if (header->type == matchtype) {
5999 				found = header;
6000 			} else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
6001 				   header->type == negtype)
6002 			{
6003 				found = header;
6004 			} else if (header->type == sigmatchtype) {
6005 				foundsig = header;
6006 			}
6007 		}
6008 	}
6009 	if (found != NULL) {
6010 		bind_rdataset(rbtdb, rbtnode, found, now, locktype, rdataset);
6011 		if (!NEGATIVE(found) && foundsig != NULL) {
6012 			bind_rdataset(rbtdb, rbtnode, foundsig, now, locktype,
6013 				      sigrdataset);
6014 		}
6015 	}
6016 
6017 	NODE_UNLOCK(lock, locktype);
6018 
6019 	if (found == NULL) {
6020 		return (ISC_R_NOTFOUND);
6021 	}
6022 
6023 	if (NEGATIVE(found)) {
6024 		/*
6025 		 * We found a negative cache entry.
6026 		 */
6027 		if (NXDOMAIN(found)) {
6028 			result = DNS_R_NCACHENXDOMAIN;
6029 		} else {
6030 			result = DNS_R_NCACHENXRRSET;
6031 		}
6032 	}
6033 
6034 	update_cachestats(rbtdb, result);
6035 
6036 	return (result);
6037 }
6038 
6039 static isc_result_t
allrdatasets(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,unsigned int options,isc_stdtime_t now,dns_rdatasetiter_t ** iteratorp)6040 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
6041 	     unsigned int options, isc_stdtime_t now,
6042 	     dns_rdatasetiter_t **iteratorp) {
6043 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6044 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
6045 	rbtdb_version_t *rbtversion = version;
6046 	rbtdb_rdatasetiter_t *iterator;
6047 
6048 	REQUIRE(VALID_RBTDB(rbtdb));
6049 
6050 	iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
6051 
6052 	if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
6053 		now = 0;
6054 		if (rbtversion == NULL) {
6055 			currentversion(
6056 				db, (dns_dbversion_t **)(void *)(&rbtversion));
6057 		} else {
6058 			INSIST(rbtversion->rbtdb == rbtdb);
6059 
6060 			(void)isc_refcount_increment(&rbtversion->references);
6061 		}
6062 	} else {
6063 		if (now == 0) {
6064 			isc_stdtime_get(&now);
6065 		}
6066 		rbtversion = NULL;
6067 	}
6068 
6069 	iterator->common.magic = DNS_RDATASETITER_MAGIC;
6070 	iterator->common.methods = &rdatasetiter_methods;
6071 	iterator->common.db = db;
6072 	iterator->common.node = node;
6073 	iterator->common.version = (dns_dbversion_t *)rbtversion;
6074 	iterator->common.options = options;
6075 	iterator->common.now = now;
6076 
6077 	isc_refcount_increment(&rbtnode->references);
6078 
6079 	iterator->current = NULL;
6080 
6081 	*iteratorp = (dns_rdatasetiter_t *)iterator;
6082 
6083 	return (ISC_R_SUCCESS);
6084 }
6085 
6086 static bool
cname_and_other_data(dns_rbtnode_t * node,rbtdb_serial_t serial)6087 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
6088 	rdatasetheader_t *header, *header_next;
6089 	bool cname, other_data;
6090 	dns_rdatatype_t rdtype;
6091 
6092 	/*
6093 	 * The caller must hold the node lock.
6094 	 */
6095 
6096 	/*
6097 	 * Look for CNAME and "other data" rdatasets active in our version.
6098 	 */
6099 	cname = false;
6100 	other_data = false;
6101 	for (header = node->data; header != NULL; header = header_next) {
6102 		header_next = header->next;
6103 		if (header->type == dns_rdatatype_cname) {
6104 			/*
6105 			 * Look for an active extant CNAME.
6106 			 */
6107 			do {
6108 				if (header->serial <= serial && !IGNORE(header))
6109 				{
6110 					/*
6111 					 * Is this a "this rdataset doesn't
6112 					 * exist" record?
6113 					 */
6114 					if (NONEXISTENT(header)) {
6115 						header = NULL;
6116 					}
6117 					break;
6118 				} else {
6119 					header = header->down;
6120 				}
6121 			} while (header != NULL);
6122 			if (header != NULL) {
6123 				cname = true;
6124 			}
6125 		} else {
6126 			/*
6127 			 * Look for active extant "other data".
6128 			 *
6129 			 * "Other data" is any rdataset whose type is not
6130 			 * KEY, NSEC, SIG or RRSIG.
6131 			 */
6132 			rdtype = RBTDB_RDATATYPE_BASE(header->type);
6133 			if (rdtype != dns_rdatatype_key &&
6134 			    rdtype != dns_rdatatype_sig &&
6135 			    rdtype != dns_rdatatype_nsec &&
6136 			    rdtype != dns_rdatatype_rrsig)
6137 			{
6138 				/*
6139 				 * Is it active and extant?
6140 				 */
6141 				do {
6142 					if (header->serial <= serial &&
6143 					    !IGNORE(header))
6144 					{
6145 						/*
6146 						 * Is this a "this rdataset
6147 						 * doesn't exist" record?
6148 						 */
6149 						if (NONEXISTENT(header)) {
6150 							header = NULL;
6151 						}
6152 						break;
6153 					} else {
6154 						header = header->down;
6155 					}
6156 				} while (header != NULL);
6157 				if (header != NULL) {
6158 					other_data = true;
6159 				}
6160 			}
6161 		}
6162 	}
6163 
6164 	if (cname && other_data) {
6165 		return (true);
6166 	}
6167 
6168 	return (false);
6169 }
6170 
6171 static void
resign_insert(dns_rbtdb_t * rbtdb,int idx,rdatasetheader_t * newheader)6172 resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader) {
6173 	INSIST(!IS_CACHE(rbtdb));
6174 	INSIST(newheader->heap_index == 0);
6175 	INSIST(!ISC_LINK_LINKED(newheader, link));
6176 
6177 	isc_heap_insert(rbtdb->heaps[idx], newheader);
6178 }
6179 
6180 /*
6181  * node write lock must be held.
6182  */
6183 static void
resign_delete(dns_rbtdb_t * rbtdb,rbtdb_version_t * version,rdatasetheader_t * header)6184 resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
6185 	      rdatasetheader_t *header) {
6186 	/*
6187 	 * Remove the old header from the heap
6188 	 */
6189 	if (header != NULL && header->heap_index != 0) {
6190 		isc_heap_delete(rbtdb->heaps[header->node->locknum],
6191 				header->heap_index);
6192 		header->heap_index = 0;
6193 		if (version != NULL) {
6194 			new_reference(rbtdb, header->node,
6195 				      isc_rwlocktype_write);
6196 			ISC_LIST_APPEND(version->resigned_list, header, link);
6197 		}
6198 	}
6199 }
6200 
6201 static uint64_t
recordsize(rdatasetheader_t * header,unsigned int namelen)6202 recordsize(rdatasetheader_t *header, unsigned int namelen) {
6203 	return (dns_rdataslab_rdatasize((unsigned char *)header,
6204 					sizeof(*header)) +
6205 		sizeof(dns_ttl_t) + sizeof(dns_rdatatype_t) +
6206 		sizeof(dns_rdataclass_t) + namelen);
6207 }
6208 
6209 static void
update_recordsandxfrsize(bool add,rbtdb_version_t * rbtversion,rdatasetheader_t * header,unsigned int namelen)6210 update_recordsandxfrsize(bool add, rbtdb_version_t *rbtversion,
6211 			 rdatasetheader_t *header, unsigned int namelen) {
6212 	unsigned char *hdr = (unsigned char *)header;
6213 	size_t hdrsize = sizeof(*header);
6214 
6215 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6216 	if (add) {
6217 		rbtversion->records += dns_rdataslab_count(hdr, hdrsize);
6218 		rbtversion->xfrsize += recordsize(header, namelen);
6219 	} else {
6220 		rbtversion->records -= dns_rdataslab_count(hdr, hdrsize);
6221 		rbtversion->xfrsize -= recordsize(header, namelen);
6222 	}
6223 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_write);
6224 }
6225 
6226 /*
6227  * write lock on rbtnode must be held.
6228  */
6229 static isc_result_t
add32(dns_rbtdb_t * rbtdb,dns_rbtnode_t * rbtnode,const dns_name_t * nodename,rbtdb_version_t * rbtversion,rdatasetheader_t * newheader,unsigned int options,bool loading,dns_rdataset_t * addedrdataset,isc_stdtime_t now)6230 add32(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, const dns_name_t *nodename,
6231       rbtdb_version_t *rbtversion, rdatasetheader_t *newheader,
6232       unsigned int options, bool loading, dns_rdataset_t *addedrdataset,
6233       isc_stdtime_t now) {
6234 	rbtdb_changed_t *changed = NULL;
6235 	rdatasetheader_t *topheader = NULL, *topheader_prev = NULL;
6236 	rdatasetheader_t *header = NULL, *sigheader = NULL;
6237 	unsigned char *merged = NULL;
6238 	isc_result_t result;
6239 	bool header_nx;
6240 	bool newheader_nx;
6241 	bool merge;
6242 	dns_rdatatype_t rdtype, covers;
6243 	rbtdb_rdatatype_t negtype, sigtype;
6244 	dns_trust_t trust;
6245 	int idx;
6246 
6247 	/*
6248 	 * Add an rdatasetheader_t to a node.
6249 	 */
6250 
6251 	/*
6252 	 * Caller must be holding the node lock.
6253 	 */
6254 
6255 	if ((options & DNS_DBADD_MERGE) != 0) {
6256 		REQUIRE(rbtversion != NULL);
6257 		merge = true;
6258 	} else {
6259 		merge = false;
6260 	}
6261 
6262 	if ((options & DNS_DBADD_FORCE) != 0) {
6263 		trust = dns_trust_ultimate;
6264 	} else {
6265 		trust = newheader->trust;
6266 	}
6267 
6268 	if (rbtversion != NULL && !loading) {
6269 		/*
6270 		 * We always add a changed record, even if no changes end up
6271 		 * being made to this node, because it's harmless and
6272 		 * simplifies the code.
6273 		 */
6274 		changed = add_changed(rbtdb, rbtversion, rbtnode);
6275 		if (changed == NULL) {
6276 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6277 			return (ISC_R_NOMEMORY);
6278 		}
6279 	}
6280 
6281 	newheader_nx = NONEXISTENT(newheader) ? true : false;
6282 	topheader_prev = NULL;
6283 	sigheader = NULL;
6284 	negtype = 0;
6285 	if (rbtversion == NULL && !newheader_nx) {
6286 		rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
6287 		covers = RBTDB_RDATATYPE_EXT(newheader->type);
6288 		sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, covers);
6289 		if (NEGATIVE(newheader)) {
6290 			/*
6291 			 * We're adding a negative cache entry.
6292 			 */
6293 			if (covers == dns_rdatatype_any) {
6294 				/*
6295 				 * If we're adding an negative cache entry
6296 				 * which covers all types (NXDOMAIN,
6297 				 * NODATA(QTYPE=ANY)),
6298 				 *
6299 				 * We make all other data ancient so that the
6300 				 * only rdataset that can be found at this
6301 				 * node is the negative cache entry.
6302 				 */
6303 				for (topheader = rbtnode->data;
6304 				     topheader != NULL;
6305 				     topheader = topheader->next)
6306 				{
6307 					set_ttl(rbtdb, topheader, 0);
6308 					mark_header_ancient(rbtdb, topheader);
6309 				}
6310 				goto find_header;
6311 			}
6312 			/*
6313 			 * Otherwise look for any RRSIGs of the given
6314 			 * type so they can be marked ancient later.
6315 			 */
6316 			for (topheader = rbtnode->data; topheader != NULL;
6317 			     topheader = topheader->next)
6318 			{
6319 				if (topheader->type == sigtype) {
6320 					sigheader = topheader;
6321 				}
6322 			}
6323 			negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6324 		} else {
6325 			/*
6326 			 * We're adding something that isn't a
6327 			 * negative cache entry.  Look for an extant
6328 			 * non-ancient NXDOMAIN/NODATA(QTYPE=ANY) negative
6329 			 * cache entry.  If we're adding an RRSIG, also
6330 			 * check for an extant non-ancient NODATA ncache
6331 			 * entry which covers the same type as the RRSIG.
6332 			 */
6333 			for (topheader = rbtnode->data; topheader != NULL;
6334 			     topheader = topheader->next)
6335 			{
6336 				if ((topheader->type ==
6337 				     RBTDB_RDATATYPE_NCACHEANY) ||
6338 				    (newheader->type == sigtype &&
6339 				     topheader->type ==
6340 					     RBTDB_RDATATYPE_VALUE(0, covers)))
6341 				{
6342 					break;
6343 				}
6344 			}
6345 			if (topheader != NULL && EXISTS(topheader) &&
6346 			    ACTIVE(topheader, now))
6347 			{
6348 				/*
6349 				 * Found one.
6350 				 */
6351 				if (trust < topheader->trust) {
6352 					/*
6353 					 * The NXDOMAIN/NODATA(QTYPE=ANY)
6354 					 * is more trusted.
6355 					 */
6356 					free_rdataset(rbtdb, rbtdb->common.mctx,
6357 						      newheader);
6358 					if (addedrdataset != NULL) {
6359 						bind_rdataset(
6360 							rbtdb, rbtnode,
6361 							topheader, now,
6362 							isc_rwlocktype_write,
6363 							addedrdataset);
6364 					}
6365 					return (DNS_R_UNCHANGED);
6366 				}
6367 				/*
6368 				 * The new rdataset is better.  Expire the
6369 				 * ncache entry.
6370 				 */
6371 				set_ttl(rbtdb, topheader, 0);
6372 				mark_header_ancient(rbtdb, topheader);
6373 				topheader = NULL;
6374 				goto find_header;
6375 			}
6376 			negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6377 		}
6378 	}
6379 
6380 	for (topheader = rbtnode->data; topheader != NULL;
6381 	     topheader = topheader->next)
6382 	{
6383 		if (topheader->type == newheader->type ||
6384 		    topheader->type == negtype)
6385 		{
6386 			break;
6387 		}
6388 		topheader_prev = topheader;
6389 	}
6390 
6391 find_header:
6392 	/*
6393 	 * If header isn't NULL, we've found the right type.  There may be
6394 	 * IGNORE rdatasets between the top of the chain and the first real
6395 	 * data.  We skip over them.
6396 	 */
6397 	header = topheader;
6398 	while (header != NULL && IGNORE(header)) {
6399 		header = header->down;
6400 	}
6401 	if (header != NULL) {
6402 		header_nx = NONEXISTENT(header) ? true : false;
6403 
6404 		/*
6405 		 * Deleting an already non-existent rdataset has no effect.
6406 		 */
6407 		if (header_nx && newheader_nx) {
6408 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6409 			return (DNS_R_UNCHANGED);
6410 		}
6411 
6412 		/*
6413 		 * Trying to add an rdataset with lower trust to a cache
6414 		 * DB has no effect, provided that the cache data isn't
6415 		 * stale. If the cache data is stale, new lower trust
6416 		 * data will supersede it below. Unclear what the best
6417 		 * policy is here.
6418 		 */
6419 		if (rbtversion == NULL && trust < header->trust &&
6420 		    (ACTIVE(header, now) || header_nx))
6421 		{
6422 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6423 			if (addedrdataset != NULL) {
6424 				bind_rdataset(rbtdb, rbtnode, header, now,
6425 					      isc_rwlocktype_write,
6426 					      addedrdataset);
6427 			}
6428 			return (DNS_R_UNCHANGED);
6429 		}
6430 
6431 		/*
6432 		 * Don't merge if a nonexistent rdataset is involved.
6433 		 */
6434 		if (merge && (header_nx || newheader_nx)) {
6435 			merge = false;
6436 		}
6437 
6438 		/*
6439 		 * If 'merge' is true, we'll try to create a new rdataset
6440 		 * that is the union of 'newheader' and 'header'.
6441 		 */
6442 		if (merge) {
6443 			unsigned int flags = 0;
6444 			INSIST(rbtversion->serial >= header->serial);
6445 			merged = NULL;
6446 			result = ISC_R_SUCCESS;
6447 
6448 			if ((options & DNS_DBADD_EXACT) != 0) {
6449 				flags |= DNS_RDATASLAB_EXACT;
6450 			}
6451 			/*
6452 			 * TTL use here is irrelevant to the cache;
6453 			 * merge is only done with zonedbs.
6454 			 */
6455 			if ((options & DNS_DBADD_EXACTTTL) != 0 &&
6456 			    newheader->rdh_ttl != header->rdh_ttl)
6457 			{
6458 				result = DNS_R_NOTEXACT;
6459 			} else if (newheader->rdh_ttl != header->rdh_ttl) {
6460 				flags |= DNS_RDATASLAB_FORCE;
6461 			}
6462 			if (result == ISC_R_SUCCESS) {
6463 				result = dns_rdataslab_merge(
6464 					(unsigned char *)header,
6465 					(unsigned char *)newheader,
6466 					(unsigned int)(sizeof(*newheader)),
6467 					rbtdb->common.mctx,
6468 					rbtdb->common.rdclass,
6469 					(dns_rdatatype_t)header->type, flags,
6470 					&merged);
6471 			}
6472 			if (result == ISC_R_SUCCESS) {
6473 				/*
6474 				 * If 'header' has the same serial number as
6475 				 * we do, we could clean it up now if we knew
6476 				 * that our caller had no references to it.
6477 				 * We don't know this, however, so we leave it
6478 				 * alone.  It will get cleaned up when
6479 				 * clean_zone_node() runs.
6480 				 */
6481 				free_rdataset(rbtdb, rbtdb->common.mctx,
6482 					      newheader);
6483 				newheader = (rdatasetheader_t *)merged;
6484 				init_rdataset(rbtdb, newheader);
6485 				update_newheader(newheader, header);
6486 				if (loading && RESIGN(newheader) &&
6487 				    RESIGN(header) &&
6488 				    resign_sooner(header, newheader))
6489 				{
6490 					newheader->resign = header->resign;
6491 					newheader->resign_lsb =
6492 						header->resign_lsb;
6493 				}
6494 			} else {
6495 				free_rdataset(rbtdb, rbtdb->common.mctx,
6496 					      newheader);
6497 				return (result);
6498 			}
6499 		}
6500 		/*
6501 		 * Don't replace existing NS, A and AAAA RRsets in the
6502 		 * cache if they are already exist. This prevents named
6503 		 * being locked to old servers. Don't lower trust of
6504 		 * existing record if the update is forced. Nothing
6505 		 * special to be done w.r.t stale data; it gets replaced
6506 		 * normally further down.
6507 		 */
6508 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6509 		    header->type == dns_rdatatype_ns && !header_nx &&
6510 		    !newheader_nx && header->trust >= newheader->trust &&
6511 		    dns_rdataslab_equalx((unsigned char *)header,
6512 					 (unsigned char *)newheader,
6513 					 (unsigned int)(sizeof(*newheader)),
6514 					 rbtdb->common.rdclass,
6515 					 (dns_rdatatype_t)header->type))
6516 		{
6517 			/*
6518 			 * Honour the new ttl if it is less than the
6519 			 * older one.
6520 			 */
6521 			if (header->rdh_ttl > newheader->rdh_ttl) {
6522 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6523 			}
6524 			if (header->noqname == NULL &&
6525 			    newheader->noqname != NULL)
6526 			{
6527 				header->noqname = newheader->noqname;
6528 				newheader->noqname = NULL;
6529 			}
6530 			if (header->closest == NULL &&
6531 			    newheader->closest != NULL)
6532 			{
6533 				header->closest = newheader->closest;
6534 				newheader->closest = NULL;
6535 			}
6536 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6537 			if (addedrdataset != NULL) {
6538 				bind_rdataset(rbtdb, rbtnode, header, now,
6539 					      isc_rwlocktype_write,
6540 					      addedrdataset);
6541 			}
6542 			return (ISC_R_SUCCESS);
6543 		}
6544 		/*
6545 		 * If we have will be replacing a NS RRset force its TTL
6546 		 * to be no more than the current NS RRset's TTL.  This
6547 		 * ensures the delegations that are withdrawn are honoured.
6548 		 */
6549 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6550 		    header->type == dns_rdatatype_ns && !header_nx &&
6551 		    !newheader_nx && header->trust <= newheader->trust)
6552 		{
6553 			if (newheader->rdh_ttl > header->rdh_ttl) {
6554 				newheader->rdh_ttl = header->rdh_ttl;
6555 			}
6556 		}
6557 		if (IS_CACHE(rbtdb) && ACTIVE(header, now) &&
6558 		    (options & DNS_DBADD_PREFETCH) == 0 &&
6559 		    (header->type == dns_rdatatype_a ||
6560 		     header->type == dns_rdatatype_aaaa ||
6561 		     header->type == dns_rdatatype_ds ||
6562 		     header->type == RBTDB_RDATATYPE_SIGDS) &&
6563 		    !header_nx && !newheader_nx &&
6564 		    header->trust >= newheader->trust &&
6565 		    dns_rdataslab_equal((unsigned char *)header,
6566 					(unsigned char *)newheader,
6567 					(unsigned int)(sizeof(*newheader))))
6568 		{
6569 			/*
6570 			 * Honour the new ttl if it is less than the
6571 			 * older one.
6572 			 */
6573 			if (header->rdh_ttl > newheader->rdh_ttl) {
6574 				set_ttl(rbtdb, header, newheader->rdh_ttl);
6575 			}
6576 			if (header->noqname == NULL &&
6577 			    newheader->noqname != NULL)
6578 			{
6579 				header->noqname = newheader->noqname;
6580 				newheader->noqname = NULL;
6581 			}
6582 			if (header->closest == NULL &&
6583 			    newheader->closest != NULL)
6584 			{
6585 				header->closest = newheader->closest;
6586 				newheader->closest = NULL;
6587 			}
6588 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6589 			if (addedrdataset != NULL) {
6590 				bind_rdataset(rbtdb, rbtnode, header, now,
6591 					      isc_rwlocktype_write,
6592 					      addedrdataset);
6593 			}
6594 			return (ISC_R_SUCCESS);
6595 		}
6596 		INSIST(rbtversion == NULL ||
6597 		       rbtversion->serial >= topheader->serial);
6598 		if (loading) {
6599 			newheader->down = NULL;
6600 			idx = newheader->node->locknum;
6601 			if (IS_CACHE(rbtdb)) {
6602 				if (ZEROTTL(newheader)) {
6603 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6604 							newheader, link);
6605 				} else {
6606 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6607 							 newheader, link);
6608 				}
6609 				INSIST(rbtdb->heaps != NULL);
6610 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6611 			} else if (RESIGN(newheader)) {
6612 				resign_insert(rbtdb, idx, newheader);
6613 				/*
6614 				 * Don't call resign_delete as we don't need
6615 				 * to reverse the delete.  The free_rdataset
6616 				 * call below will clean up the heap entry.
6617 				 */
6618 			}
6619 
6620 			/*
6621 			 * There are no other references to 'header' when
6622 			 * loading, so we MAY clean up 'header' now.
6623 			 * Since we don't generate changed records when
6624 			 * loading, we MUST clean up 'header' now.
6625 			 */
6626 			if (topheader_prev != NULL) {
6627 				topheader_prev->next = newheader;
6628 			} else {
6629 				rbtnode->data = newheader;
6630 			}
6631 			newheader->next = topheader->next;
6632 			if (rbtversion != NULL && !header_nx) {
6633 				update_recordsandxfrsize(false, rbtversion,
6634 							 header,
6635 							 nodename->length);
6636 			}
6637 			free_rdataset(rbtdb, rbtdb->common.mctx, header);
6638 		} else {
6639 			idx = newheader->node->locknum;
6640 			if (IS_CACHE(rbtdb)) {
6641 				INSIST(rbtdb->heaps != NULL);
6642 				isc_heap_insert(rbtdb->heaps[idx], newheader);
6643 				if (ZEROTTL(newheader)) {
6644 					ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6645 							newheader, link);
6646 				} else {
6647 					ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6648 							 newheader, link);
6649 				}
6650 			} else if (RESIGN(newheader)) {
6651 				resign_insert(rbtdb, idx, newheader);
6652 				resign_delete(rbtdb, rbtversion, header);
6653 			}
6654 			if (topheader_prev != NULL) {
6655 				topheader_prev->next = newheader;
6656 			} else {
6657 				rbtnode->data = newheader;
6658 			}
6659 			newheader->next = topheader->next;
6660 			newheader->down = topheader;
6661 			topheader->next = newheader;
6662 			rbtnode->dirty = 1;
6663 			if (changed != NULL) {
6664 				changed->dirty = true;
6665 			}
6666 			if (rbtversion == NULL) {
6667 				set_ttl(rbtdb, header, 0);
6668 				mark_header_ancient(rbtdb, header);
6669 				if (sigheader != NULL) {
6670 					set_ttl(rbtdb, sigheader, 0);
6671 					mark_header_ancient(rbtdb, sigheader);
6672 				}
6673 			}
6674 			if (rbtversion != NULL && !header_nx) {
6675 				update_recordsandxfrsize(false, rbtversion,
6676 							 header,
6677 							 nodename->length);
6678 			}
6679 		}
6680 	} else {
6681 		/*
6682 		 * No non-IGNORED rdatasets of the given type exist at
6683 		 * this node.
6684 		 */
6685 
6686 		/*
6687 		 * If we're trying to delete the type, don't bother.
6688 		 */
6689 		if (newheader_nx) {
6690 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
6691 			return (DNS_R_UNCHANGED);
6692 		}
6693 
6694 		idx = newheader->node->locknum;
6695 		if (IS_CACHE(rbtdb)) {
6696 			isc_heap_insert(rbtdb->heaps[idx], newheader);
6697 			if (ZEROTTL(newheader)) {
6698 				ISC_LIST_APPEND(rbtdb->rdatasets[idx],
6699 						newheader, link);
6700 			} else {
6701 				ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
6702 						 newheader, link);
6703 			}
6704 		} else if (RESIGN(newheader)) {
6705 			resign_insert(rbtdb, idx, newheader);
6706 			resign_delete(rbtdb, rbtversion, header);
6707 		}
6708 
6709 		if (topheader != NULL) {
6710 			/*
6711 			 * We have an list of rdatasets of the given type,
6712 			 * but they're all marked IGNORE.  We simply insert
6713 			 * the new rdataset at the head of the list.
6714 			 *
6715 			 * Ignored rdatasets cannot occur during loading, so
6716 			 * we INSIST on it.
6717 			 */
6718 			INSIST(!loading);
6719 			INSIST(rbtversion == NULL ||
6720 			       rbtversion->serial >= topheader->serial);
6721 			if (topheader_prev != NULL) {
6722 				topheader_prev->next = newheader;
6723 			} else {
6724 				rbtnode->data = newheader;
6725 			}
6726 			newheader->next = topheader->next;
6727 			newheader->down = topheader;
6728 			topheader->next = newheader;
6729 			rbtnode->dirty = 1;
6730 			if (changed != NULL) {
6731 				changed->dirty = true;
6732 			}
6733 		} else {
6734 			/*
6735 			 * No rdatasets of the given type exist at the node.
6736 			 */
6737 			newheader->next = rbtnode->data;
6738 			newheader->down = NULL;
6739 			rbtnode->data = newheader;
6740 		}
6741 	}
6742 
6743 	if (rbtversion != NULL && !newheader_nx) {
6744 		update_recordsandxfrsize(true, rbtversion, newheader,
6745 					 nodename->length);
6746 	}
6747 
6748 	/*
6749 	 * Check if the node now contains CNAME and other data.
6750 	 */
6751 	if (rbtversion != NULL &&
6752 	    cname_and_other_data(rbtnode, rbtversion->serial))
6753 	{
6754 		return (DNS_R_CNAMEANDOTHER);
6755 	}
6756 
6757 	if (addedrdataset != NULL) {
6758 		bind_rdataset(rbtdb, rbtnode, newheader, now,
6759 			      isc_rwlocktype_write, addedrdataset);
6760 	}
6761 
6762 	return (ISC_R_SUCCESS);
6763 }
6764 
6765 static bool
delegating_type(dns_rbtdb_t * rbtdb,dns_rbtnode_t * node,rbtdb_rdatatype_t type)6766 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
6767 		rbtdb_rdatatype_t type) {
6768 	if (IS_CACHE(rbtdb)) {
6769 		if (type == dns_rdatatype_dname) {
6770 			return (true);
6771 		} else {
6772 			return (false);
6773 		}
6774 	} else if (type == dns_rdatatype_dname ||
6775 		   (type == dns_rdatatype_ns &&
6776 		    (node != rbtdb->origin_node || IS_STUB(rbtdb))))
6777 	{
6778 		return (true);
6779 	}
6780 	return (false);
6781 }
6782 
6783 static isc_result_t
addnoqname(dns_rbtdb_t * rbtdb,rdatasetheader_t * newheader,dns_rdataset_t * rdataset)6784 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6785 	   dns_rdataset_t *rdataset) {
6786 	struct noqname *noqname;
6787 	isc_mem_t *mctx = rbtdb->common.mctx;
6788 	dns_name_t name;
6789 	dns_rdataset_t neg, negsig;
6790 	isc_result_t result;
6791 	isc_region_t r;
6792 
6793 	dns_name_init(&name, NULL);
6794 	dns_rdataset_init(&neg);
6795 	dns_rdataset_init(&negsig);
6796 
6797 	result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig);
6798 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6799 
6800 	noqname = isc_mem_get(mctx, sizeof(*noqname));
6801 	dns_name_init(&noqname->name, NULL);
6802 	noqname->neg = NULL;
6803 	noqname->negsig = NULL;
6804 	noqname->type = neg.type;
6805 	dns_name_dup(&name, mctx, &noqname->name);
6806 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0);
6807 	if (result != ISC_R_SUCCESS) {
6808 		goto cleanup;
6809 	}
6810 	noqname->neg = r.base;
6811 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0);
6812 	if (result != ISC_R_SUCCESS) {
6813 		goto cleanup;
6814 	}
6815 	noqname->negsig = r.base;
6816 	dns_rdataset_disassociate(&neg);
6817 	dns_rdataset_disassociate(&negsig);
6818 	newheader->noqname = noqname;
6819 	return (ISC_R_SUCCESS);
6820 
6821 cleanup:
6822 	dns_rdataset_disassociate(&neg);
6823 	dns_rdataset_disassociate(&negsig);
6824 	free_noqname(mctx, &noqname);
6825 	return (result);
6826 }
6827 
6828 static isc_result_t
addclosest(dns_rbtdb_t * rbtdb,rdatasetheader_t * newheader,dns_rdataset_t * rdataset)6829 addclosest(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
6830 	   dns_rdataset_t *rdataset) {
6831 	struct noqname *closest;
6832 	isc_mem_t *mctx = rbtdb->common.mctx;
6833 	dns_name_t name;
6834 	dns_rdataset_t neg, negsig;
6835 	isc_result_t result;
6836 	isc_region_t r;
6837 
6838 	dns_name_init(&name, NULL);
6839 	dns_rdataset_init(&neg);
6840 	dns_rdataset_init(&negsig);
6841 
6842 	result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig);
6843 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
6844 
6845 	closest = isc_mem_get(mctx, sizeof(*closest));
6846 	dns_name_init(&closest->name, NULL);
6847 	closest->neg = NULL;
6848 	closest->negsig = NULL;
6849 	closest->type = neg.type;
6850 	dns_name_dup(&name, mctx, &closest->name);
6851 	result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0);
6852 	if (result != ISC_R_SUCCESS) {
6853 		goto cleanup;
6854 	}
6855 	closest->neg = r.base;
6856 	result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0);
6857 	if (result != ISC_R_SUCCESS) {
6858 		goto cleanup;
6859 	}
6860 	closest->negsig = r.base;
6861 	dns_rdataset_disassociate(&neg);
6862 	dns_rdataset_disassociate(&negsig);
6863 	newheader->closest = closest;
6864 	return (ISC_R_SUCCESS);
6865 
6866 cleanup:
6867 	dns_rdataset_disassociate(&neg);
6868 	dns_rdataset_disassociate(&negsig);
6869 	free_noqname(mctx, &closest);
6870 	return (result);
6871 }
6872 
6873 static dns_dbmethods_t zone_methods;
6874 
6875 static size_t
rdataset_size(rdatasetheader_t * header)6876 rdataset_size(rdatasetheader_t *header) {
6877 	if (!NONEXISTENT(header)) {
6878 		return (dns_rdataslab_size((unsigned char *)header,
6879 					   sizeof(*header)));
6880 	}
6881 
6882 	return (sizeof(*header));
6883 }
6884 
6885 static isc_result_t
addrdataset(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,isc_stdtime_t now,dns_rdataset_t * rdataset,unsigned int options,dns_rdataset_t * addedrdataset)6886 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
6887 	    isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
6888 	    dns_rdataset_t *addedrdataset) {
6889 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6890 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
6891 	rbtdb_version_t *rbtversion = version;
6892 	isc_region_t region;
6893 	rdatasetheader_t *newheader;
6894 	rdatasetheader_t *header;
6895 	isc_result_t result;
6896 	bool delegating;
6897 	bool newnsec;
6898 	bool tree_locked = false;
6899 	bool cache_is_overmem = false;
6900 	dns_fixedname_t fixed;
6901 	dns_name_t *name;
6902 
6903 	REQUIRE(VALID_RBTDB(rbtdb));
6904 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
6905 
6906 	if (rbtdb->common.methods == &zone_methods) {
6907 		/*
6908 		 * SOA records are only allowed at top of zone.
6909 		 */
6910 		if (rdataset->type == dns_rdatatype_soa &&
6911 		    node != rbtdb->origin_node)
6912 		{
6913 			return (DNS_R_NOTZONETOP);
6914 		}
6915 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6916 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
6917 			  (rdataset->type == dns_rdatatype_nsec3 ||
6918 			   rdataset->covers == dns_rdatatype_nsec3)) ||
6919 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
6920 			  rdataset->type != dns_rdatatype_nsec3 &&
6921 			  rdataset->covers != dns_rdatatype_nsec3)));
6922 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6923 	}
6924 
6925 	if (rbtversion == NULL) {
6926 		if (now == 0) {
6927 			isc_stdtime_get(&now);
6928 		}
6929 	} else {
6930 		now = 0;
6931 	}
6932 
6933 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
6934 					    &region, sizeof(rdatasetheader_t));
6935 	if (result != ISC_R_SUCCESS) {
6936 		return (result);
6937 	}
6938 
6939 	name = dns_fixedname_initname(&fixed);
6940 	nodefullname(db, node, name);
6941 	dns_rdataset_getownercase(rdataset, name);
6942 
6943 	newheader = (rdatasetheader_t *)region.base;
6944 	init_rdataset(rbtdb, newheader);
6945 	setownercase(newheader, name);
6946 	set_ttl(rbtdb, newheader, rdataset->ttl + now);
6947 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
6948 						rdataset->covers);
6949 	atomic_init(&newheader->attributes, 0);
6950 	if (rdataset->ttl == 0U) {
6951 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_ZEROTTL);
6952 	}
6953 	newheader->noqname = NULL;
6954 	newheader->closest = NULL;
6955 	atomic_init(&newheader->count,
6956 		    atomic_fetch_add_relaxed(&init_count, 1));
6957 	newheader->trust = rdataset->trust;
6958 	newheader->last_used = now;
6959 	newheader->node = rbtnode;
6960 	if (rbtversion != NULL) {
6961 		newheader->serial = rbtversion->serial;
6962 		now = 0;
6963 
6964 		if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
6965 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
6966 			newheader->resign =
6967 				(isc_stdtime_t)(dns_time64_from32(
6968 							rdataset->resign) >>
6969 						1);
6970 			newheader->resign_lsb = rdataset->resign & 0x1;
6971 		} else {
6972 			newheader->resign = 0;
6973 			newheader->resign_lsb = 0;
6974 		}
6975 	} else {
6976 		newheader->serial = 1;
6977 		newheader->resign = 0;
6978 		newheader->resign_lsb = 0;
6979 		if ((rdataset->attributes & DNS_RDATASETATTR_PREFETCH) != 0) {
6980 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_PREFETCH);
6981 		}
6982 		if ((rdataset->attributes & DNS_RDATASETATTR_NEGATIVE) != 0) {
6983 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NEGATIVE);
6984 		}
6985 		if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) {
6986 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NXDOMAIN);
6987 		}
6988 		if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) {
6989 			RDATASET_ATTR_SET(newheader, RDATASET_ATTR_OPTOUT);
6990 		}
6991 		if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
6992 			result = addnoqname(rbtdb, newheader, rdataset);
6993 			if (result != ISC_R_SUCCESS) {
6994 				free_rdataset(rbtdb, rbtdb->common.mctx,
6995 					      newheader);
6996 				return (result);
6997 			}
6998 		}
6999 		if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) {
7000 			result = addclosest(rbtdb, newheader, rdataset);
7001 			if (result != ISC_R_SUCCESS) {
7002 				free_rdataset(rbtdb, rbtdb->common.mctx,
7003 					      newheader);
7004 				return (result);
7005 			}
7006 		}
7007 	}
7008 
7009 	/*
7010 	 * If we're adding a delegation type (e.g. NS or DNAME for a zone,
7011 	 * just DNAME for the cache), then we need to set the callback bit
7012 	 * on the node.
7013 	 */
7014 	if (delegating_type(rbtdb, rbtnode, rdataset->type)) {
7015 		delegating = true;
7016 	} else {
7017 		delegating = false;
7018 	}
7019 
7020 	/*
7021 	 * Add to the auxiliary NSEC tree if we're adding an NSEC record.
7022 	 */
7023 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7024 	if (rbtnode->nsec != DNS_RBT_NSEC_HAS_NSEC &&
7025 	    rdataset->type == dns_rdatatype_nsec)
7026 	{
7027 		newnsec = true;
7028 	} else {
7029 		newnsec = false;
7030 	}
7031 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7032 
7033 	/*
7034 	 * If we're adding a delegation type, adding to the auxiliary NSEC
7035 	 * tree, or the DB is a cache in an overmem state, hold an
7036 	 * exclusive lock on the tree.  In the latter case the lock does
7037 	 * not necessarily have to be acquired but it will help purge
7038 	 * ancient entries more effectively.
7039 	 */
7040 	if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) {
7041 		cache_is_overmem = true;
7042 	}
7043 	if (delegating || newnsec || cache_is_overmem) {
7044 		tree_locked = true;
7045 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7046 	}
7047 
7048 	if (cache_is_overmem) {
7049 		overmem_purge(rbtdb, rbtnode->locknum, rdataset_size(newheader),
7050 			      tree_locked);
7051 	}
7052 
7053 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7054 		  isc_rwlocktype_write);
7055 
7056 	if (rbtdb->rrsetstats != NULL) {
7057 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_STATCOUNT);
7058 		update_rrsetstats(rbtdb, newheader->type,
7059 				  atomic_load_acquire(&newheader->attributes),
7060 				  true);
7061 	}
7062 
7063 	if (IS_CACHE(rbtdb)) {
7064 		if (tree_locked) {
7065 			cleanup_dead_nodes(rbtdb, rbtnode->locknum);
7066 		}
7067 
7068 		header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
7069 		if (header != NULL) {
7070 			dns_ttl_t rdh_ttl = header->rdh_ttl;
7071 
7072 			/* Only account for stale TTL if cache is not overmem */
7073 			if (!cache_is_overmem) {
7074 				rdh_ttl += rbtdb->serve_stale_ttl;
7075 			}
7076 
7077 			if (rdh_ttl < now - RBTDB_VIRTUAL) {
7078 				expire_header(rbtdb, header, tree_locked,
7079 					      expire_ttl);
7080 			}
7081 		}
7082 
7083 		/*
7084 		 * If we've been holding a write lock on the tree just for
7085 		 * cleaning, we can release it now.  However, we still need the
7086 		 * node lock.
7087 		 */
7088 		if (tree_locked && !delegating && !newnsec) {
7089 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7090 			tree_locked = false;
7091 		}
7092 	}
7093 
7094 	result = ISC_R_SUCCESS;
7095 	if (newnsec) {
7096 		dns_rbtnode_t *nsecnode;
7097 
7098 		nsecnode = NULL;
7099 		result = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7100 		if (result == ISC_R_SUCCESS) {
7101 			nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7102 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7103 		} else if (result == ISC_R_EXISTS) {
7104 			rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC;
7105 			result = ISC_R_SUCCESS;
7106 		}
7107 	}
7108 
7109 	if (result == ISC_R_SUCCESS) {
7110 		result = add32(rbtdb, rbtnode, name, rbtversion, newheader,
7111 			       options, false, addedrdataset, now);
7112 	}
7113 	if (result == ISC_R_SUCCESS && delegating) {
7114 		rbtnode->find_callback = 1;
7115 	}
7116 
7117 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7118 		    isc_rwlocktype_write);
7119 
7120 	if (tree_locked) {
7121 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
7122 	}
7123 
7124 	/*
7125 	 * Update the zone's secure status.  If version is non-NULL
7126 	 * this is deferred until closeversion() is called.
7127 	 */
7128 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7129 		iszonesecure(db, version, rbtdb->origin_node);
7130 	}
7131 
7132 	return (result);
7133 }
7134 
7135 static isc_result_t
subtractrdataset(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,dns_rdataset_t * rdataset,unsigned int options,dns_rdataset_t * newrdataset)7136 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7137 		 dns_rdataset_t *rdataset, unsigned int options,
7138 		 dns_rdataset_t *newrdataset) {
7139 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7140 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7141 	rbtdb_version_t *rbtversion = version;
7142 	dns_fixedname_t fname;
7143 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7144 	rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
7145 	unsigned char *subresult;
7146 	isc_region_t region;
7147 	isc_result_t result;
7148 	rbtdb_changed_t *changed;
7149 
7150 	REQUIRE(VALID_RBTDB(rbtdb));
7151 	REQUIRE(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
7152 
7153 	if (rbtdb->common.methods == &zone_methods) {
7154 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7155 		REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 &&
7156 			  (rdataset->type == dns_rdatatype_nsec3 ||
7157 			   rdataset->covers == dns_rdatatype_nsec3)) ||
7158 			 (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 &&
7159 			  rdataset->type != dns_rdatatype_nsec3 &&
7160 			  rdataset->covers != dns_rdatatype_nsec3)));
7161 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7162 	}
7163 
7164 	nodefullname(db, node, nodename);
7165 
7166 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7167 					    &region, sizeof(rdatasetheader_t));
7168 	if (result != ISC_R_SUCCESS) {
7169 		return (result);
7170 	}
7171 	newheader = (rdatasetheader_t *)region.base;
7172 	init_rdataset(rbtdb, newheader);
7173 	set_ttl(rbtdb, newheader, rdataset->ttl);
7174 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7175 						rdataset->covers);
7176 	atomic_init(&newheader->attributes, 0);
7177 	newheader->serial = rbtversion->serial;
7178 	newheader->trust = 0;
7179 	newheader->noqname = NULL;
7180 	newheader->closest = NULL;
7181 	atomic_init(&newheader->count,
7182 		    atomic_fetch_add_relaxed(&init_count, 1));
7183 	newheader->last_used = 0;
7184 	newheader->node = rbtnode;
7185 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7186 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7187 		newheader->resign =
7188 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7189 					1);
7190 		newheader->resign_lsb = rdataset->resign & 0x1;
7191 	} else {
7192 		newheader->resign = 0;
7193 		newheader->resign_lsb = 0;
7194 	}
7195 
7196 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7197 		  isc_rwlocktype_write);
7198 
7199 	changed = add_changed(rbtdb, rbtversion, rbtnode);
7200 	if (changed == NULL) {
7201 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7202 		NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7203 			    isc_rwlocktype_write);
7204 		return (ISC_R_NOMEMORY);
7205 	}
7206 
7207 	topheader_prev = NULL;
7208 	for (topheader = rbtnode->data; topheader != NULL;
7209 	     topheader = topheader->next)
7210 	{
7211 		if (topheader->type == newheader->type) {
7212 			break;
7213 		}
7214 		topheader_prev = topheader;
7215 	}
7216 	/*
7217 	 * If header isn't NULL, we've found the right type.  There may be
7218 	 * IGNORE rdatasets between the top of the chain and the first real
7219 	 * data.  We skip over them.
7220 	 */
7221 	header = topheader;
7222 	while (header != NULL && IGNORE(header)) {
7223 		header = header->down;
7224 	}
7225 	if (header != NULL && EXISTS(header)) {
7226 		unsigned int flags = 0;
7227 		subresult = NULL;
7228 		result = ISC_R_SUCCESS;
7229 		if ((options & DNS_DBSUB_EXACT) != 0) {
7230 			flags |= DNS_RDATASLAB_EXACT;
7231 			if (newheader->rdh_ttl != header->rdh_ttl) {
7232 				result = DNS_R_NOTEXACT;
7233 			}
7234 		}
7235 		if (result == ISC_R_SUCCESS) {
7236 			result = dns_rdataslab_subtract(
7237 				(unsigned char *)header,
7238 				(unsigned char *)newheader,
7239 				(unsigned int)(sizeof(*newheader)),
7240 				rbtdb->common.mctx, rbtdb->common.rdclass,
7241 				(dns_rdatatype_t)header->type, flags,
7242 				&subresult);
7243 		}
7244 		if (result == ISC_R_SUCCESS) {
7245 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7246 			newheader = (rdatasetheader_t *)subresult;
7247 			init_rdataset(rbtdb, newheader);
7248 			update_newheader(newheader, header);
7249 			if (RESIGN(header)) {
7250 				RDATASET_ATTR_SET(newheader,
7251 						  RDATASET_ATTR_RESIGN);
7252 				newheader->resign = header->resign;
7253 				newheader->resign_lsb = header->resign_lsb;
7254 				resign_insert(rbtdb, rbtnode->locknum,
7255 					      newheader);
7256 			}
7257 			/*
7258 			 * We have to set the serial since the rdataslab
7259 			 * subtraction routine copies the reserved portion of
7260 			 * header, not newheader.
7261 			 */
7262 			newheader->serial = rbtversion->serial;
7263 			/*
7264 			 * XXXJT: dns_rdataslab_subtract() copied the pointers
7265 			 * to additional info.  We need to clear these fields
7266 			 * to avoid having duplicated references.
7267 			 */
7268 			update_recordsandxfrsize(true, rbtversion, newheader,
7269 						 nodename->length);
7270 		} else if (result == DNS_R_NXRRSET) {
7271 			/*
7272 			 * This subtraction would remove all of the rdata;
7273 			 * add a nonexistent header instead.
7274 			 */
7275 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7276 			newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7277 			if (newheader == NULL) {
7278 				result = ISC_R_NOMEMORY;
7279 				goto unlock;
7280 			}
7281 			init_rdataset(rbtdb, newheader);
7282 			set_ttl(rbtdb, newheader, 0);
7283 			newheader->type = topheader->type;
7284 			atomic_init(&newheader->attributes,
7285 				    RDATASET_ATTR_NONEXISTENT);
7286 			newheader->trust = 0;
7287 			newheader->serial = rbtversion->serial;
7288 			newheader->noqname = NULL;
7289 			newheader->closest = NULL;
7290 			atomic_init(&newheader->count, 0);
7291 			newheader->node = rbtnode;
7292 			newheader->resign = 0;
7293 			newheader->resign_lsb = 0;
7294 			newheader->last_used = 0;
7295 		} else {
7296 			free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7297 			goto unlock;
7298 		}
7299 
7300 		/*
7301 		 * If we're here, we want to link newheader in front of
7302 		 * topheader.
7303 		 */
7304 		INSIST(rbtversion->serial >= topheader->serial);
7305 		update_recordsandxfrsize(false, rbtversion, header,
7306 					 nodename->length);
7307 		if (topheader_prev != NULL) {
7308 			topheader_prev->next = newheader;
7309 		} else {
7310 			rbtnode->data = newheader;
7311 		}
7312 		newheader->next = topheader->next;
7313 		newheader->down = topheader;
7314 		topheader->next = newheader;
7315 		rbtnode->dirty = 1;
7316 		changed->dirty = true;
7317 		resign_delete(rbtdb, rbtversion, header);
7318 	} else {
7319 		/*
7320 		 * The rdataset doesn't exist, so we don't need to do anything
7321 		 * to satisfy the deletion request.
7322 		 */
7323 		free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
7324 		if ((options & DNS_DBSUB_EXACT) != 0) {
7325 			result = DNS_R_NOTEXACT;
7326 		} else {
7327 			result = DNS_R_UNCHANGED;
7328 		}
7329 	}
7330 
7331 	if (result == ISC_R_SUCCESS && newrdataset != NULL) {
7332 		bind_rdataset(rbtdb, rbtnode, newheader, 0,
7333 			      isc_rwlocktype_write, newrdataset);
7334 	}
7335 
7336 	if (result == DNS_R_NXRRSET && newrdataset != NULL &&
7337 	    (options & DNS_DBSUB_WANTOLD) != 0)
7338 	{
7339 		bind_rdataset(rbtdb, rbtnode, header, 0, isc_rwlocktype_write,
7340 			      newrdataset);
7341 	}
7342 
7343 unlock:
7344 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7345 		    isc_rwlocktype_write);
7346 
7347 	/*
7348 	 * Update the zone's secure status.  If version is non-NULL
7349 	 * this is deferred until closeversion() is called.
7350 	 */
7351 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7352 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7353 		version = rbtdb->current_version;
7354 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7355 		iszonesecure(db, version, rbtdb->origin_node);
7356 	}
7357 
7358 	return (result);
7359 }
7360 
7361 static isc_result_t
deleterdataset(dns_db_t * db,dns_dbnode_t * node,dns_dbversion_t * version,dns_rdatatype_t type,dns_rdatatype_t covers)7362 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
7363 	       dns_rdatatype_t type, dns_rdatatype_t covers) {
7364 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7365 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
7366 	rbtdb_version_t *rbtversion = version;
7367 	dns_fixedname_t fname;
7368 	dns_name_t *nodename = dns_fixedname_initname(&fname);
7369 	isc_result_t result;
7370 	rdatasetheader_t *newheader;
7371 
7372 	REQUIRE(VALID_RBTDB(rbtdb));
7373 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
7374 
7375 	if (type == dns_rdatatype_any) {
7376 		return (ISC_R_NOTIMPLEMENTED);
7377 	}
7378 	if (type == dns_rdatatype_rrsig && covers == 0) {
7379 		return (ISC_R_NOTIMPLEMENTED);
7380 	}
7381 
7382 	newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
7383 	if (newheader == NULL) {
7384 		return (ISC_R_NOMEMORY);
7385 	}
7386 	init_rdataset(rbtdb, newheader);
7387 	set_ttl(rbtdb, newheader, 0);
7388 	newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
7389 	atomic_init(&newheader->attributes, RDATASET_ATTR_NONEXISTENT);
7390 	newheader->trust = 0;
7391 	newheader->noqname = NULL;
7392 	newheader->closest = NULL;
7393 	if (rbtversion != NULL) {
7394 		newheader->serial = rbtversion->serial;
7395 	} else {
7396 		newheader->serial = 0;
7397 	}
7398 	atomic_init(&newheader->count, 0);
7399 	newheader->last_used = 0;
7400 	newheader->node = rbtnode;
7401 
7402 	nodefullname(db, node, nodename);
7403 
7404 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7405 		  isc_rwlocktype_write);
7406 	result = add32(rbtdb, rbtnode, nodename, rbtversion, newheader,
7407 		       DNS_DBADD_FORCE, false, NULL, 0);
7408 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
7409 		    isc_rwlocktype_write);
7410 
7411 	/*
7412 	 * Update the zone's secure status.  If version is non-NULL
7413 	 * this is deferred until closeversion() is called.
7414 	 */
7415 	if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) {
7416 		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
7417 		version = rbtdb->current_version;
7418 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
7419 		iszonesecure(db, version, rbtdb->origin_node);
7420 	}
7421 
7422 	return (result);
7423 }
7424 
7425 /*
7426  * load a non-NSEC3 node in the main tree and optionally to the auxiliary NSEC
7427  */
7428 static isc_result_t
loadnode(dns_rbtdb_t * rbtdb,const dns_name_t * name,dns_rbtnode_t ** nodep,bool hasnsec)7429 loadnode(dns_rbtdb_t *rbtdb, const dns_name_t *name, dns_rbtnode_t **nodep,
7430 	 bool hasnsec) {
7431 	isc_result_t noderesult, nsecresult, tmpresult;
7432 	dns_rbtnode_t *nsecnode = NULL, *node = NULL;
7433 
7434 	noderesult = dns_rbt_addnode(rbtdb->tree, name, &node);
7435 	if (!hasnsec) {
7436 		goto done;
7437 	}
7438 	if (noderesult == ISC_R_EXISTS) {
7439 		/*
7440 		 * Add a node to the auxiliary NSEC tree for an old node
7441 		 * just now getting an NSEC record.
7442 		 */
7443 		if (node->nsec == DNS_RBT_NSEC_HAS_NSEC) {
7444 			goto done;
7445 		}
7446 	} else if (noderesult != ISC_R_SUCCESS) {
7447 		goto done;
7448 	}
7449 
7450 	/*
7451 	 * Build the auxiliary tree for NSECs as we go.
7452 	 * This tree speeds searches for closest NSECs that would otherwise
7453 	 * need to examine many irrelevant nodes in large TLDs.
7454 	 *
7455 	 * Add nodes to the auxiliary tree after corresponding nodes have
7456 	 * been added to the main tree.
7457 	 */
7458 	nsecresult = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode);
7459 	if (nsecresult == ISC_R_SUCCESS) {
7460 		nsecnode->nsec = DNS_RBT_NSEC_NSEC;
7461 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7462 		goto done;
7463 	}
7464 
7465 	if (nsecresult == ISC_R_EXISTS) {
7466 #if 1 /* 0 */
7467 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7468 			      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7469 			      "addnode: NSEC node already exists");
7470 #endif /* if 1 */
7471 		node->nsec = DNS_RBT_NSEC_HAS_NSEC;
7472 		goto done;
7473 	}
7474 
7475 	if (noderesult == ISC_R_SUCCESS) {
7476 		/*
7477 		 * Remove the node we just added above.
7478 		 */
7479 		tmpresult = dns_rbt_deletenode(rbtdb->tree, node, false);
7480 		if (tmpresult != ISC_R_SUCCESS) {
7481 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
7482 				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
7483 				      "loading_addrdataset: "
7484 				      "dns_rbt_deletenode: %s after "
7485 				      "dns_rbt_addnode(NSEC): %s",
7486 				      isc_result_totext(tmpresult),
7487 				      isc_result_totext(noderesult));
7488 		}
7489 	}
7490 
7491 	/*
7492 	 * Set the error condition to be returned.
7493 	 */
7494 	noderesult = nsecresult;
7495 
7496 done:
7497 	if (noderesult == ISC_R_SUCCESS || noderesult == ISC_R_EXISTS) {
7498 		*nodep = node;
7499 	}
7500 
7501 	return (noderesult);
7502 }
7503 
7504 static isc_result_t
loading_addrdataset(void * arg,const dns_name_t * name,dns_rdataset_t * rdataset)7505 loading_addrdataset(void *arg, const dns_name_t *name,
7506 		    dns_rdataset_t *rdataset) {
7507 	rbtdb_load_t *loadctx = arg;
7508 	dns_rbtdb_t *rbtdb = loadctx->rbtdb;
7509 	dns_rbtnode_t *node;
7510 	isc_result_t result;
7511 	isc_region_t region;
7512 	rdatasetheader_t *newheader;
7513 
7514 	REQUIRE(rdataset->rdclass == rbtdb->common.rdclass);
7515 
7516 	/*
7517 	 * SOA records are only allowed at top of zone.
7518 	 */
7519 	if (rdataset->type == dns_rdatatype_soa && !IS_CACHE(rbtdb) &&
7520 	    !dns_name_equal(name, &rbtdb->common.origin))
7521 	{
7522 		return (DNS_R_NOTZONETOP);
7523 	}
7524 
7525 	if (rdataset->type != dns_rdatatype_nsec3 &&
7526 	    rdataset->covers != dns_rdatatype_nsec3)
7527 	{
7528 		add_empty_wildcards(rbtdb, name, false);
7529 	}
7530 
7531 	if (dns_name_iswildcard(name)) {
7532 		/*
7533 		 * NS record owners cannot legally be wild cards.
7534 		 */
7535 		if (rdataset->type == dns_rdatatype_ns) {
7536 			return (DNS_R_INVALIDNS);
7537 		}
7538 		/*
7539 		 * NSEC3 record owners cannot legally be wild cards.
7540 		 */
7541 		if (rdataset->type == dns_rdatatype_nsec3) {
7542 			return (DNS_R_INVALIDNSEC3);
7543 		}
7544 		result = add_wildcard_magic(rbtdb, name, false);
7545 		if (result != ISC_R_SUCCESS) {
7546 			return (result);
7547 		}
7548 	}
7549 
7550 	node = NULL;
7551 	if (rdataset->type == dns_rdatatype_nsec3 ||
7552 	    rdataset->covers == dns_rdatatype_nsec3)
7553 	{
7554 		result = dns_rbt_addnode(rbtdb->nsec3, name, &node);
7555 		if (result == ISC_R_SUCCESS) {
7556 			node->nsec = DNS_RBT_NSEC_NSEC3;
7557 		}
7558 	} else if (rdataset->type == dns_rdatatype_nsec) {
7559 		result = loadnode(rbtdb, name, &node, true);
7560 	} else {
7561 		result = loadnode(rbtdb, name, &node, false);
7562 	}
7563 	if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) {
7564 		return (result);
7565 	}
7566 	if (result == ISC_R_SUCCESS) {
7567 		node->locknum = node->hashval % rbtdb->node_lock_count;
7568 	}
7569 
7570 	result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
7571 					    &region, sizeof(rdatasetheader_t));
7572 	if (result != ISC_R_SUCCESS) {
7573 		return (result);
7574 	}
7575 	newheader = (rdatasetheader_t *)region.base;
7576 	init_rdataset(rbtdb, newheader);
7577 	set_ttl(rbtdb, newheader, rdataset->ttl + loadctx->now); /* XXX overflow
7578 								  * check */
7579 	newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
7580 						rdataset->covers);
7581 	atomic_init(&newheader->attributes, 0);
7582 	newheader->trust = rdataset->trust;
7583 	newheader->serial = 1;
7584 	newheader->noqname = NULL;
7585 	newheader->closest = NULL;
7586 	atomic_init(&newheader->count,
7587 		    atomic_fetch_add_relaxed(&init_count, 1));
7588 	newheader->last_used = 0;
7589 	newheader->node = node;
7590 	setownercase(newheader, name);
7591 
7592 	if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) {
7593 		RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN);
7594 		newheader->resign =
7595 			(isc_stdtime_t)(dns_time64_from32(rdataset->resign) >>
7596 					1);
7597 		newheader->resign_lsb = rdataset->resign & 0x1;
7598 	} else {
7599 		newheader->resign = 0;
7600 		newheader->resign_lsb = 0;
7601 	}
7602 
7603 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
7604 	result = add32(rbtdb, node, name, rbtdb->current_version, newheader,
7605 		       DNS_DBADD_MERGE, true, NULL, 0);
7606 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
7607 		    isc_rwlocktype_write);
7608 
7609 	if (result == ISC_R_SUCCESS &&
7610 	    delegating_type(rbtdb, node, rdataset->type))
7611 	{
7612 		node->find_callback = 1;
7613 	} else if (result == DNS_R_UNCHANGED) {
7614 		result = ISC_R_SUCCESS;
7615 	}
7616 
7617 	return (result);
7618 }
7619 
7620 static isc_result_t
rbt_datafixer(dns_rbtnode_t * rbtnode,void * base,size_t filesize,void * arg,uint64_t * crc)7621 rbt_datafixer(dns_rbtnode_t *rbtnode, void *base, size_t filesize, void *arg,
7622 	      uint64_t *crc) {
7623 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
7624 	rdatasetheader_t *header;
7625 	unsigned char *limit = ((unsigned char *)base) + filesize;
7626 
7627 	REQUIRE(rbtnode != NULL);
7628 	REQUIRE(VALID_RBTDB(rbtdb));
7629 
7630 	for (header = rbtnode->data; header != NULL; header = header->next) {
7631 		unsigned char *p = (unsigned char *)header;
7632 		size_t size = dns_rdataslab_size(p, sizeof(*header));
7633 		isc_crc64_update(crc, p, size);
7634 #ifdef DEBUG
7635 		hexdump("hashing header", p, sizeof(rdatasetheader_t));
7636 		hexdump("hashing slab", p + sizeof(rdatasetheader_t),
7637 			size - sizeof(rdatasetheader_t));
7638 #endif /* ifdef DEBUG */
7639 		header->serial = 1;
7640 		header->is_mmapped = 1;
7641 		header->node = rbtnode;
7642 		header->node_is_relative = 0;
7643 
7644 		if (RESIGN(header) &&
7645 		    (header->resign != 0 || header->resign_lsb != 0))
7646 		{
7647 			int idx = header->node->locknum;
7648 			isc_heap_insert(rbtdb->heaps[idx], header);
7649 		}
7650 
7651 		if (header->next != NULL) {
7652 			size_t cooked = dns_rbt_serialize_align(size);
7653 			if ((uintptr_t)header->next !=
7654 			    (p - (unsigned char *)base) + cooked)
7655 			{
7656 				return (ISC_R_INVALIDFILE);
7657 			}
7658 			header->next = (rdatasetheader_t *)(p + cooked);
7659 			header->next_is_relative = 0;
7660 			if ((header->next < (rdatasetheader_t *)base) ||
7661 			    (header->next > (rdatasetheader_t *)limit))
7662 			{
7663 				return (ISC_R_INVALIDFILE);
7664 			}
7665 		}
7666 
7667 		update_recordsandxfrsize(true, rbtdb->current_version, header,
7668 					 rbtnode->fullnamelen);
7669 	}
7670 
7671 	/* We're done deserializing; clear fullnamelen */
7672 	rbtnode->fullnamelen = 0;
7673 
7674 	return (ISC_R_SUCCESS);
7675 }
7676 
7677 /*
7678  * Load the RBT database from the image in 'f'
7679  */
7680 static isc_result_t
deserialize(void * arg,FILE * f,off_t offset)7681 deserialize(void *arg, FILE *f, off_t offset) {
7682 	isc_result_t result;
7683 	rbtdb_load_t *loadctx = arg;
7684 	dns_rbtdb_t *rbtdb = loadctx->rbtdb;
7685 	rbtdb_file_header_t *header;
7686 	int fd;
7687 	off_t filesize = 0;
7688 	char *base;
7689 	dns_rbt_t *tree = NULL, *nsec = NULL, *nsec3 = NULL;
7690 	int protect, flags;
7691 	dns_rbtnode_t *origin_node = NULL;
7692 
7693 	REQUIRE(VALID_RBTDB(rbtdb));
7694 
7695 	/*
7696 	 * TODO CKB: since this is read-write (had to be to add nodes later)
7697 	 * we will need to lock the file or the nodes in it before modifying
7698 	 * the nodes in the file.
7699 	 */
7700 
7701 	/* Map in the whole file in one go */
7702 	fd = fileno(f);
7703 	isc_file_getsizefd(fd, &filesize);
7704 	protect = PROT_READ | PROT_WRITE;
7705 	flags = MAP_PRIVATE;
7706 #ifdef MAP_FILE
7707 	flags |= MAP_FILE;
7708 #endif /* ifdef MAP_FILE */
7709 
7710 	base = isc_file_mmap(NULL, filesize, protect, flags, fd, 0);
7711 	if (base == NULL || base == MAP_FAILED) {
7712 		return (ISC_R_FAILURE);
7713 	}
7714 
7715 	header = (rbtdb_file_header_t *)(base + offset);
7716 	if (!match_header_version(header)) {
7717 		result = ISC_R_INVALIDFILE;
7718 		goto cleanup;
7719 	}
7720 
7721 	if (header->tree != 0) {
7722 		result = dns_rbt_deserialize_tree(
7723 			base, filesize, (off_t)header->tree, rbtdb->common.mctx,
7724 			delete_callback, rbtdb, rbt_datafixer, rbtdb, NULL,
7725 			&tree);
7726 		if (result != ISC_R_SUCCESS) {
7727 			goto cleanup;
7728 		}
7729 
7730 		result = dns_rbt_findnode(tree, &rbtdb->common.origin, NULL,
7731 					  &origin_node, NULL,
7732 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
7733 		if (result != ISC_R_SUCCESS) {
7734 			goto cleanup;
7735 		}
7736 	}
7737 
7738 	if (header->nsec != 0) {
7739 		result = dns_rbt_deserialize_tree(
7740 			base, filesize, (off_t)header->nsec, rbtdb->common.mctx,
7741 			delete_callback, rbtdb, rbt_datafixer, rbtdb, NULL,
7742 			&nsec);
7743 		if (result != ISC_R_SUCCESS) {
7744 			goto cleanup;
7745 		}
7746 	}
7747 
7748 	if (header->nsec3 != 0) {
7749 		result = dns_rbt_deserialize_tree(
7750 			base, filesize, (off_t)header->nsec3,
7751 			rbtdb->common.mctx, delete_callback, rbtdb,
7752 			rbt_datafixer, rbtdb, NULL, &nsec3);
7753 		if (result != ISC_R_SUCCESS) {
7754 			goto cleanup;
7755 		}
7756 	}
7757 
7758 	/*
7759 	 * We have a successfully loaded all the rbt trees now update
7760 	 * rbtdb to use them.
7761 	 */
7762 
7763 	rbtdb->mmap_location = base;
7764 	rbtdb->mmap_size = (size_t)filesize;
7765 
7766 	if (tree != NULL) {
7767 		dns_rbt_destroy(&rbtdb->tree);
7768 		rbtdb->tree = tree;
7769 		rbtdb->origin_node = origin_node;
7770 	}
7771 
7772 	if (nsec != NULL) {
7773 		dns_rbt_destroy(&rbtdb->nsec);
7774 		rbtdb->nsec = nsec;
7775 	}
7776 
7777 	if (nsec3 != NULL) {
7778 		dns_rbt_destroy(&rbtdb->nsec3);
7779 		rbtdb->nsec3 = nsec3;
7780 	}
7781 
7782 	return (ISC_R_SUCCESS);
7783 
7784 cleanup:
7785 	if (tree != NULL) {
7786 		dns_rbt_destroy(&tree);
7787 	}
7788 	if (nsec != NULL) {
7789 		dns_rbt_destroy(&nsec);
7790 	}
7791 	if (nsec3 != NULL) {
7792 		dns_rbt_destroy(&nsec3);
7793 	}
7794 	isc_file_munmap(base, (size_t)filesize);
7795 	return (result);
7796 }
7797 
7798 static isc_result_t
beginload(dns_db_t * db,dns_rdatacallbacks_t * callbacks)7799 beginload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7800 	rbtdb_load_t *loadctx;
7801 	dns_rbtdb_t *rbtdb;
7802 	rbtdb = (dns_rbtdb_t *)db;
7803 
7804 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7805 	REQUIRE(VALID_RBTDB(rbtdb));
7806 
7807 	loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
7808 
7809 	loadctx->rbtdb = rbtdb;
7810 	if (IS_CACHE(rbtdb)) {
7811 		isc_stdtime_get(&loadctx->now);
7812 	} else {
7813 		loadctx->now = 0;
7814 	}
7815 
7816 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7817 
7818 	REQUIRE((rbtdb->attributes &
7819 		 (RBTDB_ATTR_LOADED | RBTDB_ATTR_LOADING)) == 0);
7820 	rbtdb->attributes |= RBTDB_ATTR_LOADING;
7821 
7822 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7823 
7824 	callbacks->add = loading_addrdataset;
7825 	callbacks->add_private = loadctx;
7826 	callbacks->deserialize = deserialize;
7827 	callbacks->deserialize_private = loadctx;
7828 
7829 	return (ISC_R_SUCCESS);
7830 }
7831 
7832 static isc_result_t
endload(dns_db_t * db,dns_rdatacallbacks_t * callbacks)7833 endload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) {
7834 	rbtdb_load_t *loadctx;
7835 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
7836 
7837 	REQUIRE(VALID_RBTDB(rbtdb));
7838 	REQUIRE(DNS_CALLBACK_VALID(callbacks));
7839 	loadctx = callbacks->add_private;
7840 	REQUIRE(loadctx != NULL);
7841 	REQUIRE(loadctx->rbtdb == rbtdb);
7842 
7843 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
7844 
7845 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
7846 	REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
7847 
7848 	rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
7849 	rbtdb->attributes |= RBTDB_ATTR_LOADED;
7850 
7851 	/*
7852 	 * If there's a KEY rdataset at the zone origin containing a
7853 	 * zone key, we consider the zone secure.
7854 	 */
7855 	if (!IS_CACHE(rbtdb) && rbtdb->origin_node != NULL) {
7856 		dns_dbversion_t *version = rbtdb->current_version;
7857 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7858 		iszonesecure(db, version, rbtdb->origin_node);
7859 	} else {
7860 		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
7861 	}
7862 
7863 	callbacks->add = NULL;
7864 	callbacks->add_private = NULL;
7865 	callbacks->deserialize = NULL;
7866 	callbacks->deserialize_private = NULL;
7867 
7868 	isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
7869 
7870 	return (ISC_R_SUCCESS);
7871 }
7872 
7873 /*
7874  * helper function to handle writing out the rdataset data pointed to
7875  * by the void *data pointer in the dns_rbtnode
7876  */
7877 static isc_result_t
rbt_datawriter(FILE * rbtfile,unsigned char * data,void * arg,uint64_t * crc)7878 rbt_datawriter(FILE *rbtfile, unsigned char *data, void *arg, uint64_t *crc) {
7879 	rbtdb_version_t *version = (rbtdb_version_t *)arg;
7880 	rbtdb_serial_t serial;
7881 	rdatasetheader_t newheader;
7882 	rdatasetheader_t *header = (rdatasetheader_t *)data, *next;
7883 	off_t where;
7884 	size_t cooked, size;
7885 	unsigned char *p;
7886 	isc_result_t result = ISC_R_SUCCESS;
7887 	char pad[sizeof(char *)];
7888 	uintptr_t off;
7889 
7890 	REQUIRE(rbtfile != NULL);
7891 	REQUIRE(data != NULL);
7892 	REQUIRE(version != NULL);
7893 
7894 	serial = version->serial;
7895 
7896 	for (; header != NULL; header = next) {
7897 		next = header->next;
7898 		do {
7899 			if (header->serial <= serial && !IGNORE(header)) {
7900 				if (NONEXISTENT(header)) {
7901 					header = NULL;
7902 				}
7903 				break;
7904 			} else {
7905 				header = header->down;
7906 			}
7907 		} while (header != NULL);
7908 
7909 		if (header == NULL) {
7910 			continue;
7911 		}
7912 
7913 		CHECK(isc_stdio_tell(rbtfile, &where));
7914 		size = dns_rdataslab_size((unsigned char *)header,
7915 					  sizeof(rdatasetheader_t));
7916 
7917 		p = (unsigned char *)header;
7918 		memmove(&newheader, p, sizeof(rdatasetheader_t));
7919 		newheader.down = NULL;
7920 		newheader.next = NULL;
7921 		off = where;
7922 		if ((off_t)off != where) {
7923 			return (ISC_R_RANGE);
7924 		}
7925 		newheader.node = (dns_rbtnode_t *)off;
7926 		newheader.node_is_relative = 1;
7927 		newheader.serial = 1;
7928 
7929 		/*
7930 		 * Round size up to the next pointer sized offset so it
7931 		 * will be properly aligned when read back in.
7932 		 */
7933 		cooked = dns_rbt_serialize_align(size);
7934 		if (next != NULL) {
7935 			newheader.next = (rdatasetheader_t *)(off + cooked);
7936 			newheader.next_is_relative = 1;
7937 		}
7938 
7939 #ifdef DEBUG
7940 		hexdump("writing header", (unsigned char *)&newheader,
7941 			sizeof(rdatasetheader_t));
7942 		hexdump("writing slab", p + sizeof(rdatasetheader_t),
7943 			size - sizeof(rdatasetheader_t));
7944 #endif /* ifdef DEBUG */
7945 		isc_crc64_update(crc, (unsigned char *)&newheader,
7946 				 sizeof(rdatasetheader_t));
7947 		CHECK(isc_stdio_write(&newheader, sizeof(rdatasetheader_t), 1,
7948 				      rbtfile, NULL));
7949 
7950 		isc_crc64_update(crc, p + sizeof(rdatasetheader_t),
7951 				 size - sizeof(rdatasetheader_t));
7952 		CHECK(isc_stdio_write(p + sizeof(rdatasetheader_t),
7953 				      size - sizeof(rdatasetheader_t), 1,
7954 				      rbtfile, NULL));
7955 		/*
7956 		 * Pad to force alignment.
7957 		 */
7958 		if (size != (size_t)cooked) {
7959 			memset(pad, 0, sizeof(pad));
7960 			CHECK(isc_stdio_write(pad, cooked - size, 1, rbtfile,
7961 					      NULL));
7962 		}
7963 	}
7964 
7965 failure:
7966 	return (result);
7967 }
7968 
7969 /*
7970  * Write out a zeroed header as a placeholder.  Doing this ensures
7971  * that the file will not read while it is partially written, should
7972  * writing fail or be interrupted.
7973  */
7974 static isc_result_t
rbtdb_zero_header(FILE * rbtfile)7975 rbtdb_zero_header(FILE *rbtfile) {
7976 	char buffer[RBTDB_HEADER_LENGTH];
7977 	isc_result_t result;
7978 
7979 	memset(buffer, 0, RBTDB_HEADER_LENGTH);
7980 	result = isc_stdio_write(buffer, 1, RBTDB_HEADER_LENGTH, rbtfile, NULL);
7981 	fflush(rbtfile);
7982 
7983 	return (result);
7984 }
7985 
7986 static isc_once_t once = ISC_ONCE_INIT;
7987 
7988 static void
init_file_version(void)7989 init_file_version(void) {
7990 	int n;
7991 
7992 	memset(FILE_VERSION, 0, sizeof(FILE_VERSION));
7993 	n = snprintf(FILE_VERSION, sizeof(FILE_VERSION), "RBTDB Image %s %s",
7994 		     dns_major, dns_mapapi);
7995 	INSIST(n > 0 && (unsigned int)n < sizeof(FILE_VERSION));
7996 }
7997 
7998 /*
7999  * Write the file header out, recording the locations of the three
8000  * RBT's used in the rbtdb: tree, nsec, and nsec3, and including NodeDump
8001  * version information and any information stored in the rbtdb object
8002  * itself that should be stored here.
8003  */
8004 static isc_result_t
rbtdb_write_header(FILE * rbtfile,off_t tree_location,off_t nsec_location,off_t nsec3_location)8005 rbtdb_write_header(FILE *rbtfile, off_t tree_location, off_t nsec_location,
8006 		   off_t nsec3_location) {
8007 	rbtdb_file_header_t header;
8008 	isc_result_t result;
8009 
8010 	RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
8011 
8012 	memset(&header, 0, sizeof(rbtdb_file_header_t));
8013 	memmove(header.version1, FILE_VERSION, sizeof(header.version1));
8014 	memmove(header.version2, FILE_VERSION, sizeof(header.version2));
8015 	header.ptrsize = (uint32_t)sizeof(void *);
8016 	header.bigendian = (1 == htonl(1)) ? 1 : 0;
8017 	header.tree = (uint64_t)tree_location;
8018 	header.nsec = (uint64_t)nsec_location;
8019 	header.nsec3 = (uint64_t)nsec3_location;
8020 	result = isc_stdio_write(&header, 1, sizeof(rbtdb_file_header_t),
8021 				 rbtfile, NULL);
8022 	fflush(rbtfile);
8023 
8024 	return (result);
8025 }
8026 
8027 static bool
match_header_version(rbtdb_file_header_t * header)8028 match_header_version(rbtdb_file_header_t *header) {
8029 	RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
8030 
8031 	if (memcmp(header->version1, FILE_VERSION, sizeof(header->version1)) !=
8032 		    0 ||
8033 	    memcmp(header->version2, FILE_VERSION, sizeof(header->version1)) !=
8034 		    0)
8035 	{
8036 		return (false);
8037 	}
8038 
8039 	return (true);
8040 }
8041 
8042 static isc_result_t
serialize(dns_db_t * db,dns_dbversion_t * ver,FILE * rbtfile)8043 serialize(dns_db_t *db, dns_dbversion_t *ver, FILE *rbtfile) {
8044 	rbtdb_version_t *version = (rbtdb_version_t *)ver;
8045 	dns_rbtdb_t *rbtdb;
8046 	isc_result_t result;
8047 	off_t tree_location, nsec_location, nsec3_location, header_location;
8048 
8049 	rbtdb = (dns_rbtdb_t *)db;
8050 
8051 	REQUIRE(VALID_RBTDB(rbtdb));
8052 	REQUIRE(rbtfile != NULL);
8053 
8054 	/* Ensure we're writing to a plain file */
8055 	CHECK(isc_file_isplainfilefd(fileno(rbtfile)));
8056 
8057 	/*
8058 	 * first, write out a zeroed header to store rbtdb information
8059 	 *
8060 	 * then for each of the three trees, store the current position
8061 	 * in the file and call dns_rbt_serialize_tree
8062 	 *
8063 	 * finally, write out the rbtdb header, storing the locations of the
8064 	 * rbtheaders
8065 	 *
8066 	 * NOTE: need to do something better with the return codes, &= will
8067 	 * not work.
8068 	 */
8069 	CHECK(isc_stdio_tell(rbtfile, &header_location));
8070 	CHECK(rbtdb_zero_header(rbtfile));
8071 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->tree, rbt_datawriter,
8072 				     version, &tree_location));
8073 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->nsec, rbt_datawriter,
8074 				     version, &nsec_location));
8075 	CHECK(dns_rbt_serialize_tree(rbtfile, rbtdb->nsec3, rbt_datawriter,
8076 				     version, &nsec3_location));
8077 
8078 	CHECK(isc_stdio_seek(rbtfile, header_location, SEEK_SET));
8079 	CHECK(rbtdb_write_header(rbtfile, tree_location, nsec_location,
8080 				 nsec3_location));
8081 failure:
8082 	return (result);
8083 }
8084 
8085 static isc_result_t
dump(dns_db_t * db,dns_dbversion_t * version,const char * filename,dns_masterformat_t masterformat)8086 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
8087      dns_masterformat_t masterformat) {
8088 	dns_rbtdb_t *rbtdb;
8089 	rbtdb_version_t *rbtversion = version;
8090 
8091 	rbtdb = (dns_rbtdb_t *)db;
8092 
8093 	REQUIRE(VALID_RBTDB(rbtdb));
8094 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8095 
8096 	return (dns_master_dump(rbtdb->common.mctx, db, version,
8097 				&dns_master_style_default, filename,
8098 				masterformat, NULL));
8099 }
8100 
8101 static void
delete_callback(void * data,void * arg)8102 delete_callback(void *data, void *arg) {
8103 	dns_rbtdb_t *rbtdb = arg;
8104 	rdatasetheader_t *current, *next;
8105 	unsigned int locknum;
8106 
8107 	current = data;
8108 	locknum = current->node->locknum;
8109 	NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
8110 	while (current != NULL) {
8111 		next = current->next;
8112 		free_rdataset(rbtdb, rbtdb->common.mctx, current);
8113 		current = next;
8114 	}
8115 	NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
8116 }
8117 
8118 static bool
issecure(dns_db_t * db)8119 issecure(dns_db_t *db) {
8120 	dns_rbtdb_t *rbtdb;
8121 	bool secure;
8122 
8123 	rbtdb = (dns_rbtdb_t *)db;
8124 
8125 	REQUIRE(VALID_RBTDB(rbtdb));
8126 
8127 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8128 	secure = (rbtdb->current_version->secure == dns_db_secure);
8129 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8130 
8131 	return (secure);
8132 }
8133 
8134 static bool
isdnssec(dns_db_t * db)8135 isdnssec(dns_db_t *db) {
8136 	dns_rbtdb_t *rbtdb;
8137 	bool dnssec;
8138 
8139 	rbtdb = (dns_rbtdb_t *)db;
8140 
8141 	REQUIRE(VALID_RBTDB(rbtdb));
8142 
8143 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8144 	dnssec = (rbtdb->current_version->secure != dns_db_insecure);
8145 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8146 
8147 	return (dnssec);
8148 }
8149 
8150 static unsigned int
nodecount(dns_db_t * db)8151 nodecount(dns_db_t *db) {
8152 	dns_rbtdb_t *rbtdb;
8153 	unsigned int count;
8154 
8155 	rbtdb = (dns_rbtdb_t *)db;
8156 
8157 	REQUIRE(VALID_RBTDB(rbtdb));
8158 
8159 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8160 	count = dns_rbt_nodecount(rbtdb->tree);
8161 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8162 
8163 	return (count);
8164 }
8165 
8166 static size_t
hashsize(dns_db_t * db)8167 hashsize(dns_db_t *db) {
8168 	dns_rbtdb_t *rbtdb;
8169 	size_t size;
8170 
8171 	rbtdb = (dns_rbtdb_t *)db;
8172 
8173 	REQUIRE(VALID_RBTDB(rbtdb));
8174 
8175 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8176 	size = dns_rbt_hashsize(rbtdb->tree);
8177 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8178 
8179 	return (size);
8180 }
8181 
8182 static isc_result_t
adjusthashsize(dns_db_t * db,size_t size)8183 adjusthashsize(dns_db_t *db, size_t size) {
8184 	isc_result_t result;
8185 	dns_rbtdb_t *rbtdb;
8186 
8187 	rbtdb = (dns_rbtdb_t *)db;
8188 
8189 	REQUIRE(VALID_RBTDB(rbtdb));
8190 
8191 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8192 	result = dns_rbt_adjusthashsize(rbtdb->tree, size);
8193 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8194 
8195 	return (result);
8196 }
8197 
8198 static void
settask(dns_db_t * db,isc_task_t * task)8199 settask(dns_db_t *db, isc_task_t *task) {
8200 	dns_rbtdb_t *rbtdb;
8201 
8202 	rbtdb = (dns_rbtdb_t *)db;
8203 
8204 	REQUIRE(VALID_RBTDB(rbtdb));
8205 
8206 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
8207 	if (rbtdb->task != NULL) {
8208 		isc_task_detach(&rbtdb->task);
8209 	}
8210 	if (task != NULL) {
8211 		isc_task_attach(task, &rbtdb->task);
8212 	}
8213 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
8214 }
8215 
8216 static bool
ispersistent(dns_db_t * db)8217 ispersistent(dns_db_t *db) {
8218 	UNUSED(db);
8219 	return (false);
8220 }
8221 
8222 static isc_result_t
getoriginnode(dns_db_t * db,dns_dbnode_t ** nodep)8223 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
8224 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8225 	dns_rbtnode_t *onode;
8226 	isc_result_t result = ISC_R_SUCCESS;
8227 
8228 	REQUIRE(VALID_RBTDB(rbtdb));
8229 	REQUIRE(nodep != NULL && *nodep == NULL);
8230 
8231 	/* Note that the access to origin_node doesn't require a DB lock */
8232 	onode = (dns_rbtnode_t *)rbtdb->origin_node;
8233 	if (onode != NULL) {
8234 		new_reference(rbtdb, onode, isc_rwlocktype_none);
8235 		*nodep = rbtdb->origin_node;
8236 	} else {
8237 		INSIST(IS_CACHE(rbtdb));
8238 		result = ISC_R_NOTFOUND;
8239 	}
8240 
8241 	return (result);
8242 }
8243 
8244 static isc_result_t
getnsec3parameters(dns_db_t * db,dns_dbversion_t * version,dns_hash_t * hash,uint8_t * flags,uint16_t * iterations,unsigned char * salt,size_t * salt_length)8245 getnsec3parameters(dns_db_t *db, dns_dbversion_t *version, dns_hash_t *hash,
8246 		   uint8_t *flags, uint16_t *iterations, unsigned char *salt,
8247 		   size_t *salt_length) {
8248 	dns_rbtdb_t *rbtdb;
8249 	isc_result_t result = ISC_R_NOTFOUND;
8250 	rbtdb_version_t *rbtversion = version;
8251 
8252 	rbtdb = (dns_rbtdb_t *)db;
8253 
8254 	REQUIRE(VALID_RBTDB(rbtdb));
8255 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8256 
8257 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8258 	if (rbtversion == NULL) {
8259 		rbtversion = rbtdb->current_version;
8260 	}
8261 
8262 	if (rbtversion->havensec3) {
8263 		if (hash != NULL) {
8264 			*hash = rbtversion->hash;
8265 		}
8266 		if (salt != NULL && salt_length != NULL) {
8267 			REQUIRE(*salt_length >= rbtversion->salt_length);
8268 			memmove(salt, rbtversion->salt,
8269 				rbtversion->salt_length);
8270 		}
8271 		if (salt_length != NULL) {
8272 			*salt_length = rbtversion->salt_length;
8273 		}
8274 		if (iterations != NULL) {
8275 			*iterations = rbtversion->iterations;
8276 		}
8277 		if (flags != NULL) {
8278 			*flags = rbtversion->flags;
8279 		}
8280 		result = ISC_R_SUCCESS;
8281 	}
8282 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8283 
8284 	return (result);
8285 }
8286 
8287 static isc_result_t
getsize(dns_db_t * db,dns_dbversion_t * version,uint64_t * records,uint64_t * xfrsize)8288 getsize(dns_db_t *db, dns_dbversion_t *version, uint64_t *records,
8289 	uint64_t *xfrsize) {
8290 	dns_rbtdb_t *rbtdb;
8291 	isc_result_t result = ISC_R_SUCCESS;
8292 	rbtdb_version_t *rbtversion = version;
8293 
8294 	rbtdb = (dns_rbtdb_t *)db;
8295 
8296 	REQUIRE(VALID_RBTDB(rbtdb));
8297 	INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb);
8298 
8299 	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
8300 	if (rbtversion == NULL) {
8301 		rbtversion = rbtdb->current_version;
8302 	}
8303 
8304 	RWLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8305 	if (records != NULL) {
8306 		*records = rbtversion->records;
8307 	}
8308 
8309 	if (xfrsize != NULL) {
8310 		*xfrsize = rbtversion->xfrsize;
8311 	}
8312 	RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_read);
8313 	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
8314 
8315 	return (result);
8316 }
8317 
8318 static isc_result_t
setsigningtime(dns_db_t * db,dns_rdataset_t * rdataset,isc_stdtime_t resign)8319 setsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, isc_stdtime_t resign) {
8320 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8321 	rdatasetheader_t *header, oldheader;
8322 
8323 	REQUIRE(VALID_RBTDB(rbtdb));
8324 	REQUIRE(!IS_CACHE(rbtdb));
8325 	REQUIRE(rdataset != NULL);
8326 
8327 	header = rdataset->private3;
8328 	header--;
8329 
8330 	NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock,
8331 		  isc_rwlocktype_write);
8332 
8333 	oldheader = *header;
8334 	/*
8335 	 * Only break the heap invariant (by adjusting resign and resign_lsb)
8336 	 * if we are going to be restoring it by calling isc_heap_increased
8337 	 * or isc_heap_decreased.
8338 	 */
8339 	if (resign != 0) {
8340 		header->resign = (isc_stdtime_t)(dns_time64_from32(resign) >>
8341 						 1);
8342 		header->resign_lsb = resign & 0x1;
8343 	}
8344 	if (header->heap_index != 0) {
8345 		INSIST(RESIGN(header));
8346 		if (resign == 0) {
8347 			isc_heap_delete(rbtdb->heaps[header->node->locknum],
8348 					header->heap_index);
8349 			header->heap_index = 0;
8350 		} else if (resign_sooner(header, &oldheader)) {
8351 			isc_heap_increased(rbtdb->heaps[header->node->locknum],
8352 					   header->heap_index);
8353 		} else if (resign_sooner(&oldheader, header)) {
8354 			isc_heap_decreased(rbtdb->heaps[header->node->locknum],
8355 					   header->heap_index);
8356 		}
8357 	} else if (resign != 0) {
8358 		RDATASET_ATTR_SET(header, RDATASET_ATTR_RESIGN);
8359 		resign_insert(rbtdb, header->node->locknum, header);
8360 	}
8361 	NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock,
8362 		    isc_rwlocktype_write);
8363 	return (ISC_R_SUCCESS);
8364 }
8365 
8366 static isc_result_t
getsigningtime(dns_db_t * db,dns_rdataset_t * rdataset,dns_name_t * foundname)8367 getsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, dns_name_t *foundname) {
8368 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8369 	rdatasetheader_t *header = NULL, *this;
8370 	unsigned int i;
8371 	isc_result_t result = ISC_R_NOTFOUND;
8372 	unsigned int locknum = 0;
8373 
8374 	REQUIRE(VALID_RBTDB(rbtdb));
8375 
8376 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8377 
8378 	for (i = 0; i < rbtdb->node_lock_count; i++) {
8379 		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read);
8380 
8381 		/*
8382 		 * Find for the earliest signing time among all of the
8383 		 * heaps, each of which is covered by a different bucket
8384 		 * lock.
8385 		 */
8386 		this = isc_heap_element(rbtdb->heaps[i], 1);
8387 		if (this == NULL) {
8388 			/* Nothing found; unlock and try the next heap. */
8389 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8390 				    isc_rwlocktype_read);
8391 			continue;
8392 		}
8393 
8394 		if (header == NULL) {
8395 			/*
8396 			 * Found a signing time: retain the bucket lock and
8397 			 * preserve the lock number so we can unlock it
8398 			 * later.
8399 			 */
8400 			header = this;
8401 			locknum = i;
8402 		} else if (resign_sooner(this, header)) {
8403 			/*
8404 			 * Found an earlier signing time; release the
8405 			 * previous bucket lock and retain this one instead.
8406 			 */
8407 			NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8408 				    isc_rwlocktype_read);
8409 			header = this;
8410 			locknum = i;
8411 		} else {
8412 			/*
8413 			 * Earliest signing time in this heap isn't
8414 			 * an improvement; unlock and try the next heap.
8415 			 */
8416 			NODE_UNLOCK(&rbtdb->node_locks[i].lock,
8417 				    isc_rwlocktype_read);
8418 		}
8419 	}
8420 
8421 	if (header != NULL) {
8422 		/*
8423 		 * Found something; pass back the answer and unlock
8424 		 * the bucket.
8425 		 */
8426 		bind_rdataset(rbtdb, header->node, header, 0,
8427 			      isc_rwlocktype_read, rdataset);
8428 
8429 		if (foundname != NULL) {
8430 			dns_rbt_fullnamefromnode(header->node, foundname);
8431 		}
8432 
8433 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
8434 			    isc_rwlocktype_read);
8435 
8436 		result = ISC_R_SUCCESS;
8437 	}
8438 
8439 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8440 
8441 	return (result);
8442 }
8443 
8444 static void
resigned(dns_db_t * db,dns_rdataset_t * rdataset,dns_dbversion_t * version)8445 resigned(dns_db_t *db, dns_rdataset_t *rdataset, dns_dbversion_t *version) {
8446 	rbtdb_version_t *rbtversion = (rbtdb_version_t *)version;
8447 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8448 	dns_rbtnode_t *node;
8449 	rdatasetheader_t *header;
8450 
8451 	REQUIRE(VALID_RBTDB(rbtdb));
8452 	REQUIRE(rdataset != NULL);
8453 	REQUIRE(rdataset->methods == &rdataset_methods);
8454 	REQUIRE(rbtdb->future_version == rbtversion);
8455 	REQUIRE(rbtversion != NULL);
8456 	REQUIRE(rbtversion->writer);
8457 	REQUIRE(rbtversion->rbtdb == rbtdb);
8458 
8459 	node = rdataset->private2;
8460 	INSIST(node != NULL);
8461 	header = rdataset->private3;
8462 	INSIST(header != NULL);
8463 	header--;
8464 
8465 	if (header->heap_index == 0) {
8466 		return;
8467 	}
8468 
8469 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8470 	NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write);
8471 	/*
8472 	 * Delete from heap and save to re-signed list so that it can
8473 	 * be restored if we backout of this change.
8474 	 */
8475 	resign_delete(rbtdb, rbtversion, header);
8476 	NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock,
8477 		    isc_rwlocktype_write);
8478 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
8479 }
8480 
8481 static isc_result_t
setcachestats(dns_db_t * db,isc_stats_t * stats)8482 setcachestats(dns_db_t *db, isc_stats_t *stats) {
8483 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8484 
8485 	REQUIRE(VALID_RBTDB(rbtdb));
8486 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8487 	REQUIRE(stats != NULL);
8488 
8489 	isc_stats_attach(stats, &rbtdb->cachestats);
8490 	return (ISC_R_SUCCESS);
8491 }
8492 
8493 static isc_result_t
setgluecachestats(dns_db_t * db,isc_stats_t * stats)8494 setgluecachestats(dns_db_t *db, isc_stats_t *stats) {
8495 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8496 
8497 	REQUIRE(VALID_RBTDB(rbtdb));
8498 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
8499 	REQUIRE(stats != NULL);
8500 
8501 	isc_stats_attach(stats, &rbtdb->gluecachestats);
8502 	return (ISC_R_SUCCESS);
8503 }
8504 
8505 static dns_stats_t *
getrrsetstats(dns_db_t * db)8506 getrrsetstats(dns_db_t *db) {
8507 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8508 
8509 	REQUIRE(VALID_RBTDB(rbtdb));
8510 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
8511 
8512 	return (rbtdb->rrsetstats);
8513 }
8514 
8515 static isc_result_t
nodefullname(dns_db_t * db,dns_dbnode_t * node,dns_name_t * name)8516 nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name) {
8517 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8518 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
8519 	isc_result_t result;
8520 
8521 	REQUIRE(VALID_RBTDB(rbtdb));
8522 	REQUIRE(node != NULL);
8523 	REQUIRE(name != NULL);
8524 
8525 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8526 	result = dns_rbt_fullnamefromnode(rbtnode, name);
8527 	RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
8528 
8529 	return (result);
8530 }
8531 
8532 static isc_result_t
setservestalettl(dns_db_t * db,dns_ttl_t ttl)8533 setservestalettl(dns_db_t *db, dns_ttl_t ttl) {
8534 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8535 
8536 	REQUIRE(VALID_RBTDB(rbtdb));
8537 	REQUIRE(IS_CACHE(rbtdb));
8538 
8539 	/* currently no bounds checking.  0 means disable. */
8540 	rbtdb->serve_stale_ttl = ttl;
8541 	return (ISC_R_SUCCESS);
8542 }
8543 
8544 static isc_result_t
getservestalettl(dns_db_t * db,dns_ttl_t * ttl)8545 getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
8546 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8547 
8548 	REQUIRE(VALID_RBTDB(rbtdb));
8549 	REQUIRE(IS_CACHE(rbtdb));
8550 
8551 	*ttl = rbtdb->serve_stale_ttl;
8552 	return (ISC_R_SUCCESS);
8553 }
8554 
8555 static isc_result_t
setservestalerefresh(dns_db_t * db,uint32_t interval)8556 setservestalerefresh(dns_db_t *db, uint32_t interval) {
8557 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8558 
8559 	REQUIRE(VALID_RBTDB(rbtdb));
8560 	REQUIRE(IS_CACHE(rbtdb));
8561 
8562 	/* currently no bounds checking.  0 means disable. */
8563 	rbtdb->serve_stale_refresh = interval;
8564 	return (ISC_R_SUCCESS);
8565 }
8566 
8567 static isc_result_t
getservestalerefresh(dns_db_t * db,uint32_t * interval)8568 getservestalerefresh(dns_db_t *db, uint32_t *interval) {
8569 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
8570 
8571 	REQUIRE(VALID_RBTDB(rbtdb));
8572 	REQUIRE(IS_CACHE(rbtdb));
8573 
8574 	*interval = rbtdb->serve_stale_refresh;
8575 	return (ISC_R_SUCCESS);
8576 }
8577 
8578 static dns_dbmethods_t zone_methods = { attach,
8579 					detach,
8580 					beginload,
8581 					endload,
8582 					serialize,
8583 					dump,
8584 					currentversion,
8585 					newversion,
8586 					attachversion,
8587 					closeversion,
8588 					findnode,
8589 					zone_find,
8590 					zone_findzonecut,
8591 					attachnode,
8592 					detachnode,
8593 					expirenode,
8594 					printnode,
8595 					createiterator,
8596 					zone_findrdataset,
8597 					allrdatasets,
8598 					addrdataset,
8599 					subtractrdataset,
8600 					deleterdataset,
8601 					issecure,
8602 					nodecount,
8603 					ispersistent,
8604 					overmem,
8605 					settask,
8606 					getoriginnode,
8607 					NULL, /* transfernode */
8608 					getnsec3parameters,
8609 					findnsec3node,
8610 					setsigningtime,
8611 					getsigningtime,
8612 					resigned,
8613 					isdnssec,
8614 					NULL, /* getrrsetstats */
8615 					NULL, /* rpz_attach */
8616 					NULL, /* rpz_ready */
8617 					NULL, /* findnodeext */
8618 					NULL, /* findext */
8619 					NULL, /* setcachestats */
8620 					hashsize,
8621 					nodefullname,
8622 					getsize,
8623 					NULL, /* setservestalettl */
8624 					NULL, /* getservestalettl */
8625 					NULL, /* setservestalerefresh */
8626 					NULL, /* getservestalerefresh */
8627 					setgluecachestats,
8628 					adjusthashsize };
8629 
8630 static dns_dbmethods_t cache_methods = { attach,
8631 					 detach,
8632 					 beginload,
8633 					 endload,
8634 					 NULL, /* serialize */
8635 					 dump,
8636 					 currentversion,
8637 					 newversion,
8638 					 attachversion,
8639 					 closeversion,
8640 					 findnode,
8641 					 cache_find,
8642 					 cache_findzonecut,
8643 					 attachnode,
8644 					 detachnode,
8645 					 expirenode,
8646 					 printnode,
8647 					 createiterator,
8648 					 cache_findrdataset,
8649 					 allrdatasets,
8650 					 addrdataset,
8651 					 subtractrdataset,
8652 					 deleterdataset,
8653 					 issecure,
8654 					 nodecount,
8655 					 ispersistent,
8656 					 overmem,
8657 					 settask,
8658 					 getoriginnode,
8659 					 NULL, /* transfernode */
8660 					 NULL, /* getnsec3parameters */
8661 					 NULL, /* findnsec3node */
8662 					 NULL, /* setsigningtime */
8663 					 NULL, /* getsigningtime */
8664 					 NULL, /* resigned */
8665 					 isdnssec,
8666 					 getrrsetstats,
8667 					 NULL, /* rpz_attach */
8668 					 NULL, /* rpz_ready */
8669 					 NULL, /* findnodeext */
8670 					 NULL, /* findext */
8671 					 setcachestats,
8672 					 hashsize,
8673 					 nodefullname,
8674 					 NULL, /* getsize */
8675 					 setservestalettl,
8676 					 getservestalettl,
8677 					 setservestalerefresh,
8678 					 getservestalerefresh,
8679 					 NULL,
8680 					 adjusthashsize };
8681 
8682 isc_result_t
dns_rbtdb_create(isc_mem_t * mctx,const dns_name_t * origin,dns_dbtype_t type,dns_rdataclass_t rdclass,unsigned int argc,char * argv[],void * driverarg,dns_db_t ** dbp)8683 dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
8684 		 dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
8685 		 void *driverarg, dns_db_t **dbp) {
8686 	dns_rbtdb_t *rbtdb;
8687 	isc_result_t result;
8688 	int i;
8689 	dns_name_t name;
8690 	bool (*sooner)(void *, void *);
8691 	isc_mem_t *hmctx = mctx;
8692 
8693 	/* Keep the compiler happy. */
8694 	UNUSED(driverarg);
8695 
8696 	rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
8697 
8698 	/*
8699 	 * If argv[0] exists, it points to a memory context to use for heap
8700 	 */
8701 	if (argc != 0) {
8702 		hmctx = (isc_mem_t *)argv[0];
8703 	}
8704 
8705 	memset(rbtdb, '\0', sizeof(*rbtdb));
8706 	dns_name_init(&rbtdb->common.origin, NULL);
8707 	rbtdb->common.attributes = 0;
8708 	if (type == dns_dbtype_cache) {
8709 		rbtdb->common.methods = &cache_methods;
8710 		rbtdb->common.attributes |= DNS_DBATTR_CACHE;
8711 	} else if (type == dns_dbtype_stub) {
8712 		rbtdb->common.methods = &zone_methods;
8713 		rbtdb->common.attributes |= DNS_DBATTR_STUB;
8714 	} else {
8715 		rbtdb->common.methods = &zone_methods;
8716 	}
8717 	rbtdb->common.rdclass = rdclass;
8718 	rbtdb->common.mctx = NULL;
8719 
8720 	ISC_LIST_INIT(rbtdb->common.update_listeners);
8721 
8722 	RBTDB_INITLOCK(&rbtdb->lock);
8723 
8724 	isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
8725 
8726 	/*
8727 	 * Initialize node_lock_count in a generic way to support future
8728 	 * extension which allows the user to specify this value on creation.
8729 	 * Note that when specified for a cache DB it must be larger than 1
8730 	 * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
8731 	 */
8732 	if (rbtdb->node_lock_count == 0) {
8733 		if (IS_CACHE(rbtdb)) {
8734 			rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
8735 		} else {
8736 			rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
8737 		}
8738 	} else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
8739 		result = ISC_R_RANGE;
8740 		goto cleanup_tree_lock;
8741 	}
8742 	INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
8743 	rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
8744 						      sizeof(rbtdb_nodelock_t));
8745 
8746 	rbtdb->cachestats = NULL;
8747 	rbtdb->gluecachestats = NULL;
8748 
8749 	rbtdb->rrsetstats = NULL;
8750 	if (IS_CACHE(rbtdb)) {
8751 		result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
8752 		if (result != ISC_R_SUCCESS) {
8753 			goto cleanup_node_locks;
8754 		}
8755 		rbtdb->rdatasets = isc_mem_get(
8756 			mctx,
8757 			rbtdb->node_lock_count * sizeof(rdatasetheaderlist_t));
8758 		for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8759 			ISC_LIST_INIT(rbtdb->rdatasets[i]);
8760 		}
8761 	} else {
8762 		rbtdb->rdatasets = NULL;
8763 	}
8764 
8765 	/*
8766 	 * Create the heaps.
8767 	 */
8768 	rbtdb->heaps = isc_mem_get(hmctx, rbtdb->node_lock_count *
8769 						  sizeof(isc_heap_t *));
8770 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8771 		rbtdb->heaps[i] = NULL;
8772 	}
8773 	sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner;
8774 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8775 		isc_heap_create(hmctx, sooner, set_index, 0, &rbtdb->heaps[i]);
8776 	}
8777 
8778 	/*
8779 	 * Create deadnode lists.
8780 	 */
8781 	rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
8782 						     sizeof(rbtnodelist_t));
8783 	for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
8784 		ISC_LIST_INIT(rbtdb->deadnodes[i]);
8785 	}
8786 
8787 	ISC_LIST_INIT(rbtdb->prunenodes);
8788 
8789 	rbtdb->active = rbtdb->node_lock_count;
8790 
8791 	for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
8792 		NODE_INITLOCK(&rbtdb->node_locks[i].lock);
8793 		isc_refcount_init(&rbtdb->node_locks[i].references, 0);
8794 		rbtdb->node_locks[i].exiting = false;
8795 	}
8796 
8797 	/*
8798 	 * Attach to the mctx.  The database will persist so long as there
8799 	 * are references to it, and attaching to the mctx ensures that our
8800 	 * mctx won't disappear out from under us.
8801 	 */
8802 	isc_mem_attach(mctx, &rbtdb->common.mctx);
8803 	isc_mem_attach(hmctx, &rbtdb->hmctx);
8804 
8805 	/*
8806 	 * Make a copy of the origin name.
8807 	 */
8808 	result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
8809 	if (result != ISC_R_SUCCESS) {
8810 		free_rbtdb(rbtdb, false, NULL);
8811 		return (result);
8812 	}
8813 
8814 	/*
8815 	 * Make the Red-Black Trees.
8816 	 */
8817 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
8818 	if (result != ISC_R_SUCCESS) {
8819 		free_rbtdb(rbtdb, false, NULL);
8820 		return (result);
8821 	}
8822 
8823 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec);
8824 	if (result != ISC_R_SUCCESS) {
8825 		free_rbtdb(rbtdb, false, NULL);
8826 		return (result);
8827 	}
8828 
8829 	result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3);
8830 	if (result != ISC_R_SUCCESS) {
8831 		free_rbtdb(rbtdb, false, NULL);
8832 		return (result);
8833 	}
8834 
8835 	/*
8836 	 * In order to set the node callback bit correctly in zone databases,
8837 	 * we need to know if the node has the origin name of the zone.
8838 	 * In loading_addrdataset() we could simply compare the new name
8839 	 * to the origin name, but this is expensive.  Also, we don't know the
8840 	 * node name in addrdataset(), so we need another way of knowing the
8841 	 * zone's top.
8842 	 *
8843 	 * We now explicitly create a node for the zone's origin, and then
8844 	 * we simply remember the node's address.  This is safe, because
8845 	 * the top-of-zone node can never be deleted, nor can its address
8846 	 * change.
8847 	 */
8848 	if (!IS_CACHE(rbtdb)) {
8849 		rbtdb->origin_node = NULL;
8850 		result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
8851 					 &rbtdb->origin_node);
8852 		if (result != ISC_R_SUCCESS) {
8853 			INSIST(result != ISC_R_EXISTS);
8854 			free_rbtdb(rbtdb, false, NULL);
8855 			return (result);
8856 		}
8857 		INSIST(rbtdb->origin_node != NULL);
8858 		rbtdb->origin_node->nsec = DNS_RBT_NSEC_NORMAL;
8859 		/*
8860 		 * We need to give the origin node the right locknum.
8861 		 */
8862 		dns_name_init(&name, NULL);
8863 		dns_rbt_namefromnode(rbtdb->origin_node, &name);
8864 		rbtdb->origin_node->locknum = rbtdb->origin_node->hashval %
8865 					      rbtdb->node_lock_count;
8866 		/*
8867 		 * Add an apex node to the NSEC3 tree so that NSEC3 searches
8868 		 * return partial matches when there is only a single NSEC3
8869 		 * record in the tree.
8870 		 */
8871 		rbtdb->nsec3_origin_node = NULL;
8872 		result = dns_rbt_addnode(rbtdb->nsec3, &rbtdb->common.origin,
8873 					 &rbtdb->nsec3_origin_node);
8874 		if (result != ISC_R_SUCCESS) {
8875 			INSIST(result != ISC_R_EXISTS);
8876 			free_rbtdb(rbtdb, false, NULL);
8877 			return (result);
8878 		}
8879 		rbtdb->nsec3_origin_node->nsec = DNS_RBT_NSEC_NSEC3;
8880 		/*
8881 		 * We need to give the nsec3 origin node the right locknum.
8882 		 */
8883 		dns_name_init(&name, NULL);
8884 		dns_rbt_namefromnode(rbtdb->nsec3_origin_node, &name);
8885 		rbtdb->nsec3_origin_node->locknum =
8886 			rbtdb->nsec3_origin_node->hashval %
8887 			rbtdb->node_lock_count;
8888 	}
8889 
8890 	/*
8891 	 * Misc. Initialization.
8892 	 */
8893 	isc_refcount_init(&rbtdb->references, 1);
8894 	rbtdb->attributes = 0;
8895 	rbtdb->task = NULL;
8896 	rbtdb->serve_stale_ttl = 0;
8897 
8898 	/*
8899 	 * Version Initialization.
8900 	 */
8901 	rbtdb->current_serial = 1;
8902 	rbtdb->least_serial = 1;
8903 	rbtdb->next_serial = 2;
8904 	rbtdb->current_version = allocate_version(mctx, 1, 1, false);
8905 	rbtdb->current_version->rbtdb = rbtdb;
8906 	rbtdb->current_version->secure = dns_db_insecure;
8907 	rbtdb->current_version->havensec3 = false;
8908 	rbtdb->current_version->flags = 0;
8909 	rbtdb->current_version->iterations = 0;
8910 	rbtdb->current_version->hash = 0;
8911 	rbtdb->current_version->salt_length = 0;
8912 	memset(rbtdb->current_version->salt, 0,
8913 	       sizeof(rbtdb->current_version->salt));
8914 	isc_rwlock_init(&rbtdb->current_version->rwlock, 0, 0);
8915 	rbtdb->current_version->records = 0;
8916 	rbtdb->current_version->xfrsize = 0;
8917 	rbtdb->future_version = NULL;
8918 	ISC_LIST_INIT(rbtdb->open_versions);
8919 	/*
8920 	 * Keep the current version in the open list so that list operation
8921 	 * won't happen in normal lookup operations.
8922 	 */
8923 	PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
8924 
8925 	rbtdb->common.magic = DNS_DB_MAGIC;
8926 	rbtdb->common.impmagic = RBTDB_MAGIC;
8927 
8928 	*dbp = (dns_db_t *)rbtdb;
8929 
8930 	return (ISC_R_SUCCESS);
8931 
8932 cleanup_node_locks:
8933 	isc_mem_put(mctx, rbtdb->node_locks,
8934 		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
8935 
8936 cleanup_tree_lock:
8937 	isc_rwlock_destroy(&rbtdb->tree_lock);
8938 	RBTDB_DESTROYLOCK(&rbtdb->lock);
8939 	isc_mem_put(mctx, rbtdb, sizeof(*rbtdb));
8940 	return (result);
8941 }
8942 
8943 /*
8944  * Slabbed Rdataset Methods
8945  */
8946 
8947 static void
rdataset_disassociate(dns_rdataset_t * rdataset)8948 rdataset_disassociate(dns_rdataset_t *rdataset) {
8949 	dns_db_t *db = rdataset->private1;
8950 	dns_dbnode_t *node = rdataset->private2;
8951 
8952 	detachnode(db, &node);
8953 }
8954 
8955 static isc_result_t
rdataset_first(dns_rdataset_t * rdataset)8956 rdataset_first(dns_rdataset_t *rdataset) {
8957 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
8958 	unsigned int count;
8959 
8960 	count = raw[0] * 256 + raw[1];
8961 	if (count == 0) {
8962 		rdataset->private5 = NULL;
8963 		return (ISC_R_NOMORE);
8964 	}
8965 
8966 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
8967 		raw += DNS_RDATASET_COUNT;
8968 	}
8969 
8970 	raw += DNS_RDATASET_LENGTH;
8971 
8972 	/*
8973 	 * The privateuint4 field is the number of rdata beyond the
8974 	 * cursor position, so we decrement the total count by one
8975 	 * before storing it.
8976 	 *
8977 	 * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
8978 	 * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
8979 	 * to the first entry in the offset table.
8980 	 */
8981 	count--;
8982 	rdataset->privateuint4 = count;
8983 	rdataset->private5 = raw;
8984 
8985 	return (ISC_R_SUCCESS);
8986 }
8987 
8988 static isc_result_t
rdataset_next(dns_rdataset_t * rdataset)8989 rdataset_next(dns_rdataset_t *rdataset) {
8990 	unsigned int count;
8991 	unsigned int length;
8992 	unsigned char *raw; /* RDATASLAB */
8993 
8994 	count = rdataset->privateuint4;
8995 	if (count == 0) {
8996 		return (ISC_R_NOMORE);
8997 	}
8998 	count--;
8999 	rdataset->privateuint4 = count;
9000 
9001 	/*
9002 	 * Skip forward one record (length + 4) or one offset (4).
9003 	 */
9004 	raw = rdataset->private5;
9005 #if DNS_RDATASET_FIXED
9006 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
9007 #endif /* DNS_RDATASET_FIXED */
9008 	{
9009 		length = raw[0] * 256 + raw[1];
9010 		raw += length;
9011 	}
9012 
9013 	rdataset->private5 = raw + DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
9014 
9015 	return (ISC_R_SUCCESS);
9016 }
9017 
9018 static void
rdataset_current(dns_rdataset_t * rdataset,dns_rdata_t * rdata)9019 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
9020 	unsigned char *raw = rdataset->private5; /* RDATASLAB */
9021 	unsigned int length;
9022 	isc_region_t r;
9023 	unsigned int flags = 0;
9024 
9025 	REQUIRE(raw != NULL);
9026 
9027 	/*
9028 	 * Find the start of the record if not already in private5
9029 	 * then skip the length and order fields.
9030 	 */
9031 #if DNS_RDATASET_FIXED
9032 	if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
9033 		unsigned int offset;
9034 		offset = ((unsigned int)raw[0] << 24) +
9035 			 ((unsigned int)raw[1] << 16) +
9036 			 ((unsigned int)raw[2] << 8) + (unsigned int)raw[3];
9037 		raw = rdataset->private3;
9038 		raw += offset;
9039 	}
9040 #endif /* if DNS_RDATASET_FIXED */
9041 
9042 	length = raw[0] * 256 + raw[1];
9043 
9044 	raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH;
9045 
9046 	if (rdataset->type == dns_rdatatype_rrsig) {
9047 		if (*raw & DNS_RDATASLAB_OFFLINE) {
9048 			flags |= DNS_RDATA_OFFLINE;
9049 		}
9050 		length--;
9051 		raw++;
9052 	}
9053 	r.length = length;
9054 	r.base = raw;
9055 	dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
9056 	rdata->flags |= flags;
9057 }
9058 
9059 static void
rdataset_clone(dns_rdataset_t * source,dns_rdataset_t * target)9060 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
9061 	dns_db_t *db = source->private1;
9062 	dns_dbnode_t *node = source->private2;
9063 	dns_dbnode_t *cloned_node = NULL;
9064 
9065 	attachnode(db, node, &cloned_node);
9066 	INSIST(!ISC_LINK_LINKED(target, link));
9067 	*target = *source;
9068 	ISC_LINK_INIT(target, link);
9069 
9070 	/*
9071 	 * Reset iterator state.
9072 	 */
9073 	target->privateuint4 = 0;
9074 	target->private5 = NULL;
9075 }
9076 
9077 static unsigned int
rdataset_count(dns_rdataset_t * rdataset)9078 rdataset_count(dns_rdataset_t *rdataset) {
9079 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9080 	unsigned int count;
9081 
9082 	count = raw[0] * 256 + raw[1];
9083 
9084 	return (count);
9085 }
9086 
9087 static isc_result_t
rdataset_getnoqname(dns_rdataset_t * rdataset,dns_name_t * name,dns_rdataset_t * nsec,dns_rdataset_t * nsecsig)9088 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
9089 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
9090 	dns_db_t *db = rdataset->private1;
9091 	dns_dbnode_t *node = rdataset->private2;
9092 	dns_dbnode_t *cloned_node;
9093 	const struct noqname *noqname = rdataset->private6;
9094 
9095 	cloned_node = NULL;
9096 	attachnode(db, node, &cloned_node);
9097 	nsec->methods = &slab_methods;
9098 	nsec->rdclass = db->rdclass;
9099 	nsec->type = noqname->type;
9100 	nsec->covers = 0;
9101 	nsec->ttl = rdataset->ttl;
9102 	nsec->trust = rdataset->trust;
9103 	nsec->private1 = rdataset->private1;
9104 	nsec->private2 = rdataset->private2;
9105 	nsec->private3 = noqname->neg;
9106 	nsec->privateuint4 = 0;
9107 	nsec->private5 = NULL;
9108 	nsec->private6 = NULL;
9109 	nsec->private7 = NULL;
9110 
9111 	cloned_node = NULL;
9112 	attachnode(db, node, &cloned_node);
9113 	nsecsig->methods = &slab_methods;
9114 	nsecsig->rdclass = db->rdclass;
9115 	nsecsig->type = dns_rdatatype_rrsig;
9116 	nsecsig->covers = noqname->type;
9117 	nsecsig->ttl = rdataset->ttl;
9118 	nsecsig->trust = rdataset->trust;
9119 	nsecsig->private1 = rdataset->private1;
9120 	nsecsig->private2 = rdataset->private2;
9121 	nsecsig->private3 = noqname->negsig;
9122 	nsecsig->privateuint4 = 0;
9123 	nsecsig->private5 = NULL;
9124 	nsec->private6 = NULL;
9125 	nsec->private7 = NULL;
9126 
9127 	dns_name_clone(&noqname->name, name);
9128 
9129 	return (ISC_R_SUCCESS);
9130 }
9131 
9132 static isc_result_t
rdataset_getclosest(dns_rdataset_t * rdataset,dns_name_t * name,dns_rdataset_t * nsec,dns_rdataset_t * nsecsig)9133 rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
9134 		    dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) {
9135 	dns_db_t *db = rdataset->private1;
9136 	dns_dbnode_t *node = rdataset->private2;
9137 	dns_dbnode_t *cloned_node;
9138 	const struct noqname *closest = rdataset->private7;
9139 
9140 	cloned_node = NULL;
9141 	attachnode(db, node, &cloned_node);
9142 	nsec->methods = &slab_methods;
9143 	nsec->rdclass = db->rdclass;
9144 	nsec->type = closest->type;
9145 	nsec->covers = 0;
9146 	nsec->ttl = rdataset->ttl;
9147 	nsec->trust = rdataset->trust;
9148 	nsec->private1 = rdataset->private1;
9149 	nsec->private2 = rdataset->private2;
9150 	nsec->private3 = closest->neg;
9151 	nsec->privateuint4 = 0;
9152 	nsec->private5 = NULL;
9153 	nsec->private6 = NULL;
9154 	nsec->private7 = NULL;
9155 
9156 	cloned_node = NULL;
9157 	attachnode(db, node, &cloned_node);
9158 	nsecsig->methods = &slab_methods;
9159 	nsecsig->rdclass = db->rdclass;
9160 	nsecsig->type = dns_rdatatype_rrsig;
9161 	nsecsig->covers = closest->type;
9162 	nsecsig->ttl = rdataset->ttl;
9163 	nsecsig->trust = rdataset->trust;
9164 	nsecsig->private1 = rdataset->private1;
9165 	nsecsig->private2 = rdataset->private2;
9166 	nsecsig->private3 = closest->negsig;
9167 	nsecsig->privateuint4 = 0;
9168 	nsecsig->private5 = NULL;
9169 	nsec->private6 = NULL;
9170 	nsec->private7 = NULL;
9171 
9172 	dns_name_clone(&closest->name, name);
9173 
9174 	return (ISC_R_SUCCESS);
9175 }
9176 
9177 static void
rdataset_settrust(dns_rdataset_t * rdataset,dns_trust_t trust)9178 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust) {
9179 	dns_rbtdb_t *rbtdb = rdataset->private1;
9180 	dns_rbtnode_t *rbtnode = rdataset->private2;
9181 	rdatasetheader_t *header = rdataset->private3;
9182 
9183 	header--;
9184 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9185 		  isc_rwlocktype_write);
9186 	header->trust = rdataset->trust = trust;
9187 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9188 		    isc_rwlocktype_write);
9189 }
9190 
9191 static void
rdataset_expire(dns_rdataset_t * rdataset)9192 rdataset_expire(dns_rdataset_t *rdataset) {
9193 	dns_rbtdb_t *rbtdb = rdataset->private1;
9194 	dns_rbtnode_t *rbtnode = rdataset->private2;
9195 	rdatasetheader_t *header = rdataset->private3;
9196 
9197 	header--;
9198 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9199 		  isc_rwlocktype_write);
9200 	expire_header(rbtdb, header, false, expire_flush);
9201 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9202 		    isc_rwlocktype_write);
9203 }
9204 
9205 static void
rdataset_clearprefetch(dns_rdataset_t * rdataset)9206 rdataset_clearprefetch(dns_rdataset_t *rdataset) {
9207 	dns_rbtdb_t *rbtdb = rdataset->private1;
9208 	dns_rbtnode_t *rbtnode = rdataset->private2;
9209 	rdatasetheader_t *header = rdataset->private3;
9210 
9211 	header--;
9212 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9213 		  isc_rwlocktype_write);
9214 	RDATASET_ATTR_CLR(header, RDATASET_ATTR_PREFETCH);
9215 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9216 		    isc_rwlocktype_write);
9217 }
9218 
9219 /*
9220  * Rdataset Iterator Methods
9221  */
9222 
9223 static void
rdatasetiter_destroy(dns_rdatasetiter_t ** iteratorp)9224 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
9225 	rbtdb_rdatasetiter_t *rbtiterator;
9226 
9227 	rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
9228 
9229 	if (rbtiterator->common.version != NULL) {
9230 		closeversion(rbtiterator->common.db,
9231 			     &rbtiterator->common.version, false);
9232 	}
9233 	detachnode(rbtiterator->common.db, &rbtiterator->common.node);
9234 	isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
9235 		    sizeof(*rbtiterator));
9236 
9237 	*iteratorp = NULL;
9238 }
9239 
9240 static bool
iterator_active(dns_rbtdb_t * rbtdb,rbtdb_rdatasetiter_t * rbtiterator,rdatasetheader_t * header)9241 iterator_active(dns_rbtdb_t *rbtdb, rbtdb_rdatasetiter_t *rbtiterator,
9242 		rdatasetheader_t *header) {
9243 	dns_ttl_t stale_ttl = header->rdh_ttl + rbtdb->serve_stale_ttl;
9244 
9245 	/*
9246 	 * Is this a "this rdataset doesn't exist" record?
9247 	 */
9248 	if (NONEXISTENT(header)) {
9249 		return (false);
9250 	}
9251 
9252 	/*
9253 	 * If this is a zone or this header still active then return it.
9254 	 */
9255 	if (!IS_CACHE(rbtdb) || ACTIVE(header, rbtiterator->common.now)) {
9256 		return (true);
9257 	}
9258 
9259 	/*
9260 	 * If we are not returning stale records or the rdataset is
9261 	 * too old don't return it.
9262 	 */
9263 	if (!STALEOK(rbtiterator) || (rbtiterator->common.now > stale_ttl)) {
9264 		return (false);
9265 	}
9266 	return (true);
9267 }
9268 
9269 static isc_result_t
rdatasetiter_first(dns_rdatasetiter_t * iterator)9270 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
9271 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9272 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9273 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9274 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
9275 	rdatasetheader_t *header, *top_next;
9276 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
9277 
9278 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9279 		  isc_rwlocktype_read);
9280 
9281 	for (header = rbtnode->data; header != NULL; header = top_next) {
9282 		top_next = header->next;
9283 		do {
9284 			if (EXPIREDOK(rbtiterator)) {
9285 				if (!NONEXISTENT(header)) {
9286 					break;
9287 				}
9288 				header = header->down;
9289 			} else if (header->serial <= serial && !IGNORE(header))
9290 			{
9291 				if (!iterator_active(rbtdb, rbtiterator,
9292 						     header))
9293 				{
9294 					header = NULL;
9295 				}
9296 				break;
9297 			} else {
9298 				header = header->down;
9299 			}
9300 		} while (header != NULL);
9301 		if (header != NULL) {
9302 			break;
9303 		}
9304 	}
9305 
9306 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9307 		    isc_rwlocktype_read);
9308 
9309 	rbtiterator->current = header;
9310 
9311 	if (header == NULL) {
9312 		return (ISC_R_NOMORE);
9313 	}
9314 
9315 	return (ISC_R_SUCCESS);
9316 }
9317 
9318 static isc_result_t
rdatasetiter_next(dns_rdatasetiter_t * iterator)9319 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
9320 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9321 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9322 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9323 	rbtdb_version_t *rbtversion = rbtiterator->common.version;
9324 	rdatasetheader_t *header, *top_next;
9325 	rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial;
9326 	rbtdb_rdatatype_t type, negtype;
9327 	dns_rdatatype_t rdtype, covers;
9328 	bool expiredok = EXPIREDOK(rbtiterator);
9329 
9330 	header = rbtiterator->current;
9331 	if (header == NULL) {
9332 		return (ISC_R_NOMORE);
9333 	}
9334 
9335 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9336 		  isc_rwlocktype_read);
9337 
9338 	type = header->type;
9339 	rdtype = RBTDB_RDATATYPE_BASE(header->type);
9340 	if (NEGATIVE(header)) {
9341 		covers = RBTDB_RDATATYPE_EXT(header->type);
9342 		negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
9343 	} else {
9344 		negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
9345 	}
9346 
9347 	/*
9348 	 * Find the start of the header chain for the next type
9349 	 * by walking back up the list.
9350 	 */
9351 	top_next = header->next;
9352 	while (top_next != NULL &&
9353 	       (top_next->type == type || top_next->type == negtype))
9354 	{
9355 		top_next = top_next->next;
9356 	}
9357 	if (expiredok) {
9358 		/*
9359 		 * Keep walking down the list if possible or
9360 		 * start the next type.
9361 		 */
9362 		header = header->down != NULL ? header->down : top_next;
9363 	} else {
9364 		header = top_next;
9365 	}
9366 	for (; header != NULL; header = top_next) {
9367 		top_next = header->next;
9368 		do {
9369 			if (expiredok) {
9370 				if (!NONEXISTENT(header)) {
9371 					break;
9372 				}
9373 				header = header->down;
9374 			} else if (header->serial <= serial && !IGNORE(header))
9375 			{
9376 				if (!iterator_active(rbtdb, rbtiterator,
9377 						     header))
9378 				{
9379 					header = NULL;
9380 				}
9381 				break;
9382 			} else {
9383 				header = header->down;
9384 			}
9385 		} while (header != NULL);
9386 		if (header != NULL) {
9387 			break;
9388 		}
9389 		/*
9390 		 * Find the start of the header chain for the next type
9391 		 * by walking back up the list.
9392 		 */
9393 		while (top_next != NULL &&
9394 		       (top_next->type == type || top_next->type == negtype))
9395 		{
9396 			top_next = top_next->next;
9397 		}
9398 	}
9399 
9400 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9401 		    isc_rwlocktype_read);
9402 
9403 	rbtiterator->current = header;
9404 
9405 	if (header == NULL) {
9406 		return (ISC_R_NOMORE);
9407 	}
9408 
9409 	return (ISC_R_SUCCESS);
9410 }
9411 
9412 static void
rdatasetiter_current(dns_rdatasetiter_t * iterator,dns_rdataset_t * rdataset)9413 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
9414 	rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
9415 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
9416 	dns_rbtnode_t *rbtnode = rbtiterator->common.node;
9417 	rdatasetheader_t *header;
9418 
9419 	header = rbtiterator->current;
9420 	REQUIRE(header != NULL);
9421 
9422 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9423 		  isc_rwlocktype_read);
9424 
9425 	bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
9426 		      isc_rwlocktype_read, rdataset);
9427 
9428 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9429 		    isc_rwlocktype_read);
9430 }
9431 
9432 /*
9433  * Database Iterator Methods
9434  */
9435 
9436 static void
reference_iter_node(rbtdb_dbiterator_t * rbtdbiter)9437 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9438 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9439 	dns_rbtnode_t *node = rbtdbiter->node;
9440 
9441 	if (node == NULL) {
9442 		return;
9443 	}
9444 
9445 	INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
9446 	reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
9447 }
9448 
9449 static void
dereference_iter_node(rbtdb_dbiterator_t * rbtdbiter)9450 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
9451 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9452 	dns_rbtnode_t *node = rbtdbiter->node;
9453 	nodelock_t *lock;
9454 
9455 	if (node == NULL) {
9456 		return;
9457 	}
9458 
9459 	lock = &rbtdb->node_locks[node->locknum].lock;
9460 	NODE_LOCK(lock, isc_rwlocktype_read);
9461 	decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9462 			    rbtdbiter->tree_locked, false);
9463 	NODE_UNLOCK(lock, isc_rwlocktype_read);
9464 
9465 	rbtdbiter->node = NULL;
9466 }
9467 
9468 static void
flush_deletions(rbtdb_dbiterator_t * rbtdbiter)9469 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
9470 	dns_rbtnode_t *node;
9471 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9472 	bool was_read_locked = false;
9473 	nodelock_t *lock;
9474 	int i;
9475 
9476 	if (rbtdbiter->delcnt != 0) {
9477 		/*
9478 		 * Note that "%d node of %d in tree" can report things like
9479 		 * "flush_deletions: 59 nodes of 41 in tree".  This means
9480 		 * That some nodes appear on the deletions list more than
9481 		 * once.  Only the last occurrence will actually be deleted.
9482 		 */
9483 		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
9484 			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
9485 			      "flush_deletions: %d nodes of %d in tree",
9486 			      rbtdbiter->delcnt,
9487 			      dns_rbt_nodecount(rbtdb->tree));
9488 
9489 		if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9490 			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9491 			was_read_locked = true;
9492 		}
9493 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9494 		rbtdbiter->tree_locked = isc_rwlocktype_write;
9495 
9496 		for (i = 0; i < rbtdbiter->delcnt; i++) {
9497 			node = rbtdbiter->deletions[i];
9498 			lock = &rbtdb->node_locks[node->locknum].lock;
9499 
9500 			NODE_LOCK(lock, isc_rwlocktype_read);
9501 			decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
9502 					    rbtdbiter->tree_locked, false);
9503 			NODE_UNLOCK(lock, isc_rwlocktype_read);
9504 		}
9505 
9506 		rbtdbiter->delcnt = 0;
9507 
9508 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
9509 		if (was_read_locked) {
9510 			RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9511 			rbtdbiter->tree_locked = isc_rwlocktype_read;
9512 		} else {
9513 			rbtdbiter->tree_locked = isc_rwlocktype_none;
9514 		}
9515 	}
9516 }
9517 
9518 static void
resume_iteration(rbtdb_dbiterator_t * rbtdbiter)9519 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
9520 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9521 
9522 	REQUIRE(rbtdbiter->paused);
9523 	REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
9524 
9525 	RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9526 	rbtdbiter->tree_locked = isc_rwlocktype_read;
9527 
9528 	rbtdbiter->paused = false;
9529 }
9530 
9531 static void
dbiterator_destroy(dns_dbiterator_t ** iteratorp)9532 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
9533 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
9534 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
9535 	dns_db_t *db = NULL;
9536 
9537 	if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
9538 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9539 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9540 	} else {
9541 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
9542 	}
9543 
9544 	dereference_iter_node(rbtdbiter);
9545 
9546 	flush_deletions(rbtdbiter);
9547 
9548 	dns_db_attach(rbtdbiter->common.db, &db);
9549 	dns_db_detach(&rbtdbiter->common.db);
9550 
9551 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9552 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9553 	isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
9554 	dns_db_detach(&db);
9555 
9556 	*iteratorp = NULL;
9557 }
9558 
9559 static isc_result_t
dbiterator_first(dns_dbiterator_t * iterator)9560 dbiterator_first(dns_dbiterator_t *iterator) {
9561 	isc_result_t result;
9562 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9563 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9564 	dns_name_t *name, *origin;
9565 
9566 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9567 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9568 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9569 	    rbtdbiter->result != ISC_R_NOMORE)
9570 	{
9571 		return (rbtdbiter->result);
9572 	}
9573 
9574 	if (rbtdbiter->paused) {
9575 		resume_iteration(rbtdbiter);
9576 	}
9577 
9578 	dereference_iter_node(rbtdbiter);
9579 
9580 	name = dns_fixedname_name(&rbtdbiter->name);
9581 	origin = dns_fixedname_name(&rbtdbiter->origin);
9582 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9583 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9584 
9585 	if (rbtdbiter->nsec3only) {
9586 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9587 		result = dns_rbtnodechain_first(rbtdbiter->current,
9588 						rbtdb->nsec3, name, origin);
9589 	} else {
9590 		rbtdbiter->current = &rbtdbiter->chain;
9591 		result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree,
9592 						name, origin);
9593 		if (!rbtdbiter->nonsec3 && result == ISC_R_NOTFOUND) {
9594 			rbtdbiter->current = &rbtdbiter->nsec3chain;
9595 			result = dns_rbtnodechain_first(
9596 				rbtdbiter->current, rbtdb->nsec3, name, origin);
9597 		}
9598 	}
9599 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9600 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9601 						  NULL, &rbtdbiter->node);
9602 		if (result == ISC_R_SUCCESS) {
9603 			rbtdbiter->new_origin = true;
9604 			reference_iter_node(rbtdbiter);
9605 		}
9606 	} else {
9607 		INSIST(result == ISC_R_NOTFOUND);
9608 		result = ISC_R_NOMORE; /* The tree is empty. */
9609 	}
9610 
9611 	rbtdbiter->result = result;
9612 
9613 	if (result != ISC_R_SUCCESS) {
9614 		ENSURE(!rbtdbiter->paused);
9615 	}
9616 
9617 	return (result);
9618 }
9619 
9620 static isc_result_t
dbiterator_last(dns_dbiterator_t * iterator)9621 dbiterator_last(dns_dbiterator_t *iterator) {
9622 	isc_result_t result;
9623 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9624 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9625 	dns_name_t *name, *origin;
9626 
9627 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9628 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9629 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9630 	    rbtdbiter->result != ISC_R_NOMORE)
9631 	{
9632 		return (rbtdbiter->result);
9633 	}
9634 
9635 	if (rbtdbiter->paused) {
9636 		resume_iteration(rbtdbiter);
9637 	}
9638 
9639 	dereference_iter_node(rbtdbiter);
9640 
9641 	name = dns_fixedname_name(&rbtdbiter->name);
9642 	origin = dns_fixedname_name(&rbtdbiter->origin);
9643 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9644 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9645 
9646 	result = ISC_R_NOTFOUND;
9647 	if (rbtdbiter->nsec3only && !rbtdbiter->nonsec3) {
9648 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9649 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3,
9650 					       name, origin);
9651 	}
9652 	if (!rbtdbiter->nsec3only && result == ISC_R_NOTFOUND) {
9653 		rbtdbiter->current = &rbtdbiter->chain;
9654 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9655 					       name, origin);
9656 	}
9657 	if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
9658 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9659 						  NULL, &rbtdbiter->node);
9660 		if (result == ISC_R_SUCCESS) {
9661 			rbtdbiter->new_origin = true;
9662 			reference_iter_node(rbtdbiter);
9663 		}
9664 	} else {
9665 		INSIST(result == ISC_R_NOTFOUND);
9666 		result = ISC_R_NOMORE; /* The tree is empty. */
9667 	}
9668 
9669 	rbtdbiter->result = result;
9670 
9671 	return (result);
9672 }
9673 
9674 static isc_result_t
dbiterator_seek(dns_dbiterator_t * iterator,const dns_name_t * name)9675 dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name) {
9676 	isc_result_t result, tresult;
9677 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9678 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9679 	dns_name_t *iname, *origin;
9680 
9681 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9682 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9683 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9684 	    rbtdbiter->result != ISC_R_NOMORE)
9685 	{
9686 		return (rbtdbiter->result);
9687 	}
9688 
9689 	if (rbtdbiter->paused) {
9690 		resume_iteration(rbtdbiter);
9691 	}
9692 
9693 	dereference_iter_node(rbtdbiter);
9694 
9695 	iname = dns_fixedname_name(&rbtdbiter->name);
9696 	origin = dns_fixedname_name(&rbtdbiter->origin);
9697 	dns_rbtnodechain_reset(&rbtdbiter->chain);
9698 	dns_rbtnodechain_reset(&rbtdbiter->nsec3chain);
9699 
9700 	if (rbtdbiter->nsec3only) {
9701 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9702 		result = dns_rbt_findnode(rbtdb->nsec3, name, NULL,
9703 					  &rbtdbiter->node, rbtdbiter->current,
9704 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9705 	} else if (rbtdbiter->nonsec3) {
9706 		rbtdbiter->current = &rbtdbiter->chain;
9707 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9708 					  &rbtdbiter->node, rbtdbiter->current,
9709 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9710 	} else {
9711 		/*
9712 		 * Stay on main chain if not found on either chain.
9713 		 */
9714 		rbtdbiter->current = &rbtdbiter->chain;
9715 		result = dns_rbt_findnode(rbtdb->tree, name, NULL,
9716 					  &rbtdbiter->node, rbtdbiter->current,
9717 					  DNS_RBTFIND_EMPTYDATA, NULL, NULL);
9718 		if (result == DNS_R_PARTIALMATCH) {
9719 			dns_rbtnode_t *node = NULL;
9720 			tresult = dns_rbt_findnode(
9721 				rbtdb->nsec3, name, NULL, &node,
9722 				&rbtdbiter->nsec3chain, DNS_RBTFIND_EMPTYDATA,
9723 				NULL, NULL);
9724 			if (tresult == ISC_R_SUCCESS) {
9725 				rbtdbiter->node = node;
9726 				rbtdbiter->current = &rbtdbiter->nsec3chain;
9727 				result = tresult;
9728 			}
9729 		}
9730 	}
9731 
9732 	if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) {
9733 		tresult = dns_rbtnodechain_current(rbtdbiter->current, iname,
9734 						   origin, NULL);
9735 		if (tresult == ISC_R_SUCCESS) {
9736 			rbtdbiter->new_origin = true;
9737 			reference_iter_node(rbtdbiter);
9738 		} else {
9739 			result = tresult;
9740 			rbtdbiter->node = NULL;
9741 		}
9742 	} else {
9743 		rbtdbiter->node = NULL;
9744 	}
9745 
9746 	rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? ISC_R_SUCCESS
9747 							   : result;
9748 
9749 	return (result);
9750 }
9751 
9752 static isc_result_t
dbiterator_prev(dns_dbiterator_t * iterator)9753 dbiterator_prev(dns_dbiterator_t *iterator) {
9754 	isc_result_t result;
9755 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9756 	dns_name_t *name, *origin;
9757 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9758 
9759 	REQUIRE(rbtdbiter->node != NULL);
9760 
9761 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9762 		return (rbtdbiter->result);
9763 	}
9764 
9765 	if (rbtdbiter->paused) {
9766 		resume_iteration(rbtdbiter);
9767 	}
9768 
9769 	name = dns_fixedname_name(&rbtdbiter->name);
9770 	origin = dns_fixedname_name(&rbtdbiter->origin);
9771 	result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin);
9772 	if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only &&
9773 	    !rbtdbiter->nonsec3 && &rbtdbiter->nsec3chain == rbtdbiter->current)
9774 	{
9775 		rbtdbiter->current = &rbtdbiter->chain;
9776 		dns_rbtnodechain_reset(rbtdbiter->current);
9777 		result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree,
9778 					       name, origin);
9779 		if (result == ISC_R_NOTFOUND) {
9780 			result = ISC_R_NOMORE;
9781 		}
9782 	}
9783 
9784 	dereference_iter_node(rbtdbiter);
9785 
9786 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9787 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9788 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9789 						  NULL, &rbtdbiter->node);
9790 	}
9791 
9792 	if (result == ISC_R_SUCCESS) {
9793 		reference_iter_node(rbtdbiter);
9794 	}
9795 
9796 	rbtdbiter->result = result;
9797 
9798 	return (result);
9799 }
9800 
9801 static isc_result_t
dbiterator_next(dns_dbiterator_t * iterator)9802 dbiterator_next(dns_dbiterator_t *iterator) {
9803 	isc_result_t result;
9804 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9805 	dns_name_t *name, *origin;
9806 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9807 
9808 	REQUIRE(rbtdbiter->node != NULL);
9809 
9810 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9811 		return (rbtdbiter->result);
9812 	}
9813 
9814 	if (rbtdbiter->paused) {
9815 		resume_iteration(rbtdbiter);
9816 	}
9817 
9818 	name = dns_fixedname_name(&rbtdbiter->name);
9819 	origin = dns_fixedname_name(&rbtdbiter->origin);
9820 	result = dns_rbtnodechain_next(rbtdbiter->current, name, origin);
9821 	if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only &&
9822 	    !rbtdbiter->nonsec3 && &rbtdbiter->chain == rbtdbiter->current)
9823 	{
9824 		rbtdbiter->current = &rbtdbiter->nsec3chain;
9825 		dns_rbtnodechain_reset(rbtdbiter->current);
9826 		result = dns_rbtnodechain_first(rbtdbiter->current,
9827 						rbtdb->nsec3, name, origin);
9828 		if (result == ISC_R_NOTFOUND) {
9829 			result = ISC_R_NOMORE;
9830 		}
9831 	}
9832 
9833 	dereference_iter_node(rbtdbiter);
9834 
9835 	if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
9836 		rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN);
9837 		result = dns_rbtnodechain_current(rbtdbiter->current, NULL,
9838 						  NULL, &rbtdbiter->node);
9839 	}
9840 	if (result == ISC_R_SUCCESS) {
9841 		reference_iter_node(rbtdbiter);
9842 	}
9843 
9844 	rbtdbiter->result = result;
9845 
9846 	return (result);
9847 }
9848 
9849 static isc_result_t
dbiterator_current(dns_dbiterator_t * iterator,dns_dbnode_t ** nodep,dns_name_t * name)9850 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
9851 		   dns_name_t *name) {
9852 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9853 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9854 	dns_rbtnode_t *node = rbtdbiter->node;
9855 	isc_result_t result;
9856 	dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
9857 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9858 
9859 	REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
9860 	REQUIRE(rbtdbiter->node != NULL);
9861 
9862 	if (rbtdbiter->paused) {
9863 		resume_iteration(rbtdbiter);
9864 	}
9865 
9866 	if (name != NULL) {
9867 		if (rbtdbiter->common.relative_names) {
9868 			origin = NULL;
9869 		}
9870 		result = dns_name_concatenate(nodename, origin, name, NULL);
9871 		if (result != ISC_R_SUCCESS) {
9872 			return (result);
9873 		}
9874 		if (rbtdbiter->common.relative_names && rbtdbiter->new_origin) {
9875 			result = DNS_R_NEWORIGIN;
9876 		}
9877 	} else {
9878 		result = ISC_R_SUCCESS;
9879 	}
9880 
9881 	new_reference(rbtdb, node, isc_rwlocktype_none);
9882 
9883 	*nodep = rbtdbiter->node;
9884 
9885 	if (iterator->cleaning && result == ISC_R_SUCCESS) {
9886 		isc_result_t expire_result;
9887 
9888 		/*
9889 		 * If the deletion array is full, flush it before trying
9890 		 * to expire the current node.  The current node can't
9891 		 * fully deleted while the iteration cursor is still on it.
9892 		 */
9893 		if (rbtdbiter->delcnt == DELETION_BATCH_MAX) {
9894 			flush_deletions(rbtdbiter);
9895 		}
9896 
9897 		expire_result = expirenode(iterator->db, *nodep, 0);
9898 
9899 		/*
9900 		 * expirenode() currently always returns success.
9901 		 */
9902 		if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
9903 			rbtdbiter->deletions[rbtdbiter->delcnt++] = node;
9904 			isc_refcount_increment(&node->references);
9905 		}
9906 	}
9907 
9908 	return (result);
9909 }
9910 
9911 static isc_result_t
dbiterator_pause(dns_dbiterator_t * iterator)9912 dbiterator_pause(dns_dbiterator_t *iterator) {
9913 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
9914 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9915 
9916 	if (rbtdbiter->result != ISC_R_SUCCESS &&
9917 	    rbtdbiter->result != ISC_R_NOTFOUND &&
9918 	    rbtdbiter->result != DNS_R_PARTIALMATCH &&
9919 	    rbtdbiter->result != ISC_R_NOMORE)
9920 	{
9921 		return (rbtdbiter->result);
9922 	}
9923 
9924 	if (rbtdbiter->paused) {
9925 		return (ISC_R_SUCCESS);
9926 	}
9927 
9928 	rbtdbiter->paused = true;
9929 
9930 	if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
9931 		INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
9932 		RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
9933 		rbtdbiter->tree_locked = isc_rwlocktype_none;
9934 	}
9935 
9936 	flush_deletions(rbtdbiter);
9937 
9938 	return (ISC_R_SUCCESS);
9939 }
9940 
9941 static isc_result_t
dbiterator_origin(dns_dbiterator_t * iterator,dns_name_t * name)9942 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
9943 	rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
9944 	dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
9945 
9946 	if (rbtdbiter->result != ISC_R_SUCCESS) {
9947 		return (rbtdbiter->result);
9948 	}
9949 
9950 	dns_name_copynf(origin, name);
9951 	return (ISC_R_SUCCESS);
9952 }
9953 
9954 static void
setownercase(rdatasetheader_t * header,const dns_name_t * name)9955 setownercase(rdatasetheader_t *header, const dns_name_t *name) {
9956 	unsigned int i;
9957 	bool fully_lower;
9958 
9959 	/*
9960 	 * We do not need to worry about label lengths as they are all
9961 	 * less than or equal to 63.
9962 	 */
9963 	memset(header->upper, 0, sizeof(header->upper));
9964 	fully_lower = true;
9965 	for (i = 0; i < name->length; i++) {
9966 		if (isupper(name->ndata[i])) {
9967 			header->upper[i / 8] |= 1 << (i % 8);
9968 			fully_lower = false;
9969 		}
9970 	}
9971 	RDATASET_ATTR_SET(header, RDATASET_ATTR_CASESET);
9972 	if (ISC_LIKELY(fully_lower)) {
9973 		RDATASET_ATTR_SET(header, RDATASET_ATTR_CASEFULLYLOWER);
9974 	}
9975 }
9976 
9977 static void
rdataset_setownercase(dns_rdataset_t * rdataset,const dns_name_t * name)9978 rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name) {
9979 	dns_rbtdb_t *rbtdb = rdataset->private1;
9980 	dns_rbtnode_t *rbtnode = rdataset->private2;
9981 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9982 	rdatasetheader_t *header;
9983 
9984 	header = (struct rdatasetheader *)(raw - sizeof(*header));
9985 
9986 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9987 		  isc_rwlocktype_write);
9988 	setownercase(header, name);
9989 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
9990 		    isc_rwlocktype_write);
9991 }
9992 
9993 static void
rdataset_getownercase(const dns_rdataset_t * rdataset,dns_name_t * name)9994 rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name) {
9995 	dns_rbtdb_t *rbtdb = rdataset->private1;
9996 	dns_rbtnode_t *rbtnode = rdataset->private2;
9997 	unsigned char *raw = rdataset->private3; /* RDATASLAB */
9998 	rdatasetheader_t *header = NULL;
9999 	uint8_t mask = (1 << 7);
10000 	uint8_t bits = 0;
10001 
10002 	header = (struct rdatasetheader *)(raw - sizeof(*header));
10003 
10004 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
10005 		  isc_rwlocktype_read);
10006 
10007 	if (!CASESET(header)) {
10008 		goto unlock;
10009 	}
10010 
10011 	if (ISC_LIKELY(CASEFULLYLOWER(header))) {
10012 		for (size_t i = 0; i < name->length; i++) {
10013 			name->ndata[i] = tolower(name->ndata[i]);
10014 		}
10015 	} else {
10016 		for (size_t i = 0; i < name->length; i++) {
10017 			if (mask == (1 << 7)) {
10018 				bits = header->upper[i / 8];
10019 				mask = 1;
10020 			} else {
10021 				mask <<= 1;
10022 			}
10023 
10024 			name->ndata[i] = ((bits & mask) != 0)
10025 						 ? toupper(name->ndata[i])
10026 						 : tolower(name->ndata[i]);
10027 		}
10028 	}
10029 
10030 unlock:
10031 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
10032 		    isc_rwlocktype_read);
10033 }
10034 
10035 struct rbtdb_glue {
10036 	struct rbtdb_glue *next;
10037 	dns_fixedname_t fixedname;
10038 	dns_rdataset_t rdataset_a;
10039 	dns_rdataset_t sigrdataset_a;
10040 	dns_rdataset_t rdataset_aaaa;
10041 	dns_rdataset_t sigrdataset_aaaa;
10042 };
10043 
10044 typedef struct {
10045 	rbtdb_glue_t *glue_list;
10046 	dns_rbtdb_t *rbtdb;
10047 	rbtdb_version_t *rbtversion;
10048 } rbtdb_glue_additionaldata_ctx_t;
10049 
10050 static void
free_gluelist(rbtdb_glue_t * glue_list,dns_rbtdb_t * rbtdb)10051 free_gluelist(rbtdb_glue_t *glue_list, dns_rbtdb_t *rbtdb) {
10052 	rbtdb_glue_t *cur, *cur_next;
10053 
10054 	if (glue_list == (void *)-1) {
10055 		return;
10056 	}
10057 
10058 	cur = glue_list;
10059 	while (cur != NULL) {
10060 		cur_next = cur->next;
10061 
10062 		if (dns_rdataset_isassociated(&cur->rdataset_a)) {
10063 			dns_rdataset_disassociate(&cur->rdataset_a);
10064 		}
10065 		if (dns_rdataset_isassociated(&cur->sigrdataset_a)) {
10066 			dns_rdataset_disassociate(&cur->sigrdataset_a);
10067 		}
10068 
10069 		if (dns_rdataset_isassociated(&cur->rdataset_aaaa)) {
10070 			dns_rdataset_disassociate(&cur->rdataset_aaaa);
10071 		}
10072 		if (dns_rdataset_isassociated(&cur->sigrdataset_aaaa)) {
10073 			dns_rdataset_disassociate(&cur->sigrdataset_aaaa);
10074 		}
10075 
10076 		dns_rdataset_invalidate(&cur->rdataset_a);
10077 		dns_rdataset_invalidate(&cur->sigrdataset_a);
10078 		dns_rdataset_invalidate(&cur->rdataset_aaaa);
10079 		dns_rdataset_invalidate(&cur->sigrdataset_aaaa);
10080 
10081 		isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
10082 		cur = cur_next;
10083 	}
10084 }
10085 
10086 static void
free_gluetable(rbtdb_version_t * version)10087 free_gluetable(rbtdb_version_t *version) {
10088 	dns_rbtdb_t *rbtdb;
10089 	size_t size, i;
10090 
10091 	RWLOCK(&version->glue_rwlock, isc_rwlocktype_write);
10092 
10093 	rbtdb = version->rbtdb;
10094 
10095 	for (i = 0; i < HASHSIZE(version->glue_table_bits); i++) {
10096 		rbtdb_glue_table_node_t *cur, *cur_next;
10097 
10098 		cur = version->glue_table[i];
10099 		while (cur != NULL) {
10100 			cur_next = cur->next;
10101 			/* isc_refcount_decrement(&cur->node->references); */
10102 			cur->node = NULL;
10103 			free_gluelist(cur->glue_list, rbtdb);
10104 			cur->glue_list = NULL;
10105 			isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur));
10106 			cur = cur_next;
10107 		}
10108 		version->glue_table[i] = NULL;
10109 	}
10110 
10111 	size = HASHSIZE(version->glue_table_bits) *
10112 	       sizeof(*version->glue_table);
10113 	isc_mem_put(rbtdb->common.mctx, version->glue_table, size);
10114 
10115 	RWUNLOCK(&version->glue_rwlock, isc_rwlocktype_write);
10116 }
10117 
10118 static uint32_t
rehash_bits(rbtdb_version_t * version,size_t newcount)10119 rehash_bits(rbtdb_version_t *version, size_t newcount) {
10120 	uint32_t oldbits = version->glue_table_bits;
10121 	uint32_t newbits = oldbits;
10122 
10123 	while (newcount >= HASHSIZE(newbits) &&
10124 	       newbits <= RBTDB_GLUE_TABLE_MAX_BITS)
10125 	{
10126 		newbits += 1;
10127 	}
10128 
10129 	return (newbits);
10130 }
10131 
10132 /*%
10133  * Write lock (version->glue_rwlock) must be held.
10134  */
10135 static void
rehash_gluetable(rbtdb_version_t * version)10136 rehash_gluetable(rbtdb_version_t *version) {
10137 	uint32_t oldbits, newbits;
10138 	size_t newsize, oldcount, i;
10139 	rbtdb_glue_table_node_t **oldtable;
10140 
10141 	oldbits = version->glue_table_bits;
10142 	oldcount = HASHSIZE(oldbits);
10143 	oldtable = version->glue_table;
10144 
10145 	newbits = rehash_bits(version, version->glue_table_nodecount);
10146 	newsize = HASHSIZE(newbits) * sizeof(version->glue_table[0]);
10147 
10148 	version->glue_table = isc_mem_get(version->rbtdb->common.mctx, newsize);
10149 	version->glue_table_bits = newbits;
10150 	memset(version->glue_table, 0, newsize);
10151 
10152 	for (i = 0; i < oldcount; i++) {
10153 		rbtdb_glue_table_node_t *gluenode;
10154 		rbtdb_glue_table_node_t *nextgluenode;
10155 		for (gluenode = oldtable[i]; gluenode != NULL;
10156 		     gluenode = nextgluenode)
10157 		{
10158 			uint32_t hash = isc_hash32(
10159 				&gluenode->node, sizeof(gluenode->node), true);
10160 			uint32_t idx = hash_32(hash, newbits);
10161 			nextgluenode = gluenode->next;
10162 			gluenode->next = version->glue_table[idx];
10163 			version->glue_table[idx] = gluenode;
10164 		}
10165 	}
10166 
10167 	isc_mem_put(version->rbtdb->common.mctx, oldtable,
10168 		    oldcount * sizeof(*version->glue_table));
10169 
10170 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_ZONE,
10171 		      ISC_LOG_DEBUG(3),
10172 		      "rehash_gluetable(): "
10173 		      "resized glue table from %zu to "
10174 		      "%zu",
10175 		      oldcount, newsize / sizeof(version->glue_table[0]));
10176 }
10177 
10178 static void
maybe_rehash_gluetable(rbtdb_version_t * version)10179 maybe_rehash_gluetable(rbtdb_version_t *version) {
10180 	size_t overcommit = HASHSIZE(version->glue_table_bits) *
10181 			    RBTDB_GLUE_TABLE_OVERCOMMIT;
10182 	if (ISC_LIKELY(version->glue_table_nodecount < overcommit)) {
10183 		return;
10184 	}
10185 
10186 	rehash_gluetable(version);
10187 }
10188 
10189 static isc_result_t
glue_nsdname_cb(void * arg,const dns_name_t * name,dns_rdatatype_t qtype)10190 glue_nsdname_cb(void *arg, const dns_name_t *name, dns_rdatatype_t qtype) {
10191 	rbtdb_glue_additionaldata_ctx_t *ctx;
10192 	isc_result_t result;
10193 	dns_fixedname_t fixedname_a;
10194 	dns_name_t *name_a = NULL;
10195 	dns_rdataset_t rdataset_a, sigrdataset_a;
10196 	dns_rbtnode_t *node_a = NULL;
10197 	dns_fixedname_t fixedname_aaaa;
10198 	dns_name_t *name_aaaa = NULL;
10199 	dns_rdataset_t rdataset_aaaa, sigrdataset_aaaa;
10200 	dns_rbtnode_t *node_aaaa = NULL;
10201 	rbtdb_glue_t *glue = NULL;
10202 	dns_name_t *gluename = NULL;
10203 
10204 	/*
10205 	 * NS records want addresses in additional records.
10206 	 */
10207 	INSIST(qtype == dns_rdatatype_a);
10208 
10209 	ctx = (rbtdb_glue_additionaldata_ctx_t *)arg;
10210 
10211 	name_a = dns_fixedname_initname(&fixedname_a);
10212 	dns_rdataset_init(&rdataset_a);
10213 	dns_rdataset_init(&sigrdataset_a);
10214 
10215 	name_aaaa = dns_fixedname_initname(&fixedname_aaaa);
10216 	dns_rdataset_init(&rdataset_aaaa);
10217 	dns_rdataset_init(&sigrdataset_aaaa);
10218 
10219 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10220 			   dns_rdatatype_a, DNS_DBFIND_GLUEOK, 0,
10221 			   (dns_dbnode_t **)&node_a, name_a, &rdataset_a,
10222 			   &sigrdataset_a);
10223 	if (result == DNS_R_GLUE) {
10224 		glue = isc_mem_get(ctx->rbtdb->common.mctx, sizeof(*glue));
10225 
10226 		gluename = dns_fixedname_initname(&glue->fixedname);
10227 		dns_name_copynf(name_a, gluename);
10228 
10229 		dns_rdataset_init(&glue->rdataset_a);
10230 		dns_rdataset_init(&glue->sigrdataset_a);
10231 		dns_rdataset_init(&glue->rdataset_aaaa);
10232 		dns_rdataset_init(&glue->sigrdataset_aaaa);
10233 
10234 		dns_rdataset_clone(&rdataset_a, &glue->rdataset_a);
10235 		if (dns_rdataset_isassociated(&sigrdataset_a)) {
10236 			dns_rdataset_clone(&sigrdataset_a,
10237 					   &glue->sigrdataset_a);
10238 		}
10239 	}
10240 
10241 	result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion,
10242 			   dns_rdatatype_aaaa, DNS_DBFIND_GLUEOK, 0,
10243 			   (dns_dbnode_t **)&node_aaaa, name_aaaa,
10244 			   &rdataset_aaaa, &sigrdataset_aaaa);
10245 	if (result == DNS_R_GLUE) {
10246 		if (glue == NULL) {
10247 			glue = isc_mem_get(ctx->rbtdb->common.mctx,
10248 					   sizeof(*glue));
10249 
10250 			gluename = dns_fixedname_initname(&glue->fixedname);
10251 			dns_name_copynf(name_aaaa, gluename);
10252 
10253 			dns_rdataset_init(&glue->rdataset_a);
10254 			dns_rdataset_init(&glue->sigrdataset_a);
10255 			dns_rdataset_init(&glue->rdataset_aaaa);
10256 			dns_rdataset_init(&glue->sigrdataset_aaaa);
10257 		} else {
10258 			INSIST(node_a == node_aaaa);
10259 			INSIST(dns_name_equal(name_a, name_aaaa));
10260 		}
10261 
10262 		dns_rdataset_clone(&rdataset_aaaa, &glue->rdataset_aaaa);
10263 		if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10264 			dns_rdataset_clone(&sigrdataset_aaaa,
10265 					   &glue->sigrdataset_aaaa);
10266 		}
10267 	}
10268 
10269 	if (glue != NULL) {
10270 		glue->next = ctx->glue_list;
10271 		ctx->glue_list = glue;
10272 	}
10273 
10274 	result = ISC_R_SUCCESS;
10275 
10276 	if (dns_rdataset_isassociated(&rdataset_a)) {
10277 		rdataset_disassociate(&rdataset_a);
10278 	}
10279 	if (dns_rdataset_isassociated(&sigrdataset_a)) {
10280 		rdataset_disassociate(&sigrdataset_a);
10281 	}
10282 
10283 	if (dns_rdataset_isassociated(&rdataset_aaaa)) {
10284 		rdataset_disassociate(&rdataset_aaaa);
10285 	}
10286 	if (dns_rdataset_isassociated(&sigrdataset_aaaa)) {
10287 		rdataset_disassociate(&sigrdataset_aaaa);
10288 	}
10289 
10290 	if (node_a != NULL) {
10291 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_a);
10292 	}
10293 	if (node_aaaa != NULL) {
10294 		detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_aaaa);
10295 	}
10296 
10297 	return (result);
10298 }
10299 
10300 static isc_result_t
rdataset_addglue(dns_rdataset_t * rdataset,dns_dbversion_t * version,dns_message_t * msg)10301 rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
10302 		 dns_message_t *msg) {
10303 	dns_rbtdb_t *rbtdb = rdataset->private1;
10304 	dns_rbtnode_t *node = rdataset->private2;
10305 	rbtdb_version_t *rbtversion = version;
10306 	uint32_t idx;
10307 	rbtdb_glue_table_node_t *cur;
10308 	bool found = false;
10309 	bool restarted = false;
10310 	rbtdb_glue_t *ge;
10311 	rbtdb_glue_additionaldata_ctx_t ctx;
10312 	isc_result_t result;
10313 	uint64_t hash;
10314 
10315 	REQUIRE(rdataset->type == dns_rdatatype_ns);
10316 	REQUIRE(rbtdb == rbtversion->rbtdb);
10317 	REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb));
10318 
10319 	/*
10320 	 * The glue table cache that forms a part of the DB version
10321 	 * structure is not explicitly bounded and there's no cache
10322 	 * cleaning. The zone data size itself is an implicit bound.
10323 	 *
10324 	 * The key into the glue hashtable is the node pointer. This is
10325 	 * because the glue hashtable is a property of the DB version,
10326 	 * and the glue is keyed for the ownername/NS tuple. We don't
10327 	 * bother with using an expensive dns_name_t comparison here as
10328 	 * the node pointer is a fixed value that won't change for a DB
10329 	 * version and can be compared directly.
10330 	 */
10331 	hash = isc_hash_function(&node, sizeof(node), true);
10332 
10333 restart:
10334 	/*
10335 	 * First, check if we have the additional entries already cached
10336 	 * in the glue table.
10337 	 */
10338 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10339 
10340 	idx = hash_32(hash, rbtversion->glue_table_bits);
10341 
10342 	for (cur = rbtversion->glue_table[idx]; cur != NULL; cur = cur->next) {
10343 		if (cur->node == node) {
10344 			break;
10345 		}
10346 	}
10347 
10348 	if (cur == NULL) {
10349 		goto no_glue;
10350 	}
10351 	/*
10352 	 * We found a cached result. Add it to the message and
10353 	 * return.
10354 	 */
10355 	found = true;
10356 	ge = cur->glue_list;
10357 
10358 	/*
10359 	 * (void *) -1 is a special value that means no glue is
10360 	 * present in the zone.
10361 	 */
10362 	if (ge == (void *)-1) {
10363 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10364 			isc_stats_increment(
10365 				rbtdb->gluecachestats,
10366 				dns_gluecachestatscounter_hits_absent);
10367 		}
10368 		goto no_glue;
10369 	} else {
10370 		if (!restarted && (rbtdb->gluecachestats != NULL)) {
10371 			isc_stats_increment(
10372 				rbtdb->gluecachestats,
10373 				dns_gluecachestatscounter_hits_present);
10374 		}
10375 	}
10376 
10377 	for (; ge != NULL; ge = ge->next) {
10378 		dns_name_t *name = NULL;
10379 		dns_rdataset_t *rdataset_a = NULL;
10380 		dns_rdataset_t *sigrdataset_a = NULL;
10381 		dns_rdataset_t *rdataset_aaaa = NULL;
10382 		dns_rdataset_t *sigrdataset_aaaa = NULL;
10383 		dns_name_t *gluename = dns_fixedname_name(&ge->fixedname);
10384 
10385 		result = dns_message_gettempname(msg, &name);
10386 		if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10387 			goto no_glue;
10388 		}
10389 
10390 		dns_name_copynf(gluename, name);
10391 
10392 		if (dns_rdataset_isassociated(&ge->rdataset_a)) {
10393 			result = dns_message_gettemprdataset(msg, &rdataset_a);
10394 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10395 				dns_message_puttempname(msg, &name);
10396 				goto no_glue;
10397 			}
10398 		}
10399 
10400 		if (dns_rdataset_isassociated(&ge->sigrdataset_a)) {
10401 			result = dns_message_gettemprdataset(msg,
10402 							     &sigrdataset_a);
10403 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10404 				if (rdataset_a != NULL) {
10405 					dns_message_puttemprdataset(
10406 						msg, &rdataset_a);
10407 				}
10408 				dns_message_puttempname(msg, &name);
10409 				goto no_glue;
10410 			}
10411 		}
10412 
10413 		if (dns_rdataset_isassociated(&ge->rdataset_aaaa)) {
10414 			result = dns_message_gettemprdataset(msg,
10415 							     &rdataset_aaaa);
10416 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10417 				dns_message_puttempname(msg, &name);
10418 				if (rdataset_a != NULL) {
10419 					dns_message_puttemprdataset(
10420 						msg, &rdataset_a);
10421 				}
10422 				if (sigrdataset_a != NULL) {
10423 					dns_message_puttemprdataset(
10424 						msg, &sigrdataset_a);
10425 				}
10426 				goto no_glue;
10427 			}
10428 		}
10429 
10430 		if (dns_rdataset_isassociated(&ge->sigrdataset_aaaa)) {
10431 			result = dns_message_gettemprdataset(msg,
10432 							     &sigrdataset_aaaa);
10433 			if (ISC_UNLIKELY(result != ISC_R_SUCCESS)) {
10434 				dns_message_puttempname(msg, &name);
10435 				if (rdataset_a != NULL) {
10436 					dns_message_puttemprdataset(
10437 						msg, &rdataset_a);
10438 				}
10439 				if (sigrdataset_a != NULL) {
10440 					dns_message_puttemprdataset(
10441 						msg, &sigrdataset_a);
10442 				}
10443 				if (rdataset_aaaa != NULL) {
10444 					dns_message_puttemprdataset(
10445 						msg, &rdataset_aaaa);
10446 				}
10447 				goto no_glue;
10448 			}
10449 		}
10450 
10451 		if (ISC_LIKELY(rdataset_a != NULL)) {
10452 			dns_rdataset_clone(&ge->rdataset_a, rdataset_a);
10453 			ISC_LIST_APPEND(name->list, rdataset_a, link);
10454 		}
10455 
10456 		if (sigrdataset_a != NULL) {
10457 			dns_rdataset_clone(&ge->sigrdataset_a, sigrdataset_a);
10458 			ISC_LIST_APPEND(name->list, sigrdataset_a, link);
10459 		}
10460 
10461 		if (rdataset_aaaa != NULL) {
10462 			dns_rdataset_clone(&ge->rdataset_aaaa, rdataset_aaaa);
10463 			ISC_LIST_APPEND(name->list, rdataset_aaaa, link);
10464 		}
10465 		if (sigrdataset_aaaa != NULL) {
10466 			dns_rdataset_clone(&ge->sigrdataset_aaaa,
10467 					   sigrdataset_aaaa);
10468 			ISC_LIST_APPEND(name->list, sigrdataset_aaaa, link);
10469 		}
10470 
10471 		dns_message_addname(msg, name, DNS_SECTION_ADDITIONAL);
10472 	}
10473 
10474 no_glue:
10475 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read);
10476 
10477 	if (found) {
10478 		return (ISC_R_SUCCESS);
10479 	}
10480 
10481 	if (restarted) {
10482 		return (ISC_R_FAILURE);
10483 	}
10484 
10485 	/*
10486 	 * No cached glue was found in the table. Cache it and restart
10487 	 * this function.
10488 	 *
10489 	 * Due to the gap between the read lock and the write lock, it's
10490 	 * possible that we may cache a duplicate glue table entry, but
10491 	 * we don't care.
10492 	 */
10493 
10494 	ctx.glue_list = NULL;
10495 	ctx.rbtdb = rbtdb;
10496 	ctx.rbtversion = rbtversion;
10497 
10498 	RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10499 
10500 	maybe_rehash_gluetable(rbtversion);
10501 	idx = hash_32(hash, rbtversion->glue_table_bits);
10502 
10503 	(void)dns_rdataset_additionaldata(rdataset, glue_nsdname_cb, &ctx);
10504 
10505 	cur = isc_mem_get(rbtdb->common.mctx, sizeof(*cur));
10506 
10507 	/*
10508 	 * XXXMUKS: it looks like the dns_dbversion is not destroyed
10509 	 * when named is terminated by a keyboard break. This doesn't
10510 	 * cleanup the node reference and keeps the process dangling.
10511 	 */
10512 	/* isc_refcount_increment0(&node->references); */
10513 	cur->node = node;
10514 
10515 	if (ctx.glue_list == NULL) {
10516 		/*
10517 		 * No glue was found. Cache it so.
10518 		 */
10519 		cur->glue_list = (void *)-1;
10520 		if (rbtdb->gluecachestats != NULL) {
10521 			isc_stats_increment(
10522 				rbtdb->gluecachestats,
10523 				dns_gluecachestatscounter_inserts_absent);
10524 		}
10525 	} else {
10526 		cur->glue_list = ctx.glue_list;
10527 		if (rbtdb->gluecachestats != NULL) {
10528 			isc_stats_increment(
10529 				rbtdb->gluecachestats,
10530 				dns_gluecachestatscounter_inserts_present);
10531 		}
10532 	}
10533 
10534 	cur->next = rbtversion->glue_table[idx];
10535 	rbtversion->glue_table[idx] = cur;
10536 	rbtversion->glue_table_nodecount++;
10537 
10538 	RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write);
10539 
10540 	restarted = true;
10541 	goto restart;
10542 
10543 	/* UNREACHABLE */
10544 }
10545 
10546 /*%
10547  * Routines for LRU-based cache management.
10548  */
10549 
10550 /*%
10551  * See if a given cache entry that is being reused needs to be updated
10552  * in the LRU-list.  From the LRU management point of view, this function is
10553  * expected to return true for almost all cases.  When used with threads,
10554  * however, this may cause a non-negligible performance penalty because a
10555  * writer lock will have to be acquired before updating the list.
10556  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
10557  * function returns true if the entry has not been updated for some period of
10558  * time.  We differentiate the NS or glue address case and the others since
10559  * experiments have shown that the former tends to be accessed relatively
10560  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
10561  * may cause external queries at a higher level zone, involving more
10562  * transactions).
10563  *
10564  * Caller must hold the node (read or write) lock.
10565  */
10566 static bool
need_headerupdate(rdatasetheader_t * header,isc_stdtime_t now)10567 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
10568 	if (RDATASET_ATTR_GET(header, (RDATASET_ATTR_NONEXISTENT |
10569 				       RDATASET_ATTR_ANCIENT |
10570 				       RDATASET_ATTR_ZEROTTL)) != 0)
10571 	{
10572 		return (false);
10573 	}
10574 
10575 #if DNS_RBTDB_LIMITLRUUPDATE
10576 	if (header->type == dns_rdatatype_ns ||
10577 	    (header->trust == dns_trust_glue &&
10578 	     (header->type == dns_rdatatype_a ||
10579 	      header->type == dns_rdatatype_aaaa)))
10580 	{
10581 		/*
10582 		 * Glue records are updated if at least DNS_RBTDB_LRUUPDATE_GLUE
10583 		 * seconds have passed since the previous update time.
10584 		 */
10585 		return (header->last_used + DNS_RBTDB_LRUUPDATE_GLUE <= now);
10586 	}
10587 
10588 	/*
10589 	 * Other records are updated if DNS_RBTDB_LRUUPDATE_REGULAR seconds
10590 	 * have passed.
10591 	 */
10592 	return (header->last_used + DNS_RBTDB_LRUUPDATE_REGULAR <= now);
10593 #else
10594 	UNUSED(now);
10595 
10596 	return (true);
10597 #endif /* if DNS_RBTDB_LIMITLRUUPDATE */
10598 }
10599 
10600 /*%
10601  * Update the timestamp of a given cache entry and move it to the head
10602  * of the corresponding LRU list.
10603  *
10604  * Caller must hold the node (write) lock.
10605  *
10606  * Note that the we do NOT touch the heap here, as the TTL has not changed.
10607  */
10608 static void
update_header(dns_rbtdb_t * rbtdb,rdatasetheader_t * header,isc_stdtime_t now)10609 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now) {
10610 	INSIST(IS_CACHE(rbtdb));
10611 
10612 	/* To be checked: can we really assume this? XXXMLG */
10613 	INSIST(ISC_LINK_LINKED(header, link));
10614 
10615 	ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum], header, link);
10616 	header->last_used = now;
10617 	ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], header, link);
10618 }
10619 
10620 static size_t
expire_lru_headers(dns_rbtdb_t * rbtdb,unsigned int locknum,size_t purgesize,bool tree_locked)10621 expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize,
10622 		   bool tree_locked) {
10623 	rdatasetheader_t *header, *header_prev;
10624 	size_t purged = 0;
10625 
10626 	for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
10627 	     header != NULL && purged <= purgesize; header = header_prev)
10628 	{
10629 		header_prev = ISC_LIST_PREV(header, link);
10630 		/*
10631 		 * Unlink the entry at this point to avoid checking it
10632 		 * again even if it's currently used someone else and
10633 		 * cannot be purged at this moment.  This entry won't be
10634 		 * referenced any more (so unlinking is safe) since the
10635 		 * TTL was reset to 0.
10636 		 */
10637 		ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, link);
10638 		size_t header_size = rdataset_size(header);
10639 		expire_header(rbtdb, header, tree_locked, expire_lru);
10640 		purged += header_size;
10641 	}
10642 
10643 	return (purged);
10644 }
10645 
10646 /*%
10647  * Purge some stale (i.e. unused for some period - LRU based cleaning) cache
10648  * entries under the overmem condition.  To recover from this condition quickly,
10649  * we cleanup entries up to the size of newly added rdata (passed as purgesize).
10650  *
10651  * This process is triggered while adding a new entry, and we specifically avoid
10652  * purging entries in the same LRU bucket as the one to which the new entry will
10653  * belong.  Otherwise, we might purge entries of the same name of different RR
10654  * types while adding RRsets from a single response (consider the case where
10655  * we're adding A and AAAA glue records of the same NS name).
10656  */
10657 static void
overmem_purge(dns_rbtdb_t * rbtdb,unsigned int locknum_start,size_t purgesize,bool tree_locked)10658 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
10659 	      bool tree_locked) {
10660 	unsigned int locknum;
10661 	size_t purged = 0;
10662 
10663 	for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
10664 	     locknum != locknum_start && purged <= purgesize;
10665 	     locknum = (locknum + 1) % rbtdb->node_lock_count)
10666 	{
10667 		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
10668 			  isc_rwlocktype_write);
10669 
10670 		purged += expire_lru_headers(rbtdb, locknum, purgesize - purged,
10671 					     tree_locked);
10672 
10673 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
10674 			    isc_rwlocktype_write);
10675 	}
10676 }
10677 
10678 static void
expire_header(dns_rbtdb_t * rbtdb,rdatasetheader_t * header,bool tree_locked,expire_t reason)10679 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
10680 	      expire_t reason) {
10681 	set_ttl(rbtdb, header, 0);
10682 	mark_header_ancient(rbtdb, header);
10683 
10684 	/*
10685 	 * Caller must hold the node (write) lock.
10686 	 */
10687 
10688 	if (isc_refcount_current(&header->node->references) == 0) {
10689 		/*
10690 		 * If no one else is using the node, we can clean it up now.
10691 		 * We first need to gain a new reference to the node to meet a
10692 		 * requirement of decrement_reference().
10693 		 */
10694 		new_reference(rbtdb, header->node, isc_rwlocktype_write);
10695 		decrement_reference(rbtdb, header->node, 0,
10696 				    isc_rwlocktype_write,
10697 				    tree_locked ? isc_rwlocktype_write
10698 						: isc_rwlocktype_none,
10699 				    false);
10700 
10701 		if (rbtdb->cachestats == NULL) {
10702 			return;
10703 		}
10704 
10705 		switch (reason) {
10706 		case expire_ttl:
10707 			isc_stats_increment(rbtdb->cachestats,
10708 					    dns_cachestatscounter_deletettl);
10709 			break;
10710 		case expire_lru:
10711 			isc_stats_increment(rbtdb->cachestats,
10712 					    dns_cachestatscounter_deletelru);
10713 			break;
10714 		default:
10715 			break;
10716 		}
10717 	}
10718 }
10719