xref: /netbsd-src/external/mpl/bind/dist/lib/dns/rbt-cachedb.c (revision bcda20f65a8566e103791ec395f7f499ef322704)
1 /*	$NetBSD: rbt-cachedb.c,v 1.2 2025/01/26 16:25:24 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <inttypes.h>
19 #include <stdbool.h>
20 #include <sys/mman.h>
21 
22 #include <isc/ascii.h>
23 #include <isc/async.h>
24 #include <isc/atomic.h>
25 #include <isc/crc64.h>
26 #include <isc/file.h>
27 #include <isc/hash.h>
28 #include <isc/hashmap.h>
29 #include <isc/heap.h>
30 #include <isc/hex.h>
31 #include <isc/loop.h>
32 #include <isc/mem.h>
33 #include <isc/mutex.h>
34 #include <isc/once.h>
35 #include <isc/random.h>
36 #include <isc/refcount.h>
37 #include <isc/result.h>
38 #include <isc/rwlock.h>
39 #include <isc/serial.h>
40 #include <isc/stdio.h>
41 #include <isc/string.h>
42 #include <isc/time.h>
43 #include <isc/urcu.h>
44 #include <isc/util.h>
45 
46 #include <dns/callbacks.h>
47 #include <dns/db.h>
48 #include <dns/dbiterator.h>
49 #include <dns/fixedname.h>
50 #include <dns/log.h>
51 #include <dns/masterdump.h>
52 #include <dns/nsec.h>
53 #include <dns/nsec3.h>
54 #include <dns/rbt.h>
55 #include <dns/rdata.h>
56 #include <dns/rdataset.h>
57 #include <dns/rdatasetiter.h>
58 #include <dns/rdataslab.h>
59 #include <dns/rdatastruct.h>
60 #include <dns/stats.h>
61 #include <dns/time.h>
62 #include <dns/view.h>
63 #include <dns/zone.h>
64 #include <dns/zonekey.h>
65 
66 #include "db_p.h"
67 #include "rbtdb_p.h"
68 
69 #define CHECK(op)                            \
70 	do {                                 \
71 		result = (op);               \
72 		if (result != ISC_R_SUCCESS) \
73 			goto failure;        \
74 	} while (0)
75 
76 /*%
77  * Whether to rate-limit updating the LRU to avoid possible thread contention.
78  * Updating LRU requires write locking, so we don't do it every time the
79  * record is touched - only after some time passes.
80  */
81 #ifndef DNS_RBTDB_LIMITLRUUPDATE
82 #define DNS_RBTDB_LIMITLRUUPDATE 1
83 #endif
84 
85 /*% Time after which we update LRU for glue records, 5 minutes */
86 #define DNS_RBTDB_LRUUPDATE_GLUE 300
87 /*% Time after which we update LRU for all other records, 10 minutes */
88 #define DNS_RBTDB_LRUUPDATE_REGULAR 600
89 
90 #define EXISTS(header)                                 \
91 	((atomic_load_acquire(&(header)->attributes) & \
92 	  DNS_SLABHEADERATTR_NONEXISTENT) == 0)
93 #define NONEXISTENT(header)                            \
94 	((atomic_load_acquire(&(header)->attributes) & \
95 	  DNS_SLABHEADERATTR_NONEXISTENT) != 0)
96 #define NXDOMAIN(header)                               \
97 	((atomic_load_acquire(&(header)->attributes) & \
98 	  DNS_SLABHEADERATTR_NXDOMAIN) != 0)
99 #define STALE(header)                                  \
100 	((atomic_load_acquire(&(header)->attributes) & \
101 	  DNS_SLABHEADERATTR_STALE) != 0)
102 #define NEGATIVE(header)                               \
103 	((atomic_load_acquire(&(header)->attributes) & \
104 	  DNS_SLABHEADERATTR_NEGATIVE) != 0)
105 #define ZEROTTL(header)                                \
106 	((atomic_load_acquire(&(header)->attributes) & \
107 	  DNS_SLABHEADERATTR_ZEROTTL) != 0)
108 #define ANCIENT(header)                                \
109 	((atomic_load_acquire(&(header)->attributes) & \
110 	  DNS_SLABHEADERATTR_ANCIENT) != 0)
111 #define STATCOUNT(header)                              \
112 	((atomic_load_acquire(&(header)->attributes) & \
113 	  DNS_SLABHEADERATTR_STATCOUNT) != 0)
114 
115 #define STALE_TTL(header, rbtdb) \
116 	(NXDOMAIN(header) ? 0 : rbtdb->common.serve_stale_ttl)
117 
118 #define ACTIVE(header, now) \
119 	(((header)->ttl > (now)) || ((header)->ttl == (now) && ZEROTTL(header)))
120 
121 #define KEEPSTALE(rbtdb) ((rbtdb)->common.serve_stale_ttl > 0)
122 
123 /*%
124  * Routines for LRU-based cache management.
125  */
126 
127 /*%
128  * See if a given cache entry that is being reused needs to be updated
129  * in the LRU-list.  From the LRU management point of view, this function is
130  * expected to return true for almost all cases.  When used with threads,
131  * however, this may cause a non-negligible performance penalty because a
132  * writer lock will have to be acquired before updating the list.
133  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
134  * function returns true if the entry has not been updated for some period of
135  * time.  We differentiate the NS or glue address case and the others since
136  * experiments have shown that the former tends to be accessed relatively
137  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
138  * may cause external queries at a higher level zone, involving more
139  * transactions).
140  *
141  * Caller must hold the node (read or write) lock.
142  */
143 static bool
144 need_headerupdate(dns_slabheader_t *header, isc_stdtime_t now) {
145 	if (DNS_SLABHEADER_GETATTR(header, (DNS_SLABHEADERATTR_NONEXISTENT |
146 					    DNS_SLABHEADERATTR_ANCIENT |
147 					    DNS_SLABHEADERATTR_ZEROTTL)) != 0)
148 	{
149 		return false;
150 	}
151 
152 #if DNS_RBTDB_LIMITLRUUPDATE
153 	if (header->type == dns_rdatatype_ns ||
154 	    (header->trust == dns_trust_glue &&
155 	     (header->type == dns_rdatatype_a ||
156 	      header->type == dns_rdatatype_aaaa)))
157 	{
158 		/*
159 		 * Glue records are updated if at least DNS_RBTDB_LRUUPDATE_GLUE
160 		 * seconds have passed since the previous update time.
161 		 */
162 		return header->last_used + DNS_RBTDB_LRUUPDATE_GLUE <= now;
163 	}
164 
165 	/*
166 	 * Other records are updated if DNS_RBTDB_LRUUPDATE_REGULAR seconds
167 	 * have passed.
168 	 */
169 	return header->last_used + DNS_RBTDB_LRUUPDATE_REGULAR <= now;
170 #else
171 	UNUSED(now);
172 
173 	return true;
174 #endif /* if DNS_RBTDB_LIMITLRUUPDATE */
175 }
176 
177 /*%
178  * Update the timestamp of a given cache entry and move it to the head
179  * of the corresponding LRU list.
180  *
181  * Caller must hold the node (write) lock.
182  *
183  * Note that the we do NOT touch the heap here, as the TTL has not changed.
184  */
185 static void
186 update_header(dns_rbtdb_t *rbtdb, dns_slabheader_t *header, isc_stdtime_t now) {
187 	INSIST(IS_CACHE(rbtdb));
188 
189 	/* To be checked: can we really assume this? XXXMLG */
190 	INSIST(ISC_LINK_LINKED(header, link));
191 
192 	ISC_LIST_UNLINK(rbtdb->lru[RBTDB_HEADERNODE(header)->locknum], header,
193 			link);
194 	header->last_used = now;
195 	ISC_LIST_PREPEND(rbtdb->lru[RBTDB_HEADERNODE(header)->locknum], header,
196 			 link);
197 }
198 
199 /*
200  * Locking
201  *
202  * If a routine is going to lock more than one lock in this module, then
203  * the locking must be done in the following order:
204  *
205  *      Tree Lock
206  *
207  *      Node Lock       (Only one from the set may be locked at one time by
208  *                       any caller)
209  *
210  *      Database Lock
211  *
212  * Failure to follow this hierarchy can result in deadlock.
213  */
214 
215 /*
216  * Deleting Nodes
217  *
218  * For zone databases the node for the origin of the zone MUST NOT be deleted.
219  */
220 
221 /*
222  * DB Routines
223  */
224 
225 static void
226 update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
227 	INSIST(IS_CACHE(rbtdb));
228 
229 	if (rbtdb->cachestats == NULL) {
230 		return;
231 	}
232 
233 	switch (result) {
234 	case DNS_R_COVERINGNSEC:
235 		isc_stats_increment(rbtdb->cachestats,
236 				    dns_cachestatscounter_coveringnsec);
237 		FALLTHROUGH;
238 	case ISC_R_SUCCESS:
239 	case DNS_R_CNAME:
240 	case DNS_R_DNAME:
241 	case DNS_R_DELEGATION:
242 	case DNS_R_NCACHENXDOMAIN:
243 	case DNS_R_NCACHENXRRSET:
244 		isc_stats_increment(rbtdb->cachestats,
245 				    dns_cachestatscounter_hits);
246 		break;
247 	default:
248 		isc_stats_increment(rbtdb->cachestats,
249 				    dns_cachestatscounter_misses);
250 	}
251 }
252 
253 static void
254 clean_stale_headers(dns_slabheader_t *top) {
255 	dns_slabheader_t *d = NULL, *down_next = NULL;
256 
257 	for (d = top->down; d != NULL; d = down_next) {
258 		down_next = d->down;
259 		dns_slabheader_destroy(&d);
260 	}
261 	top->down = NULL;
262 }
263 
264 static isc_result_t
265 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
266 		 dns_name_t *foundname, dns_rdataset_t *rdataset,
267 		 dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
268 	dns_name_t *zcname = NULL;
269 	dns_typepair_t type;
270 	dns_rbtnode_t *node = NULL;
271 
272 	REQUIRE(search != NULL);
273 	REQUIRE(search->zonecut != NULL);
274 	REQUIRE(search->zonecut_header != NULL);
275 
276 	/*
277 	 * The caller MUST NOT be holding any node locks.
278 	 */
279 
280 	node = search->zonecut;
281 	type = search->zonecut_header->type;
282 
283 	/*
284 	 * If we have to set foundname, we do it before anything else.
285 	 * If we were to set foundname after we had set nodep or bound the
286 	 * rdataset, then we'd have to undo that work if dns_name_copy()
287 	 * failed.  By setting foundname first, there's nothing to undo if
288 	 * we have trouble.
289 	 */
290 	if (foundname != NULL && search->copy_name) {
291 		zcname = dns_fixedname_name(&search->zonecut_name);
292 		dns_name_copy(zcname, foundname);
293 	}
294 	if (nodep != NULL) {
295 		/*
296 		 * Note that we don't have to increment the node's reference
297 		 * count here because we're going to use the reference we
298 		 * already have in the search block.
299 		 */
300 		*nodep = node;
301 		search->need_cleanup = false;
302 	}
303 	if (rdataset != NULL) {
304 		isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
305 		NODE_RDLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
306 			    &nlocktype);
307 		dns__rbtdb_bindrdataset(search->rbtdb, node,
308 					search->zonecut_header, search->now,
309 					isc_rwlocktype_read,
310 					rdataset DNS__DB_FLARG_PASS);
311 		if (sigrdataset != NULL && search->zonecut_sigheader != NULL) {
312 			dns__rbtdb_bindrdataset(
313 				search->rbtdb, node, search->zonecut_sigheader,
314 				search->now, isc_rwlocktype_read,
315 				sigrdataset DNS__DB_FLARG_PASS);
316 		}
317 		NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
318 			    &nlocktype);
319 	}
320 
321 	if (type == dns_rdatatype_dname) {
322 		return DNS_R_DNAME;
323 	}
324 	return DNS_R_DELEGATION;
325 }
326 
327 static bool
328 check_stale_header(dns_rbtnode_t *node, dns_slabheader_t *header,
329 		   isc_rwlocktype_t *nlocktypep, isc_rwlock_t *lock,
330 		   rbtdb_search_t *search, dns_slabheader_t **header_prev) {
331 	if (!ACTIVE(header, search->now)) {
332 		dns_ttl_t stale = header->ttl +
333 				  STALE_TTL(header, search->rbtdb);
334 		/*
335 		 * If this data is in the stale window keep it and if
336 		 * DNS_DBFIND_STALEOK is not set we tell the caller to
337 		 * skip this record.  We skip the records with ZEROTTL
338 		 * (these records should not be cached anyway).
339 		 */
340 
341 		DNS_SLABHEADER_CLRATTR(header, DNS_SLABHEADERATTR_STALE_WINDOW);
342 		if (!ZEROTTL(header) && KEEPSTALE(search->rbtdb) &&
343 		    stale > search->now)
344 		{
345 			dns__rbtdb_mark(header, DNS_SLABHEADERATTR_STALE);
346 			*header_prev = header;
347 			/*
348 			 * If DNS_DBFIND_STALESTART is set then it means we
349 			 * failed to resolve the name during recursion, in
350 			 * this case we mark the time in which the refresh
351 			 * failed.
352 			 */
353 			if ((search->options & DNS_DBFIND_STALESTART) != 0) {
354 				atomic_store_release(
355 					&header->last_refresh_fail_ts,
356 					search->now);
357 			} else if ((search->options &
358 				    DNS_DBFIND_STALEENABLED) != 0 &&
359 				   search->now <
360 					   (atomic_load_acquire(
361 						    &header->last_refresh_fail_ts) +
362 					    search->rbtdb->serve_stale_refresh))
363 			{
364 				/*
365 				 * If we are within interval between last
366 				 * refresh failure time + 'stale-refresh-time',
367 				 * then don't skip this stale entry but use it
368 				 * instead.
369 				 */
370 				DNS_SLABHEADER_SETATTR(
371 					header,
372 					DNS_SLABHEADERATTR_STALE_WINDOW);
373 				return false;
374 			} else if ((search->options &
375 				    DNS_DBFIND_STALETIMEOUT) != 0)
376 			{
377 				/*
378 				 * We want stale RRset due to timeout, so we
379 				 * don't skip it.
380 				 */
381 				return false;
382 			}
383 			return (search->options & DNS_DBFIND_STALEOK) == 0;
384 		}
385 
386 		/*
387 		 * This rdataset is stale.  If no one else is using the
388 		 * node, we can clean it up right now, otherwise we mark
389 		 * it as ancient, and the node as dirty, so it will get
390 		 * cleaned up later.
391 		 */
392 		if ((header->ttl < search->now - RBTDB_VIRTUAL) &&
393 		    (*nlocktypep == isc_rwlocktype_write ||
394 		     NODE_TRYUPGRADE(lock, nlocktypep) == ISC_R_SUCCESS))
395 		{
396 			/*
397 			 * We update the node's status only when we can
398 			 * get write access; otherwise, we leave others
399 			 * to this work.  Periodical cleaning will
400 			 * eventually take the job as the last resort.
401 			 * We won't downgrade the lock, since other
402 			 * rdatasets are probably stale, too.
403 			 */
404 
405 			if (isc_refcount_current(&node->references) == 0) {
406 				/*
407 				 * header->down can be non-NULL if the
408 				 * refcount has just decremented to 0
409 				 * but dns__rbtdb_decref() has not
410 				 * performed clean_cache_node(), in
411 				 * which case we need to purge the stale
412 				 * headers first.
413 				 */
414 				clean_stale_headers(header);
415 				if (*header_prev != NULL) {
416 					(*header_prev)->next = header->next;
417 				} else {
418 					node->data = header->next;
419 				}
420 				dns_slabheader_destroy(&header);
421 			} else {
422 				dns__rbtdb_mark(header,
423 						DNS_SLABHEADERATTR_ANCIENT);
424 				RBTDB_HEADERNODE(header)->dirty = 1;
425 				*header_prev = header;
426 			}
427 		} else {
428 			*header_prev = header;
429 		}
430 		return true;
431 	}
432 	return false;
433 }
434 
435 static isc_result_t
436 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name,
437 		       void *arg DNS__DB_FLARG) {
438 	rbtdb_search_t *search = arg;
439 	dns_slabheader_t *header = NULL;
440 	dns_slabheader_t *header_prev = NULL, *header_next = NULL;
441 	dns_slabheader_t *dname_header = NULL, *sigdname_header = NULL;
442 	isc_result_t result;
443 	isc_rwlock_t *lock = NULL;
444 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
445 
446 	REQUIRE(search->zonecut == NULL);
447 
448 	/*
449 	 * Keep compiler silent.
450 	 */
451 	UNUSED(name);
452 
453 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
454 	NODE_RDLOCK(lock, &nlocktype);
455 
456 	/*
457 	 * Look for a DNAME or RRSIG DNAME rdataset.
458 	 */
459 	for (header = node->data; header != NULL; header = header_next) {
460 		header_next = header->next;
461 		if (check_stale_header(node, header, &nlocktype, lock, search,
462 				       &header_prev))
463 		{
464 			/* Do nothing. */
465 		} else if (header->type == dns_rdatatype_dname &&
466 			   EXISTS(header) && !ANCIENT(header))
467 		{
468 			dname_header = header;
469 			header_prev = header;
470 		} else if (header->type == DNS_SIGTYPE(dns_rdatatype_dname) &&
471 			   EXISTS(header) && !ANCIENT(header))
472 		{
473 			sigdname_header = header;
474 			header_prev = header;
475 		} else {
476 			header_prev = header;
477 		}
478 	}
479 
480 	if (dname_header != NULL &&
481 	    (!DNS_TRUST_PENDING(dname_header->trust) ||
482 	     (search->options & DNS_DBFIND_PENDINGOK) != 0))
483 	{
484 		/*
485 		 * We increment the reference count on node to ensure that
486 		 * search->zonecut_header will still be valid later.
487 		 */
488 		dns__rbtdb_newref(search->rbtdb, node,
489 				  nlocktype DNS__DB_FLARG_PASS);
490 		search->zonecut = node;
491 		search->zonecut_header = dname_header;
492 		search->zonecut_sigheader = sigdname_header;
493 		search->need_cleanup = true;
494 		result = DNS_R_PARTIALMATCH;
495 	} else {
496 		result = DNS_R_CONTINUE;
497 	}
498 
499 	NODE_UNLOCK(lock, &nlocktype);
500 
501 	return result;
502 }
503 
504 static isc_result_t
505 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
506 		     dns_dbnode_t **nodep, dns_name_t *foundname,
507 		     dns_rdataset_t *rdataset,
508 		     dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
509 	unsigned int i;
510 	isc_result_t result = ISC_R_NOTFOUND;
511 	dns_name_t name;
512 	dns_rbtdb_t *rbtdb = NULL;
513 	bool done;
514 
515 	/*
516 	 * Caller must be holding the tree lock.
517 	 */
518 
519 	rbtdb = search->rbtdb;
520 	i = search->chain.level_matches;
521 	done = false;
522 	do {
523 		dns_slabheader_t *header = NULL;
524 		dns_slabheader_t *header_prev = NULL, *header_next = NULL;
525 		dns_slabheader_t *found = NULL, *foundsig = NULL;
526 		isc_rwlock_t *lock = &rbtdb->node_locks[node->locknum].lock;
527 		isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
528 
529 		NODE_RDLOCK(lock, &nlocktype);
530 
531 		/*
532 		 * Look for NS and RRSIG NS rdatasets.
533 		 */
534 		for (header = node->data; header != NULL; header = header_next)
535 		{
536 			header_next = header->next;
537 			if (check_stale_header(node, header, &nlocktype, lock,
538 					       search, &header_prev))
539 			{
540 				/* Do nothing. */
541 			} else if (EXISTS(header) && !ANCIENT(header)) {
542 				/*
543 				 * We've found an extant rdataset.  See if
544 				 * we're interested in it.
545 				 */
546 				if (header->type == dns_rdatatype_ns) {
547 					found = header;
548 					if (foundsig != NULL) {
549 						break;
550 					}
551 				} else if (header->type ==
552 					   DNS_SIGTYPE(dns_rdatatype_ns))
553 				{
554 					foundsig = header;
555 					if (found != NULL) {
556 						break;
557 					}
558 				}
559 				header_prev = header;
560 			} else {
561 				header_prev = header;
562 			}
563 		}
564 
565 		if (found != NULL) {
566 			/*
567 			 * If we have to set foundname, we do it before
568 			 * anything else.  If we were to set foundname after
569 			 * we had set nodep or bound the rdataset, then we'd
570 			 * have to undo that work if dns_name_concatenate()
571 			 * failed.  By setting foundname first, there's
572 			 * nothing to undo if we have trouble.
573 			 */
574 			if (foundname != NULL) {
575 				dns_name_init(&name, NULL);
576 				dns_rbt_namefromnode(node, &name);
577 				dns_name_copy(&name, foundname);
578 				while (i > 0) {
579 					dns_rbtnode_t *level_node =
580 						search->chain.levels[--i];
581 					dns_name_init(&name, NULL);
582 					dns_rbt_namefromnode(level_node, &name);
583 					result = dns_name_concatenate(
584 						foundname, &name, foundname,
585 						NULL);
586 					if (result != ISC_R_SUCCESS) {
587 						if (nodep != NULL) {
588 							*nodep = NULL;
589 						}
590 						goto node_exit;
591 					}
592 				}
593 			}
594 			result = DNS_R_DELEGATION;
595 			if (nodep != NULL) {
596 				dns__rbtdb_newref(search->rbtdb, node,
597 						  nlocktype DNS__DB_FLARG_PASS);
598 				*nodep = node;
599 			}
600 			dns__rbtdb_bindrdataset(search->rbtdb, node, found,
601 						search->now, nlocktype,
602 						rdataset DNS__DB_FLARG_PASS);
603 			if (foundsig != NULL) {
604 				dns__rbtdb_bindrdataset(
605 					search->rbtdb, node, foundsig,
606 					search->now, nlocktype,
607 					sigrdataset DNS__DB_FLARG_PASS);
608 			}
609 			if (need_headerupdate(found, search->now) ||
610 			    (foundsig != NULL &&
611 			     need_headerupdate(foundsig, search->now)))
612 			{
613 				if (nlocktype != isc_rwlocktype_write) {
614 					NODE_FORCEUPGRADE(lock, &nlocktype);
615 					POST(nlocktype);
616 				}
617 				if (need_headerupdate(found, search->now)) {
618 					update_header(search->rbtdb, found,
619 						      search->now);
620 				}
621 				if (foundsig != NULL &&
622 				    need_headerupdate(foundsig, search->now))
623 				{
624 					update_header(search->rbtdb, foundsig,
625 						      search->now);
626 				}
627 			}
628 		}
629 
630 	node_exit:
631 		NODE_UNLOCK(lock, &nlocktype);
632 
633 		if (found == NULL && i > 0) {
634 			i--;
635 			node = search->chain.levels[i];
636 		} else {
637 			done = true;
638 		}
639 	} while (!done);
640 
641 	return result;
642 }
643 
644 /*
645  * Look for a potentially covering NSEC in the cache where `name`
646  * is known not to exist.  This uses the auxiliary NSEC tree to find
647  * the potential NSEC owner. If found, we update 'foundname', 'nodep',
648  * 'rdataset' and 'sigrdataset', and return DNS_R_COVERINGNSEC.
649  * Otherwise, return ISC_R_NOTFOUND.
650  */
651 static isc_result_t
652 find_coveringnsec(rbtdb_search_t *search, const dns_name_t *name,
653 		  dns_dbnode_t **nodep, isc_stdtime_t now,
654 		  dns_name_t *foundname, dns_rdataset_t *rdataset,
655 		  dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
656 	dns_fixedname_t fprefix, forigin, ftarget, fixed;
657 	dns_name_t *prefix = NULL, *origin = NULL;
658 	dns_name_t *target = NULL, *fname = NULL;
659 	dns_rbtnode_t *node = NULL;
660 	dns_rbtnodechain_t chain;
661 	isc_result_t result;
662 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
663 	isc_rwlock_t *lock = NULL;
664 	dns_typepair_t matchtype, sigmatchtype;
665 	dns_slabheader_t *found = NULL, *foundsig = NULL;
666 	dns_slabheader_t *header = NULL;
667 	dns_slabheader_t *header_next = NULL, *header_prev = NULL;
668 
669 	/*
670 	 * Look for the node in the auxilary tree.
671 	 */
672 	dns_rbtnodechain_init(&chain);
673 	target = dns_fixedname_initname(&ftarget);
674 	result = dns_rbt_findnode(search->rbtdb->nsec, name, target, &node,
675 				  &chain, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
676 	if (result != DNS_R_PARTIALMATCH) {
677 		dns_rbtnodechain_reset(&chain);
678 		return ISC_R_NOTFOUND;
679 	}
680 
681 	prefix = dns_fixedname_initname(&fprefix);
682 	origin = dns_fixedname_initname(&forigin);
683 	target = dns_fixedname_initname(&ftarget);
684 	fname = dns_fixedname_initname(&fixed);
685 
686 	matchtype = DNS_TYPEPAIR_VALUE(dns_rdatatype_nsec, 0);
687 	sigmatchtype = DNS_SIGTYPE(dns_rdatatype_nsec);
688 
689 	/*
690 	 * Extract predecessor from chain.
691 	 */
692 	result = dns_rbtnodechain_current(&chain, prefix, origin, NULL);
693 	dns_rbtnodechain_reset(&chain);
694 	if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) {
695 		return ISC_R_NOTFOUND;
696 	}
697 
698 	result = dns_name_concatenate(prefix, origin, target, NULL);
699 	if (result != ISC_R_SUCCESS) {
700 		return ISC_R_NOTFOUND;
701 	}
702 
703 	/*
704 	 * Lookup the predecessor in the main tree.
705 	 */
706 	node = NULL;
707 	result = dns_rbt_findnode(search->rbtdb->tree, target, fname, &node,
708 				  NULL, DNS_RBTFIND_EMPTYDATA, NULL, NULL);
709 	if (result != ISC_R_SUCCESS) {
710 		return ISC_R_NOTFOUND;
711 	}
712 
713 	lock = &(search->rbtdb->node_locks[node->locknum].lock);
714 	NODE_RDLOCK(lock, &nlocktype);
715 	for (header = node->data; header != NULL; header = header_next) {
716 		header_next = header->next;
717 		if (check_stale_header(node, header, &nlocktype, lock, search,
718 				       &header_prev))
719 		{
720 			continue;
721 		}
722 		if (NONEXISTENT(header) || DNS_TYPEPAIR_TYPE(header->type) == 0)
723 		{
724 			header_prev = header;
725 			continue;
726 		}
727 		if (header->type == matchtype) {
728 			found = header;
729 			if (foundsig != NULL) {
730 				break;
731 			}
732 		} else if (header->type == sigmatchtype) {
733 			foundsig = header;
734 			if (found != NULL) {
735 				break;
736 			}
737 		}
738 		header_prev = header;
739 	}
740 	if (found != NULL) {
741 		dns__rbtdb_bindrdataset(search->rbtdb, node, found, now,
742 					nlocktype, rdataset DNS__DB_FLARG_PASS);
743 		if (foundsig != NULL) {
744 			dns__rbtdb_bindrdataset(search->rbtdb, node, foundsig,
745 						now, nlocktype,
746 						sigrdataset DNS__DB_FLARG_PASS);
747 		}
748 		dns__rbtdb_newref(search->rbtdb, node,
749 				  nlocktype DNS__DB_FLARG_PASS);
750 
751 		dns_name_copy(fname, foundname);
752 
753 		*nodep = node;
754 		result = DNS_R_COVERINGNSEC;
755 	} else {
756 		result = ISC_R_NOTFOUND;
757 	}
758 	NODE_UNLOCK(lock, &nlocktype);
759 	return result;
760 }
761 
762 static isc_result_t
763 cache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version,
764 	   dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
765 	   dns_dbnode_t **nodep, dns_name_t *foundname,
766 	   dns_rdataset_t *rdataset,
767 	   dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
768 	dns_rbtnode_t *node = NULL;
769 	isc_result_t result;
770 	rbtdb_search_t search;
771 	bool cname_ok = true;
772 	bool found_noqname = false;
773 	bool all_negative = true;
774 	bool empty_node;
775 	isc_rwlock_t *lock = NULL;
776 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
777 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
778 	dns_slabheader_t *header = NULL;
779 	dns_slabheader_t *header_prev = NULL, *header_next = NULL;
780 	dns_slabheader_t *found = NULL, *nsheader = NULL;
781 	dns_slabheader_t *foundsig = NULL, *nssig = NULL, *cnamesig = NULL;
782 	dns_slabheader_t *update = NULL, *updatesig = NULL;
783 	dns_slabheader_t *nsecheader = NULL, *nsecsig = NULL;
784 	dns_typepair_t sigtype, negtype;
785 
786 	UNUSED(version);
787 
788 	REQUIRE(VALID_RBTDB((dns_rbtdb_t *)db));
789 	REQUIRE(version == NULL);
790 
791 	if (now == 0) {
792 		now = isc_stdtime_now();
793 	}
794 
795 	search = (rbtdb_search_t){
796 		.rbtdb = (dns_rbtdb_t *)db,
797 		.serial = 1,
798 		.options = options,
799 		.now = now,
800 	};
801 	dns_fixedname_init(&search.zonecut_name);
802 	dns_rbtnodechain_init(&search.chain);
803 
804 	TREE_RDLOCK(&search.rbtdb->tree_lock, &tlocktype);
805 
806 	/*
807 	 * Search down from the root of the tree.  If, while going down, we
808 	 * encounter a callback node, cache_zonecut_callback() will search the
809 	 * rdatasets at the zone cut for a DNAME rdataset.
810 	 */
811 	result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
812 				  &search.chain, DNS_RBTFIND_EMPTYDATA,
813 				  cache_zonecut_callback, &search);
814 
815 	if (result == DNS_R_PARTIALMATCH) {
816 		/*
817 		 * If dns_rbt_findnode discovered a covering DNAME skip
818 		 * looking for a covering NSEC.
819 		 */
820 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
821 		    (search.zonecut_header == NULL ||
822 		     search.zonecut_header->type != dns_rdatatype_dname))
823 		{
824 			result = find_coveringnsec(
825 				&search, name, nodep, now, foundname, rdataset,
826 				sigrdataset DNS__DB_FLARG_PASS);
827 			if (result == DNS_R_COVERINGNSEC) {
828 				goto tree_exit;
829 			}
830 		}
831 		if (search.zonecut != NULL) {
832 			result = setup_delegation(
833 				&search, nodep, foundname, rdataset,
834 				sigrdataset DNS__DB_FLARG_PASS);
835 			goto tree_exit;
836 		} else {
837 		find_ns:
838 			result = find_deepest_zonecut(
839 				&search, node, nodep, foundname, rdataset,
840 				sigrdataset DNS__DB_FLARG_PASS);
841 			goto tree_exit;
842 		}
843 	} else if (result != ISC_R_SUCCESS) {
844 		goto tree_exit;
845 	}
846 
847 	/*
848 	 * Certain DNSSEC types are not subject to CNAME matching
849 	 * (RFC4035, section 2.5 and RFC3007).
850 	 *
851 	 * We don't check for RRSIG, because we don't store RRSIG records
852 	 * directly.
853 	 */
854 	if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) {
855 		cname_ok = false;
856 	}
857 
858 	/*
859 	 * We now go looking for rdata...
860 	 */
861 
862 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
863 	NODE_RDLOCK(lock, &nlocktype);
864 
865 	/*
866 	 * These pointers need to be reset here in case we did
867 	 * 'goto find_ns' from somewhere below.
868 	 */
869 	found = NULL;
870 	foundsig = NULL;
871 	sigtype = DNS_SIGTYPE(type);
872 	negtype = DNS_TYPEPAIR_VALUE(0, type);
873 	nsheader = NULL;
874 	nsecheader = NULL;
875 	nssig = NULL;
876 	nsecsig = NULL;
877 	cnamesig = NULL;
878 	empty_node = true;
879 	header_prev = NULL;
880 	for (header = node->data; header != NULL; header = header_next) {
881 		header_next = header->next;
882 		if (check_stale_header(node, header, &nlocktype, lock, &search,
883 				       &header_prev))
884 		{
885 			/* Do nothing. */
886 		} else if (EXISTS(header) && !ANCIENT(header)) {
887 			/*
888 			 * We now know that there is at least one active
889 			 * non-stale rdataset at this node.
890 			 */
891 			empty_node = false;
892 			if (header->noqname != NULL &&
893 			    header->trust == dns_trust_secure)
894 			{
895 				found_noqname = true;
896 			}
897 			if (!NEGATIVE(header)) {
898 				all_negative = false;
899 			}
900 
901 			/*
902 			 * If we found a type we were looking for, remember
903 			 * it.
904 			 */
905 			if (header->type == type ||
906 			    (type == dns_rdatatype_any &&
907 			     DNS_TYPEPAIR_TYPE(header->type) != 0) ||
908 			    (cname_ok && header->type == dns_rdatatype_cname))
909 			{
910 				/*
911 				 * We've found the answer.
912 				 */
913 				found = header;
914 				if (header->type == dns_rdatatype_cname &&
915 				    cname_ok)
916 				{
917 					/*
918 					 * If we've already got the
919 					 * CNAME RRSIG, use it.
920 					 */
921 					if (cnamesig != NULL) {
922 						foundsig = cnamesig;
923 					} else {
924 						sigtype = DNS_SIGTYPE(
925 							dns_rdatatype_cname);
926 					}
927 				}
928 			} else if (header->type == sigtype) {
929 				/*
930 				 * We've found the RRSIG rdataset for our
931 				 * target type.  Remember it.
932 				 */
933 				foundsig = header;
934 			} else if (header->type == RDATATYPE_NCACHEANY ||
935 				   header->type == negtype)
936 			{
937 				/*
938 				 * We've found a negative cache entry.
939 				 */
940 				found = header;
941 			} else if (header->type == dns_rdatatype_ns) {
942 				/*
943 				 * Remember a NS rdataset even if we're
944 				 * not specifically looking for it, because
945 				 * we might need it later.
946 				 */
947 				nsheader = header;
948 			} else if (header->type ==
949 				   DNS_SIGTYPE(dns_rdatatype_ns))
950 			{
951 				/*
952 				 * If we need the NS rdataset, we'll also
953 				 * need its signature.
954 				 */
955 				nssig = header;
956 			} else if (header->type == dns_rdatatype_nsec) {
957 				nsecheader = header;
958 			} else if (header->type ==
959 				   DNS_SIGTYPE(dns_rdatatype_nsec))
960 			{
961 				nsecsig = header;
962 			} else if (cname_ok &&
963 				   header->type ==
964 					   DNS_SIGTYPE(dns_rdatatype_cname))
965 			{
966 				/*
967 				 * If we get a CNAME match, we'll also need
968 				 * its signature.
969 				 */
970 				cnamesig = header;
971 			}
972 			header_prev = header;
973 		} else {
974 			header_prev = header;
975 		}
976 	}
977 
978 	if (empty_node) {
979 		/*
980 		 * We have an exact match for the name, but there are no
981 		 * extant rdatasets.  That means that this node doesn't
982 		 * meaningfully exist, and that we really have a partial match.
983 		 */
984 		NODE_UNLOCK(lock, &nlocktype);
985 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
986 			result = find_coveringnsec(
987 				&search, name, nodep, now, foundname, rdataset,
988 				sigrdataset DNS__DB_FLARG_PASS);
989 			if (result == DNS_R_COVERINGNSEC) {
990 				goto tree_exit;
991 			}
992 		}
993 		goto find_ns;
994 	}
995 
996 	/*
997 	 * If we didn't find what we were looking for...
998 	 */
999 	if (found == NULL ||
1000 	    (DNS_TRUST_ADDITIONAL(found->trust) &&
1001 	     ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
1002 	    (found->trust == dns_trust_glue &&
1003 	     ((options & DNS_DBFIND_GLUEOK) == 0)) ||
1004 	    (DNS_TRUST_PENDING(found->trust) &&
1005 	     ((options & DNS_DBFIND_PENDINGOK) == 0)))
1006 	{
1007 		/*
1008 		 * Return covering NODATA NSEC record.
1009 		 */
1010 		if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 &&
1011 		    nsecheader != NULL)
1012 		{
1013 			if (nodep != NULL) {
1014 				dns__rbtdb_newref(search.rbtdb, node,
1015 						  nlocktype DNS__DB_FLARG_PASS);
1016 				*nodep = node;
1017 			}
1018 			dns__rbtdb_bindrdataset(search.rbtdb, node, nsecheader,
1019 						search.now, nlocktype,
1020 						rdataset DNS__DB_FLARG_PASS);
1021 			if (need_headerupdate(nsecheader, search.now)) {
1022 				update = nsecheader;
1023 			}
1024 			if (nsecsig != NULL) {
1025 				dns__rbtdb_bindrdataset(
1026 					search.rbtdb, node, nsecsig, search.now,
1027 					nlocktype,
1028 					sigrdataset DNS__DB_FLARG_PASS);
1029 				if (need_headerupdate(nsecsig, search.now)) {
1030 					updatesig = nsecsig;
1031 				}
1032 			}
1033 			result = DNS_R_COVERINGNSEC;
1034 			goto node_exit;
1035 		}
1036 
1037 		/*
1038 		 * This name was from a wild card.  Look for a covering NSEC.
1039 		 */
1040 		if (found == NULL && (found_noqname || all_negative) &&
1041 		    (search.options & DNS_DBFIND_COVERINGNSEC) != 0)
1042 		{
1043 			NODE_UNLOCK(lock, &nlocktype);
1044 			result = find_coveringnsec(
1045 				&search, name, nodep, now, foundname, rdataset,
1046 				sigrdataset DNS__DB_FLARG_PASS);
1047 			if (result == DNS_R_COVERINGNSEC) {
1048 				goto tree_exit;
1049 			}
1050 			goto find_ns;
1051 		}
1052 
1053 		/*
1054 		 * If there is an NS rdataset at this node, then this is the
1055 		 * deepest zone cut.
1056 		 */
1057 		if (nsheader != NULL) {
1058 			if (nodep != NULL) {
1059 				dns__rbtdb_newref(search.rbtdb, node,
1060 						  nlocktype DNS__DB_FLARG_PASS);
1061 				*nodep = node;
1062 			}
1063 			dns__rbtdb_bindrdataset(search.rbtdb, node, nsheader,
1064 						search.now, nlocktype,
1065 						rdataset DNS__DB_FLARG_PASS);
1066 			if (need_headerupdate(nsheader, search.now)) {
1067 				update = nsheader;
1068 			}
1069 			if (nssig != NULL) {
1070 				dns__rbtdb_bindrdataset(
1071 					search.rbtdb, node, nssig, search.now,
1072 					nlocktype,
1073 					sigrdataset DNS__DB_FLARG_PASS);
1074 				if (need_headerupdate(nssig, search.now)) {
1075 					updatesig = nssig;
1076 				}
1077 			}
1078 			result = DNS_R_DELEGATION;
1079 			goto node_exit;
1080 		}
1081 
1082 		/*
1083 		 * Go find the deepest zone cut.
1084 		 */
1085 		NODE_UNLOCK(lock, &nlocktype);
1086 		goto find_ns;
1087 	}
1088 
1089 	/*
1090 	 * We found what we were looking for, or we found a CNAME.
1091 	 */
1092 
1093 	if (nodep != NULL) {
1094 		dns__rbtdb_newref(search.rbtdb, node,
1095 				  nlocktype DNS__DB_FLARG_PASS);
1096 		*nodep = node;
1097 	}
1098 
1099 	if (NEGATIVE(found)) {
1100 		/*
1101 		 * We found a negative cache entry.
1102 		 */
1103 		if (NXDOMAIN(found)) {
1104 			result = DNS_R_NCACHENXDOMAIN;
1105 		} else {
1106 			result = DNS_R_NCACHENXRRSET;
1107 		}
1108 	} else if (type != found->type && type != dns_rdatatype_any &&
1109 		   found->type == dns_rdatatype_cname)
1110 	{
1111 		/*
1112 		 * We weren't doing an ANY query and we found a CNAME instead
1113 		 * of the type we were looking for, so we need to indicate
1114 		 * that result to the caller.
1115 		 */
1116 		result = DNS_R_CNAME;
1117 	} else {
1118 		/*
1119 		 * An ordinary successful query!
1120 		 */
1121 		result = ISC_R_SUCCESS;
1122 	}
1123 
1124 	if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
1125 	    result == DNS_R_NCACHENXRRSET)
1126 	{
1127 		dns__rbtdb_bindrdataset(search.rbtdb, node, found, search.now,
1128 					nlocktype, rdataset DNS__DB_FLARG_PASS);
1129 		if (need_headerupdate(found, search.now)) {
1130 			update = found;
1131 		}
1132 		if (!NEGATIVE(found) && foundsig != NULL) {
1133 			dns__rbtdb_bindrdataset(search.rbtdb, node, foundsig,
1134 						search.now, nlocktype,
1135 						sigrdataset DNS__DB_FLARG_PASS);
1136 			if (need_headerupdate(foundsig, search.now)) {
1137 				updatesig = foundsig;
1138 			}
1139 		}
1140 	}
1141 
1142 node_exit:
1143 	if ((update != NULL || updatesig != NULL) &&
1144 	    nlocktype != isc_rwlocktype_write)
1145 	{
1146 		NODE_FORCEUPGRADE(lock, &nlocktype);
1147 		POST(nlocktype);
1148 	}
1149 	if (update != NULL && need_headerupdate(update, search.now)) {
1150 		update_header(search.rbtdb, update, search.now);
1151 	}
1152 	if (updatesig != NULL && need_headerupdate(updatesig, search.now)) {
1153 		update_header(search.rbtdb, updatesig, search.now);
1154 	}
1155 
1156 	NODE_UNLOCK(lock, &nlocktype);
1157 
1158 tree_exit:
1159 	TREE_UNLOCK(&search.rbtdb->tree_lock, &tlocktype);
1160 
1161 	/*
1162 	 * If we found a zonecut but aren't going to use it, we have to
1163 	 * let go of it.
1164 	 */
1165 	if (search.need_cleanup) {
1166 		node = search.zonecut;
1167 		INSIST(node != NULL);
1168 		lock = &(search.rbtdb->node_locks[node->locknum].lock);
1169 
1170 		NODE_RDLOCK(lock, &nlocktype);
1171 		dns__rbtdb_decref(search.rbtdb, node, 0, &nlocktype, &tlocktype,
1172 				  true, false DNS__DB_FLARG_PASS);
1173 		NODE_UNLOCK(lock, &nlocktype);
1174 		INSIST(tlocktype == isc_rwlocktype_none);
1175 	}
1176 
1177 	dns_rbtnodechain_reset(&search.chain);
1178 
1179 	update_cachestats(search.rbtdb, result);
1180 	return result;
1181 }
1182 
1183 static isc_result_t
1184 cache_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options,
1185 		  isc_stdtime_t now, dns_dbnode_t **nodep,
1186 		  dns_name_t *foundname, dns_name_t *dcname,
1187 		  dns_rdataset_t *rdataset,
1188 		  dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
1189 	dns_rbtnode_t *node = NULL;
1190 	isc_rwlock_t *lock = NULL;
1191 	isc_result_t result;
1192 	rbtdb_search_t search;
1193 	dns_slabheader_t *header = NULL;
1194 	dns_slabheader_t *header_prev = NULL, *header_next = NULL;
1195 	dns_slabheader_t *found = NULL, *foundsig = NULL;
1196 	unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
1197 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1198 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1199 	bool dcnull = (dcname == NULL);
1200 
1201 	REQUIRE(VALID_RBTDB((dns_rbtdb_t *)db));
1202 
1203 	if (now == 0) {
1204 		now = isc_stdtime_now();
1205 	}
1206 
1207 	search = (rbtdb_search_t){
1208 		.rbtdb = (dns_rbtdb_t *)db,
1209 		.serial = 1,
1210 		.options = options,
1211 		.now = now,
1212 	};
1213 	dns_fixedname_init(&search.zonecut_name);
1214 	dns_rbtnodechain_init(&search.chain);
1215 
1216 	if (dcnull) {
1217 		dcname = foundname;
1218 	}
1219 
1220 	if ((options & DNS_DBFIND_NOEXACT) != 0) {
1221 		rbtoptions |= DNS_RBTFIND_NOEXACT;
1222 	}
1223 
1224 	TREE_RDLOCK(&search.rbtdb->tree_lock, &tlocktype);
1225 
1226 	/*
1227 	 * Search down from the root of the tree.
1228 	 */
1229 	result = dns_rbt_findnode(search.rbtdb->tree, name, dcname, &node,
1230 				  &search.chain, rbtoptions, NULL, &search);
1231 
1232 	if (result == DNS_R_PARTIALMATCH) {
1233 		result = find_deepest_zonecut(&search, node, nodep, foundname,
1234 					      rdataset,
1235 					      sigrdataset DNS__DB_FLARG_PASS);
1236 		goto tree_exit;
1237 	} else if (result != ISC_R_SUCCESS) {
1238 		goto tree_exit;
1239 	} else if (!dcnull) {
1240 		dns_name_copy(dcname, foundname);
1241 	}
1242 
1243 	/*
1244 	 * We now go looking for an NS rdataset at the node.
1245 	 */
1246 
1247 	lock = &(search.rbtdb->node_locks[node->locknum].lock);
1248 	NODE_RDLOCK(lock, &nlocktype);
1249 
1250 	for (header = node->data; header != NULL; header = header_next) {
1251 		header_next = header->next;
1252 		if (check_stale_header(node, header, &nlocktype, lock, &search,
1253 				       &header_prev))
1254 		{
1255 			/*
1256 			 * The function dns_rbt_findnode found us the a matching
1257 			 * node for 'name' and stored the result in 'dcname'.
1258 			 * This is the deepest known zonecut in our database.
1259 			 * However, this node may be stale and if serve-stale
1260 			 * is not enabled (in other words 'stale-answer-enable'
1261 			 * is set to no), this node may not be used as a
1262 			 * zonecut we know about. If so, find the deepest
1263 			 * zonecut from this node up and return that instead.
1264 			 */
1265 			NODE_UNLOCK(lock, &nlocktype);
1266 			result = find_deepest_zonecut(
1267 				&search, node, nodep, foundname, rdataset,
1268 				sigrdataset DNS__DB_FLARG_PASS);
1269 			dns_name_copy(foundname, dcname);
1270 			goto tree_exit;
1271 		} else if (EXISTS(header) && !ANCIENT(header)) {
1272 			/*
1273 			 * If we found a type we were looking for, remember
1274 			 * it.
1275 			 */
1276 			if (header->type == dns_rdatatype_ns) {
1277 				/*
1278 				 * Remember a NS rdataset even if we're
1279 				 * not specifically looking for it, because
1280 				 * we might need it later.
1281 				 */
1282 				found = header;
1283 			} else if (header->type ==
1284 				   DNS_SIGTYPE(dns_rdatatype_ns))
1285 			{
1286 				/*
1287 				 * If we need the NS rdataset, we'll also
1288 				 * need its signature.
1289 				 */
1290 				foundsig = header;
1291 			}
1292 			header_prev = header;
1293 		} else {
1294 			header_prev = header;
1295 		}
1296 	}
1297 
1298 	if (found == NULL) {
1299 		/*
1300 		 * No NS records here.
1301 		 */
1302 		NODE_UNLOCK(lock, &nlocktype);
1303 		result = find_deepest_zonecut(&search, node, nodep, foundname,
1304 					      rdataset,
1305 					      sigrdataset DNS__DB_FLARG_PASS);
1306 		goto tree_exit;
1307 	}
1308 
1309 	if (nodep != NULL) {
1310 		dns__rbtdb_newref(search.rbtdb, node,
1311 				  nlocktype DNS__DB_FLARG_PASS);
1312 		*nodep = node;
1313 	}
1314 
1315 	dns__rbtdb_bindrdataset(search.rbtdb, node, found, search.now,
1316 				nlocktype, rdataset DNS__DB_FLARG_PASS);
1317 	if (foundsig != NULL) {
1318 		dns__rbtdb_bindrdataset(search.rbtdb, node, foundsig,
1319 					search.now, nlocktype,
1320 					sigrdataset DNS__DB_FLARG_PASS);
1321 	}
1322 
1323 	if (need_headerupdate(found, search.now) ||
1324 	    (foundsig != NULL && need_headerupdate(foundsig, search.now)))
1325 	{
1326 		if (nlocktype != isc_rwlocktype_write) {
1327 			NODE_FORCEUPGRADE(lock, &nlocktype);
1328 			POST(nlocktype);
1329 		}
1330 		if (need_headerupdate(found, search.now)) {
1331 			update_header(search.rbtdb, found, search.now);
1332 		}
1333 		if (foundsig != NULL && need_headerupdate(foundsig, search.now))
1334 		{
1335 			update_header(search.rbtdb, foundsig, search.now);
1336 		}
1337 	}
1338 
1339 	NODE_UNLOCK(lock, &nlocktype);
1340 
1341 tree_exit:
1342 	TREE_UNLOCK(&search.rbtdb->tree_lock, &tlocktype);
1343 
1344 	INSIST(!search.need_cleanup);
1345 
1346 	dns_rbtnodechain_reset(&search.chain);
1347 
1348 	if (result == DNS_R_DELEGATION) {
1349 		result = ISC_R_SUCCESS;
1350 	}
1351 
1352 	return result;
1353 }
1354 
1355 static isc_result_t
1356 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
1357 		   dns_rdatatype_t type, dns_rdatatype_t covers,
1358 		   isc_stdtime_t now, dns_rdataset_t *rdataset,
1359 		   dns_rdataset_t *sigrdataset DNS__DB_FLARG) {
1360 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1361 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
1362 	dns_slabheader_t *header = NULL, *header_next = NULL;
1363 	dns_slabheader_t *found = NULL, *foundsig = NULL;
1364 	dns_typepair_t matchtype, sigmatchtype, negtype;
1365 	isc_result_t result;
1366 	isc_rwlock_t *lock = NULL;
1367 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1368 
1369 	REQUIRE(VALID_RBTDB(rbtdb));
1370 	REQUIRE(type != dns_rdatatype_any);
1371 
1372 	UNUSED(version);
1373 
1374 	result = ISC_R_SUCCESS;
1375 
1376 	if (now == 0) {
1377 		now = isc_stdtime_now();
1378 	}
1379 
1380 	lock = &rbtdb->node_locks[rbtnode->locknum].lock;
1381 	NODE_RDLOCK(lock, &nlocktype);
1382 
1383 	matchtype = DNS_TYPEPAIR_VALUE(type, covers);
1384 	negtype = DNS_TYPEPAIR_VALUE(0, type);
1385 	if (covers == 0) {
1386 		sigmatchtype = DNS_SIGTYPE(type);
1387 	} else {
1388 		sigmatchtype = 0;
1389 	}
1390 
1391 	for (header = rbtnode->data; header != NULL; header = header_next) {
1392 		header_next = header->next;
1393 		if (!ACTIVE(header, now)) {
1394 			if ((header->ttl + STALE_TTL(header, rbtdb) <
1395 			     now - RBTDB_VIRTUAL) &&
1396 			    (nlocktype == isc_rwlocktype_write ||
1397 			     NODE_TRYUPGRADE(lock, &nlocktype) ==
1398 				     ISC_R_SUCCESS))
1399 			{
1400 				/*
1401 				 * We update the node's status only when we
1402 				 * can get write access.
1403 				 *
1404 				 * We don't check if refcurrent(rbtnode) == 0
1405 				 * and try to free like we do in cache_find(),
1406 				 * because refcurrent(rbtnode) must be
1407 				 * non-zero.  This is so because 'node' is an
1408 				 * argument to the function.
1409 				 */
1410 				dns__rbtdb_mark(header,
1411 						DNS_SLABHEADERATTR_ANCIENT);
1412 				RBTDB_HEADERNODE(header)->dirty = 1;
1413 			}
1414 		} else if (EXISTS(header) && !ANCIENT(header)) {
1415 			if (header->type == matchtype) {
1416 				found = header;
1417 			} else if (header->type == RDATATYPE_NCACHEANY ||
1418 				   header->type == negtype)
1419 			{
1420 				found = header;
1421 			} else if (header->type == sigmatchtype) {
1422 				foundsig = header;
1423 			}
1424 		}
1425 	}
1426 	if (found != NULL) {
1427 		dns__rbtdb_bindrdataset(rbtdb, rbtnode, found, now, nlocktype,
1428 					rdataset DNS__DB_FLARG_PASS);
1429 		if (!NEGATIVE(found) && foundsig != NULL) {
1430 			dns__rbtdb_bindrdataset(rbtdb, rbtnode, foundsig, now,
1431 						nlocktype,
1432 						sigrdataset DNS__DB_FLARG_PASS);
1433 		}
1434 	}
1435 
1436 	NODE_UNLOCK(lock, &nlocktype);
1437 
1438 	if (found == NULL) {
1439 		return ISC_R_NOTFOUND;
1440 	}
1441 
1442 	if (NEGATIVE(found)) {
1443 		/*
1444 		 * We found a negative cache entry.
1445 		 */
1446 		if (NXDOMAIN(found)) {
1447 			result = DNS_R_NCACHENXDOMAIN;
1448 		} else {
1449 			result = DNS_R_NCACHENXRRSET;
1450 		}
1451 	}
1452 
1453 	update_cachestats(rbtdb, result);
1454 
1455 	return result;
1456 }
1457 
1458 static size_t
1459 hashsize(dns_db_t *db) {
1460 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1461 	size_t size;
1462 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1463 
1464 	REQUIRE(VALID_RBTDB(rbtdb));
1465 
1466 	TREE_RDLOCK(&rbtdb->tree_lock, &tlocktype);
1467 	size = dns_rbt_hashsize(rbtdb->tree);
1468 	TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
1469 
1470 	return size;
1471 }
1472 
1473 static isc_result_t
1474 setcachestats(dns_db_t *db, isc_stats_t *stats) {
1475 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1476 
1477 	REQUIRE(VALID_RBTDB(rbtdb));
1478 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
1479 	REQUIRE(stats != NULL);
1480 
1481 	isc_stats_attach(stats, &rbtdb->cachestats);
1482 	return ISC_R_SUCCESS;
1483 }
1484 
1485 static dns_stats_t *
1486 getrrsetstats(dns_db_t *db) {
1487 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1488 
1489 	REQUIRE(VALID_RBTDB(rbtdb));
1490 	REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
1491 
1492 	return rbtdb->rrsetstats;
1493 }
1494 
1495 static isc_result_t
1496 setservestalettl(dns_db_t *db, dns_ttl_t ttl) {
1497 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1498 
1499 	REQUIRE(VALID_RBTDB(rbtdb));
1500 	REQUIRE(IS_CACHE(rbtdb));
1501 
1502 	/* currently no bounds checking.  0 means disable. */
1503 	rbtdb->common.serve_stale_ttl = ttl;
1504 	return ISC_R_SUCCESS;
1505 }
1506 
1507 static isc_result_t
1508 getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
1509 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1510 
1511 	REQUIRE(VALID_RBTDB(rbtdb));
1512 	REQUIRE(IS_CACHE(rbtdb));
1513 
1514 	*ttl = rbtdb->common.serve_stale_ttl;
1515 	return ISC_R_SUCCESS;
1516 }
1517 
1518 static isc_result_t
1519 setservestalerefresh(dns_db_t *db, uint32_t interval) {
1520 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1521 
1522 	REQUIRE(VALID_RBTDB(rbtdb));
1523 	REQUIRE(IS_CACHE(rbtdb));
1524 
1525 	/* currently no bounds checking.  0 means disable. */
1526 	rbtdb->serve_stale_refresh = interval;
1527 	return ISC_R_SUCCESS;
1528 }
1529 
1530 static isc_result_t
1531 getservestalerefresh(dns_db_t *db, uint32_t *interval) {
1532 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1533 
1534 	REQUIRE(VALID_RBTDB(rbtdb));
1535 	REQUIRE(IS_CACHE(rbtdb));
1536 
1537 	*interval = rbtdb->serve_stale_refresh;
1538 	return ISC_R_SUCCESS;
1539 }
1540 
1541 static void
1542 expiredata(dns_db_t *db, dns_dbnode_t *node, void *data) {
1543 	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1544 	dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
1545 	dns_slabheader_t *header = data;
1546 	isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1547 	isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
1548 
1549 	NODE_WRLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
1550 	dns__cacherbt_expireheader(header, &tlocktype,
1551 				   dns_expire_flush DNS__DB_FILELINE);
1552 	NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, &nlocktype);
1553 	INSIST(tlocktype == isc_rwlocktype_none);
1554 }
1555 
1556 dns_dbmethods_t dns__rbtdb_cachemethods = {
1557 	.destroy = dns__rbtdb_destroy,
1558 	.currentversion = dns__rbtdb_currentversion,
1559 	.newversion = dns__rbtdb_newversion,
1560 	.attachversion = dns__rbtdb_attachversion,
1561 	.closeversion = dns__rbtdb_closeversion,
1562 	.findnode = dns__rbtdb_findnode,
1563 	.find = cache_find,
1564 	.findzonecut = cache_findzonecut,
1565 	.attachnode = dns__rbtdb_attachnode,
1566 	.detachnode = dns__rbtdb_detachnode,
1567 	.createiterator = dns__rbtdb_createiterator,
1568 	.findrdataset = cache_findrdataset,
1569 	.allrdatasets = dns__rbtdb_allrdatasets,
1570 	.addrdataset = dns__rbtdb_addrdataset,
1571 	.subtractrdataset = dns__rbtdb_subtractrdataset,
1572 	.deleterdataset = dns__rbtdb_deleterdataset,
1573 	.nodecount = dns__rbtdb_nodecount,
1574 	.setloop = dns__rbtdb_setloop,
1575 	.getoriginnode = dns__rbtdb_getoriginnode,
1576 	.getrrsetstats = getrrsetstats,
1577 	.setcachestats = setcachestats,
1578 	.hashsize = hashsize,
1579 	.setservestalettl = setservestalettl,
1580 	.getservestalettl = getservestalettl,
1581 	.setservestalerefresh = setservestalerefresh,
1582 	.getservestalerefresh = getservestalerefresh,
1583 	.locknode = dns__rbtdb_locknode,
1584 	.unlocknode = dns__rbtdb_unlocknode,
1585 	.expiredata = expiredata,
1586 	.deletedata = dns__rbtdb_deletedata,
1587 	.setmaxrrperset = dns__rbtdb_setmaxrrperset,
1588 	.setmaxtypepername = dns__rbtdb_setmaxtypepername,
1589 };
1590 
1591 /*
1592  * Caller must hold the node (write) lock.
1593  */
1594 void
1595 dns__cacherbt_expireheader(dns_slabheader_t *header,
1596 			   isc_rwlocktype_t *tlocktypep,
1597 			   dns_expire_t reason DNS__DB_FLARG) {
1598 	dns__rbtdb_setttl(header, 0);
1599 	dns__rbtdb_mark(header, DNS_SLABHEADERATTR_ANCIENT);
1600 	RBTDB_HEADERNODE(header)->dirty = 1;
1601 
1602 	if (isc_refcount_current(&RBTDB_HEADERNODE(header)->references) == 0) {
1603 		isc_rwlocktype_t nlocktype = isc_rwlocktype_write;
1604 		dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)header->db;
1605 
1606 		/*
1607 		 * If no one else is using the node, we can clean it up now.
1608 		 * We first need to gain a new reference to the node to meet a
1609 		 * requirement of dns__rbtdb_decref().
1610 		 */
1611 		dns__rbtdb_newref(rbtdb, RBTDB_HEADERNODE(header),
1612 				  nlocktype DNS__DB_FLARG_PASS);
1613 		dns__rbtdb_decref(rbtdb, RBTDB_HEADERNODE(header), 0,
1614 				  &nlocktype, tlocktypep, true,
1615 				  false DNS__DB_FLARG_PASS);
1616 
1617 		if (rbtdb->cachestats == NULL) {
1618 			return;
1619 		}
1620 
1621 		switch (reason) {
1622 		case dns_expire_ttl:
1623 			isc_stats_increment(rbtdb->cachestats,
1624 					    dns_cachestatscounter_deletettl);
1625 			break;
1626 		case dns_expire_lru:
1627 			isc_stats_increment(rbtdb->cachestats,
1628 					    dns_cachestatscounter_deletelru);
1629 			break;
1630 		default:
1631 			break;
1632 		}
1633 	}
1634 }
1635 
1636 static size_t
1637 rdataset_size(dns_slabheader_t *header) {
1638 	if (!NONEXISTENT(header)) {
1639 		return dns_rdataslab_size((unsigned char *)header,
1640 					  sizeof(*header));
1641 	}
1642 
1643 	return sizeof(*header);
1644 }
1645 
1646 static size_t
1647 expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum,
1648 		   isc_rwlocktype_t *tlocktypep,
1649 		   size_t purgesize DNS__DB_FLARG) {
1650 	dns_slabheader_t *header = NULL;
1651 	size_t purged = 0;
1652 
1653 	for (header = ISC_LIST_TAIL(rbtdb->lru[locknum]);
1654 	     header != NULL && header->last_used <= rbtdb->last_used &&
1655 	     purged <= purgesize;
1656 	     header = ISC_LIST_TAIL(rbtdb->lru[locknum]))
1657 	{
1658 		size_t header_size = rdataset_size(header);
1659 
1660 		/*
1661 		 * Unlink the entry at this point to avoid checking it
1662 		 * again even if it's currently used someone else and
1663 		 * cannot be purged at this moment.  This entry won't be
1664 		 * referenced any more (so unlinking is safe) since the
1665 		 * TTL will be reset to 0.
1666 		 */
1667 		ISC_LIST_UNLINK(rbtdb->lru[locknum], header, link);
1668 		dns__cacherbt_expireheader(header, tlocktypep,
1669 					   dns_expire_lru DNS__DB_FLARG_PASS);
1670 		purged += header_size;
1671 	}
1672 
1673 	return purged;
1674 }
1675 
1676 /*%
1677  * Purge some expired and/or stale (i.e. unused for some period) cache entries
1678  * due to an overmem condition.  To recover from this condition quickly,
1679  * we clean up entries up to the size of newly added rdata that triggered
1680  * the overmem; this is accessible via newheader.
1681  *
1682  * The LRU lists tails are processed in LRU order to the nearest second.
1683  *
1684  * A write lock on the tree must be held.
1685  */
1686 void
1687 dns__cacherbt_overmem(dns_rbtdb_t *rbtdb, dns_slabheader_t *newheader,
1688 		      isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) {
1689 	uint32_t locknum_start = rbtdb->lru_sweep++ % rbtdb->node_lock_count;
1690 	uint32_t locknum = locknum_start;
1691 	/* Size of added data, possible node and possible ENT node. */
1692 	size_t purgesize =
1693 		rdataset_size(newheader) +
1694 		2 * dns__rbtnode_getsize(RBTDB_HEADERNODE(newheader));
1695 	size_t purged = 0;
1696 	isc_stdtime_t min_last_used = 0;
1697 	size_t max_passes = 8;
1698 
1699 again:
1700 	do {
1701 		isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
1702 		NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1703 
1704 		purged += expire_lru_headers(rbtdb, locknum, tlocktypep,
1705 					     purgesize -
1706 						     purged DNS__DB_FLARG_PASS);
1707 
1708 		/*
1709 		 * Work out the oldest remaining last_used values of the list
1710 		 * tails as we walk across the array of lru lists.
1711 		 */
1712 		dns_slabheader_t *header = ISC_LIST_TAIL(rbtdb->lru[locknum]);
1713 		if (header != NULL &&
1714 		    (min_last_used == 0 || header->last_used < min_last_used))
1715 		{
1716 			min_last_used = header->last_used;
1717 		}
1718 		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
1719 		locknum = (locknum + 1) % rbtdb->node_lock_count;
1720 	} while (locknum != locknum_start && purged <= purgesize);
1721 
1722 	/*
1723 	 * Update rbtdb->last_used if we have walked all the list tails and have
1724 	 * not freed the required amount of memory.
1725 	 */
1726 	if (purged < purgesize) {
1727 		if (min_last_used != 0) {
1728 			rbtdb->last_used = min_last_used;
1729 			if (max_passes-- > 0) {
1730 				goto again;
1731 			}
1732 		}
1733 	}
1734 }
1735