xref: /openbsd-src/usr.sbin/nsd/server.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 
90 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
91 
92 #ifdef USE_DNSTAP
93 /*
94  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
95  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
96  */
97 static void
98 log_addr(const char* descr,
99 #ifdef INET6
100 	struct sockaddr_storage* addr
101 #else
102 	struct sockaddr_in* addr
103 #endif
104 	)
105 {
106 	char str_buf[64];
107 	if(verbosity < 6)
108 		return;
109 	if(
110 #ifdef INET6
111 		addr->ss_family == AF_INET
112 #else
113 		addr->sin_family == AF_INET
114 #endif
115 		) {
116 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
117 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
118 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
119 #ifdef INET6
120 	} else {
121 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
122 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
123 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
124 #endif
125 	}
126 }
127 #endif /* USE_DNSTAP */
128 
129 #ifdef USE_TCP_FASTOPEN
130   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
131   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
132 #endif
133 
134 /*
135  * Data for the UDP handlers.
136  */
137 struct udp_handler_data
138 {
139 	struct nsd        *nsd;
140 	struct nsd_socket *socket;
141 	struct event       event;
142 };
143 
144 struct tcp_accept_handler_data {
145 	struct nsd        *nsd;
146 	struct nsd_socket *socket;
147 	int                event_added;
148 	struct event       event;
149 #ifdef HAVE_SSL
150 	/* handler accepts TLS connections on the dedicated port */
151 	int                tls_accept;
152 #endif
153 };
154 
155 /*
156  * These globals are used to enable the TCP accept handlers
157  * when the number of TCP connection drops below the maximum
158  * number of TCP connections.
159  */
160 static size_t tcp_accept_handler_count;
161 static struct tcp_accept_handler_data *tcp_accept_handlers;
162 
163 static struct event slowaccept_event;
164 static int slowaccept;
165 
166 #ifdef HAVE_SSL
167 static unsigned char *ocspdata = NULL;
168 static long ocspdata_len = 0;
169 #endif
170 
171 #ifdef NONBLOCKING_IS_BROKEN
172 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
173    read multiple times from a socket when reported ready by select. */
174 # define NUM_RECV_PER_SELECT (1)
175 #else /* !NONBLOCKING_IS_BROKEN */
176 # define NUM_RECV_PER_SELECT (100)
177 #endif /* NONBLOCKING_IS_BROKEN */
178 
179 #ifndef HAVE_MMSGHDR
180 struct mmsghdr {
181 	struct msghdr msg_hdr;
182 	unsigned int  msg_len;
183 };
184 #endif
185 
186 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
187 static struct iovec iovecs[NUM_RECV_PER_SELECT];
188 static struct query *queries[NUM_RECV_PER_SELECT];
189 
190 /*
191  * Data for the TCP connection handlers.
192  *
193  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
194  * blocking the entire server on a slow TCP connection, but does make
195  * reading from and writing to the socket more complicated.
196  *
197  * Basically, whenever a read/write would block (indicated by the
198  * EAGAIN errno variable) we remember the position we were reading
199  * from/writing to and return from the TCP reading/writing event
200  * handler.  When the socket becomes readable/writable again we
201  * continue from the same position.
202  */
203 struct tcp_handler_data
204 {
205 	/*
206 	 * The region used to allocate all TCP connection related
207 	 * data, including this structure.  This region is destroyed
208 	 * when the connection is closed.
209 	 */
210 	region_type*		region;
211 
212 	/*
213 	 * The global nsd structure.
214 	 */
215 	struct nsd*			nsd;
216 
217 	/*
218 	 * The current query data for this TCP connection.
219 	 */
220 	query_type*			query;
221 
222 	/*
223 	 * The query_state is used to remember if we are performing an
224 	 * AXFR, if we're done processing, or if we should discard the
225 	 * query and connection.
226 	 */
227 	query_state_type	query_state;
228 
229 	/*
230 	 * The event for the file descriptor and tcp timeout
231 	 */
232 	struct event event;
233 
234 	/*
235 	 * The bytes_transmitted field is used to remember the number
236 	 * of bytes transmitted when receiving or sending a DNS
237 	 * packet.  The count includes the two additional bytes used
238 	 * to specify the packet length on a TCP connection.
239 	 */
240 	size_t				bytes_transmitted;
241 
242 	/*
243 	 * The number of queries handled by this specific TCP connection.
244 	 */
245 	int					query_count;
246 
247 	/*
248 	 * The timeout in msec for this tcp connection
249 	 */
250 	int	tcp_timeout;
251 
252 	/*
253 	 * If the connection is allowed to have further queries on it.
254 	 */
255 	int tcp_no_more_queries;
256 
257 #ifdef USE_DNSTAP
258 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
259 	struct nsd_socket *socket;
260 #endif /* USE_DNSTAP */
261 
262 #ifdef HAVE_SSL
263 	/*
264 	 * TLS object.
265 	 */
266 	SSL* tls;
267 
268 	/*
269 	 * TLS handshake state.
270 	 */
271 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
272 		tls_hs_read_event, tls_hs_write_event } shake_state;
273 #endif
274 	/* list of connections, for service of remaining tcp channels */
275 	struct tcp_handler_data *prev, *next;
276 };
277 /* global that is the list of active tcp channels */
278 static struct tcp_handler_data *tcp_active_list = NULL;
279 
280 /*
281  * Handle incoming queries on the UDP server sockets.
282  */
283 static void handle_udp(int fd, short event, void* arg);
284 
285 /*
286  * Handle incoming connections on the TCP sockets.  These handlers
287  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
288  * connection) but are disabled when the number of current TCP
289  * connections is equal to the maximum number of TCP connections.
290  * Disabling is done by changing the handler to wait for the
291  * NETIO_EVENT_NONE type.  This is done using the function
292  * configure_tcp_accept_handlers.
293  */
294 static void handle_tcp_accept(int fd, short event, void* arg);
295 
296 /*
297  * Handle incoming queries on a TCP connection.  The TCP connections
298  * are configured to be non-blocking and the handler may be called
299  * multiple times before a complete query is received.
300  */
301 static void handle_tcp_reading(int fd, short event, void* arg);
302 
303 /*
304  * Handle outgoing responses on a TCP connection.  The TCP connections
305  * are configured to be non-blocking and the handler may be called
306  * multiple times before a complete response is sent.
307  */
308 static void handle_tcp_writing(int fd, short event, void* arg);
309 
310 #ifdef HAVE_SSL
311 /* Create SSL object and associate fd */
312 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
313 /*
314  * Handle TLS handshake. May be called multiple times if incomplete.
315  */
316 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
317 
318 /*
319  * Handle incoming queries on a TLS over TCP connection.  The TLS
320  * connections are configured to be non-blocking and the handler may
321  * be called multiple times before a complete query is received.
322  */
323 static void handle_tls_reading(int fd, short event, void* arg);
324 
325 /*
326  * Handle outgoing responses on a TLS over TCP connection.  The TLS
327  * connections are configured to be non-blocking and the handler may
328  * be called multiple times before a complete response is sent.
329  */
330 static void handle_tls_writing(int fd, short event, void* arg);
331 #endif
332 
333 /*
334  * Send all children the quit nonblocking, then close pipe.
335  */
336 static void send_children_quit(struct nsd* nsd);
337 /* same, for shutdown time, waits for child to exit to avoid restart issues */
338 static void send_children_quit_and_wait(struct nsd* nsd);
339 
340 /* set childrens flags to send NSD_STATS to them */
341 #ifdef BIND8_STATS
342 static void set_children_stats(struct nsd* nsd);
343 #endif /* BIND8_STATS */
344 
345 /*
346  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
347  */
348 static void configure_handler_event_types(short event_types);
349 
350 static uint16_t *compressed_dname_offsets = 0;
351 static uint32_t compression_table_capacity = 0;
352 static uint32_t compression_table_size = 0;
353 static domain_type* compressed_dnames[MAXRRSPP];
354 
355 #ifdef USE_TCP_FASTOPEN
356 /* Checks to see if the kernel value must be manually changed in order for
357    TCP Fast Open to support server mode */
358 static void report_tcp_fastopen_config() {
359 
360 	int tcp_fastopen_fp;
361 	uint8_t tcp_fastopen_value;
362 
363 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
364 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
365 	}
366 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
367 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
368 		close(tcp_fastopen_fp);
369 	}
370 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
371 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
372 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
373 		log_msg(LOG_WARNING, "To enable TFO use the command:");
374 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
375 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
376 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
377 		close(tcp_fastopen_fp);
378 	}
379 	close(tcp_fastopen_fp);
380 }
381 #endif
382 
383 /*
384  * Remove the specified pid from the list of child pids.  Returns -1 if
385  * the pid is not in the list, child_num otherwise.  The field is set to 0.
386  */
387 static int
388 delete_child_pid(struct nsd *nsd, pid_t pid)
389 {
390 	size_t i;
391 	for (i = 0; i < nsd->child_count; ++i) {
392 		if (nsd->children[i].pid == pid) {
393 			nsd->children[i].pid = 0;
394 			if(!nsd->children[i].need_to_exit) {
395 				if(nsd->children[i].child_fd != -1)
396 					close(nsd->children[i].child_fd);
397 				nsd->children[i].child_fd = -1;
398 				if(nsd->children[i].handler)
399 					nsd->children[i].handler->fd = -1;
400 			}
401 			return i;
402 		}
403 	}
404 	return -1;
405 }
406 
407 /*
408  * Restart child servers if necessary.
409  */
410 static int
411 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
412 	int* xfrd_sock_p)
413 {
414 	struct main_ipc_handler_data *ipc_data;
415 	size_t i;
416 	int sv[2];
417 
418 	/* Fork the child processes... */
419 	for (i = 0; i < nsd->child_count; ++i) {
420 		if (nsd->children[i].pid <= 0) {
421 			if (nsd->children[i].child_fd != -1)
422 				close(nsd->children[i].child_fd);
423 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
424 				log_msg(LOG_ERR, "socketpair: %s",
425 					strerror(errno));
426 				return -1;
427 			}
428 			nsd->children[i].child_fd = sv[0];
429 			nsd->children[i].parent_fd = sv[1];
430 			nsd->children[i].pid = fork();
431 			switch (nsd->children[i].pid) {
432 			default: /* SERVER MAIN */
433 				close(nsd->children[i].parent_fd);
434 				nsd->children[i].parent_fd = -1;
435 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
436 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
437 				}
438 				if(!nsd->children[i].handler)
439 				{
440 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
441 						region, sizeof(struct main_ipc_handler_data));
442 					ipc_data->nsd = nsd;
443 					ipc_data->child = &nsd->children[i];
444 					ipc_data->child_num = i;
445 					ipc_data->xfrd_sock = xfrd_sock_p;
446 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
447 					ipc_data->forward_mode = 0;
448 					ipc_data->got_bytes = 0;
449 					ipc_data->total_bytes = 0;
450 					ipc_data->acl_num = 0;
451 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
452 						region, sizeof(struct netio_handler));
453 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
454 					nsd->children[i].handler->timeout = NULL;
455 					nsd->children[i].handler->user_data = ipc_data;
456 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
457 					nsd->children[i].handler->event_handler = parent_handle_child_command;
458 					netio_add_handler(netio, nsd->children[i].handler);
459 				}
460 				/* clear any ongoing ipc */
461 				ipc_data = (struct main_ipc_handler_data*)
462 					nsd->children[i].handler->user_data;
463 				ipc_data->forward_mode = 0;
464 				/* restart - update fd */
465 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
466 				break;
467 			case 0: /* CHILD */
468 				/* the child need not be able to access the
469 				 * nsd.db file */
470 				namedb_close_udb(nsd->db);
471 #ifdef MEMCLEAN /* OS collects memory pages */
472 				region_destroy(region);
473 #endif
474 
475 				if (pledge("stdio rpath inet", NULL) == -1) {
476 					log_msg(LOG_ERR, "pledge");
477 					exit(1);
478 				}
479 
480 				nsd->pid = 0;
481 				nsd->child_count = 0;
482 				nsd->server_kind = nsd->children[i].kind;
483 				nsd->this_child = &nsd->children[i];
484 				nsd->this_child->child_num = i;
485 				/* remove signal flags inherited from parent
486 				   the parent will handle them. */
487 				nsd->signal_hint_reload_hup = 0;
488 				nsd->signal_hint_reload = 0;
489 				nsd->signal_hint_child = 0;
490 				nsd->signal_hint_quit = 0;
491 				nsd->signal_hint_shutdown = 0;
492 				nsd->signal_hint_stats = 0;
493 				nsd->signal_hint_statsusr = 0;
494 				close(*xfrd_sock_p);
495 				close(nsd->this_child->child_fd);
496 				nsd->this_child->child_fd = -1;
497 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
498 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
499 				}
500 				server_child(nsd);
501 				/* NOTREACH */
502 				exit(0);
503 			case -1:
504 				log_msg(LOG_ERR, "fork failed: %s",
505 					strerror(errno));
506 				return -1;
507 			}
508 		}
509 	}
510 	return 0;
511 }
512 
513 #ifdef BIND8_STATS
514 static void set_bind8_alarm(struct nsd* nsd)
515 {
516 	/* resync so that the next alarm is on the next whole minute */
517 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
518 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
519 }
520 #endif
521 
522 /* set zone stat ids for zones initially read in */
523 static void
524 zonestatid_tree_set(struct nsd* nsd)
525 {
526 	struct radnode* n;
527 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
528 		zone_type* zone = (zone_type*)n->elem;
529 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
530 	}
531 }
532 
533 #ifdef USE_ZONE_STATS
534 void
535 server_zonestat_alloc(struct nsd* nsd)
536 {
537 	size_t num = (nsd->options->zonestatnames->count==0?1:
538 			nsd->options->zonestatnames->count);
539 	size_t sz = sizeof(struct nsdst)*num;
540 	char tmpfile[256];
541 	uint8_t z = 0;
542 
543 	/* file names */
544 	nsd->zonestatfname[0] = 0;
545 	nsd->zonestatfname[1] = 0;
546 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
547 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
548 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
549 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
550 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
551 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
552 
553 	/* file descriptors */
554 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
555 	if(nsd->zonestatfd[0] == -1) {
556 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
557 			strerror(errno));
558 		exit(1);
559 	}
560 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
561 	if(nsd->zonestatfd[0] == -1) {
562 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
563 			strerror(errno));
564 		close(nsd->zonestatfd[0]);
565 		unlink(nsd->zonestatfname[0]);
566 		exit(1);
567 	}
568 
569 #ifdef HAVE_MMAP
570 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
571 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
572 			strerror(errno));
573 		exit(1);
574 	}
575 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
576 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
577 			nsd->zonestatfname[0], strerror(errno));
578 		exit(1);
579 	}
580 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
581 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
582 			strerror(errno));
583 		exit(1);
584 	}
585 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
586 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
587 			nsd->zonestatfname[1], strerror(errno));
588 		exit(1);
589 	}
590 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
591 		MAP_SHARED, nsd->zonestatfd[0], 0);
592 	if(nsd->zonestat[0] == MAP_FAILED) {
593 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
594 		unlink(nsd->zonestatfname[0]);
595 		unlink(nsd->zonestatfname[1]);
596 		exit(1);
597 	}
598 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
599 		MAP_SHARED, nsd->zonestatfd[1], 0);
600 	if(nsd->zonestat[1] == MAP_FAILED) {
601 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
602 		unlink(nsd->zonestatfname[0]);
603 		unlink(nsd->zonestatfname[1]);
604 		exit(1);
605 	}
606 	memset(nsd->zonestat[0], 0, sz);
607 	memset(nsd->zonestat[1], 0, sz);
608 	nsd->zonestatsize[0] = num;
609 	nsd->zonestatsize[1] = num;
610 	nsd->zonestatdesired = num;
611 	nsd->zonestatsizenow = num;
612 	nsd->zonestatnow = nsd->zonestat[0];
613 #endif /* HAVE_MMAP */
614 }
615 
616 void
617 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
618 {
619 #ifdef HAVE_MMAP
620 #ifdef MREMAP_MAYMOVE
621 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
622 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
623 		MREMAP_MAYMOVE);
624 	if(nsd->zonestat[idx] == MAP_FAILED) {
625 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
626 		exit(1);
627 	}
628 #else /* !HAVE MREMAP */
629 	if(msync(nsd->zonestat[idx],
630 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
631 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
632 	if(munmap(nsd->zonestat[idx],
633 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
634 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
635 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
636 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
637 	if(nsd->zonestat[idx] == MAP_FAILED) {
638 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
639 		exit(1);
640 	}
641 #endif /* MREMAP */
642 #endif /* HAVE_MMAP */
643 }
644 
645 /* realloc the zonestat array for the one that is not currently in use,
646  * to match the desired new size of the array (if applicable) */
647 void
648 server_zonestat_realloc(struct nsd* nsd)
649 {
650 #ifdef HAVE_MMAP
651 	uint8_t z = 0;
652 	size_t sz;
653 	int idx = 0; /* index of the zonestat array that is not in use */
654 	if(nsd->zonestatnow == nsd->zonestat[0])
655 		idx = 1;
656 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
657 		return;
658 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
659 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
660 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
661 			strerror(errno));
662 		exit(1);
663 	}
664 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
665 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
666 			nsd->zonestatfname[idx], strerror(errno));
667 		exit(1);
668 	}
669 	zonestat_remap(nsd, idx, sz);
670 	/* zero the newly allocated region */
671 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
672 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
673 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
674 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
675 	}
676 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
677 #endif /* HAVE_MMAP */
678 }
679 
680 /* switchover to use the other array for the new children, that
681  * briefly coexist with the old children.  And we want to avoid them
682  * both writing to the same statistics arrays. */
683 void
684 server_zonestat_switch(struct nsd* nsd)
685 {
686 	if(nsd->zonestatnow == nsd->zonestat[0]) {
687 		nsd->zonestatnow = nsd->zonestat[1];
688 		nsd->zonestatsizenow = nsd->zonestatsize[1];
689 	} else {
690 		nsd->zonestatnow = nsd->zonestat[0];
691 		nsd->zonestatsizenow = nsd->zonestatsize[0];
692 	}
693 }
694 #endif /* USE_ZONE_STATS */
695 
696 static void
697 cleanup_dname_compression_tables(void *ptr)
698 {
699 	free(ptr);
700 	compressed_dname_offsets = NULL;
701 	compression_table_capacity = 0;
702 }
703 
704 static void
705 initialize_dname_compression_tables(struct nsd *nsd)
706 {
707 	size_t needed = domain_table_count(nsd->db->domains) + 1;
708 	needed += EXTRA_DOMAIN_NUMBERS;
709 	if(compression_table_capacity < needed) {
710 		if(compressed_dname_offsets) {
711 			region_remove_cleanup(nsd->db->region,
712 				cleanup_dname_compression_tables,
713 				compressed_dname_offsets);
714 			free(compressed_dname_offsets);
715 		}
716 		compressed_dname_offsets = (uint16_t *) xmallocarray(
717 			needed, sizeof(uint16_t));
718 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
719 			compressed_dname_offsets);
720 		compression_table_capacity = needed;
721 		compression_table_size=domain_table_count(nsd->db->domains)+1;
722 	}
723 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
724 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
725 }
726 
727 static int
728 set_cloexec(struct nsd_socket *sock)
729 {
730 	assert(sock != NULL);
731 
732 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
733 		const char *socktype =
734 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
735 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
736 			socktype, strerror(errno));
737 		return -1;
738 	}
739 
740 	return 1;
741 }
742 
743 static int
744 set_reuseport(struct nsd_socket *sock)
745 {
746 #ifdef SO_REUSEPORT
747 	int on = 1;
748 #ifdef SO_REUSEPORT_LB
749 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
750 	 * SO_REUSEPORT on Linux. This is what the users want with the config
751 	 * option in nsd.conf; if we actually need local address and port reuse
752 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
753 	 * _LB they want.
754 	 */
755 	int opt = SO_REUSEPORT_LB;
756 	static const char optname[] = "SO_REUSEPORT_LB";
757 #else /* !SO_REUSEPORT_LB */
758 	int opt = SO_REUSEPORT;
759 	static const char optname[] = "SO_REUSEPORT";
760 #endif /* SO_REUSEPORT_LB */
761 
762 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
763 		return 1;
764 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
765 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
766 			optname, strerror(errno));
767 	}
768 	return -1;
769 #else
770 	(void)sock;
771 #endif /* SO_REUSEPORT */
772 
773 	return 0;
774 }
775 
776 static int
777 set_reuseaddr(struct nsd_socket *sock)
778 {
779 #ifdef SO_REUSEADDR
780 	int on = 1;
781 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
782 		return 1;
783 	}
784 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
785 		strerror(errno));
786 	return -1;
787 #endif /* SO_REUSEADDR */
788 	return 0;
789 }
790 
791 static int
792 set_rcvbuf(struct nsd_socket *sock, int rcv)
793 {
794 #ifdef SO_RCVBUF
795 #ifdef SO_RCVBUFFORCE
796 	if(0 == setsockopt(
797 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
798 	{
799 		return 1;
800 	}
801 	if(errno == EPERM || errno == ENOBUFS) {
802 		return 0;
803 	}
804 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
805 		strerror(errno));
806 	return -1;
807 #else /* !SO_RCVBUFFORCE */
808 	if (0 == setsockopt(
809 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
810 	{
811 		return 1;
812 	}
813 	if(errno == ENOSYS || errno == ENOBUFS) {
814 		return 0;
815 	}
816 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
817 		strerror(errno));
818 	return -1;
819 #endif /* SO_RCVBUFFORCE */
820 #endif /* SO_RCVBUF */
821 
822 	return 0;
823 }
824 
825 static int
826 set_sndbuf(struct nsd_socket *sock, int snd)
827 {
828 #ifdef SO_SNDBUF
829 #ifdef SO_SNDBUFFORCE
830 	if(0 == setsockopt(
831 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
832 	{
833 		return 1;
834 	}
835 	if(errno == EPERM || errno == ENOBUFS) {
836 		return 0;
837 	}
838 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
839 		strerror(errno));
840 	return -1;
841 #else /* !SO_SNDBUFFORCE */
842 	if(0 == setsockopt(
843 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
844 	{
845 		return 1;
846 	}
847 	if(errno == ENOSYS || errno == ENOBUFS) {
848 		return 0;
849 	}
850 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
851 		strerror(errno));
852 	return -1;
853 #endif /* SO_SNDBUFFORCE */
854 #endif /* SO_SNDBUF */
855 
856 	return 0;
857 }
858 
859 static int
860 set_nonblock(struct nsd_socket *sock)
861 {
862 	const char *socktype =
863 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
864 
865 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
866 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
867 			socktype, strerror(errno));
868 		return -1;
869 	}
870 
871 	return 1;
872 }
873 
874 #ifdef INET6
875 static int
876 set_ipv6_v6only(struct nsd_socket *sock)
877 {
878 #ifdef IPV6_V6ONLY
879 	int on = 1;
880 	const char *socktype =
881 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
882 
883 	if(0 == setsockopt(
884 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
885 	{
886 		return 1;
887 	}
888 
889 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
890 		socktype, strerror(errno));
891 	return -1;
892 #else
893 	(void)sock;
894 #endif /* IPV6_V6ONLY */
895 
896 	return 0;
897 }
898 #endif /* INET6 */
899 
900 #ifdef INET6
901 static int
902 set_ipv6_use_min_mtu(struct nsd_socket *sock)
903 {
904 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
905 #if defined(IPV6_USE_MIN_MTU)
906 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
907 	 * network. Therefore we do not send UDP datagrams larger than the
908 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
909 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
910 	 */
911 	int opt = IPV6_USE_MIN_MTU;
912 	int optval = 1;
913 	static const char optname[] = "IPV6_USE_MIN_MTU";
914 #elif defined(IPV6_MTU)
915 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
916 	 * to the MIN MTU to get the same.
917 	 */
918 	int opt = IPV6_MTU;
919 	int optval = IPV6_MIN_MTU;
920 	static const char optname[] = "IPV6_MTU";
921 #endif
922 	if(0 == setsockopt(
923 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
924 	{
925 		return 1;
926 	}
927 
928 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
929 		optname, strerror(errno));
930 	return -1;
931 #else
932 	(void)sock;
933 #endif /* INET6 */
934 
935 	return 0;
936 }
937 #endif /* INET6 */
938 
939 static int
940 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
941 {
942 	int ret = 0;
943 
944 #if defined(IP_MTU_DISCOVER)
945 	int opt = IP_MTU_DISCOVER;
946 	int optval;
947 # if defined(IP_PMTUDISC_OMIT)
948 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
949 	 * information and send packets with DF=0. Fragmentation is allowed if
950 	 * and only if the packet size exceeds the outgoing interface MTU or
951 	 * the packet encounters smaller MTU link in network. This mitigates
952 	 * DNS fragmentation attacks by preventing forged PMTU information.
953 	 * FreeBSD already has same semantics without setting the option.
954 	 */
955 	optval = IP_PMTUDISC_OMIT;
956 	if(0 == setsockopt(
957 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
958 	{
959 		return 1;
960 	}
961 
962 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
963 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
964 # endif /* IP_PMTUDISC_OMIT */
965 # if defined(IP_PMTUDISC_DONT)
966 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
967 	optval = IP_PMTUDISC_DONT;
968 	if(0 == setsockopt(
969 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
970 	{
971 		return 1;
972 	}
973 
974 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
975 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
976 # endif
977 	ret = -1;
978 #elif defined(IP_DONTFRAG)
979 	int off = 0;
980 	if (0 == setsockopt(
981 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
982 	{
983 		return 1;
984 	}
985 
986 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
987 		strerror(errno));
988 	ret = -1;
989 #else
990 	(void)sock;
991 #endif
992 
993 	return ret;
994 }
995 
996 static int
997 set_ip_freebind(struct nsd_socket *sock)
998 {
999 #ifdef IP_FREEBIND
1000 	int on = 1;
1001 	const char *socktype =
1002 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1003 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1004 	{
1005 		return 1;
1006 	}
1007 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1008 		socktype, strerror(errno));
1009 	return -1;
1010 #else
1011 	(void)sock;
1012 #endif /* IP_FREEBIND */
1013 
1014 	return 0;
1015 }
1016 
1017 static int
1018 set_ip_transparent(struct nsd_socket *sock)
1019 {
1020 	/*
1021 	The scandalous preprocessor blob here calls for some explanation :)
1022 	POSIX does not specify an option to bind non-local IPs, so
1023 	platforms developed several implementation-specific options,
1024 	all set in the same way, but with different names.
1025 	For additional complexity, some platform manage this setting
1026 	differently for different address families (IPv4 vs IPv6).
1027 	This scandalous preprocessor blob below abstracts such variability
1028 	in the way which leaves the C code as lean and clear as possible.
1029 	*/
1030 
1031 #if defined(IP_TRANSPARENT)
1032 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1033 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1034 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1035 // as of 2020-01, Linux does not support this on IPv6 programmatically
1036 #elif defined(SO_BINDANY)
1037 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1038 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1039 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1040 #elif defined(IP_BINDANY)
1041 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1042 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1043 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1044 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1045 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1046 #endif
1047 
1048 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1049 	(void)sock;
1050 #else
1051 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1052 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1053 #	endif
1054 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1055 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1056 #	endif
1057 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1058 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1059 #	endif
1060 
1061 	int on = 1;
1062 	const char *socktype =
1063 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1064 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1065 
1066 	if(0 == setsockopt(
1067 		sock->s,
1068 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1069 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1070 		&on, sizeof(on)))
1071 	{
1072 		return 1;
1073 	}
1074 
1075 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1076 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1077 	return -1;
1078 #endif
1079 
1080 	return 0;
1081 }
1082 
1083 static int
1084 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1085 {
1086 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1087 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1088 		return 1;
1089 	}
1090 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1091 		strerror(errno));
1092 	return -1;
1093 #else
1094 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1095 #endif
1096 	return 0;
1097 }
1098 
1099 #ifdef USE_TCP_FASTOPEN
1100 static int
1101 set_tcp_fastopen(struct nsd_socket *sock)
1102 {
1103 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1104 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1105 	 */
1106 	int qlen;
1107 
1108 #ifdef __APPLE__
1109 	/* macOS X implementation only supports qlen of 1 via this call. The
1110 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1111 	 * kernel parameter.
1112 	 */
1113 	qlen = 1;
1114 #else
1115 	/* 5 is recommended on Linux. */
1116 	qlen = 5;
1117 #endif
1118 	if (0 == setsockopt(
1119 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1120 	{
1121 		return 1;
1122 	}
1123 
1124 	if (errno == EPERM) {
1125 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1126 				 "; this could likely be because sysctl "
1127 				 "net.inet.tcp.fastopen.enabled, "
1128 				 "net.inet.tcp.fastopen.server_enable, or "
1129 				 "net.ipv4.tcp_fastopen is disabled",
1130 			strerror(errno));
1131 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1132 	 * disabled, except when verbosity enabled for debugging
1133 	 */
1134 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1135 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1136 			strerror(errno));
1137 	}
1138 
1139 	return (errno == ENOPROTOOPT ? 0 : -1);
1140 }
1141 #endif /* USE_TCP_FASTOPEN */
1142 
1143 static int
1144 set_bindtodevice(struct nsd_socket *sock)
1145 {
1146 #if defined(SO_BINDTODEVICE)
1147 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1148 		sock->device, strlen(sock->device)) == -1)
1149 	{
1150 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1151 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1152 		return -1;
1153 	}
1154 
1155 	return 1;
1156 #else
1157 	(void)sock;
1158 	return 0;
1159 #endif
1160 }
1161 
1162 static int
1163 set_setfib(struct nsd_socket *sock)
1164 {
1165 #if defined(SO_SETFIB)
1166 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1167 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1168 	{
1169 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1170 		                 "SO_SETFIB", sock->fib, strerror(errno));
1171 		return -1;
1172 	}
1173 
1174 	return 1;
1175 #else
1176 	(void)sock;
1177 	return 0;
1178 #endif
1179 }
1180 
1181 static int
1182 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1183 {
1184 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1185 
1186 	if(-1 == (sock->s = socket(
1187 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1188 	{
1189 #ifdef INET6
1190 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1191 		   (sock->addr.ai_family == AF_INET6) &&
1192 		   (errno == EAFNOSUPPORT))
1193 		{
1194 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1195 				"not supported");
1196 			return 0;
1197 		}
1198 #endif
1199 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1200 		return -1;
1201 	}
1202 
1203 	set_cloexec(sock);
1204 
1205 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1206 		*reuseport_works = (set_reuseport(sock) == 1);
1207 
1208 	if(nsd->options->receive_buffer_size > 0)
1209 		rcv = nsd->options->receive_buffer_size;
1210 	if(set_rcvbuf(sock, rcv) == -1)
1211 		return -1;
1212 
1213 	if(nsd->options->send_buffer_size > 0)
1214 		snd = nsd->options->send_buffer_size;
1215 	if(set_sndbuf(sock, snd) == -1)
1216 		return -1;
1217 #ifdef INET6
1218 	if(sock->addr.ai_family == AF_INET6) {
1219 		if(set_ipv6_v6only(sock) == -1 ||
1220 		   set_ipv6_use_min_mtu(sock) == -1)
1221 			return -1;
1222 	} else
1223 #endif /* INET6 */
1224 	if(sock->addr.ai_family == AF_INET) {
1225 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1226 			return -1;
1227 	}
1228 
1229 	/* Set socket to non-blocking. Otherwise, on operating systems
1230 	 * with thundering herd problems, the UDP recv could block
1231 	 * after select returns readable.
1232 	 */
1233 	set_nonblock(sock);
1234 
1235 	if(nsd->options->ip_freebind)
1236 		(void)set_ip_freebind(sock);
1237 	if(nsd->options->ip_transparent)
1238 		(void)set_ip_transparent(sock);
1239 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1240 		return -1;
1241 	if(sock->fib != -1 && set_setfib(sock) == -1)
1242 		return -1;
1243 
1244 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1245 		char buf[256];
1246 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1247 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1248 			buf, strerror(errno));
1249 		return -1;
1250 	}
1251 
1252 	return 1;
1253 }
1254 
1255 static int
1256 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1257 {
1258 #ifdef USE_TCP_FASTOPEN
1259 	report_tcp_fastopen_config();
1260 #endif
1261 
1262 	(void)reuseport_works;
1263 
1264 	if(-1 == (sock->s = socket(
1265 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1266 	{
1267 #ifdef INET6
1268 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1269 		   (sock->addr.ai_family == AF_INET6) &&
1270 		   (errno == EAFNOSUPPORT))
1271 		{
1272 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1273 			                     "not supported");
1274 			return 0;
1275 		}
1276 #endif /* INET6 */
1277 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1278 		return -1;
1279 	}
1280 
1281 	set_cloexec(sock);
1282 
1283 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1284 		*reuseport_works = (set_reuseport(sock) == 1);
1285 
1286 	(void)set_reuseaddr(sock);
1287 
1288 #ifdef INET6
1289 	if(sock->addr.ai_family == AF_INET6) {
1290 		if (set_ipv6_v6only(sock) == -1 ||
1291 		    set_ipv6_use_min_mtu(sock) == -1)
1292 			return -1;
1293 	}
1294 #endif
1295 
1296 	if(nsd->tcp_mss > 0)
1297 		set_tcp_maxseg(sock, nsd->tcp_mss);
1298 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1299 	   it may block in accept, even if select() says readable. */
1300 	(void)set_nonblock(sock);
1301 	if(nsd->options->ip_freebind)
1302 		(void)set_ip_freebind(sock);
1303 	if(nsd->options->ip_transparent)
1304 		(void)set_ip_transparent(sock);
1305 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1306 		return -1;
1307 	if(sock->fib != -1 && set_setfib(sock) == -1)
1308 		return -1;
1309 
1310 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1311 		char buf[256];
1312 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1313 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1314 			buf, strerror(errno));
1315 		return -1;
1316 	}
1317 
1318 #ifdef USE_TCP_FASTOPEN
1319 	(void)set_tcp_fastopen(sock);
1320 #endif
1321 
1322 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1323 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1324 		return -1;
1325 	}
1326 
1327 	return 1;
1328 }
1329 
1330 /*
1331  * Initialize the server, reuseport, create and bind the sockets.
1332  */
1333 int
1334 server_init(struct nsd *nsd)
1335 {
1336 	size_t i;
1337 	int reuseport = 1; /* Determine if REUSEPORT works. */
1338 
1339 	/* open server interface ports */
1340 	for(i = 0; i < nsd->ifs; i++) {
1341 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1342 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1343 		{
1344 			return -1;
1345 		}
1346 	}
1347 
1348 	if(nsd->reuseport && reuseport) {
1349 		size_t ifs = nsd->ifs * nsd->reuseport;
1350 
1351 		/* increase the size of the interface arrays, there are going
1352 		 * to be separate interface file descriptors for every server
1353 		 * instance */
1354 		region_remove_cleanup(nsd->region, free, nsd->udp);
1355 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1356 
1357 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1358 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1359 		region_add_cleanup(nsd->region, free, nsd->udp);
1360 		region_add_cleanup(nsd->region, free, nsd->tcp);
1361 		if(ifs > nsd->ifs) {
1362 			memset(&nsd->udp[nsd->ifs], 0,
1363 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1364 			memset(&nsd->tcp[nsd->ifs], 0,
1365 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1366 		}
1367 
1368 		for(i = nsd->ifs; i < ifs; i++) {
1369 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1370 			nsd->udp[i].s = -1;
1371 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1372 				return -1;
1373 			}
1374 			/* Turn off REUSEPORT for TCP by copying the socket
1375 			 * file descriptor.
1376 			 * This means we should not close TCP used by
1377 			 * other servers in reuseport enabled mode, in
1378 			 * server_child().
1379 			 */
1380 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1381 		}
1382 
1383 		nsd->ifs = ifs;
1384 	} else {
1385 		nsd->reuseport = 0;
1386 	}
1387 
1388 	/* open server interface ports for verifiers */
1389 	for(i = 0; i < nsd->verify_ifs; i++) {
1390 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1391 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1392 		{
1393 			return -1;
1394 		}
1395 	}
1396 
1397 	return 0;
1398 }
1399 
1400 /*
1401  * Prepare the server for take off.
1402  *
1403  */
1404 int
1405 server_prepare(struct nsd *nsd)
1406 {
1407 #ifdef RATELIMIT
1408 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1409 #ifdef HAVE_GETRANDOM
1410 	uint32_t v;
1411 	if(getrandom(&v, sizeof(v), 0) == -1) {
1412 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1413 		exit(1);
1414 	}
1415 	hash_set_raninit(v);
1416 #elif defined(HAVE_ARC4RANDOM)
1417 	hash_set_raninit(arc4random());
1418 #else
1419 	uint32_t v = getpid() ^ time(NULL);
1420 	srandom((unsigned long)v);
1421 #  ifdef HAVE_SSL
1422 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1423 		hash_set_raninit(v);
1424 	else
1425 #  endif
1426 		hash_set_raninit(random());
1427 #endif
1428 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1429 		nsd->options->rrl_ratelimit,
1430 		nsd->options->rrl_whitelist_ratelimit,
1431 		nsd->options->rrl_slip,
1432 		nsd->options->rrl_ipv4_prefix_length,
1433 		nsd->options->rrl_ipv6_prefix_length);
1434 #endif /* RATELIMIT */
1435 
1436 	/* Open the database... */
1437 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1438 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1439 			nsd->dbfile, strerror(errno));
1440 		unlink(nsd->task[0]->fname);
1441 		unlink(nsd->task[1]->fname);
1442 #ifdef USE_ZONE_STATS
1443 		unlink(nsd->zonestatfname[0]);
1444 		unlink(nsd->zonestatfname[1]);
1445 #endif
1446 		xfrd_del_tempdir(nsd);
1447 		return -1;
1448 	}
1449 	/* check if zone files have been modified */
1450 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1451 	 * for all zones */
1452 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1453 		nsd->options->database[0] == 0))
1454 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1455 	zonestatid_tree_set(nsd);
1456 
1457 	compression_table_capacity = 0;
1458 	initialize_dname_compression_tables(nsd);
1459 
1460 #ifdef	BIND8_STATS
1461 	/* Initialize times... */
1462 	time(&nsd->st.boot);
1463 	set_bind8_alarm(nsd);
1464 #endif /* BIND8_STATS */
1465 
1466 	return 0;
1467 }
1468 
1469 /*
1470  * Fork the required number of servers.
1471  */
1472 static int
1473 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1474 	int* xfrd_sock_p)
1475 {
1476 	size_t i;
1477 
1478 	/* Start all child servers initially.  */
1479 	for (i = 0; i < nsd->child_count; ++i) {
1480 		nsd->children[i].pid = 0;
1481 	}
1482 
1483 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1484 }
1485 
1486 static void
1487 server_close_socket(struct nsd_socket *sock)
1488 {
1489 	if(sock->s != -1) {
1490 		close(sock->s);
1491 		sock->s = -1;
1492 	}
1493 }
1494 
1495 void
1496 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1497 {
1498 	size_t i;
1499 
1500 	/* Close all the sockets... */
1501 	for (i = 0; i < n; ++i) {
1502 		server_close_socket(&sockets[i]);
1503 	}
1504 }
1505 
1506 /*
1507  * Close the sockets, shutdown the server and exit.
1508  * Does not return.
1509  */
1510 void
1511 server_shutdown(struct nsd *nsd)
1512 {
1513 	size_t i;
1514 
1515 	server_close_all_sockets(nsd->udp, nsd->ifs);
1516 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1517 	/* CHILD: close command channel to parent */
1518 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1519 	{
1520 		close(nsd->this_child->parent_fd);
1521 		nsd->this_child->parent_fd = -1;
1522 	}
1523 	/* SERVER: close command channels to children */
1524 	if(!nsd->this_child)
1525 	{
1526 		for(i=0; i < nsd->child_count; ++i)
1527 			if(nsd->children[i].child_fd != -1)
1528 			{
1529 				close(nsd->children[i].child_fd);
1530 				nsd->children[i].child_fd = -1;
1531 			}
1532 	}
1533 
1534 	tsig_finalize();
1535 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1536 #ifdef HAVE_SSL
1537 	if (nsd->tls_ctx)
1538 		SSL_CTX_free(nsd->tls_ctx);
1539 #endif
1540 
1541 #ifdef MEMCLEAN /* OS collects memory pages */
1542 #ifdef RATELIMIT
1543 	rrl_mmap_deinit_keep_mmap();
1544 #endif
1545 #ifdef USE_DNSTAP
1546 	dt_collector_destroy(nsd->dt_collector, nsd);
1547 #endif
1548 	udb_base_free_keep_mmap(nsd->task[0]);
1549 	udb_base_free_keep_mmap(nsd->task[1]);
1550 	namedb_free_ixfr(nsd->db);
1551 	namedb_close_udb(nsd->db); /* keeps mmap */
1552 	namedb_close(nsd->db);
1553 	nsd_options_destroy(nsd->options);
1554 	region_destroy(nsd->region);
1555 #endif
1556 	log_finalize();
1557 	exit(0);
1558 }
1559 
1560 void
1561 server_prepare_xfrd(struct nsd* nsd)
1562 {
1563 	char tmpfile[256];
1564 	/* create task mmaps */
1565 	nsd->mytask = 0;
1566 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1567 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1568 	nsd->task[0] = task_file_create(tmpfile);
1569 	if(!nsd->task[0]) {
1570 #ifdef USE_ZONE_STATS
1571 		unlink(nsd->zonestatfname[0]);
1572 		unlink(nsd->zonestatfname[1]);
1573 #endif
1574 		xfrd_del_tempdir(nsd);
1575 		exit(1);
1576 	}
1577 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1578 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1579 	nsd->task[1] = task_file_create(tmpfile);
1580 	if(!nsd->task[1]) {
1581 		unlink(nsd->task[0]->fname);
1582 #ifdef USE_ZONE_STATS
1583 		unlink(nsd->zonestatfname[0]);
1584 		unlink(nsd->zonestatfname[1]);
1585 #endif
1586 		xfrd_del_tempdir(nsd);
1587 		exit(1);
1588 	}
1589 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1590 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1591 	/* create xfrd listener structure */
1592 	nsd->xfrd_listener = region_alloc(nsd->region,
1593 		sizeof(netio_handler_type));
1594 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1595 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1596 	nsd->xfrd_listener->fd = -1;
1597 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1598 		nsd;
1599 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1600 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1601 }
1602 
1603 
1604 void
1605 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1606 {
1607 	pid_t pid;
1608 	int sockets[2] = {0,0};
1609 	struct ipc_handler_conn_data *data;
1610 
1611 	if(nsd->xfrd_listener->fd != -1)
1612 		close(nsd->xfrd_listener->fd);
1613 	if(del_db) {
1614 		/* recreate taskdb that xfrd was using, it may be corrupt */
1615 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1616 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1617 		nsd->task[1-nsd->mytask]->fname = NULL;
1618 		/* free alloc already, so udb does not shrink itself */
1619 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1620 		nsd->task[1-nsd->mytask]->alloc = NULL;
1621 		udb_base_free(nsd->task[1-nsd->mytask]);
1622 		/* create new file, overwrite the old one */
1623 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1624 		free(tmpfile);
1625 	}
1626 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1627 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1628 		return;
1629 	}
1630 	pid = fork();
1631 	switch (pid) {
1632 	case -1:
1633 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1634 		break;
1635 	default:
1636 		/* PARENT: close first socket, use second one */
1637 		close(sockets[0]);
1638 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1639 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1640 		}
1641 		if(del_db) xfrd_free_namedb(nsd);
1642 		/* use other task than I am using, since if xfrd died and is
1643 		 * restarted, the reload is using nsd->mytask */
1644 		nsd->mytask = 1 - nsd->mytask;
1645 
1646 #ifdef HAVE_SETPROCTITLE
1647 		setproctitle("xfrd");
1648 #endif
1649 #ifdef HAVE_CPUSET_T
1650 		if(nsd->use_cpu_affinity) {
1651 			set_cpu_affinity(nsd->xfrd_cpuset);
1652 		}
1653 #endif
1654 
1655 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1656 		/* ENOTREACH */
1657 		break;
1658 	case 0:
1659 		/* CHILD: close second socket, use first one */
1660 		close(sockets[1]);
1661 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1662 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1663 		}
1664 		nsd->xfrd_listener->fd = sockets[0];
1665 		break;
1666 	}
1667 	/* server-parent only */
1668 	nsd->xfrd_listener->timeout = NULL;
1669 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1670 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1671 	/* clear ongoing ipc reads */
1672 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1673 	data->conn->is_reading = 0;
1674 }
1675 
1676 /** add all soainfo to taskdb */
1677 static void
1678 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1679 {
1680 	struct radnode* n;
1681 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1682 	/* add all SOA INFO to mytask */
1683 	udb_ptr_init(&task_last, taskudb);
1684 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1685 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1686 	}
1687 	udb_ptr_unlink(&task_last, taskudb);
1688 }
1689 
1690 void
1691 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1692 {
1693 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1694 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1695 	 *   then they exchange and process.
1696 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1697 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1698 	 *   expire notifications can be sent back via a normal reload later
1699 	 *   (xfrd will wait for current running reload to finish if any).
1700 	 */
1701 	sig_atomic_t cmd = 0;
1702 	pid_t mypid;
1703 	int xfrd_sock = nsd->xfrd_listener->fd;
1704 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1705 	udb_ptr t;
1706 	if(!shortsoa) {
1707 		if(nsd->signal_hint_shutdown) {
1708 		shutdown:
1709 			log_msg(LOG_WARNING, "signal received, shutting down...");
1710 			server_close_all_sockets(nsd->udp, nsd->ifs);
1711 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1712 			daemon_remote_close(nsd->rc);
1713 			/* Unlink it if possible... */
1714 			unlinkpid(nsd->pidfile);
1715 			unlink(nsd->task[0]->fname);
1716 			unlink(nsd->task[1]->fname);
1717 #ifdef USE_ZONE_STATS
1718 			unlink(nsd->zonestatfname[0]);
1719 			unlink(nsd->zonestatfname[1]);
1720 #endif
1721 			/* write the nsd.db to disk, wait for it to complete */
1722 			udb_base_sync(nsd->db->udb, 1);
1723 			udb_base_close(nsd->db->udb);
1724 			server_shutdown(nsd);
1725 			/* ENOTREACH */
1726 			exit(0);
1727 		}
1728 	}
1729 	if(shortsoa) {
1730 		/* put SOA in xfrd task because mytask may be in use */
1731 		taskudb = nsd->task[1-nsd->mytask];
1732 	}
1733 
1734 	add_all_soa_to_task(nsd, taskudb);
1735 	if(!shortsoa) {
1736 		/* wait for xfrd to signal task is ready, RELOAD signal */
1737 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1738 			cmd != NSD_RELOAD) {
1739 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1740 			exit(1);
1741 		}
1742 		if(nsd->signal_hint_shutdown) {
1743 			goto shutdown;
1744 		}
1745 	}
1746 	/* give xfrd our task, signal it with RELOAD_DONE */
1747 	task_process_sync(taskudb);
1748 	cmd = NSD_RELOAD_DONE;
1749 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1750 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1751 			(int)nsd->pid, strerror(errno));
1752 	}
1753 	mypid = getpid();
1754 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1755 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1756 			strerror(errno));
1757 	}
1758 
1759 	if(!shortsoa) {
1760 		/* process the xfrd task works (expiry data) */
1761 		nsd->mytask = 1 - nsd->mytask;
1762 		taskudb = nsd->task[nsd->mytask];
1763 		task_remap(taskudb);
1764 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1765 		while(!udb_ptr_is_null(&t)) {
1766 			task_process_expire(nsd->db, TASKLIST(&t));
1767 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1768 		}
1769 		udb_ptr_unlink(&t, taskudb);
1770 		task_clear(taskudb);
1771 
1772 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1773 		cmd = NSD_RELOAD_DONE;
1774 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1775 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1776 				(int)nsd->pid, strerror(errno));
1777 		}
1778 	}
1779 }
1780 
1781 #ifdef HAVE_SSL
1782 static void
1783 log_crypto_from_err(const char* str, unsigned long err)
1784 {
1785 	/* error:[error code]:[library name]:[function name]:[reason string] */
1786 	char buf[128];
1787 	unsigned long e;
1788 	ERR_error_string_n(err, buf, sizeof(buf));
1789 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1790 	while( (e=ERR_get_error()) ) {
1791 		ERR_error_string_n(e, buf, sizeof(buf));
1792 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1793 	}
1794 }
1795 
1796 void
1797 log_crypto_err(const char* str)
1798 {
1799 	log_crypto_from_err(str, ERR_get_error());
1800 }
1801 
1802 /** true if the ssl handshake error has to be squelched from the logs */
1803 static int
1804 squelch_err_ssl_handshake(unsigned long err)
1805 {
1806 	if(verbosity >= 3)
1807 		return 0; /* only squelch on low verbosity */
1808 	/* this is very specific, we could filter on ERR_GET_REASON()
1809 	 * (the third element in ERR_PACK) */
1810 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1811 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1812 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1813 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1814 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1815 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1816 #endif
1817 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1818 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1819 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1820 #  ifdef SSL_R_VERSION_TOO_LOW
1821 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1822 #  endif
1823 #endif
1824 		)
1825 		return 1;
1826 	return 0;
1827 }
1828 
1829 void
1830 perform_openssl_init(void)
1831 {
1832 	/* init SSL library */
1833 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1834 	ERR_load_crypto_strings();
1835 #endif
1836 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1837 	ERR_load_SSL_strings();
1838 #endif
1839 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1840 	OpenSSL_add_all_algorithms();
1841 #else
1842 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1843 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1844 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1845 #endif
1846 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1847 	(void)SSL_library_init();
1848 #else
1849 	OPENSSL_init_ssl(0, NULL);
1850 #endif
1851 
1852 	if(!RAND_status()) {
1853 		/* try to seed it */
1854 		unsigned char buf[256];
1855 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1856 		size_t i;
1857 		v = seed;
1858 		for(i=0; i<256/sizeof(v); i++) {
1859 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1860 			v = v*seed + (unsigned int)i;
1861 		}
1862 		RAND_seed(buf, 256);
1863 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1864 	}
1865 }
1866 
1867 static int
1868 get_ocsp(char *filename, unsigned char **ocsp)
1869 {
1870 	BIO *bio;
1871 	OCSP_RESPONSE *response;
1872 	int len = -1;
1873 	unsigned char *p, *buf;
1874 	assert(filename);
1875 
1876 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1877 		log_crypto_err("get_ocsp: BIO_new_file failed");
1878 		return -1;
1879 	}
1880 
1881 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1882 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1883 		BIO_free(bio);
1884 		return -1;
1885 	}
1886 
1887 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1888 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1889 		OCSP_RESPONSE_free(response);
1890 		BIO_free(bio);
1891 		return -1;
1892 	}
1893 
1894 	if ((buf = malloc((size_t) len)) == NULL) {
1895 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1896 		OCSP_RESPONSE_free(response);
1897 		BIO_free(bio);
1898 		return -1;
1899 	}
1900 
1901 	p = buf;
1902 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1903 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1904 		free(buf);
1905 		OCSP_RESPONSE_free(response);
1906 		BIO_free(bio);
1907 		return -1;
1908 	}
1909 
1910 	OCSP_RESPONSE_free(response);
1911 	BIO_free(bio);
1912 
1913 	*ocsp = buf;
1914 	return len;
1915 }
1916 
1917 /* further setup ssl ctx after the keys are loaded */
1918 static void
1919 listen_sslctx_setup_2(void* ctxt)
1920 {
1921 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1922 	(void)ctx;
1923 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1924 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1925 		/* ENOTREACH */
1926 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1927 	}
1928 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1929 	if(1) {
1930 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1931 		if (!ecdh) {
1932 			log_crypto_err("could not find p256, not enabling ECDHE");
1933 		} else {
1934 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1935 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1936 			}
1937 			EC_KEY_free (ecdh);
1938 		}
1939 	}
1940 #endif
1941 }
1942 
1943 static int
1944 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1945 {
1946 	if(ocspdata) {
1947 		unsigned char *p;
1948 		if ((p=malloc(ocspdata_len)) == NULL) {
1949 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1950 			return SSL_TLSEXT_ERR_NOACK;
1951 		}
1952 		memcpy(p, ocspdata, ocspdata_len);
1953 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1954 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1955 			free(p);
1956 			return SSL_TLSEXT_ERR_NOACK;
1957 		}
1958 		return SSL_TLSEXT_ERR_OK;
1959 	} else {
1960 		return SSL_TLSEXT_ERR_NOACK;
1961 	}
1962 }
1963 
1964 SSL_CTX*
1965 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1966 {
1967 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1968 	if(!ctx) {
1969 		log_crypto_err("could not SSL_CTX_new");
1970 		return NULL;
1971 	}
1972 	/* no SSLv2, SSLv3 because has defects */
1973 #if SSL_OP_NO_SSLv2 != 0
1974 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1975 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1976 		SSL_CTX_free(ctx);
1977 		return NULL;
1978 	}
1979 #endif
1980 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1981 		!= SSL_OP_NO_SSLv3){
1982 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1983 		SSL_CTX_free(ctx);
1984 		return 0;
1985 	}
1986 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1987 	/* if we have tls 1.1 disable 1.0 */
1988 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1989 		!= SSL_OP_NO_TLSv1){
1990 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1991 		SSL_CTX_free(ctx);
1992 		return 0;
1993 	}
1994 #endif
1995 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1996 	/* if we have tls 1.2 disable 1.1 */
1997 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1998 		!= SSL_OP_NO_TLSv1_1){
1999 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2000 		SSL_CTX_free(ctx);
2001 		return 0;
2002 	}
2003 #endif
2004 #if defined(SSL_OP_NO_RENEGOTIATION)
2005 	/* disable client renegotiation */
2006 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2007 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2008 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2009 		SSL_CTX_free(ctx);
2010 		return 0;
2011 	}
2012 #endif
2013 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2014 	/* if we detect system-wide crypto policies, use those */
2015 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2016 		/* if we have sha256, set the cipher list to have no known vulns */
2017 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2018 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2019 	}
2020 #endif
2021 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2022 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2023 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2024 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2025 		SSL_CTX_free(ctx);
2026 		return 0;
2027 	}
2028 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2029 	SSL_CTX_set_security_level(ctx, 0);
2030 #endif
2031 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2032 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2033 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2034 		SSL_CTX_free(ctx);
2035 		return NULL;
2036 	}
2037 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2038 		log_msg(LOG_ERR, "error for private key file: %s", key);
2039 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2040 		SSL_CTX_free(ctx);
2041 		return NULL;
2042 	}
2043 	if(!SSL_CTX_check_private_key(ctx)) {
2044 		log_msg(LOG_ERR, "error for key file: %s", key);
2045 		log_crypto_err("Error in SSL_CTX check_private_key");
2046 		SSL_CTX_free(ctx);
2047 		return NULL;
2048 	}
2049 	listen_sslctx_setup_2(ctx);
2050 	if(verifypem && verifypem[0]) {
2051 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2052 			log_crypto_err("Error in SSL_CTX verify locations");
2053 			SSL_CTX_free(ctx);
2054 			return NULL;
2055 		}
2056 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2057 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2058 	}
2059 	return ctx;
2060 }
2061 
2062 SSL_CTX*
2063 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2064 {
2065 	char *key, *pem;
2066 	SSL_CTX *ctx;
2067 
2068 	key = nsd->options->tls_service_key;
2069 	pem = nsd->options->tls_service_pem;
2070 	if(!key || key[0] == 0) {
2071 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2072 		return NULL;
2073 	}
2074 	if(!pem || pem[0] == 0) {
2075 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2076 		return NULL;
2077 	}
2078 
2079 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2080 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2081 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2082 	if(!ctx) {
2083 		log_msg(LOG_ERR, "could not setup server TLS context");
2084 		return NULL;
2085 	}
2086 	if(ocspfile && ocspfile[0]) {
2087 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2088 			log_crypto_err("Error reading OCSPfile");
2089 			SSL_CTX_free(ctx);
2090 			return NULL;
2091 		} else {
2092 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2093 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2094 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2095 				SSL_CTX_free(ctx);
2096 				return NULL;
2097 			}
2098 		}
2099 	}
2100 	return ctx;
2101 }
2102 
2103 /* check if tcp_handler_accept_data created for TLS dedicated port */
2104 int
2105 using_tls_port(struct sockaddr* addr, const char* tls_port)
2106 {
2107 	in_port_t port = 0;
2108 
2109 	if (addr->sa_family == AF_INET)
2110 		port = ((struct sockaddr_in*)addr)->sin_port;
2111 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2112 	else
2113 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2114 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2115 	if (atoi(tls_port) == ntohs(port))
2116 		return 1;
2117 
2118 	return 0;
2119 }
2120 #endif
2121 
2122 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2123 ssize_t
2124 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2125 {
2126 	uint8_t* buf = (uint8_t*) p;
2127 	ssize_t total = 0;
2128 	struct pollfd fd;
2129 	memset(&fd, 0, sizeof(fd));
2130 	fd.fd = s;
2131 	fd.events = POLLIN;
2132 
2133 	while( total < sz) {
2134 		ssize_t ret;
2135 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2136 		if(ret == -1) {
2137 			if(errno == EAGAIN)
2138 				/* blocking read */
2139 				continue;
2140 			if(errno == EINTR) {
2141 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2142 					return -1;
2143 				/* other signals can be handled later */
2144 				continue;
2145 			}
2146 			/* some error */
2147 			return -1;
2148 		}
2149 		if(ret == 0) {
2150 			/* operation timed out */
2151 			return -2;
2152 		}
2153 		ret = read(s, buf+total, sz-total);
2154 		if(ret == -1) {
2155 			if(errno == EAGAIN)
2156 				/* blocking read */
2157 				continue;
2158 			if(errno == EINTR) {
2159 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2160 					return -1;
2161 				/* other signals can be handled later */
2162 				continue;
2163 			}
2164 			/* some error */
2165 			return -1;
2166 		}
2167 		if(ret == 0) {
2168 			/* closed connection! */
2169 			return 0;
2170 		}
2171 		total += ret;
2172 	}
2173 	return total;
2174 }
2175 
2176 static void
2177 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2178 {
2179 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2180 	udb_ptr t, next;
2181 	udb_base* u = nsd->task[nsd->mytask];
2182 	udb_ptr_init(&next, u);
2183 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2184 	udb_base_set_userdata(u, 0);
2185 	while(!udb_ptr_is_null(&t)) {
2186 		/* store next in list so this one can be deleted or reused */
2187 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2188 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2189 
2190 		/* process task t */
2191 		/* append results for task t and update last_task */
2192 		task_process_in_reload(nsd, u, last_task, &t);
2193 
2194 		/* go to next */
2195 		udb_ptr_set_ptr(&t, u, &next);
2196 
2197 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2198 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2199 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2200 			if(cmd == NSD_QUIT) {
2201 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2202 				/* sync to disk (if needed) */
2203 				udb_base_sync(nsd->db->udb, 0);
2204 				/* unlink files of remainder of tasks */
2205 				while(!udb_ptr_is_null(&t)) {
2206 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2207 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2208 					}
2209 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2210 				}
2211 				udb_ptr_unlink(&t, u);
2212 				udb_ptr_unlink(&next, u);
2213 				exit(0);
2214 			}
2215 		}
2216 
2217 	}
2218 	udb_ptr_unlink(&t, u);
2219 	udb_ptr_unlink(&next, u);
2220 }
2221 
2222 #ifdef BIND8_STATS
2223 static void
2224 parent_send_stats(struct nsd* nsd, int cmdfd)
2225 {
2226 	size_t i;
2227 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2228 		log_msg(LOG_ERR, "could not write stats to reload");
2229 		return;
2230 	}
2231 	for(i=0; i<nsd->child_count; i++)
2232 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2233 			sizeof(stc_type))) {
2234 			log_msg(LOG_ERR, "could not write stats to reload");
2235 			return;
2236 		}
2237 }
2238 
2239 static void
2240 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2241 {
2242 	struct nsdst s;
2243 	stc_type* p;
2244 	size_t i;
2245 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2246 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2247 		log_msg(LOG_ERR, "could not read stats from oldpar");
2248 		return;
2249 	}
2250 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2251 	s.db_mem = region_get_mem(nsd->db->region);
2252 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2253 		nsd->child_count);
2254 	if(!p) return;
2255 	for(i=0; i<nsd->child_count; i++) {
2256 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2257 			sizeof(stc_type))
2258 			return;
2259 	}
2260 }
2261 #endif /* BIND8_STATS */
2262 
2263 void server_verify(struct nsd *nsd, int cmdsocket);
2264 
2265 /*
2266  * Reload the database, stop parent, re-fork children and continue.
2267  * as server_main.
2268  */
2269 static void
2270 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2271 	int cmdsocket)
2272 {
2273 	pid_t mypid;
2274 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2275 	int ret;
2276 	udb_ptr last_task;
2277 	struct sigaction old_sigchld, ign_sigchld;
2278 	struct radnode* node;
2279 	zone_type* zone;
2280 	enum soainfo_hint hint;
2281 	/* ignore SIGCHLD from the previous server_main that used this pid */
2282 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2283 	ign_sigchld.sa_handler = SIG_IGN;
2284 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2285 
2286 #ifdef HAVE_SETPROCTITLE
2287 	setproctitle("main");
2288 #endif
2289 #ifdef HAVE_CPUSET_T
2290 	if(nsd->use_cpu_affinity) {
2291 		set_cpu_affinity(nsd->cpuset);
2292 	}
2293 #endif
2294 
2295 	/* see what tasks we got from xfrd */
2296 	task_remap(nsd->task[nsd->mytask]);
2297 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2298 	udb_compact_inhibited(nsd->db->udb, 1);
2299 	reload_process_tasks(nsd, &last_task, cmdsocket);
2300 	udb_compact_inhibited(nsd->db->udb, 0);
2301 	udb_compact(nsd->db->udb);
2302 
2303 #ifndef NDEBUG
2304 	if(nsd_debug_level >= 1)
2305 		region_log_stats(nsd->db->region);
2306 #endif /* NDEBUG */
2307 	/* sync to disk (if needed) */
2308 	udb_base_sync(nsd->db->udb, 0);
2309 
2310 	initialize_dname_compression_tables(nsd);
2311 
2312 #ifdef BIND8_STATS
2313 	/* Restart dumping stats if required.  */
2314 	time(&nsd->st.boot);
2315 	set_bind8_alarm(nsd);
2316 #endif
2317 #ifdef USE_ZONE_STATS
2318 	server_zonestat_realloc(nsd); /* realloc for new children */
2319 	server_zonestat_switch(nsd);
2320 #endif
2321 
2322 	if(nsd->options->verify_enable) {
2323 #ifdef RATELIMIT
2324 		/* allocate resources for rate limiting. use a slot that is guaranteed
2325 		   not mapped to a file so no persistent data is overwritten */
2326 		rrl_init(nsd->child_count + 1);
2327 #endif
2328 
2329 		/* spin-up server and execute verifiers for each zone */
2330 		server_verify(nsd, cmdsocket);
2331 #ifdef RATELIMIT
2332 		/* deallocate rate limiting resources */
2333 		rrl_deinit(nsd->child_count + 1);
2334 #endif
2335 	}
2336 
2337 	for(node = radix_first(nsd->db->zonetree);
2338 	    node != NULL;
2339 	    node = radix_next(node))
2340 	{
2341 		zone = (zone_type *)node->elem;
2342 		if(zone->is_updated) {
2343 			if(zone->is_bad) {
2344 				nsd->mode = NSD_RELOAD_FAILED;
2345 				hint = soainfo_bad;
2346 			} else {
2347 				hint = soainfo_ok;
2348 			}
2349 			/* update(s), verified or not, possibly with subsequent
2350 			   skipped update(s). skipped update(s) are picked up
2351 			   by failed update check in xfrd */
2352 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2353 			                 zone, hint);
2354 		} else if(zone->is_skipped) {
2355 			/* corrupt or inconsistent update without preceding
2356 			   update(s), communicate soainfo_gone */
2357 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2358 			                 zone, soainfo_gone);
2359 		}
2360 		zone->is_updated = 0;
2361 		zone->is_skipped = 0;
2362 	}
2363 
2364 	if(nsd->mode == NSD_RELOAD_FAILED) {
2365 		exit(NSD_RELOAD_FAILED);
2366 	}
2367 
2368 	/* listen for the signals of failed children again */
2369 	sigaction(SIGCHLD, &old_sigchld, NULL);
2370 #ifdef USE_DNSTAP
2371 	if (nsd->dt_collector) {
2372 		int *swap_fd_send;
2373 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2374 		/* Swap fd_send with fd_swap so old serve child and new serve
2375 		 * childs will not write to the same pipe ends simultaneously */
2376 		swap_fd_send = nsd->dt_collector_fd_send;
2377 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2378 		nsd->dt_collector_fd_swap = swap_fd_send;
2379 
2380 	}
2381 #endif
2382 	/* Start new child processes */
2383 	if (server_start_children(nsd, server_region, netio, &nsd->
2384 		xfrd_listener->fd) != 0) {
2385 		send_children_quit(nsd);
2386 		exit(1);
2387 	}
2388 
2389 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2390 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2391 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2392 		if(cmd == NSD_QUIT) {
2393 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2394 			send_children_quit(nsd);
2395 			exit(0);
2396 		}
2397 	}
2398 
2399 	/* Send quit command to parent: blocking, wait for receipt. */
2400 	do {
2401 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2402 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2403 		{
2404 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2405 				strerror(errno));
2406 		}
2407 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2408 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2409 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2410 			RELOAD_SYNC_TIMEOUT);
2411 		if(ret == -2) {
2412 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2413 		}
2414 	} while (ret == -2);
2415 	if(ret == -1) {
2416 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2417 			strerror(errno));
2418 	}
2419 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2420 	if(cmd == NSD_QUIT) {
2421 		/* small race condition possible here, parent got quit cmd. */
2422 		send_children_quit(nsd);
2423 		exit(1);
2424 	}
2425 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2426 #ifdef BIND8_STATS
2427 	reload_do_stats(cmdsocket, nsd, &last_task);
2428 #endif
2429 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2430 	task_process_sync(nsd->task[nsd->mytask]);
2431 #ifdef USE_ZONE_STATS
2432 	server_zonestat_realloc(nsd); /* realloc for next children */
2433 #endif
2434 
2435 	/* send soainfo to the xfrd process, signal it that reload is done,
2436 	 * it picks up the taskudb */
2437 	cmd = NSD_RELOAD_DONE;
2438 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2439 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2440 			strerror(errno));
2441 	}
2442 	mypid = getpid();
2443 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2444 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2445 			strerror(errno));
2446 	}
2447 
2448 	/* try to reopen file */
2449 	if (nsd->file_rotation_ok)
2450 		log_reopen(nsd->log_filename, 1);
2451 	/* exit reload, continue as new server_main */
2452 }
2453 
2454 /*
2455  * Get the mode depending on the signal hints that have been received.
2456  * Multiple signal hints can be received and will be handled in turn.
2457  */
2458 static sig_atomic_t
2459 server_signal_mode(struct nsd *nsd)
2460 {
2461 	if(nsd->signal_hint_quit) {
2462 		nsd->signal_hint_quit = 0;
2463 		return NSD_QUIT;
2464 	}
2465 	else if(nsd->signal_hint_shutdown) {
2466 		nsd->signal_hint_shutdown = 0;
2467 		return NSD_SHUTDOWN;
2468 	}
2469 	else if(nsd->signal_hint_child) {
2470 		nsd->signal_hint_child = 0;
2471 		return NSD_REAP_CHILDREN;
2472 	}
2473 	else if(nsd->signal_hint_reload) {
2474 		nsd->signal_hint_reload = 0;
2475 		return NSD_RELOAD;
2476 	}
2477 	else if(nsd->signal_hint_reload_hup) {
2478 		nsd->signal_hint_reload_hup = 0;
2479 		return NSD_RELOAD_REQ;
2480 	}
2481 	else if(nsd->signal_hint_stats) {
2482 		nsd->signal_hint_stats = 0;
2483 #ifdef BIND8_STATS
2484 		set_bind8_alarm(nsd);
2485 #endif
2486 		return NSD_STATS;
2487 	}
2488 	else if(nsd->signal_hint_statsusr) {
2489 		nsd->signal_hint_statsusr = 0;
2490 		return NSD_STATS;
2491 	}
2492 	return NSD_RUN;
2493 }
2494 
2495 /*
2496  * The main server simply waits for signals and child processes to
2497  * terminate.  Child processes are restarted as necessary.
2498  */
2499 void
2500 server_main(struct nsd *nsd)
2501 {
2502 	region_type *server_region = region_create(xalloc, free);
2503 	netio_type *netio = netio_create(server_region);
2504 	netio_handler_type reload_listener;
2505 	int reload_sockets[2] = {-1, -1};
2506 	struct timespec timeout_spec;
2507 	int status;
2508 	pid_t child_pid;
2509 	pid_t reload_pid = -1;
2510 	sig_atomic_t mode;
2511 
2512 	/* Ensure we are the main process */
2513 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2514 
2515 	/* Add listener for the XFRD process */
2516 	netio_add_handler(netio, nsd->xfrd_listener);
2517 
2518 	/* Start the child processes that handle incoming queries */
2519 	if (server_start_children(nsd, server_region, netio,
2520 		&nsd->xfrd_listener->fd) != 0) {
2521 		send_children_quit(nsd);
2522 		exit(1);
2523 	}
2524 	reload_listener.fd = -1;
2525 
2526 	/* This_child MUST be 0, because this is the parent process */
2527 	assert(nsd->this_child == 0);
2528 
2529 	/* Run the server until we get a shutdown signal */
2530 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2531 		/* Did we receive a signal that changes our mode? */
2532 		if(mode == NSD_RUN) {
2533 			nsd->mode = mode = server_signal_mode(nsd);
2534 		}
2535 
2536 		switch (mode) {
2537 		case NSD_RUN:
2538 			/* see if any child processes terminated */
2539 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2540 				int is_child = delete_child_pid(nsd, child_pid);
2541 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2542 					if(nsd->children[is_child].child_fd == -1)
2543 						nsd->children[is_child].has_exited = 1;
2544 					parent_check_all_children_exited(nsd);
2545 				} else if(is_child != -1) {
2546 					log_msg(LOG_WARNING,
2547 					       "server %d died unexpectedly with status %d, restarting",
2548 					       (int) child_pid, status);
2549 					restart_child_servers(nsd, server_region, netio,
2550 						&nsd->xfrd_listener->fd);
2551 				} else if (child_pid == reload_pid) {
2552 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2553 					pid_t mypid;
2554 					log_msg(LOG_WARNING,
2555 					       "Reload process %d failed with status %d, continuing with old database",
2556 					       (int) child_pid, status);
2557 					reload_pid = -1;
2558 					if(reload_listener.fd != -1) close(reload_listener.fd);
2559 					netio_remove_handler(netio, &reload_listener);
2560 					reload_listener.fd = -1;
2561 					reload_listener.event_types = NETIO_EVENT_NONE;
2562 					task_process_sync(nsd->task[nsd->mytask]);
2563 					/* inform xfrd reload attempt ended */
2564 					if(!write_socket(nsd->xfrd_listener->fd,
2565 						&cmd, sizeof(cmd))) {
2566 						log_msg(LOG_ERR, "problems "
2567 						  "sending SOAEND to xfrd: %s",
2568 						  strerror(errno));
2569 					}
2570 					mypid = getpid();
2571 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2572 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2573 							strerror(errno));
2574 					}
2575 #ifdef USE_DNSTAP
2576 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2577 					log_msg(LOG_WARNING,
2578 					       "dnstap-collector %d terminated with status %d",
2579 					       (int) child_pid, status);
2580 					if(nsd->dt_collector) {
2581 						dt_collector_close(nsd->dt_collector, nsd);
2582 						dt_collector_destroy(nsd->dt_collector, nsd);
2583 						nsd->dt_collector = NULL;
2584 					}
2585 					/* Only respawn a crashed (or exited)
2586 					 * dnstap-collector when not reloading,
2587 					 * to not induce a reload during a
2588 					 * reload (which would seriously
2589 					 * disrupt nsd procedures and lead to
2590 					 * unpredictable results)!
2591 					 *
2592 					 * This will *leave* a dnstap-collector
2593 					 * process terminated, but because
2594 					 * signalling of the reload process to
2595 					 * the main process to respawn in this
2596 					 * situation will be cumbersome, and
2597 					 * because this situation is so
2598 					 * specific (and therefore hopefully
2599 					 * extremely rare or non-existing at
2600 					 * all), plus the fact that we are left
2601 					 * with a perfectly function NSD
2602 					 * (besides not logging dnstap
2603 					 * messages), I consider it acceptable
2604 					 * to leave this unresolved.
2605 					 */
2606 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2607 						nsd->dt_collector = dt_collector_create(nsd);
2608 						dt_collector_start(nsd->dt_collector, nsd);
2609 						nsd->mode = NSD_RELOAD_REQ;
2610 					}
2611 #endif
2612 				} else if(status != 0) {
2613 					/* check for status, because we get
2614 					 * the old-servermain because reload
2615 					 * is the process-parent of old-main,
2616 					 * and we get older server-processes
2617 					 * that are exiting after a reload */
2618 					log_msg(LOG_WARNING,
2619 					       "process %d terminated with status %d",
2620 					       (int) child_pid, status);
2621 				}
2622 			}
2623 			if (child_pid == -1) {
2624 				if (errno == EINTR) {
2625 					continue;
2626 				}
2627 				if (errno != ECHILD)
2628 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2629 			}
2630 			if (nsd->mode != NSD_RUN)
2631 				break;
2632 
2633 			/* timeout to collect processes. In case no sigchild happens. */
2634 			timeout_spec.tv_sec = 60;
2635 			timeout_spec.tv_nsec = 0;
2636 
2637 			/* listen on ports, timeout for collecting terminated children */
2638 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2639 				if (errno != EINTR) {
2640 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2641 				}
2642 			}
2643 			if(nsd->restart_children) {
2644 				restart_child_servers(nsd, server_region, netio,
2645 					&nsd->xfrd_listener->fd);
2646 				nsd->restart_children = 0;
2647 			}
2648 			if(nsd->reload_failed) {
2649 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2650 				pid_t mypid;
2651 				nsd->reload_failed = 0;
2652 				log_msg(LOG_WARNING,
2653 				       "Reload process %d failed, continuing with old database",
2654 				       (int) reload_pid);
2655 				reload_pid = -1;
2656 				if(reload_listener.fd != -1) close(reload_listener.fd);
2657 				netio_remove_handler(netio, &reload_listener);
2658 				reload_listener.fd = -1;
2659 				reload_listener.event_types = NETIO_EVENT_NONE;
2660 				task_process_sync(nsd->task[nsd->mytask]);
2661 				/* inform xfrd reload attempt ended */
2662 				if(!write_socket(nsd->xfrd_listener->fd,
2663 					&cmd, sizeof(cmd))) {
2664 					log_msg(LOG_ERR, "problems "
2665 					  "sending SOAEND to xfrd: %s",
2666 					  strerror(errno));
2667 				}
2668 				mypid = getpid();
2669 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2670 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2671 						strerror(errno));
2672 				}
2673 			}
2674 
2675 			break;
2676 		case NSD_RELOAD_REQ: {
2677 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2678 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2679 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2680 				"main: ipc send reload_req to xfrd"));
2681 			if(!write_socket(nsd->xfrd_listener->fd,
2682 				&cmd, sizeof(cmd))) {
2683 				log_msg(LOG_ERR, "server_main: could not send "
2684 				"reload_req to xfrd: %s", strerror(errno));
2685 			}
2686 			nsd->mode = NSD_RUN;
2687 			} break;
2688 		case NSD_RELOAD:
2689 			/* Continue to run nsd after reload */
2690 			nsd->mode = NSD_RUN;
2691 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2692 			if (reload_pid != -1) {
2693 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2694 				       (int) reload_pid);
2695 				break;
2696 			}
2697 
2698 			/* switch the mytask to keep track of who owns task*/
2699 			nsd->mytask = 1 - nsd->mytask;
2700 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2701 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2702 				reload_pid = -1;
2703 				break;
2704 			}
2705 
2706 			/* Do actual reload */
2707 			reload_pid = fork();
2708 			switch (reload_pid) {
2709 			case -1:
2710 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2711 				break;
2712 			default:
2713 				/* PARENT */
2714 				close(reload_sockets[0]);
2715 				server_reload(nsd, server_region, netio,
2716 					reload_sockets[1]);
2717 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2718 				close(reload_sockets[1]);
2719 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2720 				/* drop stale xfrd ipc data */
2721 				((struct ipc_handler_conn_data*)nsd->
2722 					xfrd_listener->user_data)
2723 					->conn->is_reading = 0;
2724 				reload_pid = -1;
2725 				reload_listener.fd = -1;
2726 				reload_listener.event_types = NETIO_EVENT_NONE;
2727 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2728 				break;
2729 			case 0:
2730 				/* CHILD */
2731 				/* server_main keep running until NSD_QUIT_SYNC
2732 				 * received from reload. */
2733 				close(reload_sockets[1]);
2734 				reload_listener.fd = reload_sockets[0];
2735 				reload_listener.timeout = NULL;
2736 				reload_listener.user_data = nsd;
2737 				reload_listener.event_types = NETIO_EVENT_READ;
2738 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2739 				netio_add_handler(netio, &reload_listener);
2740 				reload_pid = getppid();
2741 				break;
2742 			}
2743 			break;
2744 		case NSD_QUIT_SYNC:
2745 			/* synchronisation of xfrd, parent and reload */
2746 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2747 				sig_atomic_t cmd = NSD_RELOAD;
2748 				/* stop xfrd ipc writes in progress */
2749 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2750 					"main: ipc send indication reload"));
2751 				if(!write_socket(nsd->xfrd_listener->fd,
2752 					&cmd, sizeof(cmd))) {
2753 					log_msg(LOG_ERR, "server_main: could not send reload "
2754 					"indication to xfrd: %s", strerror(errno));
2755 				}
2756 				/* wait for ACK from xfrd */
2757 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2758 				nsd->quit_sync_done = 1;
2759 			}
2760 			nsd->mode = NSD_RUN;
2761 			break;
2762 		case NSD_QUIT:
2763 			/* silent shutdown during reload */
2764 			if(reload_listener.fd != -1) {
2765 				/* acknowledge the quit, to sync reload that we will really quit now */
2766 				sig_atomic_t cmd = NSD_RELOAD;
2767 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2768 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2769 					log_msg(LOG_ERR, "server_main: "
2770 						"could not ack quit: %s", strerror(errno));
2771 				}
2772 #ifdef BIND8_STATS
2773 				parent_send_stats(nsd, reload_listener.fd);
2774 #endif /* BIND8_STATS */
2775 				close(reload_listener.fd);
2776 			}
2777 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2778 			/* only quit children after xfrd has acked */
2779 			send_children_quit(nsd);
2780 
2781 #ifdef MEMCLEAN /* OS collects memory pages */
2782 			region_destroy(server_region);
2783 #endif
2784 			server_shutdown(nsd);
2785 
2786 			/* ENOTREACH */
2787 			break;
2788 		case NSD_SHUTDOWN:
2789 			break;
2790 		case NSD_REAP_CHILDREN:
2791 			/* continue; wait for child in run loop */
2792 			nsd->mode = NSD_RUN;
2793 			break;
2794 		case NSD_STATS:
2795 #ifdef BIND8_STATS
2796 			set_children_stats(nsd);
2797 #endif
2798 			nsd->mode = NSD_RUN;
2799 			break;
2800 		default:
2801 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2802 			nsd->mode = NSD_RUN;
2803 			break;
2804 		}
2805 	}
2806 	log_msg(LOG_WARNING, "signal received, shutting down...");
2807 
2808 	/* close opened ports to avoid race with restart of nsd */
2809 	server_close_all_sockets(nsd->udp, nsd->ifs);
2810 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2811 	daemon_remote_close(nsd->rc);
2812 	send_children_quit_and_wait(nsd);
2813 
2814 	/* Unlink it if possible... */
2815 	unlinkpid(nsd->pidfile);
2816 	unlink(nsd->task[0]->fname);
2817 	unlink(nsd->task[1]->fname);
2818 #ifdef USE_ZONE_STATS
2819 	unlink(nsd->zonestatfname[0]);
2820 	unlink(nsd->zonestatfname[1]);
2821 #endif
2822 #ifdef USE_DNSTAP
2823 	dt_collector_close(nsd->dt_collector, nsd);
2824 #endif
2825 
2826 	if(reload_listener.fd != -1) {
2827 		sig_atomic_t cmd = NSD_QUIT;
2828 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2829 			"main: ipc send quit to reload-process"));
2830 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2831 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2832 				strerror(errno));
2833 		}
2834 		fsync(reload_listener.fd);
2835 		close(reload_listener.fd);
2836 		/* wait for reload to finish processing */
2837 		while(1) {
2838 			if(waitpid(reload_pid, NULL, 0) == -1) {
2839 				if(errno == EINTR) continue;
2840 				if(errno == ECHILD) break;
2841 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2842 					(int)reload_pid, strerror(errno));
2843 			}
2844 			break;
2845 		}
2846 	}
2847 	if(nsd->xfrd_listener->fd != -1) {
2848 		/* complete quit, stop xfrd */
2849 		sig_atomic_t cmd = NSD_QUIT;
2850 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2851 			"main: ipc send quit to xfrd"));
2852 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2853 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2854 				strerror(errno));
2855 		}
2856 		fsync(nsd->xfrd_listener->fd);
2857 		close(nsd->xfrd_listener->fd);
2858 		(void)kill(nsd->pid, SIGTERM);
2859 	}
2860 
2861 #ifdef MEMCLEAN /* OS collects memory pages */
2862 	region_destroy(server_region);
2863 #endif
2864 	/* write the nsd.db to disk, wait for it to complete */
2865 	udb_base_sync(nsd->db->udb, 1);
2866 	udb_base_close(nsd->db->udb);
2867 	server_shutdown(nsd);
2868 }
2869 
2870 static query_state_type
2871 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2872 {
2873 	return query_process(query, nsd, now_p);
2874 }
2875 
2876 static query_state_type
2877 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2878 {
2879 #ifdef RATELIMIT
2880 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2881 		if(query->edns.cookie_status != COOKIE_VALID
2882 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2883 		&& rrl_process_query(query))
2884 			return rrl_slip(query);
2885 		else	return QUERY_PROCESSED;
2886 	}
2887 	return QUERY_DISCARDED;
2888 #else
2889 	return query_process(query, nsd, now_p);
2890 #endif
2891 }
2892 
2893 const char*
2894 nsd_event_vs(void)
2895 {
2896 #ifdef USE_MINI_EVENT
2897 	return "";
2898 #else
2899 	return event_get_version();
2900 #endif
2901 }
2902 
2903 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2904 static const char* ub_ev_backend2str(int b)
2905 {
2906 	switch(b) {
2907 	case EVBACKEND_SELECT:	return "select";
2908 	case EVBACKEND_POLL:	return "poll";
2909 	case EVBACKEND_EPOLL:	return "epoll";
2910 	case EVBACKEND_KQUEUE:	return "kqueue";
2911 	case EVBACKEND_DEVPOLL: return "devpoll";
2912 	case EVBACKEND_PORT:	return "evport";
2913 	}
2914 	return "unknown";
2915 }
2916 #endif
2917 
2918 const char*
2919 nsd_event_method(void)
2920 {
2921 #ifdef USE_MINI_EVENT
2922 	return "select";
2923 #else
2924 	struct event_base* b = nsd_child_event_base();
2925 	const char* m;
2926 #  ifdef EV_FEATURE_BACKENDS
2927 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2928 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2929 	m = event_base_get_method(b);
2930 #  else
2931 	m = "?";
2932 #  endif
2933 #  ifdef MEMCLEAN
2934 	event_base_free(b);
2935 #  endif
2936 	return m;
2937 #endif
2938 }
2939 
2940 struct event_base*
2941 nsd_child_event_base(void)
2942 {
2943 	struct event_base* base;
2944 #ifdef USE_MINI_EVENT
2945 	static time_t secs;
2946 	static struct timeval now;
2947 	base = event_init(&secs, &now);
2948 #else
2949 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2950 	/* libev */
2951 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2952 #  else
2953 	/* libevent */
2954 #    ifdef HAVE_EVENT_BASE_NEW
2955 	base = event_base_new();
2956 #    else
2957 	base = event_init();
2958 #    endif
2959 #  endif
2960 #endif
2961 	return base;
2962 }
2963 
2964 static void
2965 add_udp_handler(
2966 	struct nsd *nsd,
2967 	struct nsd_socket *sock,
2968 	struct udp_handler_data *data)
2969 {
2970 	struct event *handler = &data->event;
2971 
2972 	data->nsd = nsd;
2973 	data->socket = sock;
2974 
2975 	memset(handler, 0, sizeof(*handler));
2976 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2977 	if(event_base_set(nsd->event_base, handler) != 0)
2978 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2979 	if(event_add(handler, NULL) != 0)
2980 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2981 }
2982 
2983 void
2984 add_tcp_handler(
2985 	struct nsd *nsd,
2986 	struct nsd_socket *sock,
2987 	struct tcp_accept_handler_data *data)
2988 {
2989 	struct event *handler = &data->event;
2990 
2991 	data->nsd = nsd;
2992 	data->socket = sock;
2993 
2994 #ifdef HAVE_SSL
2995 	if (nsd->tls_ctx &&
2996 	    nsd->options->tls_port &&
2997 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2998 	{
2999 		data->tls_accept = 1;
3000 		if(verbosity >= 2) {
3001 			char buf[48];
3002 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3003 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3004 		}
3005 	} else {
3006 		data->tls_accept = 0;
3007 	}
3008 #endif
3009 
3010 	memset(handler, 0, sizeof(*handler));
3011 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3012 	if(event_base_set(nsd->event_base, handler) != 0)
3013 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3014 	if(event_add(handler, NULL) != 0)
3015 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3016 	data->event_added = 1;
3017 }
3018 
3019 /*
3020  * Serve DNS request to verifiers (short-lived)
3021  */
3022 void server_verify(struct nsd *nsd, int cmdsocket)
3023 {
3024 	size_t size = 0;
3025 	struct event cmd_event, signal_event, exit_event;
3026 	struct zone *zone;
3027 
3028 	assert(nsd != NULL);
3029 
3030 	zone = verify_next_zone(nsd, NULL);
3031 	if(zone == NULL)
3032 		return;
3033 
3034 	nsd->server_region = region_create(xalloc, free);
3035 	nsd->event_base = nsd_child_event_base();
3036 
3037 	nsd->next_zone_to_verify = zone;
3038 	nsd->verifier_count = 0;
3039 	nsd->verifier_limit = nsd->options->verifier_count;
3040 	size = sizeof(struct verifier) * nsd->verifier_limit;
3041 	if(pipe(nsd->verifier_pipe) == -1) {
3042 		log_msg(LOG_ERR, "verify: could not create pipe: %s",
3043 				strerror(errno));
3044 		goto fail_pipe;
3045 	}
3046 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3047 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3048 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3049 
3050 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3051 		nsd->verifiers[i].nsd = nsd;
3052 		nsd->verifiers[i].zone = NULL;
3053 		nsd->verifiers[i].pid = -1;
3054 		nsd->verifiers[i].output_stream.fd = -1;
3055 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3056 		nsd->verifiers[i].error_stream.fd = -1;
3057 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3058 	}
3059 
3060 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3061 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3062 	   event_add(&cmd_event, NULL) != 0)
3063 	{
3064 		log_msg(LOG_ERR, "verify: could not add command event");
3065 		goto fail;
3066 	}
3067 
3068 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3069 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3070 	   signal_add(&signal_event, NULL) != 0)
3071 	{
3072 		log_msg(LOG_ERR, "verify: could not add signal event");
3073 		goto fail;
3074 	}
3075 
3076 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3077 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3078 	   event_add(&exit_event, NULL) != 0)
3079   {
3080 		log_msg(LOG_ERR, "verify: could not add exit event");
3081 		goto fail;
3082 	}
3083 
3084 	memset(msgs, 0, sizeof(msgs));
3085 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3086 		queries[i] = query_create(nsd->server_region,
3087 			compressed_dname_offsets,
3088 			compression_table_size, compressed_dnames);
3089 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3090 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3091 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3092 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3093 		msgs[i].msg_hdr.msg_iovlen = 1;
3094 		msgs[i].msg_hdr.msg_name = &queries[i]->addr;
3095 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3096 	}
3097 
3098 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3099 		struct udp_handler_data *data;
3100 		data = region_alloc_zero(
3101 			nsd->server_region, sizeof(*data));
3102 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3103 	}
3104 
3105 	tcp_accept_handler_count = nsd->verify_ifs;
3106 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3107 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3108 
3109 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3110 		struct tcp_accept_handler_data *data;
3111 		data = &tcp_accept_handlers[i];
3112 		memset(data, 0, sizeof(*data));
3113 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3114 	}
3115 
3116 	while(nsd->next_zone_to_verify != NULL &&
3117 	      nsd->verifier_count < nsd->verifier_limit)
3118 	{
3119 		verify_zone(nsd, nsd->next_zone_to_verify);
3120 		nsd->next_zone_to_verify
3121 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3122 	}
3123 
3124 	/* short-lived main loop */
3125 	event_base_dispatch(nsd->event_base);
3126 
3127 	/* remove command and exit event handlers */
3128 	event_del(&exit_event);
3129 	event_del(&signal_event);
3130 	event_del(&cmd_event);
3131 
3132 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3133 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3134 fail:
3135 	close(nsd->verifier_pipe[0]);
3136 	close(nsd->verifier_pipe[1]);
3137 fail_pipe:
3138 	event_base_free(nsd->event_base);
3139 	region_destroy(nsd->server_region);
3140 
3141 	nsd->event_base = NULL;
3142 	nsd->server_region = NULL;
3143 	nsd->verifier_limit = 0;
3144 	nsd->verifier_pipe[0] = -1;
3145 	nsd->verifier_pipe[1] = -1;
3146 	nsd->verifiers = NULL;
3147 }
3148 
3149 /*
3150  * Serve DNS requests.
3151  */
3152 void
3153 server_child(struct nsd *nsd)
3154 {
3155 	size_t i, from, numifs;
3156 	region_type *server_region = region_create(xalloc, free);
3157 	struct event_base* event_base = nsd_child_event_base();
3158 	sig_atomic_t mode;
3159 
3160 	if(!event_base) {
3161 		log_msg(LOG_ERR, "nsd server could not create event base");
3162 		exit(1);
3163 	}
3164 	nsd->event_base = event_base;
3165 	nsd->server_region = server_region;
3166 
3167 #ifdef RATELIMIT
3168 	rrl_init(nsd->this_child->child_num);
3169 #endif
3170 
3171 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3172 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3173 
3174 #ifdef HAVE_SETPROCTITLE
3175 	setproctitle("server %d", nsd->this_child->child_num + 1);
3176 #endif
3177 #ifdef HAVE_CPUSET_T
3178 	if(nsd->use_cpu_affinity) {
3179 		set_cpu_affinity(nsd->this_child->cpuset);
3180 	}
3181 #endif
3182 
3183 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3184 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3185 	}
3186 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3187 		server_close_all_sockets(nsd->udp, nsd->ifs);
3188 	}
3189 
3190 	if (nsd->this_child->parent_fd != -1) {
3191 		struct event *handler;
3192 		struct ipc_handler_conn_data* user_data =
3193 			(struct ipc_handler_conn_data*)region_alloc(
3194 			server_region, sizeof(struct ipc_handler_conn_data));
3195 		user_data->nsd = nsd;
3196 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3197 
3198 		handler = (struct event*) region_alloc(
3199 			server_region, sizeof(*handler));
3200 		memset(handler, 0, sizeof(*handler));
3201 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3202 			EV_READ, child_handle_parent_command, user_data);
3203 		if(event_base_set(event_base, handler) != 0)
3204 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3205 		if(event_add(handler, NULL) != 0)
3206 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3207 	}
3208 
3209 	if(nsd->reuseport) {
3210 		numifs = nsd->ifs / nsd->reuseport;
3211 		from = numifs * nsd->this_child->child_num;
3212 		if(from+numifs > nsd->ifs) { /* should not happen */
3213 			from = 0;
3214 			numifs = nsd->ifs;
3215 		}
3216 	} else {
3217 		from = 0;
3218 		numifs = nsd->ifs;
3219 	}
3220 
3221 	if (nsd->server_kind & NSD_SERVER_UDP) {
3222 		int child = nsd->this_child->child_num;
3223 		memset(msgs, 0, sizeof(msgs));
3224 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3225 			queries[i] = query_create(server_region,
3226 				compressed_dname_offsets,
3227 				compression_table_size, compressed_dnames);
3228 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3229 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3230 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3231 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3232 			msgs[i].msg_hdr.msg_iovlen  = 1;
3233 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
3234 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3235 		}
3236 
3237 		for (i = 0; i < nsd->ifs; i++) {
3238 			int listen;
3239 			struct udp_handler_data *data;
3240 
3241 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3242 
3243 			if(i >= from && i < (from + numifs) && listen) {
3244 				data = region_alloc_zero(
3245 					nsd->server_region, sizeof(*data));
3246 				add_udp_handler(nsd, &nsd->udp[i], data);
3247 			} else {
3248 				/* close sockets intended for other servers */
3249 				server_close_socket(&nsd->udp[i]);
3250 			}
3251 		}
3252 	}
3253 
3254 	/*
3255 	 * Keep track of all the TCP accept handlers so we can enable
3256 	 * and disable them based on the current number of active TCP
3257 	 * connections.
3258 	 */
3259 	if (nsd->server_kind & NSD_SERVER_TCP) {
3260 		int child = nsd->this_child->child_num;
3261 		tcp_accept_handler_count = numifs;
3262 		tcp_accept_handlers = region_alloc_array(server_region,
3263 			numifs, sizeof(*tcp_accept_handlers));
3264 
3265 		for (i = 0; i < nsd->ifs; i++) {
3266 			int listen;
3267 			struct tcp_accept_handler_data *data;
3268 
3269 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3270 
3271 			if(i >= from && i < (from + numifs) && listen) {
3272 				data = &tcp_accept_handlers[i-from];
3273 				memset(data, 0, sizeof(*data));
3274 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3275 			} else {
3276 				/* close sockets intended for other servers */
3277 				/*
3278 				 * uncomment this once tcp servers are no
3279 				 * longer copied in the tcp fd copy line
3280 				 * in server_init().
3281 				server_close_socket(&nsd->tcp[i]);
3282 				*/
3283 				/* close sockets not meant for this server*/
3284 				if(!listen)
3285 					server_close_socket(&nsd->tcp[i]);
3286 			}
3287 		}
3288 	} else {
3289 		tcp_accept_handler_count = 0;
3290 	}
3291 
3292 	/* The main loop... */
3293 	while ((mode = nsd->mode) != NSD_QUIT) {
3294 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3295 
3296 		/* Do we need to do the statistics... */
3297 		if (mode == NSD_STATS) {
3298 #ifdef BIND8_STATS
3299 			int p = nsd->st.period;
3300 			nsd->st.period = 1; /* force stats printout */
3301 			/* Dump the statistics */
3302 			bind8_stats(nsd);
3303 			nsd->st.period = p;
3304 #else /* !BIND8_STATS */
3305 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3306 #endif /* BIND8_STATS */
3307 
3308 			nsd->mode = NSD_RUN;
3309 		}
3310 		else if (mode == NSD_REAP_CHILDREN) {
3311 			/* got signal, notify parent. parent reaps terminated children. */
3312 			if (nsd->this_child->parent_fd != -1) {
3313 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3314 				if (write(nsd->this_child->parent_fd,
3315 				    &parent_notify,
3316 				    sizeof(parent_notify)) == -1)
3317 				{
3318 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3319 						(int) nsd->this_child->pid, strerror(errno));
3320 				}
3321 			} else /* no parent, so reap 'em */
3322 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3323 			nsd->mode = NSD_RUN;
3324 		}
3325 		else if(mode == NSD_RUN) {
3326 			/* Wait for a query... */
3327 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3328 				if (errno != EINTR) {
3329 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3330 					break;
3331 				}
3332 			}
3333 		} else if(mode == NSD_QUIT) {
3334 			/* ignore here, quit */
3335 		} else {
3336 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3337 				(int)mode);
3338 			nsd->mode = NSD_RUN;
3339 		}
3340 	}
3341 
3342 	service_remaining_tcp(nsd);
3343 #ifdef	BIND8_STATS
3344 	bind8_stats(nsd);
3345 #endif /* BIND8_STATS */
3346 
3347 #ifdef MEMCLEAN /* OS collects memory pages */
3348 #ifdef RATELIMIT
3349 	rrl_deinit(nsd->this_child->child_num);
3350 #endif
3351 	event_base_free(event_base);
3352 	region_destroy(server_region);
3353 #endif
3354 	server_shutdown(nsd);
3355 }
3356 
3357 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3358 {
3359 	int* timed_out = (int*)arg;
3360         assert(event & EV_TIMEOUT); (void)event;
3361 	/* wake up the service tcp thread, note event is no longer
3362 	 * registered */
3363 	*timed_out = 1;
3364 }
3365 
3366 void
3367 service_remaining_tcp(struct nsd* nsd)
3368 {
3369 	struct tcp_handler_data* p;
3370 	struct event_base* event_base;
3371 	/* check if it is needed */
3372 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3373 		return;
3374 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3375 #ifdef USE_DNSTAP
3376 	/* remove dnstap collector, we cannot write there because the new
3377 	 * child process is using the file descriptor, or the child
3378 	 * process after that. */
3379 	dt_collector_destroy(nsd->dt_collector, nsd);
3380 	nsd->dt_collector = NULL;
3381 #endif
3382 	/* setup event base */
3383 	event_base = nsd_child_event_base();
3384 	if(!event_base) {
3385 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3386 		return;
3387 	}
3388 	/* register tcp connections */
3389 	for(p = tcp_active_list; p != NULL; p = p->next) {
3390 		struct timeval timeout;
3391 		int fd = p->event.ev_fd;
3392 #ifdef USE_MINI_EVENT
3393 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3394 #else
3395 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3396 #endif
3397 		void (*fn)(int, short, void*);
3398 #ifdef HAVE_SSL
3399 		if(p->tls) {
3400 			if((event&EV_READ))
3401 				fn = handle_tls_reading;
3402 			else	fn = handle_tls_writing;
3403 		} else {
3404 #endif
3405 			if((event&EV_READ))
3406 				fn = handle_tcp_reading;
3407 			else	fn = handle_tcp_writing;
3408 #ifdef HAVE_SSL
3409 		}
3410 #endif
3411 
3412 		p->tcp_no_more_queries = 1;
3413 		/* set timeout to 1/10 second */
3414 		if(p->tcp_timeout > 100)
3415 			p->tcp_timeout = 100;
3416 		timeout.tv_sec = p->tcp_timeout / 1000;
3417 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3418 		event_del(&p->event);
3419 		memset(&p->event, 0, sizeof(p->event));
3420 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3421 			fn, p);
3422 		if(event_base_set(event_base, &p->event) != 0)
3423 			log_msg(LOG_ERR, "event base set failed");
3424 		if(event_add(&p->event, &timeout) != 0)
3425 			log_msg(LOG_ERR, "event add failed");
3426 	}
3427 
3428 	/* handle it */
3429 	while(nsd->current_tcp_count > 0) {
3430 		mode_t m = server_signal_mode(nsd);
3431 		struct event timeout;
3432 		struct timeval tv;
3433 		int timed_out = 0;
3434 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3435 			m == NSD_REAP_CHILDREN) {
3436 			/* quit */
3437 			break;
3438 		}
3439 		/* timer */
3440 		/* have to do something every second */
3441 		tv.tv_sec = 1;
3442 		tv.tv_usec = 0;
3443 		memset(&timeout, 0, sizeof(timeout));
3444 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3445 			&timed_out);
3446 		if(event_base_set(event_base, &timeout) != 0)
3447 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3448 		if(event_add(&timeout, &tv) != 0)
3449 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3450 
3451 		/* service loop */
3452 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3453 			if (errno != EINTR) {
3454 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3455 				break;
3456 			}
3457 		}
3458 		if(!timed_out) {
3459 			event_del(&timeout);
3460 		} else {
3461 			/* timed out, quit */
3462 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3463 			break;
3464 		}
3465 	}
3466 #ifdef MEMCLEAN
3467 	event_base_free(event_base);
3468 #endif
3469 	/* continue to quit after return */
3470 }
3471 
3472 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3473  * are always used, even if nonblocking operations are broken, in which case
3474  * NUM_RECV_PER_SELECT is defined to 1 (one).
3475  */
3476 #if defined(HAVE_RECVMMSG)
3477 #define nsd_recvmmsg recvmmsg
3478 #else /* !HAVE_RECVMMSG */
3479 
3480 static int
3481 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3482              int flags, struct timespec *timeout)
3483 {
3484 	unsigned int vpos = 0;
3485 	ssize_t rcvd;
3486 
3487 	/* timeout is ignored, ensure caller does not expect it to work */
3488 	assert(timeout == NULL); (void)timeout;
3489 
3490 	while(vpos < vlen) {
3491 		rcvd = recvfrom(sockfd,
3492 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3493 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3494 		                flags,
3495 		                msgvec[vpos].msg_hdr.msg_name,
3496 		               &msgvec[vpos].msg_hdr.msg_namelen);
3497 		if(rcvd < 0) {
3498 			break;
3499 		} else {
3500 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3501 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3502 			vpos++;
3503 		}
3504 	}
3505 
3506 	if(vpos) {
3507 		/* error will be picked up next time */
3508 		return (int)vpos;
3509 	} else if(errno == 0) {
3510 		return 0;
3511 	} else if(errno == EAGAIN) {
3512 		return 0;
3513 	}
3514 
3515 	return -1;
3516 }
3517 #endif /* HAVE_RECVMMSG */
3518 
3519 #ifdef HAVE_SENDMMSG
3520 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3521 #else /* !HAVE_SENDMMSG */
3522 
3523 static int
3524 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3525 {
3526 	unsigned int vpos = 0;
3527 	ssize_t snd;
3528 
3529 	while(vpos < vlen) {
3530 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3531 		snd = sendto(sockfd,
3532 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3533 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3534 		             flags,
3535 		             msgvec[vpos].msg_hdr.msg_name,
3536 		             msgvec[vpos].msg_hdr.msg_namelen);
3537 		if(snd < 0) {
3538 			break;
3539 		} else {
3540 			msgvec[vpos].msg_len = (unsigned int)snd;
3541 			vpos++;
3542 		}
3543 	}
3544 
3545 	if(vpos) {
3546 		return (int)vpos;
3547 	} else if(errno == 0) {
3548 		return 0;
3549 	}
3550 
3551 	return -1;
3552 }
3553 #endif /* HAVE_SENDMMSG */
3554 
3555 static int
3556 port_is_zero(
3557 #ifdef INET6
3558         struct sockaddr_storage *addr
3559 #else
3560         struct sockaddr_in *addr
3561 #endif
3562 	)
3563 {
3564 #ifdef INET6
3565 	if(addr->ss_family == AF_INET6) {
3566 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3567 	} else if(addr->ss_family == AF_INET) {
3568 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3569 	}
3570 	return 0;
3571 #else
3572 	if(addr->sin_family == AF_INET) {
3573 		return addr->sin_port == 0;
3574 	}
3575 	return 0;
3576 #endif
3577 }
3578 
3579 static void
3580 handle_udp(int fd, short event, void* arg)
3581 {
3582 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3583 	int received, sent, recvcount, i;
3584 	struct query *q;
3585 	uint32_t now = 0;
3586 
3587 	if (!(event & EV_READ)) {
3588 		return;
3589 	}
3590 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3591 	/* this printf strangely gave a performance increase on Linux */
3592 	/* printf("recvcount %d \n", recvcount); */
3593 	if (recvcount == -1) {
3594 		if (errno != EAGAIN && errno != EINTR) {
3595 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3596 			STATUP(data->nsd, rxerr);
3597 			/* No zone statup */
3598 		}
3599 		/* Simply no data available */
3600 		return;
3601 	}
3602 	for (i = 0; i < recvcount; i++) {
3603 	loopstart:
3604 		received = msgs[i].msg_len;
3605 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3606 		q = queries[i];
3607 		if (received == -1) {
3608 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3609 #if defined(HAVE_RECVMMSG)
3610 				msgs[i].msg_hdr.msg_flags
3611 #else
3612 				errno
3613 #endif
3614 				));
3615 			STATUP(data->nsd, rxerr);
3616 			/* No zone statup */
3617 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3618 			iovecs[i].iov_len = buffer_remaining(q->packet);
3619 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3620 			goto swap_drop;
3621 		}
3622 
3623 		/* Account... */
3624 #ifdef BIND8_STATS
3625 		if (data->socket->addr.ai_family == AF_INET) {
3626 			STATUP(data->nsd, qudp);
3627 		} else if (data->socket->addr.ai_family == AF_INET6) {
3628 			STATUP(data->nsd, qudp6);
3629 		}
3630 #endif
3631 
3632 		buffer_skip(q->packet, received);
3633 		buffer_flip(q->packet);
3634 #ifdef USE_DNSTAP
3635 		/*
3636 		 * sending UDP-query with server address (local) and client address to dnstap process
3637 		 */
3638 		log_addr("query from client", &q->addr);
3639 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3640 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3641 			q->tcp, q->packet);
3642 #endif /* USE_DNSTAP */
3643 
3644 		/* Process and answer the query... */
3645 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3646 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3647 				STATUP(data->nsd, nona);
3648 				ZTATUP(data->nsd, q->zone, nona);
3649 			}
3650 
3651 #ifdef USE_ZONE_STATS
3652 			if (data->socket->addr.ai_family == AF_INET) {
3653 				ZTATUP(data->nsd, q->zone, qudp);
3654 			} else if (data->socket->addr.ai_family == AF_INET6) {
3655 				ZTATUP(data->nsd, q->zone, qudp6);
3656 			}
3657 #endif
3658 
3659 			/* Add EDNS0 and TSIG info if necessary.  */
3660 			query_add_optional(q, data->nsd, &now);
3661 
3662 			buffer_flip(q->packet);
3663 			iovecs[i].iov_len = buffer_remaining(q->packet);
3664 #ifdef BIND8_STATS
3665 			/* Account the rcode & TC... */
3666 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3667 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3668 			if (TC(q->packet)) {
3669 				STATUP(data->nsd, truncated);
3670 				ZTATUP(data->nsd, q->zone, truncated);
3671 			}
3672 #endif /* BIND8_STATS */
3673 #ifdef USE_DNSTAP
3674 			/*
3675 			 * sending UDP-response with server address (local) and client address to dnstap process
3676 			 */
3677 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3678 			log_addr("response to client", &q->addr);
3679 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3680 				&q->addr, q->addrlen, q->tcp, q->packet,
3681 				q->zone);
3682 #endif /* USE_DNSTAP */
3683 		} else {
3684 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3685 			iovecs[i].iov_len = buffer_remaining(q->packet);
3686 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3687 		swap_drop:
3688 			STATUP(data->nsd, dropped);
3689 			ZTATUP(data->nsd, q->zone, dropped);
3690 			if(i != recvcount-1) {
3691 				/* swap with last and decrease recvcount */
3692 				struct mmsghdr mtmp = msgs[i];
3693 				struct iovec iotmp = iovecs[i];
3694 				recvcount--;
3695 				msgs[i] = msgs[recvcount];
3696 				iovecs[i] = iovecs[recvcount];
3697 				queries[i] = queries[recvcount];
3698 				msgs[recvcount] = mtmp;
3699 				iovecs[recvcount] = iotmp;
3700 				queries[recvcount] = q;
3701 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3702 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3703 				goto loopstart;
3704 			} else { recvcount --; }
3705 		}
3706 	}
3707 
3708 	/* send until all are sent */
3709 	i = 0;
3710 	while(i<recvcount) {
3711 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3712 		if(sent == -1) {
3713 			if(errno == ENOBUFS ||
3714 #ifdef EWOULDBLOCK
3715 				errno == EWOULDBLOCK ||
3716 #endif
3717 				errno == EAGAIN) {
3718 				/* block to wait until send buffer avail */
3719 				int flag, errstore;
3720 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3721 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3722 					flag = 0;
3723 				}
3724 				flag &= ~O_NONBLOCK;
3725 				if(fcntl(fd, F_SETFL, flag) == -1)
3726 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3727 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3728 				errstore = errno;
3729 				flag |= O_NONBLOCK;
3730 				if(fcntl(fd, F_SETFL, flag) == -1)
3731 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3732 				if(sent != -1) {
3733 					i += sent;
3734 					continue;
3735 				}
3736 				errno = errstore;
3737 			}
3738 			if(errno == EINVAL) {
3739 				/* skip the invalid argument entry,
3740 				 * send the remaining packets in the list */
3741 				if(!(port_is_zero((void*)&queries[i]->addr) &&
3742 					verbosity < 3)) {
3743 					const char* es = strerror(errno);
3744 					char a[64];
3745 					addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3746 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3747 				}
3748 				i += 1;
3749 				continue;
3750 			}
3751 			/* don't log transient network full errors, unless
3752 			 * on higher verbosity */
3753 			if(!(errno == ENOBUFS && verbosity < 1) &&
3754 #ifdef EWOULDBLOCK
3755 			   errno != EWOULDBLOCK &&
3756 #endif
3757 			   errno != EAGAIN) {
3758 				const char* es = strerror(errno);
3759 				char a[64];
3760 				addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3761 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3762 			}
3763 #ifdef BIND8_STATS
3764 			data->nsd->st.txerr += recvcount-i;
3765 #endif /* BIND8_STATS */
3766 			break;
3767 		}
3768 		i += sent;
3769 	}
3770 	for(i=0; i<recvcount; i++) {
3771 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3772 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3773 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3774 	}
3775 }
3776 
3777 #ifdef HAVE_SSL
3778 /*
3779  * Setup an event for the tcp handler.
3780  */
3781 static void
3782 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3783        int fd, short event)
3784 {
3785 	struct timeval timeout;
3786 	struct event_base* ev_base;
3787 
3788 	timeout.tv_sec = data->nsd->tcp_timeout;
3789 	timeout.tv_usec = 0L;
3790 
3791 	ev_base = data->event.ev_base;
3792 	event_del(&data->event);
3793 	memset(&data->event, 0, sizeof(data->event));
3794 	event_set(&data->event, fd, event, fn, data);
3795 	if(event_base_set(ev_base, &data->event) != 0)
3796 		log_msg(LOG_ERR, "event base set failed");
3797 	if(event_add(&data->event, &timeout) != 0)
3798 		log_msg(LOG_ERR, "event add failed");
3799 }
3800 #endif /* HAVE_SSL */
3801 
3802 static void
3803 cleanup_tcp_handler(struct tcp_handler_data* data)
3804 {
3805 	event_del(&data->event);
3806 #ifdef HAVE_SSL
3807 	if(data->tls) {
3808 		SSL_shutdown(data->tls);
3809 		SSL_free(data->tls);
3810 		data->tls = NULL;
3811 	}
3812 #endif
3813 	close(data->event.ev_fd);
3814 	if(data->prev)
3815 		data->prev->next = data->next;
3816 	else	tcp_active_list = data->next;
3817 	if(data->next)
3818 		data->next->prev = data->prev;
3819 
3820 	/*
3821 	 * Enable the TCP accept handlers when the current number of
3822 	 * TCP connections is about to drop below the maximum number
3823 	 * of TCP connections.
3824 	 */
3825 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3826 		configure_handler_event_types(EV_READ|EV_PERSIST);
3827 		if(slowaccept) {
3828 			event_del(&slowaccept_event);
3829 			slowaccept = 0;
3830 		}
3831 	}
3832 	--data->nsd->current_tcp_count;
3833 	assert(data->nsd->current_tcp_count >= 0);
3834 
3835 	region_destroy(data->region);
3836 }
3837 
3838 static void
3839 handle_tcp_reading(int fd, short event, void* arg)
3840 {
3841 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3842 	ssize_t received;
3843 	struct event_base* ev_base;
3844 	struct timeval timeout;
3845 	uint32_t now = 0;
3846 
3847 	if ((event & EV_TIMEOUT)) {
3848 		/* Connection timed out.  */
3849 		cleanup_tcp_handler(data);
3850 		return;
3851 	}
3852 
3853 	if ((data->nsd->tcp_query_count > 0 &&
3854 		data->query_count >= data->nsd->tcp_query_count) ||
3855 		data->tcp_no_more_queries) {
3856 		/* No more queries allowed on this tcp connection. */
3857 		cleanup_tcp_handler(data);
3858 		return;
3859 	}
3860 
3861 	assert((event & EV_READ));
3862 
3863 	if (data->bytes_transmitted == 0) {
3864 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3865 	}
3866 
3867 	/*
3868 	 * Check if we received the leading packet length bytes yet.
3869 	 */
3870 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3871 		received = read(fd,
3872 				(char *) &data->query->tcplen
3873 				+ data->bytes_transmitted,
3874 				sizeof(uint16_t) - data->bytes_transmitted);
3875 		if (received == -1) {
3876 			if (errno == EAGAIN || errno == EINTR) {
3877 				/*
3878 				 * Read would block, wait until more
3879 				 * data is available.
3880 				 */
3881 				return;
3882 			} else {
3883 				char buf[48];
3884 				addr2str(&data->query->addr, buf, sizeof(buf));
3885 #ifdef ECONNRESET
3886 				if (verbosity >= 2 || errno != ECONNRESET)
3887 #endif /* ECONNRESET */
3888 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3889 				cleanup_tcp_handler(data);
3890 				return;
3891 			}
3892 		} else if (received == 0) {
3893 			/* EOF */
3894 			cleanup_tcp_handler(data);
3895 			return;
3896 		}
3897 
3898 		data->bytes_transmitted += received;
3899 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3900 			/*
3901 			 * Not done with the tcplen yet, wait for more
3902 			 * data to become available.
3903 			 */
3904 			return;
3905 		}
3906 
3907 		assert(data->bytes_transmitted == sizeof(uint16_t));
3908 
3909 		data->query->tcplen = ntohs(data->query->tcplen);
3910 
3911 		/*
3912 		 * Minimum query size is:
3913 		 *
3914 		 *     Size of the header (12)
3915 		 *   + Root domain name   (1)
3916 		 *   + Query class        (2)
3917 		 *   + Query type         (2)
3918 		 */
3919 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3920 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3921 			cleanup_tcp_handler(data);
3922 			return;
3923 		}
3924 
3925 		if (data->query->tcplen > data->query->maxlen) {
3926 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3927 			cleanup_tcp_handler(data);
3928 			return;
3929 		}
3930 
3931 		buffer_set_limit(data->query->packet, data->query->tcplen);
3932 	}
3933 
3934 	assert(buffer_remaining(data->query->packet) > 0);
3935 
3936 	/* Read the (remaining) query data.  */
3937 	received = read(fd,
3938 			buffer_current(data->query->packet),
3939 			buffer_remaining(data->query->packet));
3940 	if (received == -1) {
3941 		if (errno == EAGAIN || errno == EINTR) {
3942 			/*
3943 			 * Read would block, wait until more data is
3944 			 * available.
3945 			 */
3946 			return;
3947 		} else {
3948 			char buf[48];
3949 			addr2str(&data->query->addr, buf, sizeof(buf));
3950 #ifdef ECONNRESET
3951 			if (verbosity >= 2 || errno != ECONNRESET)
3952 #endif /* ECONNRESET */
3953 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3954 			cleanup_tcp_handler(data);
3955 			return;
3956 		}
3957 	} else if (received == 0) {
3958 		/* EOF */
3959 		cleanup_tcp_handler(data);
3960 		return;
3961 	}
3962 
3963 	data->bytes_transmitted += received;
3964 	buffer_skip(data->query->packet, received);
3965 	if (buffer_remaining(data->query->packet) > 0) {
3966 		/*
3967 		 * Message not yet complete, wait for more data to
3968 		 * become available.
3969 		 */
3970 		return;
3971 	}
3972 
3973 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3974 
3975 	/* Account... */
3976 #ifdef BIND8_STATS
3977 #ifndef INET6
3978 	STATUP(data->nsd, ctcp);
3979 #else
3980 	if (data->query->addr.ss_family == AF_INET) {
3981 		STATUP(data->nsd, ctcp);
3982 	} else if (data->query->addr.ss_family == AF_INET6) {
3983 		STATUP(data->nsd, ctcp6);
3984 	}
3985 #endif
3986 #endif /* BIND8_STATS */
3987 
3988 	/* We have a complete query, process it.  */
3989 
3990 	/* tcp-query-count: handle query counter ++ */
3991 	data->query_count++;
3992 
3993 	buffer_flip(data->query->packet);
3994 #ifdef USE_DNSTAP
3995 	/*
3996 	 * and send TCP-query with found address (local) and client address to dnstap process
3997 	 */
3998 	log_addr("query from client", &data->query->addr);
3999 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4000 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4001 		data->query->addrlen, data->query->tcp, data->query->packet);
4002 #endif /* USE_DNSTAP */
4003 	data->query_state = server_process_query(data->nsd, data->query, &now);
4004 	if (data->query_state == QUERY_DISCARDED) {
4005 		/* Drop the packet and the entire connection... */
4006 		STATUP(data->nsd, dropped);
4007 		ZTATUP(data->nsd, data->query->zone, dropped);
4008 		cleanup_tcp_handler(data);
4009 		return;
4010 	}
4011 
4012 #ifdef BIND8_STATS
4013 	if (RCODE(data->query->packet) == RCODE_OK
4014 	    && !AA(data->query->packet))
4015 	{
4016 		STATUP(data->nsd, nona);
4017 		ZTATUP(data->nsd, data->query->zone, nona);
4018 	}
4019 #endif /* BIND8_STATS */
4020 
4021 #ifdef USE_ZONE_STATS
4022 #ifndef INET6
4023 	ZTATUP(data->nsd, data->query->zone, ctcp);
4024 #else
4025 	if (data->query->addr.ss_family == AF_INET) {
4026 		ZTATUP(data->nsd, data->query->zone, ctcp);
4027 	} else if (data->query->addr.ss_family == AF_INET6) {
4028 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4029 	}
4030 #endif
4031 #endif /* USE_ZONE_STATS */
4032 
4033 	query_add_optional(data->query, data->nsd, &now);
4034 
4035 	/* Switch to the tcp write handler.  */
4036 	buffer_flip(data->query->packet);
4037 	data->query->tcplen = buffer_remaining(data->query->packet);
4038 #ifdef BIND8_STATS
4039 	/* Account the rcode & TC... */
4040 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4041 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4042 	if (TC(data->query->packet)) {
4043 		STATUP(data->nsd, truncated);
4044 		ZTATUP(data->nsd, data->query->zone, truncated);
4045 	}
4046 #endif /* BIND8_STATS */
4047 #ifdef USE_DNSTAP
4048 	/*
4049 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4050 	 */
4051 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4052 	log_addr("response to client", &data->query->addr);
4053 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4054 		data->query->addrlen, data->query->tcp, data->query->packet,
4055 		data->query->zone);
4056 #endif /* USE_DNSTAP */
4057 	data->bytes_transmitted = 0;
4058 
4059 	timeout.tv_sec = data->tcp_timeout / 1000;
4060 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4061 
4062 	ev_base = data->event.ev_base;
4063 	event_del(&data->event);
4064 	memset(&data->event, 0, sizeof(data->event));
4065 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4066 		handle_tcp_reading, data);
4067 	if(event_base_set(ev_base, &data->event) != 0)
4068 		log_msg(LOG_ERR, "event base set tcpr failed");
4069 	if(event_add(&data->event, &timeout) != 0)
4070 		log_msg(LOG_ERR, "event add tcpr failed");
4071 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4072 	handle_tcp_writing(fd, EV_WRITE, data);
4073 }
4074 
4075 static void
4076 handle_tcp_writing(int fd, short event, void* arg)
4077 {
4078 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4079 	ssize_t sent;
4080 	struct query *q = data->query;
4081 	struct timeval timeout;
4082 	struct event_base* ev_base;
4083 	uint32_t now = 0;
4084 
4085 	if ((event & EV_TIMEOUT)) {
4086 		/* Connection timed out.  */
4087 		cleanup_tcp_handler(data);
4088 		return;
4089 	}
4090 
4091 	assert((event & EV_WRITE));
4092 
4093 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4094 		/* Writing the response packet length.  */
4095 		uint16_t n_tcplen = htons(q->tcplen);
4096 #ifdef HAVE_WRITEV
4097 		struct iovec iov[2];
4098 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4099 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4100 		iov[1].iov_base = buffer_begin(q->packet);
4101 		iov[1].iov_len = buffer_limit(q->packet);
4102 		sent = writev(fd, iov, 2);
4103 #else /* HAVE_WRITEV */
4104 		sent = write(fd,
4105 			     (const char *) &n_tcplen + data->bytes_transmitted,
4106 			     sizeof(n_tcplen) - data->bytes_transmitted);
4107 #endif /* HAVE_WRITEV */
4108 		if (sent == -1) {
4109 			if (errno == EAGAIN || errno == EINTR) {
4110 				/*
4111 				 * Write would block, wait until
4112 				 * socket becomes writable again.
4113 				 */
4114 				return;
4115 			} else {
4116 #ifdef ECONNRESET
4117 				if(verbosity >= 2 || errno != ECONNRESET)
4118 #endif /* ECONNRESET */
4119 #ifdef EPIPE
4120 				  if(verbosity >= 2 || errno != EPIPE)
4121 #endif /* EPIPE 'broken pipe' */
4122 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4123 				cleanup_tcp_handler(data);
4124 				return;
4125 			}
4126 		}
4127 
4128 		data->bytes_transmitted += sent;
4129 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4130 			/*
4131 			 * Writing not complete, wait until socket
4132 			 * becomes writable again.
4133 			 */
4134 			return;
4135 		}
4136 
4137 #ifdef HAVE_WRITEV
4138 		sent -= sizeof(n_tcplen);
4139 		/* handle potential 'packet done' code */
4140 		goto packet_could_be_done;
4141 #endif
4142  	}
4143 
4144 	sent = write(fd,
4145 		     buffer_current(q->packet),
4146 		     buffer_remaining(q->packet));
4147 	if (sent == -1) {
4148 		if (errno == EAGAIN || errno == EINTR) {
4149 			/*
4150 			 * Write would block, wait until
4151 			 * socket becomes writable again.
4152 			 */
4153 			return;
4154 		} else {
4155 #ifdef ECONNRESET
4156 			if(verbosity >= 2 || errno != ECONNRESET)
4157 #endif /* ECONNRESET */
4158 #ifdef EPIPE
4159 				  if(verbosity >= 2 || errno != EPIPE)
4160 #endif /* EPIPE 'broken pipe' */
4161 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4162 			cleanup_tcp_handler(data);
4163 			return;
4164 		}
4165 	}
4166 
4167 	data->bytes_transmitted += sent;
4168 #ifdef HAVE_WRITEV
4169   packet_could_be_done:
4170 #endif
4171 	buffer_skip(q->packet, sent);
4172 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4173 		/*
4174 		 * Still more data to write when socket becomes
4175 		 * writable again.
4176 		 */
4177 		return;
4178 	}
4179 
4180 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4181 
4182 	if (data->query_state == QUERY_IN_AXFR ||
4183 		data->query_state == QUERY_IN_IXFR) {
4184 		/* Continue processing AXFR and writing back results.  */
4185 		buffer_clear(q->packet);
4186 		if(data->query_state == QUERY_IN_AXFR)
4187 			data->query_state = query_axfr(data->nsd, q, 0);
4188 		else data->query_state = query_ixfr(data->nsd, q);
4189 		if (data->query_state != QUERY_PROCESSED) {
4190 			query_add_optional(data->query, data->nsd, &now);
4191 
4192 			/* Reset data. */
4193 			buffer_flip(q->packet);
4194 			q->tcplen = buffer_remaining(q->packet);
4195 			data->bytes_transmitted = 0;
4196 			/* Reset timeout.  */
4197 			timeout.tv_sec = data->tcp_timeout / 1000;
4198 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4199 			ev_base = data->event.ev_base;
4200 			event_del(&data->event);
4201 			memset(&data->event, 0, sizeof(data->event));
4202 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4203 				handle_tcp_writing, data);
4204 			if(event_base_set(ev_base, &data->event) != 0)
4205 				log_msg(LOG_ERR, "event base set tcpw failed");
4206 			if(event_add(&data->event, &timeout) != 0)
4207 				log_msg(LOG_ERR, "event add tcpw failed");
4208 
4209 			/*
4210 			 * Write data if/when the socket is writable
4211 			 * again.
4212 			 */
4213 			return;
4214 		}
4215 	}
4216 
4217 	/*
4218 	 * Done sending, wait for the next request to arrive on the
4219 	 * TCP socket by installing the TCP read handler.
4220 	 */
4221 	if ((data->nsd->tcp_query_count > 0 &&
4222 		data->query_count >= data->nsd->tcp_query_count) ||
4223 		data->tcp_no_more_queries) {
4224 
4225 		(void) shutdown(fd, SHUT_WR);
4226 	}
4227 
4228 	data->bytes_transmitted = 0;
4229 
4230 	timeout.tv_sec = data->tcp_timeout / 1000;
4231 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4232 	ev_base = data->event.ev_base;
4233 	event_del(&data->event);
4234 	memset(&data->event, 0, sizeof(data->event));
4235 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4236 		handle_tcp_reading, data);
4237 	if(event_base_set(ev_base, &data->event) != 0)
4238 		log_msg(LOG_ERR, "event base set tcpw failed");
4239 	if(event_add(&data->event, &timeout) != 0)
4240 		log_msg(LOG_ERR, "event add tcpw failed");
4241 }
4242 
4243 #ifdef HAVE_SSL
4244 /** create SSL object and associate fd */
4245 static SSL*
4246 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4247 {
4248 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4249 	if(!ssl) {
4250 		log_crypto_err("could not SSL_new");
4251 		return NULL;
4252 	}
4253 	SSL_set_accept_state(ssl);
4254 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4255 	if(!SSL_set_fd(ssl, fd)) {
4256 		log_crypto_err("could not SSL_set_fd");
4257 		SSL_free(ssl);
4258 		return NULL;
4259 	}
4260 	return ssl;
4261 }
4262 
4263 /** TLS handshake to upgrade TCP connection */
4264 static int
4265 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4266 {
4267 	int r;
4268 	if(data->shake_state == tls_hs_read_event) {
4269 		/* read condition satisfied back to writing */
4270 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4271 		data->shake_state = tls_hs_none;
4272 		return 1;
4273 	}
4274 	if(data->shake_state == tls_hs_write_event) {
4275 		/* write condition satisfied back to reading */
4276 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4277 		data->shake_state = tls_hs_none;
4278 		return 1;
4279 	}
4280 
4281 	/* (continue to) setup the TLS connection */
4282 	ERR_clear_error();
4283 	r = SSL_do_handshake(data->tls);
4284 
4285 	if(r != 1) {
4286 		int want = SSL_get_error(data->tls, r);
4287 		if(want == SSL_ERROR_WANT_READ) {
4288 			if(data->shake_state == tls_hs_read) {
4289 				/* try again later */
4290 				return 1;
4291 			}
4292 			data->shake_state = tls_hs_read;
4293 			/* switch back to reading mode */
4294 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4295 			return 1;
4296 		} else if(want == SSL_ERROR_WANT_WRITE) {
4297 			if(data->shake_state == tls_hs_write) {
4298 				/* try again later */
4299 				return 1;
4300 			}
4301 			data->shake_state = tls_hs_write;
4302 			/* switch back to writing mode */
4303 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4304 			return 1;
4305 		} else {
4306 			if(r == 0)
4307 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4308 			else {
4309 				unsigned long err = ERR_get_error();
4310 				if(!squelch_err_ssl_handshake(err)) {
4311 					char a[64], s[256];
4312 					addr2str(&data->query->addr, a, sizeof(a));
4313 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4314 					log_crypto_from_err(s, err);
4315 				}
4316 			}
4317 			cleanup_tcp_handler(data);
4318 			return 0;
4319 		}
4320 	}
4321 
4322 	/* Use to log successful upgrade for testing - could be removed*/
4323 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4324 	/* set back to the event we need to have when reading (or writing) */
4325 	if(data->shake_state == tls_hs_read && writing) {
4326 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4327 	} else if(data->shake_state == tls_hs_write && !writing) {
4328 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4329 	}
4330 	data->shake_state = tls_hs_none;
4331 	return 1;
4332 }
4333 
4334 /** handle TLS reading of incoming query */
4335 static void
4336 handle_tls_reading(int fd, short event, void* arg)
4337 {
4338 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4339 	ssize_t received;
4340 	uint32_t now = 0;
4341 
4342 	if ((event & EV_TIMEOUT)) {
4343 		/* Connection timed out.  */
4344 		cleanup_tcp_handler(data);
4345 		return;
4346 	}
4347 
4348 	if ((data->nsd->tcp_query_count > 0 &&
4349 	    data->query_count >= data->nsd->tcp_query_count) ||
4350 	    data->tcp_no_more_queries) {
4351 		/* No more queries allowed on this tcp connection. */
4352 		cleanup_tcp_handler(data);
4353 		return;
4354 	}
4355 
4356 	assert((event & EV_READ));
4357 
4358 	if (data->bytes_transmitted == 0) {
4359 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4360 	}
4361 
4362 	if(data->shake_state != tls_hs_none) {
4363 		if(!tls_handshake(data, fd, 0))
4364 			return;
4365 		if(data->shake_state != tls_hs_none)
4366 			return;
4367 	}
4368 
4369 	/*
4370 	 * Check if we received the leading packet length bytes yet.
4371 	 */
4372 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4373 		ERR_clear_error();
4374 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4375 		    + data->bytes_transmitted,
4376 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4377 			int want = SSL_get_error(data->tls, received);
4378 			if(want == SSL_ERROR_ZERO_RETURN) {
4379 				cleanup_tcp_handler(data);
4380 				return; /* shutdown, closed */
4381 			} else if(want == SSL_ERROR_WANT_READ) {
4382 				/* wants to be called again */
4383 				return;
4384 			}
4385 			else if(want == SSL_ERROR_WANT_WRITE) {
4386 				/* switch to writing */
4387 				data->shake_state = tls_hs_write_event;
4388 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4389 				return;
4390 			}
4391 			cleanup_tcp_handler(data);
4392 			log_crypto_err("could not SSL_read");
4393 			return;
4394 		}
4395 
4396 		data->bytes_transmitted += received;
4397 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4398 			/*
4399 			 * Not done with the tcplen yet, wait for more
4400 			 * data to become available.
4401 			 */
4402 			return;
4403 		}
4404 
4405 		assert(data->bytes_transmitted == sizeof(uint16_t));
4406 
4407 		data->query->tcplen = ntohs(data->query->tcplen);
4408 
4409 		/*
4410 		 * Minimum query size is:
4411 		 *
4412 		 *     Size of the header (12)
4413 		 *   + Root domain name   (1)
4414 		 *   + Query class        (2)
4415 		 *   + Query type         (2)
4416 		 */
4417 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4418 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4419 			cleanup_tcp_handler(data);
4420 			return;
4421 		}
4422 
4423 		if (data->query->tcplen > data->query->maxlen) {
4424 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4425 			cleanup_tcp_handler(data);
4426 			return;
4427 		}
4428 
4429 		buffer_set_limit(data->query->packet, data->query->tcplen);
4430 	}
4431 
4432 	assert(buffer_remaining(data->query->packet) > 0);
4433 
4434 	/* Read the (remaining) query data.  */
4435 	ERR_clear_error();
4436 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4437 			    (int)buffer_remaining(data->query->packet));
4438 	if(received <= 0) {
4439 		int want = SSL_get_error(data->tls, received);
4440 		if(want == SSL_ERROR_ZERO_RETURN) {
4441 			cleanup_tcp_handler(data);
4442 			return; /* shutdown, closed */
4443 		} else if(want == SSL_ERROR_WANT_READ) {
4444 			/* wants to be called again */
4445 			return;
4446 		}
4447 		else if(want == SSL_ERROR_WANT_WRITE) {
4448 			/* switch back writing */
4449 			data->shake_state = tls_hs_write_event;
4450 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4451 			return;
4452 		}
4453 		cleanup_tcp_handler(data);
4454 		log_crypto_err("could not SSL_read");
4455 		return;
4456 	}
4457 
4458 	data->bytes_transmitted += received;
4459 	buffer_skip(data->query->packet, received);
4460 	if (buffer_remaining(data->query->packet) > 0) {
4461 		/*
4462 		 * Message not yet complete, wait for more data to
4463 		 * become available.
4464 		 */
4465 		return;
4466 	}
4467 
4468 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4469 
4470 	/* Account... */
4471 #ifndef INET6
4472 	STATUP(data->nsd, ctls);
4473 #else
4474 	if (data->query->addr.ss_family == AF_INET) {
4475 		STATUP(data->nsd, ctls);
4476 	} else if (data->query->addr.ss_family == AF_INET6) {
4477 		STATUP(data->nsd, ctls6);
4478 	}
4479 #endif
4480 
4481 	/* We have a complete query, process it.  */
4482 
4483 	/* tcp-query-count: handle query counter ++ */
4484 	data->query_count++;
4485 
4486 	buffer_flip(data->query->packet);
4487 #ifdef USE_DNSTAP
4488 	/*
4489 	 * and send TCP-query with found address (local) and client address to dnstap process
4490 	 */
4491 	log_addr("query from client", &data->query->addr);
4492 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4493 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4494 		data->query->addrlen, data->query->tcp, data->query->packet);
4495 #endif /* USE_DNSTAP */
4496 	data->query_state = server_process_query(data->nsd, data->query, &now);
4497 	if (data->query_state == QUERY_DISCARDED) {
4498 		/* Drop the packet and the entire connection... */
4499 		STATUP(data->nsd, dropped);
4500 		ZTATUP(data->nsd, data->query->zone, dropped);
4501 		cleanup_tcp_handler(data);
4502 		return;
4503 	}
4504 
4505 #ifdef BIND8_STATS
4506 	if (RCODE(data->query->packet) == RCODE_OK
4507 	    && !AA(data->query->packet))
4508 	{
4509 		STATUP(data->nsd, nona);
4510 		ZTATUP(data->nsd, data->query->zone, nona);
4511 	}
4512 #endif /* BIND8_STATS */
4513 
4514 #ifdef USE_ZONE_STATS
4515 #ifndef INET6
4516 	ZTATUP(data->nsd, data->query->zone, ctls);
4517 #else
4518 	if (data->query->addr.ss_family == AF_INET) {
4519 		ZTATUP(data->nsd, data->query->zone, ctls);
4520 	} else if (data->query->addr.ss_family == AF_INET6) {
4521 		ZTATUP(data->nsd, data->query->zone, ctls6);
4522 	}
4523 #endif
4524 #endif /* USE_ZONE_STATS */
4525 
4526 	query_add_optional(data->query, data->nsd, &now);
4527 
4528 	/* Switch to the tcp write handler.  */
4529 	buffer_flip(data->query->packet);
4530 	data->query->tcplen = buffer_remaining(data->query->packet);
4531 #ifdef BIND8_STATS
4532 	/* Account the rcode & TC... */
4533 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4534 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4535 	if (TC(data->query->packet)) {
4536 		STATUP(data->nsd, truncated);
4537 		ZTATUP(data->nsd, data->query->zone, truncated);
4538 	}
4539 #endif /* BIND8_STATS */
4540 #ifdef USE_DNSTAP
4541 	/*
4542 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4543 	 */
4544 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4545 	log_addr("response to client", &data->query->addr);
4546 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4547 		data->query->addrlen, data->query->tcp, data->query->packet,
4548 		data->query->zone);
4549 #endif /* USE_DNSTAP */
4550 	data->bytes_transmitted = 0;
4551 
4552 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4553 
4554 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4555 	handle_tls_writing(fd, EV_WRITE, data);
4556 }
4557 
4558 /** handle TLS writing of outgoing response */
4559 static void
4560 handle_tls_writing(int fd, short event, void* arg)
4561 {
4562 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4563 	ssize_t sent;
4564 	struct query *q = data->query;
4565 	/* static variable that holds reassembly buffer used to put the
4566 	 * TCP length in front of the packet, like writev. */
4567 	static buffer_type* global_tls_temp_buffer = NULL;
4568 	buffer_type* write_buffer;
4569 	uint32_t now = 0;
4570 
4571 	if ((event & EV_TIMEOUT)) {
4572 		/* Connection timed out.  */
4573 		cleanup_tcp_handler(data);
4574 		return;
4575 	}
4576 
4577 	assert((event & EV_WRITE));
4578 
4579 	if(data->shake_state != tls_hs_none) {
4580 		if(!tls_handshake(data, fd, 1))
4581 			return;
4582 		if(data->shake_state != tls_hs_none)
4583 			return;
4584 	}
4585 
4586 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4587 
4588 	/* If we are writing the start of a message, we must include the length
4589 	 * this is done with a copy into write_buffer. */
4590 	write_buffer = NULL;
4591 	if (data->bytes_transmitted == 0) {
4592 		if(!global_tls_temp_buffer) {
4593 			/* gets deallocated when nsd shuts down from
4594 			 * nsd.region */
4595 			global_tls_temp_buffer = buffer_create(nsd.region,
4596 				QIOBUFSZ + sizeof(q->tcplen));
4597 			if (!global_tls_temp_buffer) {
4598 				return;
4599 			}
4600 		}
4601 		write_buffer = global_tls_temp_buffer;
4602 		buffer_clear(write_buffer);
4603 		buffer_write_u16(write_buffer, q->tcplen);
4604 		buffer_write(write_buffer, buffer_current(q->packet),
4605 			(int)buffer_remaining(q->packet));
4606 		buffer_flip(write_buffer);
4607 	} else {
4608 		write_buffer = q->packet;
4609 	}
4610 
4611 	/* Write the response */
4612 	ERR_clear_error();
4613 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4614 	if(sent <= 0) {
4615 		int want = SSL_get_error(data->tls, sent);
4616 		if(want == SSL_ERROR_ZERO_RETURN) {
4617 			cleanup_tcp_handler(data);
4618 			/* closed */
4619 		} else if(want == SSL_ERROR_WANT_READ) {
4620 			/* switch back to reading */
4621 			data->shake_state = tls_hs_read_event;
4622 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4623 		} else if(want != SSL_ERROR_WANT_WRITE) {
4624 			cleanup_tcp_handler(data);
4625 			log_crypto_err("could not SSL_write");
4626 		}
4627 		return;
4628 	}
4629 
4630 	buffer_skip(write_buffer, sent);
4631 	if(buffer_remaining(write_buffer) != 0) {
4632 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4633 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4634 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4635 		}
4636 	}
4637 
4638 	data->bytes_transmitted += sent;
4639 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4640 		/*
4641 		 * Still more data to write when socket becomes
4642 		 * writable again.
4643 		 */
4644 		return;
4645 	}
4646 
4647 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4648 
4649 	if (data->query_state == QUERY_IN_AXFR ||
4650 		data->query_state == QUERY_IN_IXFR) {
4651 		/* Continue processing AXFR and writing back results.  */
4652 		buffer_clear(q->packet);
4653 		if(data->query_state == QUERY_IN_AXFR)
4654 			data->query_state = query_axfr(data->nsd, q, 0);
4655 		else data->query_state = query_ixfr(data->nsd, q);
4656 		if (data->query_state != QUERY_PROCESSED) {
4657 			query_add_optional(data->query, data->nsd, &now);
4658 
4659 			/* Reset data. */
4660 			buffer_flip(q->packet);
4661 			q->tcplen = buffer_remaining(q->packet);
4662 			data->bytes_transmitted = 0;
4663 			/* Reset to writing mode.  */
4664 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4665 
4666 			/*
4667 			 * Write data if/when the socket is writable
4668 			 * again.
4669 			 */
4670 			return;
4671 		}
4672 	}
4673 
4674 	/*
4675 	 * Done sending, wait for the next request to arrive on the
4676 	 * TCP socket by installing the TCP read handler.
4677 	 */
4678 	if ((data->nsd->tcp_query_count > 0 &&
4679 		data->query_count >= data->nsd->tcp_query_count) ||
4680 		data->tcp_no_more_queries) {
4681 
4682 		(void) shutdown(fd, SHUT_WR);
4683 	}
4684 
4685 	data->bytes_transmitted = 0;
4686 
4687 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4688 }
4689 #endif
4690 
4691 static void
4692 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4693 	void* ATTR_UNUSED(arg))
4694 {
4695 	if(slowaccept) {
4696 		configure_handler_event_types(EV_PERSIST | EV_READ);
4697 		slowaccept = 0;
4698 	}
4699 }
4700 
4701 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4702 {
4703 #ifndef HAVE_ACCEPT4
4704 	int s = accept(fd, addr, addrlen);
4705 	if (s != -1) {
4706 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4707 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4708 			close(s);
4709 			s = -1;
4710 			errno=EINTR; /* stop error printout as error in accept4
4711 				by setting this errno, it omits printout, in
4712 				later code that calls nsd_accept4 */
4713 		}
4714 	}
4715 	return s;
4716 #else
4717 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4718 #endif /* HAVE_ACCEPT4 */
4719 }
4720 
4721 /*
4722  * Handle an incoming TCP connection.  The connection is accepted and
4723  * a new TCP reader event handler is added.  The TCP handler
4724  * is responsible for cleanup when the connection is closed.
4725  */
4726 static void
4727 handle_tcp_accept(int fd, short event, void* arg)
4728 {
4729 	struct tcp_accept_handler_data *data
4730 		= (struct tcp_accept_handler_data *) arg;
4731 	int s;
4732 	int reject = 0;
4733 	struct tcp_handler_data *tcp_data;
4734 	region_type *tcp_region;
4735 #ifdef INET6
4736 	struct sockaddr_storage addr;
4737 #else
4738 	struct sockaddr_in addr;
4739 #endif
4740 	socklen_t addrlen;
4741 	struct timeval timeout;
4742 
4743 	if (!(event & EV_READ)) {
4744 		return;
4745 	}
4746 
4747 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4748 		reject = data->nsd->options->tcp_reject_overflow;
4749 		if (!reject) {
4750 			return;
4751 		}
4752 	}
4753 
4754 	/* Accept it... */
4755 	addrlen = sizeof(addr);
4756 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4757 	if (s == -1) {
4758 		/**
4759 		 * EMFILE and ENFILE is a signal that the limit of open
4760 		 * file descriptors has been reached. Pause accept().
4761 		 * EINTR is a signal interrupt. The others are various OS ways
4762 		 * of saying that the client has closed the connection.
4763 		 */
4764 		if (errno == EMFILE || errno == ENFILE) {
4765 			if (!slowaccept) {
4766 				/* disable accept events */
4767 				struct timeval tv;
4768 				configure_handler_event_types(0);
4769 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4770 				tv.tv_usec = 0L;
4771 				memset(&slowaccept_event, 0,
4772 					sizeof(slowaccept_event));
4773 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4774 					handle_slowaccept_timeout, NULL);
4775 				(void)event_base_set(data->event.ev_base,
4776 					&slowaccept_event);
4777 				(void)event_add(&slowaccept_event, &tv);
4778 				slowaccept = 1;
4779 				/* We don't want to spam the logs here */
4780 			}
4781 		} else if (errno != EINTR
4782 			&& errno != EWOULDBLOCK
4783 #ifdef ECONNABORTED
4784 			&& errno != ECONNABORTED
4785 #endif /* ECONNABORTED */
4786 #ifdef EPROTO
4787 			&& errno != EPROTO
4788 #endif /* EPROTO */
4789 			) {
4790 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4791 		}
4792 		return;
4793 	}
4794 
4795 	if (reject) {
4796 		shutdown(s, SHUT_RDWR);
4797 		close(s);
4798 		return;
4799 	}
4800 
4801 	/*
4802 	 * This region is deallocated when the TCP connection is
4803 	 * closed by the TCP handler.
4804 	 */
4805 	tcp_region = region_create(xalloc, free);
4806 	tcp_data = (struct tcp_handler_data *) region_alloc(
4807 		tcp_region, sizeof(struct tcp_handler_data));
4808 	tcp_data->region = tcp_region;
4809 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4810 		compression_table_size, compressed_dnames);
4811 	tcp_data->nsd = data->nsd;
4812 	tcp_data->query_count = 0;
4813 #ifdef HAVE_SSL
4814 	tcp_data->shake_state = tls_hs_none;
4815 	tcp_data->tls = NULL;
4816 #endif
4817 	tcp_data->prev = NULL;
4818 	tcp_data->next = NULL;
4819 
4820 	tcp_data->query_state = QUERY_PROCESSED;
4821 	tcp_data->bytes_transmitted = 0;
4822 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4823 	tcp_data->query->addrlen = addrlen;
4824 
4825 	tcp_data->tcp_no_more_queries = 0;
4826 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4827 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4828 		/* very busy, give smaller timeout */
4829 		tcp_data->tcp_timeout = 200;
4830 	}
4831 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4832 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4833 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4834 
4835 #ifdef USE_DNSTAP
4836 	/* save the address of the connection */
4837 	tcp_data->socket = data->socket;
4838 #endif /* USE_DNSTAP */
4839 
4840 #ifdef HAVE_SSL
4841 	if (data->tls_accept) {
4842 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4843 		if(!tcp_data->tls) {
4844 			close(s);
4845 			return;
4846 		}
4847 		tcp_data->shake_state = tls_hs_read;
4848 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4849 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4850 			  handle_tls_reading, tcp_data);
4851 	} else {
4852 #endif
4853 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4854 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4855 			  handle_tcp_reading, tcp_data);
4856 #ifdef HAVE_SSL
4857 	}
4858 #endif
4859 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4860 		log_msg(LOG_ERR, "cannot set tcp event base");
4861 		close(s);
4862 		region_destroy(tcp_region);
4863 		return;
4864 	}
4865 	if(event_add(&tcp_data->event, &timeout) != 0) {
4866 		log_msg(LOG_ERR, "cannot add tcp to event base");
4867 		close(s);
4868 		region_destroy(tcp_region);
4869 		return;
4870 	}
4871 	if(tcp_active_list) {
4872 		tcp_active_list->prev = tcp_data;
4873 		tcp_data->next = tcp_active_list;
4874 	}
4875 	tcp_active_list = tcp_data;
4876 
4877 	/*
4878 	 * Keep track of the total number of TCP handlers installed so
4879 	 * we can stop accepting connections when the maximum number
4880 	 * of simultaneous TCP connections is reached.
4881 	 *
4882 	 * If tcp-reject-overflow is enabled, however, then we do not
4883 	 * change the handler event type; we keep it as-is and accept
4884 	 * overflow TCP connections only so that we can forcibly kill
4885 	 * them off.
4886 	 */
4887 	++data->nsd->current_tcp_count;
4888 	if (!data->nsd->options->tcp_reject_overflow &&
4889 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4890 	{
4891 		configure_handler_event_types(0);
4892 	}
4893 }
4894 
4895 static void
4896 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4897 {
4898 	size_t i;
4899 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4900 	for (i = 0; i < nsd->child_count; ++i) {
4901 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4902 			if (write(nsd->children[i].child_fd,
4903 				&command,
4904 				sizeof(command)) == -1)
4905 			{
4906 				if(errno != EAGAIN && errno != EINTR)
4907 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4908 					(int) command,
4909 					(int) nsd->children[i].pid,
4910 					strerror(errno));
4911 			} else if (timeout > 0) {
4912 				(void)block_read(NULL,
4913 					nsd->children[i].child_fd,
4914 					&command, sizeof(command), timeout);
4915 			}
4916 			fsync(nsd->children[i].child_fd);
4917 			close(nsd->children[i].child_fd);
4918 			nsd->children[i].child_fd = -1;
4919 		}
4920 	}
4921 }
4922 
4923 static void
4924 send_children_quit(struct nsd* nsd)
4925 {
4926 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4927 	send_children_command(nsd, NSD_QUIT, 0);
4928 }
4929 
4930 static void
4931 send_children_quit_and_wait(struct nsd* nsd)
4932 {
4933 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4934 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4935 }
4936 
4937 #ifdef BIND8_STATS
4938 static void
4939 set_children_stats(struct nsd* nsd)
4940 {
4941 	size_t i;
4942 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4943 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4944 	for (i = 0; i < nsd->child_count; ++i) {
4945 		nsd->children[i].need_to_send_STATS = 1;
4946 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4947 	}
4948 }
4949 #endif /* BIND8_STATS */
4950 
4951 static void
4952 configure_handler_event_types(short event_types)
4953 {
4954 	size_t i;
4955 
4956 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4957 		struct event* handler = &tcp_accept_handlers[i].event;
4958 		if(event_types) {
4959 			/* reassign */
4960 			int fd = handler->ev_fd;
4961 			struct event_base* base = handler->ev_base;
4962 			if(tcp_accept_handlers[i].event_added)
4963 				event_del(handler);
4964 			memset(handler, 0, sizeof(*handler));
4965 			event_set(handler, fd, event_types,
4966 				handle_tcp_accept, &tcp_accept_handlers[i]);
4967 			if(event_base_set(base, handler) != 0)
4968 				log_msg(LOG_ERR, "conhand: cannot event_base");
4969 			if(event_add(handler, NULL) != 0)
4970 				log_msg(LOG_ERR, "conhand: cannot event_add");
4971 			tcp_accept_handlers[i].event_added = 1;
4972 		} else {
4973 			/* remove */
4974 			if(tcp_accept_handlers[i].event_added) {
4975 				event_del(handler);
4976 				tcp_accept_handlers[i].event_added = 0;
4977 			}
4978 		}
4979 	}
4980 }
4981