xref: /openbsd-src/usr.sbin/nsd/server.c (revision 25c4e8bd056e974b28f4a0ffd39d76c190a56013)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 
90 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
91 
92 #ifdef USE_DNSTAP
93 /*
94  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
95  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
96  */
97 static void
98 log_addr(const char* descr,
99 #ifdef INET6
100 	struct sockaddr_storage* addr
101 #else
102 	struct sockaddr_in* addr
103 #endif
104 	)
105 {
106 	char str_buf[64];
107 	if(verbosity < 6)
108 		return;
109 	if(
110 #ifdef INET6
111 		addr->ss_family == AF_INET
112 #else
113 		addr->sin_family == AF_INET
114 #endif
115 		) {
116 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
117 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
118 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
119 #ifdef INET6
120 	} else {
121 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
122 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
123 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
124 #endif
125 	}
126 }
127 #endif /* USE_DNSTAP */
128 
129 #ifdef USE_TCP_FASTOPEN
130   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
131   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
132 #endif
133 
134 /*
135  * Data for the UDP handlers.
136  */
137 struct udp_handler_data
138 {
139 	struct nsd        *nsd;
140 	struct nsd_socket *socket;
141 	struct event       event;
142 };
143 
144 struct tcp_accept_handler_data {
145 	struct nsd        *nsd;
146 	struct nsd_socket *socket;
147 	int                event_added;
148 	struct event       event;
149 #ifdef HAVE_SSL
150 	/* handler accepts TLS connections on the dedicated port */
151 	int                tls_accept;
152 #endif
153 };
154 
155 /*
156  * These globals are used to enable the TCP accept handlers
157  * when the number of TCP connection drops below the maximum
158  * number of TCP connections.
159  */
160 static size_t tcp_accept_handler_count;
161 static struct tcp_accept_handler_data *tcp_accept_handlers;
162 
163 static struct event slowaccept_event;
164 static int slowaccept;
165 
166 #ifdef HAVE_SSL
167 static unsigned char *ocspdata = NULL;
168 static long ocspdata_len = 0;
169 #endif
170 
171 #ifdef NONBLOCKING_IS_BROKEN
172 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
173    read multiple times from a socket when reported ready by select. */
174 # define NUM_RECV_PER_SELECT (1)
175 #else /* !NONBLOCKING_IS_BROKEN */
176 # define NUM_RECV_PER_SELECT (100)
177 #endif /* NONBLOCKING_IS_BROKEN */
178 
179 #ifndef HAVE_MMSGHDR
180 struct mmsghdr {
181 	struct msghdr msg_hdr;
182 	unsigned int  msg_len;
183 };
184 #endif
185 
186 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
187 static struct iovec iovecs[NUM_RECV_PER_SELECT];
188 static struct query *queries[NUM_RECV_PER_SELECT];
189 
190 /*
191  * Data for the TCP connection handlers.
192  *
193  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
194  * blocking the entire server on a slow TCP connection, but does make
195  * reading from and writing to the socket more complicated.
196  *
197  * Basically, whenever a read/write would block (indicated by the
198  * EAGAIN errno variable) we remember the position we were reading
199  * from/writing to and return from the TCP reading/writing event
200  * handler.  When the socket becomes readable/writable again we
201  * continue from the same position.
202  */
203 struct tcp_handler_data
204 {
205 	/*
206 	 * The region used to allocate all TCP connection related
207 	 * data, including this structure.  This region is destroyed
208 	 * when the connection is closed.
209 	 */
210 	region_type*		region;
211 
212 	/*
213 	 * The global nsd structure.
214 	 */
215 	struct nsd*			nsd;
216 
217 	/*
218 	 * The current query data for this TCP connection.
219 	 */
220 	query_type*			query;
221 
222 	/*
223 	 * The query_state is used to remember if we are performing an
224 	 * AXFR, if we're done processing, or if we should discard the
225 	 * query and connection.
226 	 */
227 	query_state_type	query_state;
228 
229 	/*
230 	 * The event for the file descriptor and tcp timeout
231 	 */
232 	struct event event;
233 
234 	/*
235 	 * The bytes_transmitted field is used to remember the number
236 	 * of bytes transmitted when receiving or sending a DNS
237 	 * packet.  The count includes the two additional bytes used
238 	 * to specify the packet length on a TCP connection.
239 	 */
240 	size_t				bytes_transmitted;
241 
242 	/*
243 	 * The number of queries handled by this specific TCP connection.
244 	 */
245 	int					query_count;
246 
247 	/*
248 	 * The timeout in msec for this tcp connection
249 	 */
250 	int	tcp_timeout;
251 
252 	/*
253 	 * If the connection is allowed to have further queries on it.
254 	 */
255 	int tcp_no_more_queries;
256 
257 #ifdef USE_DNSTAP
258 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
259 	struct nsd_socket *socket;
260 #endif /* USE_DNSTAP */
261 
262 #ifdef HAVE_SSL
263 	/*
264 	 * TLS object.
265 	 */
266 	SSL* tls;
267 
268 	/*
269 	 * TLS handshake state.
270 	 */
271 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
272 		tls_hs_read_event, tls_hs_write_event } shake_state;
273 #endif
274 	/* list of connections, for service of remaining tcp channels */
275 	struct tcp_handler_data *prev, *next;
276 };
277 /* global that is the list of active tcp channels */
278 static struct tcp_handler_data *tcp_active_list = NULL;
279 
280 /*
281  * Handle incoming queries on the UDP server sockets.
282  */
283 static void handle_udp(int fd, short event, void* arg);
284 
285 /*
286  * Handle incoming connections on the TCP sockets.  These handlers
287  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
288  * connection) but are disabled when the number of current TCP
289  * connections is equal to the maximum number of TCP connections.
290  * Disabling is done by changing the handler to wait for the
291  * NETIO_EVENT_NONE type.  This is done using the function
292  * configure_tcp_accept_handlers.
293  */
294 static void handle_tcp_accept(int fd, short event, void* arg);
295 
296 /*
297  * Handle incoming queries on a TCP connection.  The TCP connections
298  * are configured to be non-blocking and the handler may be called
299  * multiple times before a complete query is received.
300  */
301 static void handle_tcp_reading(int fd, short event, void* arg);
302 
303 /*
304  * Handle outgoing responses on a TCP connection.  The TCP connections
305  * are configured to be non-blocking and the handler may be called
306  * multiple times before a complete response is sent.
307  */
308 static void handle_tcp_writing(int fd, short event, void* arg);
309 
310 #ifdef HAVE_SSL
311 /* Create SSL object and associate fd */
312 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
313 /*
314  * Handle TLS handshake. May be called multiple times if incomplete.
315  */
316 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
317 
318 /*
319  * Handle incoming queries on a TLS over TCP connection.  The TLS
320  * connections are configured to be non-blocking and the handler may
321  * be called multiple times before a complete query is received.
322  */
323 static void handle_tls_reading(int fd, short event, void* arg);
324 
325 /*
326  * Handle outgoing responses on a TLS over TCP connection.  The TLS
327  * connections are configured to be non-blocking and the handler may
328  * be called multiple times before a complete response is sent.
329  */
330 static void handle_tls_writing(int fd, short event, void* arg);
331 #endif
332 
333 /*
334  * Send all children the quit nonblocking, then close pipe.
335  */
336 static void send_children_quit(struct nsd* nsd);
337 /* same, for shutdown time, waits for child to exit to avoid restart issues */
338 static void send_children_quit_and_wait(struct nsd* nsd);
339 
340 /* set childrens flags to send NSD_STATS to them */
341 #ifdef BIND8_STATS
342 static void set_children_stats(struct nsd* nsd);
343 #endif /* BIND8_STATS */
344 
345 /*
346  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
347  */
348 static void configure_handler_event_types(short event_types);
349 
350 static uint16_t *compressed_dname_offsets = 0;
351 static uint32_t compression_table_capacity = 0;
352 static uint32_t compression_table_size = 0;
353 static domain_type* compressed_dnames[MAXRRSPP];
354 
355 #ifdef USE_TCP_FASTOPEN
356 /* Checks to see if the kernel value must be manually changed in order for
357    TCP Fast Open to support server mode */
358 static void report_tcp_fastopen_config() {
359 
360 	int tcp_fastopen_fp;
361 	uint8_t tcp_fastopen_value;
362 
363 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
364 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
365 	}
366 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
367 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
368 		close(tcp_fastopen_fp);
369 	}
370 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
371 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
372 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
373 		log_msg(LOG_WARNING, "To enable TFO use the command:");
374 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
375 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
376 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
377 		close(tcp_fastopen_fp);
378 	}
379 	close(tcp_fastopen_fp);
380 }
381 #endif
382 
383 /*
384  * Remove the specified pid from the list of child pids.  Returns -1 if
385  * the pid is not in the list, child_num otherwise.  The field is set to 0.
386  */
387 static int
388 delete_child_pid(struct nsd *nsd, pid_t pid)
389 {
390 	size_t i;
391 	for (i = 0; i < nsd->child_count; ++i) {
392 		if (nsd->children[i].pid == pid) {
393 			nsd->children[i].pid = 0;
394 			if(!nsd->children[i].need_to_exit) {
395 				if(nsd->children[i].child_fd != -1)
396 					close(nsd->children[i].child_fd);
397 				nsd->children[i].child_fd = -1;
398 				if(nsd->children[i].handler)
399 					nsd->children[i].handler->fd = -1;
400 			}
401 			return i;
402 		}
403 	}
404 	return -1;
405 }
406 
407 /*
408  * Restart child servers if necessary.
409  */
410 static int
411 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
412 	int* xfrd_sock_p)
413 {
414 	struct main_ipc_handler_data *ipc_data;
415 	size_t i;
416 	int sv[2];
417 
418 	/* Fork the child processes... */
419 	for (i = 0; i < nsd->child_count; ++i) {
420 		if (nsd->children[i].pid <= 0) {
421 			if (nsd->children[i].child_fd != -1)
422 				close(nsd->children[i].child_fd);
423 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
424 				log_msg(LOG_ERR, "socketpair: %s",
425 					strerror(errno));
426 				return -1;
427 			}
428 			nsd->children[i].child_fd = sv[0];
429 			nsd->children[i].parent_fd = sv[1];
430 			nsd->children[i].pid = fork();
431 			switch (nsd->children[i].pid) {
432 			default: /* SERVER MAIN */
433 				close(nsd->children[i].parent_fd);
434 				nsd->children[i].parent_fd = -1;
435 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
436 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
437 				}
438 				if(!nsd->children[i].handler)
439 				{
440 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
441 						region, sizeof(struct main_ipc_handler_data));
442 					ipc_data->nsd = nsd;
443 					ipc_data->child = &nsd->children[i];
444 					ipc_data->child_num = i;
445 					ipc_data->xfrd_sock = xfrd_sock_p;
446 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
447 					ipc_data->forward_mode = 0;
448 					ipc_data->got_bytes = 0;
449 					ipc_data->total_bytes = 0;
450 					ipc_data->acl_num = 0;
451 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
452 						region, sizeof(struct netio_handler));
453 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
454 					nsd->children[i].handler->timeout = NULL;
455 					nsd->children[i].handler->user_data = ipc_data;
456 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
457 					nsd->children[i].handler->event_handler = parent_handle_child_command;
458 					netio_add_handler(netio, nsd->children[i].handler);
459 				}
460 				/* clear any ongoing ipc */
461 				ipc_data = (struct main_ipc_handler_data*)
462 					nsd->children[i].handler->user_data;
463 				ipc_data->forward_mode = 0;
464 				/* restart - update fd */
465 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
466 				break;
467 			case 0: /* CHILD */
468 				/* the child need not be able to access the
469 				 * nsd.db file */
470 				namedb_close_udb(nsd->db);
471 #ifdef MEMCLEAN /* OS collects memory pages */
472 				region_destroy(region);
473 #endif
474 
475 				if (pledge("stdio rpath inet", NULL) == -1) {
476 					log_msg(LOG_ERR, "pledge");
477 					exit(1);
478 				}
479 
480 				nsd->pid = 0;
481 				nsd->child_count = 0;
482 				nsd->server_kind = nsd->children[i].kind;
483 				nsd->this_child = &nsd->children[i];
484 				nsd->this_child->child_num = i;
485 				/* remove signal flags inherited from parent
486 				   the parent will handle them. */
487 				nsd->signal_hint_reload_hup = 0;
488 				nsd->signal_hint_reload = 0;
489 				nsd->signal_hint_child = 0;
490 				nsd->signal_hint_quit = 0;
491 				nsd->signal_hint_shutdown = 0;
492 				nsd->signal_hint_stats = 0;
493 				nsd->signal_hint_statsusr = 0;
494 				close(*xfrd_sock_p);
495 				close(nsd->this_child->child_fd);
496 				nsd->this_child->child_fd = -1;
497 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
498 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
499 				}
500 				server_child(nsd);
501 				/* NOTREACH */
502 				exit(0);
503 			case -1:
504 				log_msg(LOG_ERR, "fork failed: %s",
505 					strerror(errno));
506 				return -1;
507 			}
508 		}
509 	}
510 	return 0;
511 }
512 
513 #ifdef BIND8_STATS
514 static void set_bind8_alarm(struct nsd* nsd)
515 {
516 	/* resync so that the next alarm is on the next whole minute */
517 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
518 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
519 }
520 #endif
521 
522 /* set zone stat ids for zones initially read in */
523 static void
524 zonestatid_tree_set(struct nsd* nsd)
525 {
526 	struct radnode* n;
527 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
528 		zone_type* zone = (zone_type*)n->elem;
529 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
530 	}
531 }
532 
533 #ifdef USE_ZONE_STATS
534 void
535 server_zonestat_alloc(struct nsd* nsd)
536 {
537 	size_t num = (nsd->options->zonestatnames->count==0?1:
538 			nsd->options->zonestatnames->count);
539 	size_t sz = sizeof(struct nsdst)*num;
540 	char tmpfile[256];
541 	uint8_t z = 0;
542 
543 	/* file names */
544 	nsd->zonestatfname[0] = 0;
545 	nsd->zonestatfname[1] = 0;
546 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
547 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
548 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
549 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
550 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
551 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
552 
553 	/* file descriptors */
554 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
555 	if(nsd->zonestatfd[0] == -1) {
556 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
557 			strerror(errno));
558 		exit(1);
559 	}
560 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
561 	if(nsd->zonestatfd[0] == -1) {
562 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
563 			strerror(errno));
564 		close(nsd->zonestatfd[0]);
565 		unlink(nsd->zonestatfname[0]);
566 		exit(1);
567 	}
568 
569 #ifdef HAVE_MMAP
570 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
571 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
572 			strerror(errno));
573 		exit(1);
574 	}
575 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
576 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
577 			nsd->zonestatfname[0], strerror(errno));
578 		exit(1);
579 	}
580 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
581 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
582 			strerror(errno));
583 		exit(1);
584 	}
585 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
586 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
587 			nsd->zonestatfname[1], strerror(errno));
588 		exit(1);
589 	}
590 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
591 		MAP_SHARED, nsd->zonestatfd[0], 0);
592 	if(nsd->zonestat[0] == MAP_FAILED) {
593 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
594 		unlink(nsd->zonestatfname[0]);
595 		unlink(nsd->zonestatfname[1]);
596 		exit(1);
597 	}
598 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
599 		MAP_SHARED, nsd->zonestatfd[1], 0);
600 	if(nsd->zonestat[1] == MAP_FAILED) {
601 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
602 		unlink(nsd->zonestatfname[0]);
603 		unlink(nsd->zonestatfname[1]);
604 		exit(1);
605 	}
606 	memset(nsd->zonestat[0], 0, sz);
607 	memset(nsd->zonestat[1], 0, sz);
608 	nsd->zonestatsize[0] = num;
609 	nsd->zonestatsize[1] = num;
610 	nsd->zonestatdesired = num;
611 	nsd->zonestatsizenow = num;
612 	nsd->zonestatnow = nsd->zonestat[0];
613 #endif /* HAVE_MMAP */
614 }
615 
616 void
617 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
618 {
619 #ifdef HAVE_MMAP
620 #ifdef MREMAP_MAYMOVE
621 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
622 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
623 		MREMAP_MAYMOVE);
624 	if(nsd->zonestat[idx] == MAP_FAILED) {
625 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
626 		exit(1);
627 	}
628 #else /* !HAVE MREMAP */
629 	if(msync(nsd->zonestat[idx],
630 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
631 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
632 	if(munmap(nsd->zonestat[idx],
633 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
634 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
635 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
636 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
637 	if(nsd->zonestat[idx] == MAP_FAILED) {
638 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
639 		exit(1);
640 	}
641 #endif /* MREMAP */
642 #endif /* HAVE_MMAP */
643 }
644 
645 /* realloc the zonestat array for the one that is not currently in use,
646  * to match the desired new size of the array (if applicable) */
647 void
648 server_zonestat_realloc(struct nsd* nsd)
649 {
650 #ifdef HAVE_MMAP
651 	uint8_t z = 0;
652 	size_t sz;
653 	int idx = 0; /* index of the zonestat array that is not in use */
654 	if(nsd->zonestatnow == nsd->zonestat[0])
655 		idx = 1;
656 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
657 		return;
658 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
659 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
660 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
661 			strerror(errno));
662 		exit(1);
663 	}
664 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
665 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
666 			nsd->zonestatfname[idx], strerror(errno));
667 		exit(1);
668 	}
669 	zonestat_remap(nsd, idx, sz);
670 	/* zero the newly allocated region */
671 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
672 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
673 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
674 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
675 	}
676 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
677 #endif /* HAVE_MMAP */
678 }
679 
680 /* switchover to use the other array for the new children, that
681  * briefly coexist with the old children.  And we want to avoid them
682  * both writing to the same statistics arrays. */
683 void
684 server_zonestat_switch(struct nsd* nsd)
685 {
686 	if(nsd->zonestatnow == nsd->zonestat[0]) {
687 		nsd->zonestatnow = nsd->zonestat[1];
688 		nsd->zonestatsizenow = nsd->zonestatsize[1];
689 	} else {
690 		nsd->zonestatnow = nsd->zonestat[0];
691 		nsd->zonestatsizenow = nsd->zonestatsize[0];
692 	}
693 }
694 #endif /* USE_ZONE_STATS */
695 
696 static void
697 cleanup_dname_compression_tables(void *ptr)
698 {
699 	free(ptr);
700 	compressed_dname_offsets = NULL;
701 	compression_table_capacity = 0;
702 }
703 
704 static void
705 initialize_dname_compression_tables(struct nsd *nsd)
706 {
707 	size_t needed = domain_table_count(nsd->db->domains) + 1;
708 	needed += EXTRA_DOMAIN_NUMBERS;
709 	if(compression_table_capacity < needed) {
710 		if(compressed_dname_offsets) {
711 			region_remove_cleanup(nsd->db->region,
712 				cleanup_dname_compression_tables,
713 				compressed_dname_offsets);
714 			free(compressed_dname_offsets);
715 		}
716 		compressed_dname_offsets = (uint16_t *) xmallocarray(
717 			needed, sizeof(uint16_t));
718 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
719 			compressed_dname_offsets);
720 		compression_table_capacity = needed;
721 		compression_table_size=domain_table_count(nsd->db->domains)+1;
722 	}
723 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
724 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
725 }
726 
727 static int
728 set_cloexec(struct nsd_socket *sock)
729 {
730 	assert(sock != NULL);
731 
732 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
733 		const char *socktype =
734 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
735 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
736 			socktype, strerror(errno));
737 		return -1;
738 	}
739 
740 	return 1;
741 }
742 
743 static int
744 set_reuseport(struct nsd_socket *sock)
745 {
746 #ifdef SO_REUSEPORT
747 	int on = 1;
748 #ifdef SO_REUSEPORT_LB
749 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
750 	 * SO_REUSEPORT on Linux. This is what the users want with the config
751 	 * option in nsd.conf; if we actually need local address and port reuse
752 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
753 	 * _LB they want.
754 	 */
755 	int opt = SO_REUSEPORT_LB;
756 	static const char optname[] = "SO_REUSEPORT_LB";
757 #else /* !SO_REUSEPORT_LB */
758 	int opt = SO_REUSEPORT;
759 	static const char optname[] = "SO_REUSEPORT";
760 #endif /* SO_REUSEPORT_LB */
761 
762 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
763 		return 1;
764 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
765 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
766 			optname, strerror(errno));
767 	}
768 	return -1;
769 #else
770 	(void)sock;
771 #endif /* SO_REUSEPORT */
772 
773 	return 0;
774 }
775 
776 static int
777 set_reuseaddr(struct nsd_socket *sock)
778 {
779 #ifdef SO_REUSEADDR
780 	int on = 1;
781 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
782 		return 1;
783 	}
784 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
785 		strerror(errno));
786 	return -1;
787 #endif /* SO_REUSEADDR */
788 	return 0;
789 }
790 
791 static int
792 set_rcvbuf(struct nsd_socket *sock, int rcv)
793 {
794 #ifdef SO_RCVBUF
795 #ifdef SO_RCVBUFFORCE
796 	if(0 == setsockopt(
797 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
798 	{
799 		return 1;
800 	}
801 	if(errno == EPERM || errno == ENOBUFS) {
802 		return 0;
803 	}
804 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
805 		strerror(errno));
806 	return -1;
807 #else /* !SO_RCVBUFFORCE */
808 	if (0 == setsockopt(
809 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
810 	{
811 		return 1;
812 	}
813 	if(errno == ENOSYS || errno == ENOBUFS) {
814 		return 0;
815 	}
816 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
817 		strerror(errno));
818 	return -1;
819 #endif /* SO_RCVBUFFORCE */
820 #endif /* SO_RCVBUF */
821 
822 	return 0;
823 }
824 
825 static int
826 set_sndbuf(struct nsd_socket *sock, int snd)
827 {
828 #ifdef SO_SNDBUF
829 #ifdef SO_SNDBUFFORCE
830 	if(0 == setsockopt(
831 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
832 	{
833 		return 1;
834 	}
835 	if(errno == EPERM || errno == ENOBUFS) {
836 		return 0;
837 	}
838 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
839 		strerror(errno));
840 	return -1;
841 #else /* !SO_SNDBUFFORCE */
842 	if(0 == setsockopt(
843 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
844 	{
845 		return 1;
846 	}
847 	if(errno == ENOSYS || errno == ENOBUFS) {
848 		return 0;
849 	}
850 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
851 		strerror(errno));
852 	return -1;
853 #endif /* SO_SNDBUFFORCE */
854 #endif /* SO_SNDBUF */
855 
856 	return 0;
857 }
858 
859 static int
860 set_nonblock(struct nsd_socket *sock)
861 {
862 	const char *socktype =
863 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
864 
865 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
866 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
867 			socktype, strerror(errno));
868 		return -1;
869 	}
870 
871 	return 1;
872 }
873 
874 #ifdef INET6
875 static int
876 set_ipv6_v6only(struct nsd_socket *sock)
877 {
878 #ifdef IPV6_V6ONLY
879 	int on = 1;
880 	const char *socktype =
881 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
882 
883 	if(0 == setsockopt(
884 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
885 	{
886 		return 1;
887 	}
888 
889 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
890 		socktype, strerror(errno));
891 	return -1;
892 #else
893 	(void)sock;
894 #endif /* IPV6_V6ONLY */
895 
896 	return 0;
897 }
898 #endif /* INET6 */
899 
900 #ifdef INET6
901 static int
902 set_ipv6_use_min_mtu(struct nsd_socket *sock)
903 {
904 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
905 #if defined(IPV6_USE_MIN_MTU)
906 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
907 	 * network. Therefore we do not send UDP datagrams larger than the
908 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
909 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
910 	 */
911 	int opt = IPV6_USE_MIN_MTU;
912 	int optval = 1;
913 	static const char optname[] = "IPV6_USE_MIN_MTU";
914 #elif defined(IPV6_MTU)
915 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
916 	 * to the MIN MTU to get the same.
917 	 */
918 	int opt = IPV6_MTU;
919 	int optval = IPV6_MIN_MTU;
920 	static const char optname[] = "IPV6_MTU";
921 #endif
922 	if(0 == setsockopt(
923 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
924 	{
925 		return 1;
926 	}
927 
928 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
929 		optname, strerror(errno));
930 	return -1;
931 #else
932 	(void)sock;
933 #endif /* INET6 */
934 
935 	return 0;
936 }
937 #endif /* INET6 */
938 
939 static int
940 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
941 {
942 	int ret = 0;
943 
944 #if defined(IP_MTU_DISCOVER)
945 	int opt = IP_MTU_DISCOVER;
946 	int optval;
947 # if defined(IP_PMTUDISC_OMIT)
948 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
949 	 * information and send packets with DF=0. Fragmentation is allowed if
950 	 * and only if the packet size exceeds the outgoing interface MTU or
951 	 * the packet encounters smaller MTU link in network. This mitigates
952 	 * DNS fragmentation attacks by preventing forged PMTU information.
953 	 * FreeBSD already has same semantics without setting the option.
954 	 */
955 	optval = IP_PMTUDISC_OMIT;
956 	if(0 == setsockopt(
957 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
958 	{
959 		return 1;
960 	}
961 
962 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
963 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
964 # endif /* IP_PMTUDISC_OMIT */
965 # if defined(IP_PMTUDISC_DONT)
966 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
967 	optval = IP_PMTUDISC_DONT;
968 	if(0 == setsockopt(
969 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
970 	{
971 		return 1;
972 	}
973 
974 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
975 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
976 # endif
977 	ret = -1;
978 #elif defined(IP_DONTFRAG)
979 	int off = 0;
980 	if (0 == setsockopt(
981 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
982 	{
983 		return 1;
984 	}
985 
986 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
987 		strerror(errno));
988 	ret = -1;
989 #else
990 	(void)sock;
991 #endif
992 
993 	return ret;
994 }
995 
996 static int
997 set_ip_freebind(struct nsd_socket *sock)
998 {
999 #ifdef IP_FREEBIND
1000 	int on = 1;
1001 	const char *socktype =
1002 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1003 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1004 	{
1005 		return 1;
1006 	}
1007 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1008 		socktype, strerror(errno));
1009 	return -1;
1010 #else
1011 	(void)sock;
1012 #endif /* IP_FREEBIND */
1013 
1014 	return 0;
1015 }
1016 
1017 static int
1018 set_ip_transparent(struct nsd_socket *sock)
1019 {
1020 	/*
1021 	The scandalous preprocessor blob here calls for some explanation :)
1022 	POSIX does not specify an option to bind non-local IPs, so
1023 	platforms developed several implementation-specific options,
1024 	all set in the same way, but with different names.
1025 	For additional complexity, some platform manage this setting
1026 	differently for different address families (IPv4 vs IPv6).
1027 	This scandalous preprocessor blob below abstracts such variability
1028 	in the way which leaves the C code as lean and clear as possible.
1029 	*/
1030 
1031 #if defined(IP_TRANSPARENT)
1032 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1033 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1034 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1035 // as of 2020-01, Linux does not support this on IPv6 programmatically
1036 #elif defined(SO_BINDANY)
1037 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1038 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1039 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1040 #elif defined(IP_BINDANY)
1041 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1042 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1043 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1044 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1045 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1046 #endif
1047 
1048 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1049 	(void)sock;
1050 #else
1051 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1052 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1053 #	endif
1054 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1055 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1056 #	endif
1057 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1058 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1059 #	endif
1060 
1061 	int on = 1;
1062 	const char *socktype =
1063 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1064 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1065 
1066 	if(0 == setsockopt(
1067 		sock->s,
1068 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1069 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1070 		&on, sizeof(on)))
1071 	{
1072 		return 1;
1073 	}
1074 
1075 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1076 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1077 	return -1;
1078 #endif
1079 
1080 	return 0;
1081 }
1082 
1083 static int
1084 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1085 {
1086 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1087 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1088 		return 1;
1089 	}
1090 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1091 		strerror(errno));
1092 	return -1;
1093 #else
1094 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1095 #endif
1096 	return 0;
1097 }
1098 
1099 #ifdef USE_TCP_FASTOPEN
1100 static int
1101 set_tcp_fastopen(struct nsd_socket *sock)
1102 {
1103 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1104 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1105 	 */
1106 	int qlen;
1107 
1108 #ifdef __APPLE__
1109 	/* macOS X implementation only supports qlen of 1 via this call. The
1110 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1111 	 * kernel parameter.
1112 	 */
1113 	qlen = 1;
1114 #else
1115 	/* 5 is recommended on Linux. */
1116 	qlen = 5;
1117 #endif
1118 	if (0 == setsockopt(
1119 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1120 	{
1121 		return 1;
1122 	}
1123 
1124 	if (errno == EPERM) {
1125 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1126 				 "; this could likely be because sysctl "
1127 				 "net.inet.tcp.fastopen.enabled, "
1128 				 "net.inet.tcp.fastopen.server_enable, or "
1129 				 "net.ipv4.tcp_fastopen is disabled",
1130 			strerror(errno));
1131 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1132 	 * disabled, except when verbosity enabled for debugging
1133 	 */
1134 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1135 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1136 			strerror(errno));
1137 	}
1138 
1139 	return (errno == ENOPROTOOPT ? 0 : -1);
1140 }
1141 #endif /* USE_TCP_FASTOPEN */
1142 
1143 static int
1144 set_bindtodevice(struct nsd_socket *sock)
1145 {
1146 #if defined(SO_BINDTODEVICE)
1147 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1148 		sock->device, strlen(sock->device)) == -1)
1149 	{
1150 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1151 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1152 		return -1;
1153 	}
1154 
1155 	return 1;
1156 #else
1157 	(void)sock;
1158 	return 0;
1159 #endif
1160 }
1161 
1162 static int
1163 set_setfib(struct nsd_socket *sock)
1164 {
1165 #if defined(SO_SETFIB)
1166 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1167 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1168 	{
1169 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1170 		                 "SO_SETFIB", sock->fib, strerror(errno));
1171 		return -1;
1172 	}
1173 
1174 	return 1;
1175 #else
1176 	(void)sock;
1177 	return 0;
1178 #endif
1179 }
1180 
1181 static int
1182 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1183 {
1184 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1185 
1186 	if(-1 == (sock->s = socket(
1187 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1188 	{
1189 #ifdef INET6
1190 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1191 		   (sock->addr.ai_family == AF_INET6) &&
1192 		   (errno == EAFNOSUPPORT))
1193 		{
1194 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1195 				"not supported");
1196 			return 0;
1197 		}
1198 #endif
1199 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1200 		return -1;
1201 	}
1202 
1203 	set_cloexec(sock);
1204 
1205 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1206 		*reuseport_works = (set_reuseport(sock) == 1);
1207 
1208 	if(nsd->options->receive_buffer_size > 0)
1209 		rcv = nsd->options->receive_buffer_size;
1210 	if(set_rcvbuf(sock, rcv) == -1)
1211 		return -1;
1212 
1213 	if(nsd->options->send_buffer_size > 0)
1214 		snd = nsd->options->send_buffer_size;
1215 	if(set_sndbuf(sock, snd) == -1)
1216 		return -1;
1217 #ifdef INET6
1218 	if(sock->addr.ai_family == AF_INET6) {
1219 		if(set_ipv6_v6only(sock) == -1 ||
1220 		   set_ipv6_use_min_mtu(sock) == -1)
1221 			return -1;
1222 	} else
1223 #endif /* INET6 */
1224 	if(sock->addr.ai_family == AF_INET) {
1225 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1226 			return -1;
1227 	}
1228 
1229 	/* Set socket to non-blocking. Otherwise, on operating systems
1230 	 * with thundering herd problems, the UDP recv could block
1231 	 * after select returns readable.
1232 	 */
1233 	set_nonblock(sock);
1234 
1235 	if(nsd->options->ip_freebind)
1236 		(void)set_ip_freebind(sock);
1237 	if(nsd->options->ip_transparent)
1238 		(void)set_ip_transparent(sock);
1239 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1240 		return -1;
1241 	if(sock->fib != -1 && set_setfib(sock) == -1)
1242 		return -1;
1243 
1244 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1245 		char buf[256];
1246 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1247 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1248 			buf, strerror(errno));
1249 		return -1;
1250 	}
1251 
1252 	return 1;
1253 }
1254 
1255 static int
1256 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1257 {
1258 #ifdef USE_TCP_FASTOPEN
1259 	report_tcp_fastopen_config();
1260 #endif
1261 
1262 	(void)reuseport_works;
1263 
1264 	if(-1 == (sock->s = socket(
1265 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1266 	{
1267 #ifdef INET6
1268 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1269 		   (sock->addr.ai_family == AF_INET6) &&
1270 		   (errno == EAFNOSUPPORT))
1271 		{
1272 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1273 			                     "not supported");
1274 			return 0;
1275 		}
1276 #endif /* INET6 */
1277 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1278 		return -1;
1279 	}
1280 
1281 	set_cloexec(sock);
1282 
1283 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1284 		*reuseport_works = (set_reuseport(sock) == 1);
1285 
1286 	(void)set_reuseaddr(sock);
1287 
1288 #ifdef INET6
1289 	if(sock->addr.ai_family == AF_INET6) {
1290 		if (set_ipv6_v6only(sock) == -1 ||
1291 		    set_ipv6_use_min_mtu(sock) == -1)
1292 			return -1;
1293 	}
1294 #endif
1295 
1296 	if(nsd->tcp_mss > 0)
1297 		set_tcp_maxseg(sock, nsd->tcp_mss);
1298 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1299 	   it may block in accept, even if select() says readable. */
1300 	(void)set_nonblock(sock);
1301 	if(nsd->options->ip_freebind)
1302 		(void)set_ip_freebind(sock);
1303 	if(nsd->options->ip_transparent)
1304 		(void)set_ip_transparent(sock);
1305 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1306 		return -1;
1307 	if(sock->fib != -1 && set_setfib(sock) == -1)
1308 		return -1;
1309 
1310 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1311 		char buf[256];
1312 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1313 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1314 			buf, strerror(errno));
1315 		return -1;
1316 	}
1317 
1318 #ifdef USE_TCP_FASTOPEN
1319 	(void)set_tcp_fastopen(sock);
1320 #endif
1321 
1322 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1323 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1324 		return -1;
1325 	}
1326 
1327 	return 1;
1328 }
1329 
1330 /*
1331  * Initialize the server, reuseport, create and bind the sockets.
1332  */
1333 int
1334 server_init(struct nsd *nsd)
1335 {
1336 	size_t i;
1337 	int reuseport = 1; /* Determine if REUSEPORT works. */
1338 
1339 	/* open server interface ports */
1340 	for(i = 0; i < nsd->ifs; i++) {
1341 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1342 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1343 		{
1344 			return -1;
1345 		}
1346 	}
1347 
1348 	if(nsd->reuseport && reuseport) {
1349 		size_t ifs = nsd->ifs * nsd->reuseport;
1350 
1351 		/* increase the size of the interface arrays, there are going
1352 		 * to be separate interface file descriptors for every server
1353 		 * instance */
1354 		region_remove_cleanup(nsd->region, free, nsd->udp);
1355 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1356 
1357 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1358 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1359 		region_add_cleanup(nsd->region, free, nsd->udp);
1360 		region_add_cleanup(nsd->region, free, nsd->tcp);
1361 		if(ifs > nsd->ifs) {
1362 			memset(&nsd->udp[nsd->ifs], 0,
1363 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1364 			memset(&nsd->tcp[nsd->ifs], 0,
1365 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1366 		}
1367 
1368 		for(i = nsd->ifs; i < ifs; i++) {
1369 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1370 			nsd->udp[i].s = -1;
1371 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1372 				return -1;
1373 			}
1374 			/* Turn off REUSEPORT for TCP by copying the socket
1375 			 * file descriptor.
1376 			 * This means we should not close TCP used by
1377 			 * other servers in reuseport enabled mode, in
1378 			 * server_child().
1379 			 */
1380 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1381 		}
1382 
1383 		nsd->ifs = ifs;
1384 	} else {
1385 		nsd->reuseport = 0;
1386 	}
1387 
1388 	/* open server interface ports for verifiers */
1389 	for(i = 0; i < nsd->verify_ifs; i++) {
1390 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1391 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1392 		{
1393 			return -1;
1394 		}
1395 	}
1396 
1397 	return 0;
1398 }
1399 
1400 /*
1401  * Prepare the server for take off.
1402  *
1403  */
1404 int
1405 server_prepare(struct nsd *nsd)
1406 {
1407 #ifdef RATELIMIT
1408 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1409 #ifdef HAVE_GETRANDOM
1410 	uint32_t v;
1411 	if(getrandom(&v, sizeof(v), 0) == -1) {
1412 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1413 		exit(1);
1414 	}
1415 	hash_set_raninit(v);
1416 #elif defined(HAVE_ARC4RANDOM)
1417 	hash_set_raninit(arc4random());
1418 #else
1419 	uint32_t v = getpid() ^ time(NULL);
1420 	srandom((unsigned long)v);
1421 #  ifdef HAVE_SSL
1422 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1423 		hash_set_raninit(v);
1424 	else
1425 #  endif
1426 		hash_set_raninit(random());
1427 #endif
1428 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1429 		nsd->options->rrl_ratelimit,
1430 		nsd->options->rrl_whitelist_ratelimit,
1431 		nsd->options->rrl_slip,
1432 		nsd->options->rrl_ipv4_prefix_length,
1433 		nsd->options->rrl_ipv6_prefix_length);
1434 #endif /* RATELIMIT */
1435 
1436 	/* Open the database... */
1437 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1438 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1439 			nsd->dbfile, strerror(errno));
1440 		unlink(nsd->task[0]->fname);
1441 		unlink(nsd->task[1]->fname);
1442 #ifdef USE_ZONE_STATS
1443 		unlink(nsd->zonestatfname[0]);
1444 		unlink(nsd->zonestatfname[1]);
1445 #endif
1446 		xfrd_del_tempdir(nsd);
1447 		return -1;
1448 	}
1449 	/* check if zone files have been modified */
1450 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1451 	 * for all zones */
1452 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1453 		nsd->options->database[0] == 0))
1454 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1455 	zonestatid_tree_set(nsd);
1456 
1457 	compression_table_capacity = 0;
1458 	initialize_dname_compression_tables(nsd);
1459 
1460 #ifdef	BIND8_STATS
1461 	/* Initialize times... */
1462 	time(&nsd->st.boot);
1463 	set_bind8_alarm(nsd);
1464 #endif /* BIND8_STATS */
1465 
1466 	return 0;
1467 }
1468 
1469 /*
1470  * Fork the required number of servers.
1471  */
1472 static int
1473 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1474 	int* xfrd_sock_p)
1475 {
1476 	size_t i;
1477 
1478 	/* Start all child servers initially.  */
1479 	for (i = 0; i < nsd->child_count; ++i) {
1480 		nsd->children[i].pid = 0;
1481 	}
1482 
1483 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1484 }
1485 
1486 static void
1487 server_close_socket(struct nsd_socket *sock)
1488 {
1489 	if(sock->s != -1) {
1490 		close(sock->s);
1491 		sock->s = -1;
1492 	}
1493 }
1494 
1495 void
1496 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1497 {
1498 	size_t i;
1499 
1500 	/* Close all the sockets... */
1501 	for (i = 0; i < n; ++i) {
1502 		server_close_socket(&sockets[i]);
1503 	}
1504 }
1505 
1506 /*
1507  * Close the sockets, shutdown the server and exit.
1508  * Does not return.
1509  */
1510 void
1511 server_shutdown(struct nsd *nsd)
1512 {
1513 	size_t i;
1514 
1515 	server_close_all_sockets(nsd->udp, nsd->ifs);
1516 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1517 	/* CHILD: close command channel to parent */
1518 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1519 	{
1520 		close(nsd->this_child->parent_fd);
1521 		nsd->this_child->parent_fd = -1;
1522 	}
1523 	/* SERVER: close command channels to children */
1524 	if(!nsd->this_child)
1525 	{
1526 		for(i=0; i < nsd->child_count; ++i)
1527 			if(nsd->children[i].child_fd != -1)
1528 			{
1529 				close(nsd->children[i].child_fd);
1530 				nsd->children[i].child_fd = -1;
1531 			}
1532 	}
1533 
1534 	tsig_finalize();
1535 #ifdef HAVE_SSL
1536 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1537 	if (nsd->tls_ctx)
1538 		SSL_CTX_free(nsd->tls_ctx);
1539 #endif
1540 
1541 #ifdef MEMCLEAN /* OS collects memory pages */
1542 #ifdef RATELIMIT
1543 	rrl_mmap_deinit_keep_mmap();
1544 #endif
1545 #ifdef USE_DNSTAP
1546 	dt_collector_destroy(nsd->dt_collector, nsd);
1547 #endif
1548 	udb_base_free_keep_mmap(nsd->task[0]);
1549 	udb_base_free_keep_mmap(nsd->task[1]);
1550 	namedb_free_ixfr(nsd->db);
1551 	namedb_close_udb(nsd->db); /* keeps mmap */
1552 	namedb_close(nsd->db);
1553 	nsd_options_destroy(nsd->options);
1554 	region_destroy(nsd->region);
1555 #endif
1556 	log_finalize();
1557 	exit(0);
1558 }
1559 
1560 void
1561 server_prepare_xfrd(struct nsd* nsd)
1562 {
1563 	char tmpfile[256];
1564 	/* create task mmaps */
1565 	nsd->mytask = 0;
1566 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1567 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1568 	nsd->task[0] = task_file_create(tmpfile);
1569 	if(!nsd->task[0]) {
1570 #ifdef USE_ZONE_STATS
1571 		unlink(nsd->zonestatfname[0]);
1572 		unlink(nsd->zonestatfname[1]);
1573 #endif
1574 		xfrd_del_tempdir(nsd);
1575 		exit(1);
1576 	}
1577 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1578 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1579 	nsd->task[1] = task_file_create(tmpfile);
1580 	if(!nsd->task[1]) {
1581 		unlink(nsd->task[0]->fname);
1582 #ifdef USE_ZONE_STATS
1583 		unlink(nsd->zonestatfname[0]);
1584 		unlink(nsd->zonestatfname[1]);
1585 #endif
1586 		xfrd_del_tempdir(nsd);
1587 		exit(1);
1588 	}
1589 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1590 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1591 	/* create xfrd listener structure */
1592 	nsd->xfrd_listener = region_alloc(nsd->region,
1593 		sizeof(netio_handler_type));
1594 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1595 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1596 	nsd->xfrd_listener->fd = -1;
1597 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1598 		nsd;
1599 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1600 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1601 }
1602 
1603 
1604 void
1605 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1606 {
1607 	pid_t pid;
1608 	int sockets[2] = {0,0};
1609 	struct ipc_handler_conn_data *data;
1610 
1611 	if(nsd->xfrd_listener->fd != -1)
1612 		close(nsd->xfrd_listener->fd);
1613 	if(del_db) {
1614 		/* recreate taskdb that xfrd was using, it may be corrupt */
1615 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1616 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1617 		nsd->task[1-nsd->mytask]->fname = NULL;
1618 		/* free alloc already, so udb does not shrink itself */
1619 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1620 		nsd->task[1-nsd->mytask]->alloc = NULL;
1621 		udb_base_free(nsd->task[1-nsd->mytask]);
1622 		/* create new file, overwrite the old one */
1623 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1624 		free(tmpfile);
1625 	}
1626 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1627 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1628 		return;
1629 	}
1630 	pid = fork();
1631 	switch (pid) {
1632 	case -1:
1633 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1634 		break;
1635 	default:
1636 		/* PARENT: close first socket, use second one */
1637 		close(sockets[0]);
1638 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1639 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1640 		}
1641 		if(del_db) xfrd_free_namedb(nsd);
1642 		/* use other task than I am using, since if xfrd died and is
1643 		 * restarted, the reload is using nsd->mytask */
1644 		nsd->mytask = 1 - nsd->mytask;
1645 
1646 #ifdef HAVE_SETPROCTITLE
1647 		setproctitle("xfrd");
1648 #endif
1649 #ifdef HAVE_CPUSET_T
1650 		if(nsd->use_cpu_affinity) {
1651 			set_cpu_affinity(nsd->xfrd_cpuset);
1652 		}
1653 #endif
1654 
1655 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1656 		/* ENOTREACH */
1657 		break;
1658 	case 0:
1659 		/* CHILD: close second socket, use first one */
1660 		close(sockets[1]);
1661 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1662 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1663 		}
1664 		nsd->xfrd_listener->fd = sockets[0];
1665 		break;
1666 	}
1667 	/* server-parent only */
1668 	nsd->xfrd_listener->timeout = NULL;
1669 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1670 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1671 	/* clear ongoing ipc reads */
1672 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1673 	data->conn->is_reading = 0;
1674 }
1675 
1676 /** add all soainfo to taskdb */
1677 static void
1678 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1679 {
1680 	struct radnode* n;
1681 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1682 	/* add all SOA INFO to mytask */
1683 	udb_ptr_init(&task_last, taskudb);
1684 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1685 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1686 	}
1687 	udb_ptr_unlink(&task_last, taskudb);
1688 }
1689 
1690 void
1691 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1692 {
1693 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1694 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1695 	 *   then they exchange and process.
1696 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1697 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1698 	 *   expire notifications can be sent back via a normal reload later
1699 	 *   (xfrd will wait for current running reload to finish if any).
1700 	 */
1701 	sig_atomic_t cmd = 0;
1702 	pid_t mypid;
1703 	int xfrd_sock = nsd->xfrd_listener->fd;
1704 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1705 	udb_ptr t;
1706 	if(!shortsoa) {
1707 		if(nsd->signal_hint_shutdown) {
1708 		shutdown:
1709 			log_msg(LOG_WARNING, "signal received, shutting down...");
1710 			server_close_all_sockets(nsd->udp, nsd->ifs);
1711 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1712 #ifdef HAVE_SSL
1713 			daemon_remote_close(nsd->rc);
1714 #endif
1715 			/* Unlink it if possible... */
1716 			unlinkpid(nsd->pidfile);
1717 			unlink(nsd->task[0]->fname);
1718 			unlink(nsd->task[1]->fname);
1719 #ifdef USE_ZONE_STATS
1720 			unlink(nsd->zonestatfname[0]);
1721 			unlink(nsd->zonestatfname[1]);
1722 #endif
1723 			/* write the nsd.db to disk, wait for it to complete */
1724 			udb_base_sync(nsd->db->udb, 1);
1725 			udb_base_close(nsd->db->udb);
1726 			server_shutdown(nsd);
1727 			/* ENOTREACH */
1728 			exit(0);
1729 		}
1730 	}
1731 	if(shortsoa) {
1732 		/* put SOA in xfrd task because mytask may be in use */
1733 		taskudb = nsd->task[1-nsd->mytask];
1734 	}
1735 
1736 	add_all_soa_to_task(nsd, taskudb);
1737 	if(!shortsoa) {
1738 		/* wait for xfrd to signal task is ready, RELOAD signal */
1739 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1740 			cmd != NSD_RELOAD) {
1741 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1742 			exit(1);
1743 		}
1744 		if(nsd->signal_hint_shutdown) {
1745 			goto shutdown;
1746 		}
1747 	}
1748 	/* give xfrd our task, signal it with RELOAD_DONE */
1749 	task_process_sync(taskudb);
1750 	cmd = NSD_RELOAD_DONE;
1751 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1752 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1753 			(int)nsd->pid, strerror(errno));
1754 	}
1755 	mypid = getpid();
1756 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1757 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1758 			strerror(errno));
1759 	}
1760 
1761 	if(!shortsoa) {
1762 		/* process the xfrd task works (expiry data) */
1763 		nsd->mytask = 1 - nsd->mytask;
1764 		taskudb = nsd->task[nsd->mytask];
1765 		task_remap(taskudb);
1766 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1767 		while(!udb_ptr_is_null(&t)) {
1768 			task_process_expire(nsd->db, TASKLIST(&t));
1769 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1770 		}
1771 		udb_ptr_unlink(&t, taskudb);
1772 		task_clear(taskudb);
1773 
1774 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1775 		cmd = NSD_RELOAD_DONE;
1776 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1777 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1778 				(int)nsd->pid, strerror(errno));
1779 		}
1780 	}
1781 }
1782 
1783 #ifdef HAVE_SSL
1784 static void
1785 log_crypto_from_err(const char* str, unsigned long err)
1786 {
1787 	/* error:[error code]:[library name]:[function name]:[reason string] */
1788 	char buf[128];
1789 	unsigned long e;
1790 	ERR_error_string_n(err, buf, sizeof(buf));
1791 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1792 	while( (e=ERR_get_error()) ) {
1793 		ERR_error_string_n(e, buf, sizeof(buf));
1794 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1795 	}
1796 }
1797 
1798 void
1799 log_crypto_err(const char* str)
1800 {
1801 	log_crypto_from_err(str, ERR_get_error());
1802 }
1803 
1804 /** true if the ssl handshake error has to be squelched from the logs */
1805 static int
1806 squelch_err_ssl_handshake(unsigned long err)
1807 {
1808 	if(verbosity >= 3)
1809 		return 0; /* only squelch on low verbosity */
1810 	/* this is very specific, we could filter on ERR_GET_REASON()
1811 	 * (the third element in ERR_PACK) */
1812 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1813 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1814 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1815 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1816 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1817 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1818 #endif
1819 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1820 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1821 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1822 #  ifdef SSL_R_VERSION_TOO_LOW
1823 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1824 #  endif
1825 #endif
1826 		)
1827 		return 1;
1828 	return 0;
1829 }
1830 
1831 void
1832 perform_openssl_init(void)
1833 {
1834 	/* init SSL library */
1835 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1836 	ERR_load_crypto_strings();
1837 #endif
1838 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1839 	ERR_load_SSL_strings();
1840 #endif
1841 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1842 	OpenSSL_add_all_algorithms();
1843 #else
1844 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1845 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1846 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1847 #endif
1848 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1849 	(void)SSL_library_init();
1850 #else
1851 	OPENSSL_init_ssl(0, NULL);
1852 #endif
1853 
1854 	if(!RAND_status()) {
1855 		/* try to seed it */
1856 		unsigned char buf[256];
1857 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1858 		size_t i;
1859 		v = seed;
1860 		for(i=0; i<256/sizeof(v); i++) {
1861 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1862 			v = v*seed + (unsigned int)i;
1863 		}
1864 		RAND_seed(buf, 256);
1865 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1866 	}
1867 }
1868 
1869 static int
1870 get_ocsp(char *filename, unsigned char **ocsp)
1871 {
1872 	BIO *bio;
1873 	OCSP_RESPONSE *response;
1874 	int len = -1;
1875 	unsigned char *p, *buf;
1876 	assert(filename);
1877 
1878 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1879 		log_crypto_err("get_ocsp: BIO_new_file failed");
1880 		return -1;
1881 	}
1882 
1883 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1884 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1885 		BIO_free(bio);
1886 		return -1;
1887 	}
1888 
1889 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1890 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1891 		OCSP_RESPONSE_free(response);
1892 		BIO_free(bio);
1893 		return -1;
1894 	}
1895 
1896 	if ((buf = malloc((size_t) len)) == NULL) {
1897 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1898 		OCSP_RESPONSE_free(response);
1899 		BIO_free(bio);
1900 		return -1;
1901 	}
1902 
1903 	p = buf;
1904 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1905 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1906 		free(buf);
1907 		OCSP_RESPONSE_free(response);
1908 		BIO_free(bio);
1909 		return -1;
1910 	}
1911 
1912 	OCSP_RESPONSE_free(response);
1913 	BIO_free(bio);
1914 
1915 	*ocsp = buf;
1916 	return len;
1917 }
1918 
1919 /* further setup ssl ctx after the keys are loaded */
1920 static void
1921 listen_sslctx_setup_2(void* ctxt)
1922 {
1923 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1924 	(void)ctx;
1925 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1926 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1927 		/* ENOTREACH */
1928 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1929 	}
1930 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1931 	if(1) {
1932 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1933 		if (!ecdh) {
1934 			log_crypto_err("could not find p256, not enabling ECDHE");
1935 		} else {
1936 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1937 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1938 			}
1939 			EC_KEY_free (ecdh);
1940 		}
1941 	}
1942 #endif
1943 }
1944 
1945 static int
1946 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1947 {
1948 	if(ocspdata) {
1949 		unsigned char *p;
1950 		if ((p=malloc(ocspdata_len)) == NULL) {
1951 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1952 			return SSL_TLSEXT_ERR_NOACK;
1953 		}
1954 		memcpy(p, ocspdata, ocspdata_len);
1955 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1956 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1957 			free(p);
1958 			return SSL_TLSEXT_ERR_NOACK;
1959 		}
1960 		return SSL_TLSEXT_ERR_OK;
1961 	} else {
1962 		return SSL_TLSEXT_ERR_NOACK;
1963 	}
1964 }
1965 
1966 SSL_CTX*
1967 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1968 {
1969 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1970 	if(!ctx) {
1971 		log_crypto_err("could not SSL_CTX_new");
1972 		return NULL;
1973 	}
1974 	/* no SSLv2, SSLv3 because has defects */
1975 #if SSL_OP_NO_SSLv2 != 0
1976 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1977 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1978 		SSL_CTX_free(ctx);
1979 		return NULL;
1980 	}
1981 #endif
1982 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1983 		!= SSL_OP_NO_SSLv3){
1984 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1985 		SSL_CTX_free(ctx);
1986 		return 0;
1987 	}
1988 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1989 	/* if we have tls 1.1 disable 1.0 */
1990 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1991 		!= SSL_OP_NO_TLSv1){
1992 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1993 		SSL_CTX_free(ctx);
1994 		return 0;
1995 	}
1996 #endif
1997 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1998 	/* if we have tls 1.2 disable 1.1 */
1999 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
2000 		!= SSL_OP_NO_TLSv1_1){
2001 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2002 		SSL_CTX_free(ctx);
2003 		return 0;
2004 	}
2005 #endif
2006 #if defined(SSL_OP_NO_RENEGOTIATION)
2007 	/* disable client renegotiation */
2008 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2009 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2010 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2011 		SSL_CTX_free(ctx);
2012 		return 0;
2013 	}
2014 #endif
2015 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2016 	/* if we detect system-wide crypto policies, use those */
2017 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2018 		/* if we have sha256, set the cipher list to have no known vulns */
2019 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2020 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2021 	}
2022 #endif
2023 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2024 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2025 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2026 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2027 		SSL_CTX_free(ctx);
2028 		return 0;
2029 	}
2030 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2031 	SSL_CTX_set_security_level(ctx, 0);
2032 #endif
2033 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2034 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2035 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2036 		SSL_CTX_free(ctx);
2037 		return NULL;
2038 	}
2039 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2040 		log_msg(LOG_ERR, "error for private key file: %s", key);
2041 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2042 		SSL_CTX_free(ctx);
2043 		return NULL;
2044 	}
2045 	if(!SSL_CTX_check_private_key(ctx)) {
2046 		log_msg(LOG_ERR, "error for key file: %s", key);
2047 		log_crypto_err("Error in SSL_CTX check_private_key");
2048 		SSL_CTX_free(ctx);
2049 		return NULL;
2050 	}
2051 	listen_sslctx_setup_2(ctx);
2052 	if(verifypem && verifypem[0]) {
2053 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2054 			log_crypto_err("Error in SSL_CTX verify locations");
2055 			SSL_CTX_free(ctx);
2056 			return NULL;
2057 		}
2058 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2059 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2060 	}
2061 	return ctx;
2062 }
2063 
2064 SSL_CTX*
2065 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2066 {
2067 	char *key, *pem;
2068 	SSL_CTX *ctx;
2069 
2070 	key = nsd->options->tls_service_key;
2071 	pem = nsd->options->tls_service_pem;
2072 	if(!key || key[0] == 0) {
2073 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2074 		return NULL;
2075 	}
2076 	if(!pem || pem[0] == 0) {
2077 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2078 		return NULL;
2079 	}
2080 
2081 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2082 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2083 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2084 	if(!ctx) {
2085 		log_msg(LOG_ERR, "could not setup server TLS context");
2086 		return NULL;
2087 	}
2088 	if(ocspfile && ocspfile[0]) {
2089 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2090 			log_crypto_err("Error reading OCSPfile");
2091 			SSL_CTX_free(ctx);
2092 			return NULL;
2093 		} else {
2094 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2095 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2096 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2097 				SSL_CTX_free(ctx);
2098 				return NULL;
2099 			}
2100 		}
2101 	}
2102 	return ctx;
2103 }
2104 
2105 /* check if tcp_handler_accept_data created for TLS dedicated port */
2106 int
2107 using_tls_port(struct sockaddr* addr, const char* tls_port)
2108 {
2109 	in_port_t port = 0;
2110 
2111 	if (addr->sa_family == AF_INET)
2112 		port = ((struct sockaddr_in*)addr)->sin_port;
2113 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2114 	else
2115 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2116 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2117 	if (atoi(tls_port) == ntohs(port))
2118 		return 1;
2119 
2120 	return 0;
2121 }
2122 #endif
2123 
2124 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2125 ssize_t
2126 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2127 {
2128 	uint8_t* buf = (uint8_t*) p;
2129 	ssize_t total = 0;
2130 	struct pollfd fd;
2131 	memset(&fd, 0, sizeof(fd));
2132 	fd.fd = s;
2133 	fd.events = POLLIN;
2134 
2135 	while( total < sz) {
2136 		ssize_t ret;
2137 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2138 		if(ret == -1) {
2139 			if(errno == EAGAIN)
2140 				/* blocking read */
2141 				continue;
2142 			if(errno == EINTR) {
2143 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2144 					return -1;
2145 				/* other signals can be handled later */
2146 				continue;
2147 			}
2148 			/* some error */
2149 			return -1;
2150 		}
2151 		if(ret == 0) {
2152 			/* operation timed out */
2153 			return -2;
2154 		}
2155 		ret = read(s, buf+total, sz-total);
2156 		if(ret == -1) {
2157 			if(errno == EAGAIN)
2158 				/* blocking read */
2159 				continue;
2160 			if(errno == EINTR) {
2161 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2162 					return -1;
2163 				/* other signals can be handled later */
2164 				continue;
2165 			}
2166 			/* some error */
2167 			return -1;
2168 		}
2169 		if(ret == 0) {
2170 			/* closed connection! */
2171 			return 0;
2172 		}
2173 		total += ret;
2174 	}
2175 	return total;
2176 }
2177 
2178 static void
2179 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2180 {
2181 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2182 	udb_ptr t, next;
2183 	udb_base* u = nsd->task[nsd->mytask];
2184 	udb_ptr_init(&next, u);
2185 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2186 	udb_base_set_userdata(u, 0);
2187 	while(!udb_ptr_is_null(&t)) {
2188 		/* store next in list so this one can be deleted or reused */
2189 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2190 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2191 
2192 		/* process task t */
2193 		/* append results for task t and update last_task */
2194 		task_process_in_reload(nsd, u, last_task, &t);
2195 
2196 		/* go to next */
2197 		udb_ptr_set_ptr(&t, u, &next);
2198 
2199 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2200 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2201 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2202 			if(cmd == NSD_QUIT) {
2203 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2204 				/* sync to disk (if needed) */
2205 				udb_base_sync(nsd->db->udb, 0);
2206 				/* unlink files of remainder of tasks */
2207 				while(!udb_ptr_is_null(&t)) {
2208 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2209 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2210 					}
2211 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2212 				}
2213 				udb_ptr_unlink(&t, u);
2214 				udb_ptr_unlink(&next, u);
2215 				exit(0);
2216 			}
2217 		}
2218 
2219 	}
2220 	udb_ptr_unlink(&t, u);
2221 	udb_ptr_unlink(&next, u);
2222 }
2223 
2224 #ifdef BIND8_STATS
2225 static void
2226 parent_send_stats(struct nsd* nsd, int cmdfd)
2227 {
2228 	size_t i;
2229 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2230 		log_msg(LOG_ERR, "could not write stats to reload");
2231 		return;
2232 	}
2233 	for(i=0; i<nsd->child_count; i++)
2234 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2235 			sizeof(stc_type))) {
2236 			log_msg(LOG_ERR, "could not write stats to reload");
2237 			return;
2238 		}
2239 }
2240 
2241 static void
2242 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2243 {
2244 	struct nsdst s;
2245 	stc_type* p;
2246 	size_t i;
2247 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2248 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2249 		log_msg(LOG_ERR, "could not read stats from oldpar");
2250 		return;
2251 	}
2252 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2253 	s.db_mem = region_get_mem(nsd->db->region);
2254 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2255 		nsd->child_count);
2256 	if(!p) return;
2257 	for(i=0; i<nsd->child_count; i++) {
2258 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2259 			sizeof(stc_type))
2260 			return;
2261 	}
2262 }
2263 #endif /* BIND8_STATS */
2264 
2265 void server_verify(struct nsd *nsd, int cmdsocket);
2266 
2267 /*
2268  * Reload the database, stop parent, re-fork children and continue.
2269  * as server_main.
2270  */
2271 static void
2272 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2273 	int cmdsocket)
2274 {
2275 	pid_t mypid;
2276 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2277 	int ret;
2278 	udb_ptr last_task;
2279 	struct sigaction old_sigchld, ign_sigchld;
2280 	struct radnode* node;
2281 	zone_type* zone;
2282 	enum soainfo_hint hint;
2283 	/* ignore SIGCHLD from the previous server_main that used this pid */
2284 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2285 	ign_sigchld.sa_handler = SIG_IGN;
2286 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2287 
2288 #ifdef HAVE_SETPROCTITLE
2289 	setproctitle("main");
2290 #endif
2291 #ifdef HAVE_CPUSET_T
2292 	if(nsd->use_cpu_affinity) {
2293 		set_cpu_affinity(nsd->cpuset);
2294 	}
2295 #endif
2296 
2297 	/* see what tasks we got from xfrd */
2298 	task_remap(nsd->task[nsd->mytask]);
2299 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2300 	udb_compact_inhibited(nsd->db->udb, 1);
2301 	reload_process_tasks(nsd, &last_task, cmdsocket);
2302 	udb_compact_inhibited(nsd->db->udb, 0);
2303 	udb_compact(nsd->db->udb);
2304 
2305 #ifndef NDEBUG
2306 	if(nsd_debug_level >= 1)
2307 		region_log_stats(nsd->db->region);
2308 #endif /* NDEBUG */
2309 	/* sync to disk (if needed) */
2310 	udb_base_sync(nsd->db->udb, 0);
2311 
2312 	initialize_dname_compression_tables(nsd);
2313 
2314 #ifdef BIND8_STATS
2315 	/* Restart dumping stats if required.  */
2316 	time(&nsd->st.boot);
2317 	set_bind8_alarm(nsd);
2318 #endif
2319 #ifdef USE_ZONE_STATS
2320 	server_zonestat_realloc(nsd); /* realloc for new children */
2321 	server_zonestat_switch(nsd);
2322 #endif
2323 
2324 	if(nsd->options->verify_enable) {
2325 #ifdef RATELIMIT
2326 		/* allocate resources for rate limiting. use a slot that is guaranteed
2327 		   not mapped to a file so no persistent data is overwritten */
2328 		rrl_init(nsd->child_count + 1);
2329 #endif
2330 
2331 		/* spin-up server and execute verifiers for each zone */
2332 		server_verify(nsd, cmdsocket);
2333 #ifdef RATELIMIT
2334 		/* deallocate rate limiting resources */
2335 		rrl_deinit(nsd->child_count + 1);
2336 #endif
2337 	}
2338 
2339 	for(node = radix_first(nsd->db->zonetree);
2340 	    node != NULL;
2341 	    node = radix_next(node))
2342 	{
2343 		zone = (zone_type *)node->elem;
2344 		if(zone->is_updated) {
2345 			if(zone->is_bad) {
2346 				nsd->mode = NSD_RELOAD_FAILED;
2347 				hint = soainfo_bad;
2348 			} else {
2349 				hint = soainfo_ok;
2350 			}
2351 			/* update(s), verified or not, possibly with subsequent
2352 			   skipped update(s). skipped update(s) are picked up
2353 			   by failed update check in xfrd */
2354 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2355 			                 zone, hint);
2356 		} else if(zone->is_skipped) {
2357 			/* corrupt or inconsistent update without preceding
2358 			   update(s), communicate soainfo_gone */
2359 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2360 			                 zone, soainfo_gone);
2361 		}
2362 		zone->is_updated = 0;
2363 		zone->is_skipped = 0;
2364 	}
2365 
2366 	if(nsd->mode == NSD_RELOAD_FAILED) {
2367 		exit(NSD_RELOAD_FAILED);
2368 	}
2369 
2370 	/* listen for the signals of failed children again */
2371 	sigaction(SIGCHLD, &old_sigchld, NULL);
2372 #ifdef USE_DNSTAP
2373 	if (nsd->dt_collector) {
2374 		int *swap_fd_send;
2375 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2376 		/* Swap fd_send with fd_swap so old serve child and new serve
2377 		 * childs will not write to the same pipe ends simultaneously */
2378 		swap_fd_send = nsd->dt_collector_fd_send;
2379 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2380 		nsd->dt_collector_fd_swap = swap_fd_send;
2381 
2382 	}
2383 #endif
2384 	/* Start new child processes */
2385 	if (server_start_children(nsd, server_region, netio, &nsd->
2386 		xfrd_listener->fd) != 0) {
2387 		send_children_quit(nsd);
2388 		exit(1);
2389 	}
2390 
2391 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2392 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2393 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2394 		if(cmd == NSD_QUIT) {
2395 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2396 			send_children_quit(nsd);
2397 			exit(0);
2398 		}
2399 	}
2400 
2401 	/* Send quit command to parent: blocking, wait for receipt. */
2402 	do {
2403 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2404 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2405 		{
2406 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2407 				strerror(errno));
2408 		}
2409 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2410 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2411 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2412 			RELOAD_SYNC_TIMEOUT);
2413 		if(ret == -2) {
2414 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2415 		}
2416 	} while (ret == -2);
2417 	if(ret == -1) {
2418 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2419 			strerror(errno));
2420 	}
2421 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2422 	if(cmd == NSD_QUIT) {
2423 		/* small race condition possible here, parent got quit cmd. */
2424 		send_children_quit(nsd);
2425 		exit(1);
2426 	}
2427 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2428 #ifdef BIND8_STATS
2429 	reload_do_stats(cmdsocket, nsd, &last_task);
2430 #endif
2431 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2432 	task_process_sync(nsd->task[nsd->mytask]);
2433 #ifdef USE_ZONE_STATS
2434 	server_zonestat_realloc(nsd); /* realloc for next children */
2435 #endif
2436 
2437 	/* send soainfo to the xfrd process, signal it that reload is done,
2438 	 * it picks up the taskudb */
2439 	cmd = NSD_RELOAD_DONE;
2440 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2441 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2442 			strerror(errno));
2443 	}
2444 	mypid = getpid();
2445 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2446 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2447 			strerror(errno));
2448 	}
2449 
2450 	/* try to reopen file */
2451 	if (nsd->file_rotation_ok)
2452 		log_reopen(nsd->log_filename, 1);
2453 	/* exit reload, continue as new server_main */
2454 }
2455 
2456 /*
2457  * Get the mode depending on the signal hints that have been received.
2458  * Multiple signal hints can be received and will be handled in turn.
2459  */
2460 static sig_atomic_t
2461 server_signal_mode(struct nsd *nsd)
2462 {
2463 	if(nsd->signal_hint_quit) {
2464 		nsd->signal_hint_quit = 0;
2465 		return NSD_QUIT;
2466 	}
2467 	else if(nsd->signal_hint_shutdown) {
2468 		nsd->signal_hint_shutdown = 0;
2469 		return NSD_SHUTDOWN;
2470 	}
2471 	else if(nsd->signal_hint_child) {
2472 		nsd->signal_hint_child = 0;
2473 		return NSD_REAP_CHILDREN;
2474 	}
2475 	else if(nsd->signal_hint_reload) {
2476 		nsd->signal_hint_reload = 0;
2477 		return NSD_RELOAD;
2478 	}
2479 	else if(nsd->signal_hint_reload_hup) {
2480 		nsd->signal_hint_reload_hup = 0;
2481 		return NSD_RELOAD_REQ;
2482 	}
2483 	else if(nsd->signal_hint_stats) {
2484 		nsd->signal_hint_stats = 0;
2485 #ifdef BIND8_STATS
2486 		set_bind8_alarm(nsd);
2487 #endif
2488 		return NSD_STATS;
2489 	}
2490 	else if(nsd->signal_hint_statsusr) {
2491 		nsd->signal_hint_statsusr = 0;
2492 		return NSD_STATS;
2493 	}
2494 	return NSD_RUN;
2495 }
2496 
2497 /*
2498  * The main server simply waits for signals and child processes to
2499  * terminate.  Child processes are restarted as necessary.
2500  */
2501 void
2502 server_main(struct nsd *nsd)
2503 {
2504 	region_type *server_region = region_create(xalloc, free);
2505 	netio_type *netio = netio_create(server_region);
2506 	netio_handler_type reload_listener;
2507 	int reload_sockets[2] = {-1, -1};
2508 	struct timespec timeout_spec;
2509 	int status;
2510 	pid_t child_pid;
2511 	pid_t reload_pid = -1;
2512 	sig_atomic_t mode;
2513 
2514 	/* Ensure we are the main process */
2515 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2516 
2517 	/* Add listener for the XFRD process */
2518 	netio_add_handler(netio, nsd->xfrd_listener);
2519 
2520 	/* Start the child processes that handle incoming queries */
2521 	if (server_start_children(nsd, server_region, netio,
2522 		&nsd->xfrd_listener->fd) != 0) {
2523 		send_children_quit(nsd);
2524 		exit(1);
2525 	}
2526 	reload_listener.fd = -1;
2527 
2528 	/* This_child MUST be 0, because this is the parent process */
2529 	assert(nsd->this_child == 0);
2530 
2531 	/* Run the server until we get a shutdown signal */
2532 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2533 		/* Did we receive a signal that changes our mode? */
2534 		if(mode == NSD_RUN) {
2535 			nsd->mode = mode = server_signal_mode(nsd);
2536 		}
2537 
2538 		switch (mode) {
2539 		case NSD_RUN:
2540 			/* see if any child processes terminated */
2541 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2542 				int is_child = delete_child_pid(nsd, child_pid);
2543 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2544 					if(nsd->children[is_child].child_fd == -1)
2545 						nsd->children[is_child].has_exited = 1;
2546 					parent_check_all_children_exited(nsd);
2547 				} else if(is_child != -1) {
2548 					log_msg(LOG_WARNING,
2549 					       "server %d died unexpectedly with status %d, restarting",
2550 					       (int) child_pid, status);
2551 					restart_child_servers(nsd, server_region, netio,
2552 						&nsd->xfrd_listener->fd);
2553 				} else if (child_pid == reload_pid) {
2554 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2555 					pid_t mypid;
2556 					log_msg(LOG_WARNING,
2557 					       "Reload process %d failed with status %d, continuing with old database",
2558 					       (int) child_pid, status);
2559 					reload_pid = -1;
2560 					if(reload_listener.fd != -1) close(reload_listener.fd);
2561 					netio_remove_handler(netio, &reload_listener);
2562 					reload_listener.fd = -1;
2563 					reload_listener.event_types = NETIO_EVENT_NONE;
2564 					task_process_sync(nsd->task[nsd->mytask]);
2565 					/* inform xfrd reload attempt ended */
2566 					if(!write_socket(nsd->xfrd_listener->fd,
2567 						&cmd, sizeof(cmd))) {
2568 						log_msg(LOG_ERR, "problems "
2569 						  "sending SOAEND to xfrd: %s",
2570 						  strerror(errno));
2571 					}
2572 					mypid = getpid();
2573 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2574 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2575 							strerror(errno));
2576 					}
2577 #ifdef USE_DNSTAP
2578 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2579 					log_msg(LOG_WARNING,
2580 					       "dnstap-collector %d terminated with status %d",
2581 					       (int) child_pid, status);
2582 					if(nsd->dt_collector) {
2583 						dt_collector_close(nsd->dt_collector, nsd);
2584 						dt_collector_destroy(nsd->dt_collector, nsd);
2585 						nsd->dt_collector = NULL;
2586 					}
2587 					/* Only respawn a crashed (or exited)
2588 					 * dnstap-collector when not reloading,
2589 					 * to not induce a reload during a
2590 					 * reload (which would seriously
2591 					 * disrupt nsd procedures and lead to
2592 					 * unpredictable results)!
2593 					 *
2594 					 * This will *leave* a dnstap-collector
2595 					 * process terminated, but because
2596 					 * signalling of the reload process to
2597 					 * the main process to respawn in this
2598 					 * situation will be cumbersome, and
2599 					 * because this situation is so
2600 					 * specific (and therefore hopefully
2601 					 * extremely rare or non-existing at
2602 					 * all), plus the fact that we are left
2603 					 * with a perfectly function NSD
2604 					 * (besides not logging dnstap
2605 					 * messages), I consider it acceptable
2606 					 * to leave this unresolved.
2607 					 */
2608 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2609 						nsd->dt_collector = dt_collector_create(nsd);
2610 						dt_collector_start(nsd->dt_collector, nsd);
2611 						nsd->mode = NSD_RELOAD_REQ;
2612 					}
2613 #endif
2614 				} else if(status != 0) {
2615 					/* check for status, because we get
2616 					 * the old-servermain because reload
2617 					 * is the process-parent of old-main,
2618 					 * and we get older server-processes
2619 					 * that are exiting after a reload */
2620 					log_msg(LOG_WARNING,
2621 					       "process %d terminated with status %d",
2622 					       (int) child_pid, status);
2623 				}
2624 			}
2625 			if (child_pid == -1) {
2626 				if (errno == EINTR) {
2627 					continue;
2628 				}
2629 				if (errno != ECHILD)
2630 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2631 			}
2632 			if (nsd->mode != NSD_RUN)
2633 				break;
2634 
2635 			/* timeout to collect processes. In case no sigchild happens. */
2636 			timeout_spec.tv_sec = 60;
2637 			timeout_spec.tv_nsec = 0;
2638 
2639 			/* listen on ports, timeout for collecting terminated children */
2640 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2641 				if (errno != EINTR) {
2642 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2643 				}
2644 			}
2645 			if(nsd->restart_children) {
2646 				restart_child_servers(nsd, server_region, netio,
2647 					&nsd->xfrd_listener->fd);
2648 				nsd->restart_children = 0;
2649 			}
2650 			if(nsd->reload_failed) {
2651 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2652 				pid_t mypid;
2653 				nsd->reload_failed = 0;
2654 				log_msg(LOG_WARNING,
2655 				       "Reload process %d failed, continuing with old database",
2656 				       (int) reload_pid);
2657 				reload_pid = -1;
2658 				if(reload_listener.fd != -1) close(reload_listener.fd);
2659 				netio_remove_handler(netio, &reload_listener);
2660 				reload_listener.fd = -1;
2661 				reload_listener.event_types = NETIO_EVENT_NONE;
2662 				task_process_sync(nsd->task[nsd->mytask]);
2663 				/* inform xfrd reload attempt ended */
2664 				if(!write_socket(nsd->xfrd_listener->fd,
2665 					&cmd, sizeof(cmd))) {
2666 					log_msg(LOG_ERR, "problems "
2667 					  "sending SOAEND to xfrd: %s",
2668 					  strerror(errno));
2669 				}
2670 				mypid = getpid();
2671 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2672 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2673 						strerror(errno));
2674 				}
2675 			}
2676 
2677 			break;
2678 		case NSD_RELOAD_REQ: {
2679 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2680 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2681 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2682 				"main: ipc send reload_req to xfrd"));
2683 			if(!write_socket(nsd->xfrd_listener->fd,
2684 				&cmd, sizeof(cmd))) {
2685 				log_msg(LOG_ERR, "server_main: could not send "
2686 				"reload_req to xfrd: %s", strerror(errno));
2687 			}
2688 			nsd->mode = NSD_RUN;
2689 			} break;
2690 		case NSD_RELOAD:
2691 			/* Continue to run nsd after reload */
2692 			nsd->mode = NSD_RUN;
2693 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2694 			if (reload_pid != -1) {
2695 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2696 				       (int) reload_pid);
2697 				break;
2698 			}
2699 
2700 			/* switch the mytask to keep track of who owns task*/
2701 			nsd->mytask = 1 - nsd->mytask;
2702 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2703 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2704 				reload_pid = -1;
2705 				break;
2706 			}
2707 
2708 			/* Do actual reload */
2709 			reload_pid = fork();
2710 			switch (reload_pid) {
2711 			case -1:
2712 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2713 				break;
2714 			default:
2715 				/* PARENT */
2716 				close(reload_sockets[0]);
2717 				server_reload(nsd, server_region, netio,
2718 					reload_sockets[1]);
2719 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2720 				close(reload_sockets[1]);
2721 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2722 				/* drop stale xfrd ipc data */
2723 				((struct ipc_handler_conn_data*)nsd->
2724 					xfrd_listener->user_data)
2725 					->conn->is_reading = 0;
2726 				reload_pid = -1;
2727 				reload_listener.fd = -1;
2728 				reload_listener.event_types = NETIO_EVENT_NONE;
2729 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2730 				break;
2731 			case 0:
2732 				/* CHILD */
2733 				/* server_main keep running until NSD_QUIT_SYNC
2734 				 * received from reload. */
2735 				close(reload_sockets[1]);
2736 				reload_listener.fd = reload_sockets[0];
2737 				reload_listener.timeout = NULL;
2738 				reload_listener.user_data = nsd;
2739 				reload_listener.event_types = NETIO_EVENT_READ;
2740 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2741 				netio_add_handler(netio, &reload_listener);
2742 				reload_pid = getppid();
2743 				break;
2744 			}
2745 			break;
2746 		case NSD_QUIT_SYNC:
2747 			/* synchronisation of xfrd, parent and reload */
2748 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2749 				sig_atomic_t cmd = NSD_RELOAD;
2750 				/* stop xfrd ipc writes in progress */
2751 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2752 					"main: ipc send indication reload"));
2753 				if(!write_socket(nsd->xfrd_listener->fd,
2754 					&cmd, sizeof(cmd))) {
2755 					log_msg(LOG_ERR, "server_main: could not send reload "
2756 					"indication to xfrd: %s", strerror(errno));
2757 				}
2758 				/* wait for ACK from xfrd */
2759 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2760 				nsd->quit_sync_done = 1;
2761 			}
2762 			nsd->mode = NSD_RUN;
2763 			break;
2764 		case NSD_QUIT:
2765 			/* silent shutdown during reload */
2766 			if(reload_listener.fd != -1) {
2767 				/* acknowledge the quit, to sync reload that we will really quit now */
2768 				sig_atomic_t cmd = NSD_RELOAD;
2769 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2770 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2771 					log_msg(LOG_ERR, "server_main: "
2772 						"could not ack quit: %s", strerror(errno));
2773 				}
2774 #ifdef BIND8_STATS
2775 				parent_send_stats(nsd, reload_listener.fd);
2776 #endif /* BIND8_STATS */
2777 				close(reload_listener.fd);
2778 			}
2779 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2780 			/* only quit children after xfrd has acked */
2781 			send_children_quit(nsd);
2782 
2783 #ifdef MEMCLEAN /* OS collects memory pages */
2784 			region_destroy(server_region);
2785 #endif
2786 			server_shutdown(nsd);
2787 
2788 			/* ENOTREACH */
2789 			break;
2790 		case NSD_SHUTDOWN:
2791 			break;
2792 		case NSD_REAP_CHILDREN:
2793 			/* continue; wait for child in run loop */
2794 			nsd->mode = NSD_RUN;
2795 			break;
2796 		case NSD_STATS:
2797 #ifdef BIND8_STATS
2798 			set_children_stats(nsd);
2799 #endif
2800 			nsd->mode = NSD_RUN;
2801 			break;
2802 		default:
2803 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2804 			nsd->mode = NSD_RUN;
2805 			break;
2806 		}
2807 	}
2808 	log_msg(LOG_WARNING, "signal received, shutting down...");
2809 
2810 	/* close opened ports to avoid race with restart of nsd */
2811 	server_close_all_sockets(nsd->udp, nsd->ifs);
2812 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2813 #ifdef HAVE_SSL
2814 	daemon_remote_close(nsd->rc);
2815 #endif
2816 	send_children_quit_and_wait(nsd);
2817 
2818 	/* Unlink it if possible... */
2819 	unlinkpid(nsd->pidfile);
2820 	unlink(nsd->task[0]->fname);
2821 	unlink(nsd->task[1]->fname);
2822 #ifdef USE_ZONE_STATS
2823 	unlink(nsd->zonestatfname[0]);
2824 	unlink(nsd->zonestatfname[1]);
2825 #endif
2826 #ifdef USE_DNSTAP
2827 	dt_collector_close(nsd->dt_collector, nsd);
2828 #endif
2829 
2830 	if(reload_listener.fd != -1) {
2831 		sig_atomic_t cmd = NSD_QUIT;
2832 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2833 			"main: ipc send quit to reload-process"));
2834 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2835 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2836 				strerror(errno));
2837 		}
2838 		fsync(reload_listener.fd);
2839 		close(reload_listener.fd);
2840 		/* wait for reload to finish processing */
2841 		while(1) {
2842 			if(waitpid(reload_pid, NULL, 0) == -1) {
2843 				if(errno == EINTR) continue;
2844 				if(errno == ECHILD) break;
2845 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2846 					(int)reload_pid, strerror(errno));
2847 			}
2848 			break;
2849 		}
2850 	}
2851 	if(nsd->xfrd_listener->fd != -1) {
2852 		/* complete quit, stop xfrd */
2853 		sig_atomic_t cmd = NSD_QUIT;
2854 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2855 			"main: ipc send quit to xfrd"));
2856 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2857 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2858 				strerror(errno));
2859 		}
2860 		fsync(nsd->xfrd_listener->fd);
2861 		close(nsd->xfrd_listener->fd);
2862 		(void)kill(nsd->pid, SIGTERM);
2863 	}
2864 
2865 #ifdef MEMCLEAN /* OS collects memory pages */
2866 	region_destroy(server_region);
2867 #endif
2868 	/* write the nsd.db to disk, wait for it to complete */
2869 	udb_base_sync(nsd->db->udb, 1);
2870 	udb_base_close(nsd->db->udb);
2871 	server_shutdown(nsd);
2872 }
2873 
2874 static query_state_type
2875 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2876 {
2877 	return query_process(query, nsd, now_p);
2878 }
2879 
2880 static query_state_type
2881 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2882 {
2883 #ifdef RATELIMIT
2884 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2885 		if(query->edns.cookie_status != COOKIE_VALID
2886 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2887 		&& rrl_process_query(query))
2888 			return rrl_slip(query);
2889 		else	return QUERY_PROCESSED;
2890 	}
2891 	return QUERY_DISCARDED;
2892 #else
2893 	return query_process(query, nsd, now_p);
2894 #endif
2895 }
2896 
2897 const char*
2898 nsd_event_vs(void)
2899 {
2900 #ifdef USE_MINI_EVENT
2901 	return "";
2902 #else
2903 	return event_get_version();
2904 #endif
2905 }
2906 
2907 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2908 static const char* ub_ev_backend2str(int b)
2909 {
2910 	switch(b) {
2911 	case EVBACKEND_SELECT:	return "select";
2912 	case EVBACKEND_POLL:	return "poll";
2913 	case EVBACKEND_EPOLL:	return "epoll";
2914 	case EVBACKEND_KQUEUE:	return "kqueue";
2915 	case EVBACKEND_DEVPOLL: return "devpoll";
2916 	case EVBACKEND_PORT:	return "evport";
2917 	}
2918 	return "unknown";
2919 }
2920 #endif
2921 
2922 const char*
2923 nsd_event_method(void)
2924 {
2925 #ifdef USE_MINI_EVENT
2926 	return "select";
2927 #else
2928 	struct event_base* b = nsd_child_event_base();
2929 	const char* m = "?";
2930 #  ifdef EV_FEATURE_BACKENDS
2931 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2932 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2933 	m = event_base_get_method(b);
2934 #  endif
2935 #  ifdef MEMCLEAN
2936 	event_base_free(b);
2937 #  endif
2938 	return m;
2939 #endif
2940 }
2941 
2942 struct event_base*
2943 nsd_child_event_base(void)
2944 {
2945 	struct event_base* base;
2946 #ifdef USE_MINI_EVENT
2947 	static time_t secs;
2948 	static struct timeval now;
2949 	base = event_init(&secs, &now);
2950 #else
2951 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2952 	/* libev */
2953 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2954 #  else
2955 	/* libevent */
2956 #    ifdef HAVE_EVENT_BASE_NEW
2957 	base = event_base_new();
2958 #    else
2959 	base = event_init();
2960 #    endif
2961 #  endif
2962 #endif
2963 	return base;
2964 }
2965 
2966 static void
2967 add_udp_handler(
2968 	struct nsd *nsd,
2969 	struct nsd_socket *sock,
2970 	struct udp_handler_data *data)
2971 {
2972 	struct event *handler = &data->event;
2973 
2974 	data->nsd = nsd;
2975 	data->socket = sock;
2976 
2977 	memset(handler, 0, sizeof(*handler));
2978 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2979 	if(event_base_set(nsd->event_base, handler) != 0)
2980 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2981 	if(event_add(handler, NULL) != 0)
2982 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2983 }
2984 
2985 void
2986 add_tcp_handler(
2987 	struct nsd *nsd,
2988 	struct nsd_socket *sock,
2989 	struct tcp_accept_handler_data *data)
2990 {
2991 	struct event *handler = &data->event;
2992 
2993 	data->nsd = nsd;
2994 	data->socket = sock;
2995 
2996 #ifdef HAVE_SSL
2997 	if (nsd->tls_ctx &&
2998 	    nsd->options->tls_port &&
2999 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
3000 	{
3001 		data->tls_accept = 1;
3002 		if(verbosity >= 2) {
3003 			char buf[48];
3004 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3005 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3006 		}
3007 	} else {
3008 		data->tls_accept = 0;
3009 	}
3010 #endif
3011 
3012 	memset(handler, 0, sizeof(*handler));
3013 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3014 	if(event_base_set(nsd->event_base, handler) != 0)
3015 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3016 	if(event_add(handler, NULL) != 0)
3017 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3018 	data->event_added = 1;
3019 }
3020 
3021 /*
3022  * Serve DNS request to verifiers (short-lived)
3023  */
3024 void server_verify(struct nsd *nsd, int cmdsocket)
3025 {
3026 	size_t size = 0;
3027 	struct event cmd_event, signal_event, exit_event;
3028 	struct zone *zone;
3029 	size_t i;
3030 
3031 	assert(nsd != NULL);
3032 
3033 	zone = verify_next_zone(nsd, NULL);
3034 	if(zone == NULL)
3035 		return;
3036 
3037 	nsd->server_region = region_create(xalloc, free);
3038 	nsd->event_base = nsd_child_event_base();
3039 
3040 	nsd->next_zone_to_verify = zone;
3041 	nsd->verifier_count = 0;
3042 	nsd->verifier_limit = nsd->options->verifier_count;
3043 	size = sizeof(struct verifier) * nsd->verifier_limit;
3044 	pipe(nsd->verifier_pipe);
3045 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3046 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3047 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3048 
3049 	for(i = 0; i < nsd->verifier_limit; i++) {
3050 		nsd->verifiers[i].nsd = nsd;
3051 		nsd->verifiers[i].zone = NULL;
3052 		nsd->verifiers[i].pid = -1;
3053 		nsd->verifiers[i].output_stream.fd = -1;
3054 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3055 		nsd->verifiers[i].error_stream.fd = -1;
3056 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3057 	}
3058 
3059 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3060 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3061 	   event_add(&cmd_event, NULL) != 0)
3062 	{
3063 		log_msg(LOG_ERR, "verify: could not add command event");
3064 		goto fail;
3065 	}
3066 
3067 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3068 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3069 	   signal_add(&signal_event, NULL) != 0)
3070 	{
3071 		log_msg(LOG_ERR, "verify: could not add signal event");
3072 		goto fail;
3073 	}
3074 
3075 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3076 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3077 	   event_add(&exit_event, NULL) != 0)
3078   {
3079 		log_msg(LOG_ERR, "verify: could not add exit event");
3080 		goto fail;
3081 	}
3082 
3083 	memset(msgs, 0, sizeof(msgs));
3084 	for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3085 		queries[i] = query_create(nsd->server_region,
3086 			compressed_dname_offsets,
3087 			compression_table_size, compressed_dnames);
3088 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3089 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3090 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3091 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3092 		msgs[i].msg_hdr.msg_iovlen = 1;
3093 		msgs[i].msg_hdr.msg_name = &queries[i]->addr;
3094 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3095 	}
3096 
3097 	for (i = 0; i < nsd->verify_ifs; i++) {
3098 		struct udp_handler_data *data;
3099 		data = region_alloc_zero(
3100 			nsd->server_region, sizeof(*data));
3101 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3102 	}
3103 
3104 	tcp_accept_handler_count = nsd->verify_ifs;
3105 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3106 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3107 
3108 	for (i = 0; i < nsd->verify_ifs; i++) {
3109 		struct tcp_accept_handler_data *data;
3110 		data = &tcp_accept_handlers[i];
3111 		memset(data, 0, sizeof(*data));
3112 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3113 	}
3114 
3115 	while(nsd->next_zone_to_verify != NULL &&
3116 	      nsd->verifier_count < nsd->verifier_limit)
3117 	{
3118 		verify_zone(nsd, nsd->next_zone_to_verify);
3119 		nsd->next_zone_to_verify
3120 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3121 	}
3122 
3123 	/* short-lived main loop */
3124 	event_base_dispatch(nsd->event_base);
3125 
3126 	/* remove command and exit event handlers */
3127 	event_del(&exit_event);
3128 	event_del(&signal_event);
3129 	event_del(&cmd_event);
3130 
3131 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3132 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3133 fail:
3134 	event_base_free(nsd->event_base);
3135 	close(nsd->verifier_pipe[0]);
3136 	close(nsd->verifier_pipe[1]);
3137 	region_destroy(nsd->server_region);
3138 
3139 	nsd->event_base = NULL;
3140 	nsd->server_region = NULL;
3141 	nsd->verifier_limit = 0;
3142 	nsd->verifier_pipe[0] = -1;
3143 	nsd->verifier_pipe[1] = -1;
3144 	nsd->verifiers = NULL;
3145 }
3146 
3147 /*
3148  * Serve DNS requests.
3149  */
3150 void
3151 server_child(struct nsd *nsd)
3152 {
3153 	size_t i, from, numifs;
3154 	region_type *server_region = region_create(xalloc, free);
3155 	struct event_base* event_base = nsd_child_event_base();
3156 	sig_atomic_t mode;
3157 
3158 	if(!event_base) {
3159 		log_msg(LOG_ERR, "nsd server could not create event base");
3160 		exit(1);
3161 	}
3162 	nsd->event_base = event_base;
3163 	nsd->server_region = server_region;
3164 
3165 #ifdef RATELIMIT
3166 	rrl_init(nsd->this_child->child_num);
3167 #endif
3168 
3169 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3170 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3171 
3172 #ifdef HAVE_SETPROCTITLE
3173 	setproctitle("server %d", nsd->this_child->child_num + 1);
3174 #endif
3175 #ifdef HAVE_CPUSET_T
3176 	if(nsd->use_cpu_affinity) {
3177 		set_cpu_affinity(nsd->this_child->cpuset);
3178 	}
3179 #endif
3180 
3181 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3182 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3183 	}
3184 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3185 		server_close_all_sockets(nsd->udp, nsd->ifs);
3186 	}
3187 
3188 	if (nsd->this_child->parent_fd != -1) {
3189 		struct event *handler;
3190 		struct ipc_handler_conn_data* user_data =
3191 			(struct ipc_handler_conn_data*)region_alloc(
3192 			server_region, sizeof(struct ipc_handler_conn_data));
3193 		user_data->nsd = nsd;
3194 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3195 
3196 		handler = (struct event*) region_alloc(
3197 			server_region, sizeof(*handler));
3198 		memset(handler, 0, sizeof(*handler));
3199 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3200 			EV_READ, child_handle_parent_command, user_data);
3201 		if(event_base_set(event_base, handler) != 0)
3202 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3203 		if(event_add(handler, NULL) != 0)
3204 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3205 	}
3206 
3207 	if(nsd->reuseport) {
3208 		numifs = nsd->ifs / nsd->reuseport;
3209 		from = numifs * nsd->this_child->child_num;
3210 		if(from+numifs > nsd->ifs) { /* should not happen */
3211 			from = 0;
3212 			numifs = nsd->ifs;
3213 		}
3214 	} else {
3215 		from = 0;
3216 		numifs = nsd->ifs;
3217 	}
3218 
3219 	if (nsd->server_kind & NSD_SERVER_UDP) {
3220 		int child = nsd->this_child->child_num;
3221 		memset(msgs, 0, sizeof(msgs));
3222 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3223 			queries[i] = query_create(server_region,
3224 				compressed_dname_offsets,
3225 				compression_table_size, compressed_dnames);
3226 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3227 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3228 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3229 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3230 			msgs[i].msg_hdr.msg_iovlen  = 1;
3231 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
3232 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3233 		}
3234 
3235 		for (i = 0; i < nsd->ifs; i++) {
3236 			int listen;
3237 			struct udp_handler_data *data;
3238 
3239 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3240 
3241 			if(i >= from && i < (from + numifs) && listen) {
3242 				data = region_alloc_zero(
3243 					nsd->server_region, sizeof(*data));
3244 				add_udp_handler(nsd, &nsd->udp[i], data);
3245 			} else {
3246 				/* close sockets intended for other servers */
3247 				server_close_socket(&nsd->udp[i]);
3248 			}
3249 		}
3250 	}
3251 
3252 	/*
3253 	 * Keep track of all the TCP accept handlers so we can enable
3254 	 * and disable them based on the current number of active TCP
3255 	 * connections.
3256 	 */
3257 	if (nsd->server_kind & NSD_SERVER_TCP) {
3258 		int child = nsd->this_child->child_num;
3259 		tcp_accept_handler_count = numifs;
3260 		tcp_accept_handlers = region_alloc_array(server_region,
3261 			numifs, sizeof(*tcp_accept_handlers));
3262 
3263 		for (i = 0; i < nsd->ifs; i++) {
3264 			int listen;
3265 			struct tcp_accept_handler_data *data;
3266 
3267 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3268 
3269 			if(i >= from && i < (from + numifs) && listen) {
3270 				data = &tcp_accept_handlers[i-from];
3271 				memset(data, 0, sizeof(*data));
3272 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3273 			} else {
3274 				/* close sockets intended for other servers */
3275 				/*
3276 				 * uncomment this once tcp servers are no
3277 				 * longer copied in the tcp fd copy line
3278 				 * in server_init().
3279 				server_close_socket(&nsd->tcp[i]);
3280 				*/
3281 				/* close sockets not meant for this server*/
3282 				if(!listen)
3283 					server_close_socket(&nsd->tcp[i]);
3284 			}
3285 		}
3286 	} else {
3287 		tcp_accept_handler_count = 0;
3288 	}
3289 
3290 	/* The main loop... */
3291 	while ((mode = nsd->mode) != NSD_QUIT) {
3292 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3293 
3294 		/* Do we need to do the statistics... */
3295 		if (mode == NSD_STATS) {
3296 #ifdef BIND8_STATS
3297 			int p = nsd->st.period;
3298 			nsd->st.period = 1; /* force stats printout */
3299 			/* Dump the statistics */
3300 			bind8_stats(nsd);
3301 			nsd->st.period = p;
3302 #else /* !BIND8_STATS */
3303 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3304 #endif /* BIND8_STATS */
3305 
3306 			nsd->mode = NSD_RUN;
3307 		}
3308 		else if (mode == NSD_REAP_CHILDREN) {
3309 			/* got signal, notify parent. parent reaps terminated children. */
3310 			if (nsd->this_child->parent_fd != -1) {
3311 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3312 				if (write(nsd->this_child->parent_fd,
3313 				    &parent_notify,
3314 				    sizeof(parent_notify)) == -1)
3315 				{
3316 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3317 						(int) nsd->this_child->pid, strerror(errno));
3318 				}
3319 			} else /* no parent, so reap 'em */
3320 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3321 			nsd->mode = NSD_RUN;
3322 		}
3323 		else if(mode == NSD_RUN) {
3324 			/* Wait for a query... */
3325 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3326 				if (errno != EINTR) {
3327 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3328 					break;
3329 				}
3330 			}
3331 		} else if(mode == NSD_QUIT) {
3332 			/* ignore here, quit */
3333 		} else {
3334 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3335 				(int)mode);
3336 			nsd->mode = NSD_RUN;
3337 		}
3338 	}
3339 
3340 	service_remaining_tcp(nsd);
3341 #ifdef	BIND8_STATS
3342 	bind8_stats(nsd);
3343 #endif /* BIND8_STATS */
3344 
3345 #ifdef MEMCLEAN /* OS collects memory pages */
3346 #ifdef RATELIMIT
3347 	rrl_deinit(nsd->this_child->child_num);
3348 #endif
3349 	event_base_free(event_base);
3350 	region_destroy(server_region);
3351 #endif
3352 	server_shutdown(nsd);
3353 }
3354 
3355 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3356 {
3357 	int* timed_out = (int*)arg;
3358         assert(event & EV_TIMEOUT); (void)event;
3359 	/* wake up the service tcp thread, note event is no longer
3360 	 * registered */
3361 	*timed_out = 1;
3362 }
3363 
3364 void
3365 service_remaining_tcp(struct nsd* nsd)
3366 {
3367 	struct tcp_handler_data* p;
3368 	struct event_base* event_base;
3369 	/* check if it is needed */
3370 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3371 		return;
3372 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3373 #ifdef USE_DNSTAP
3374 	/* remove dnstap collector, we cannot write there because the new
3375 	 * child process is using the file descriptor, or the child
3376 	 * process after that. */
3377 	dt_collector_destroy(nsd->dt_collector, nsd);
3378 	nsd->dt_collector = NULL;
3379 #endif
3380 	/* setup event base */
3381 	event_base = nsd_child_event_base();
3382 	if(!event_base) {
3383 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3384 		return;
3385 	}
3386 	/* register tcp connections */
3387 	for(p = tcp_active_list; p != NULL; p = p->next) {
3388 		struct timeval timeout;
3389 		int fd = p->event.ev_fd;
3390 #ifdef USE_MINI_EVENT
3391 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3392 #else
3393 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3394 #endif
3395 		void (*fn)(int, short, void*);
3396 #ifdef HAVE_SSL
3397 		if(p->tls) {
3398 			if((event&EV_READ))
3399 				fn = handle_tls_reading;
3400 			else	fn = handle_tls_writing;
3401 		} else {
3402 #endif
3403 			if((event&EV_READ))
3404 				fn = handle_tcp_reading;
3405 			else	fn = handle_tcp_writing;
3406 #ifdef HAVE_SSL
3407 		}
3408 #endif
3409 
3410 		p->tcp_no_more_queries = 1;
3411 		/* set timeout to 1/10 second */
3412 		if(p->tcp_timeout > 100)
3413 			p->tcp_timeout = 100;
3414 		timeout.tv_sec = p->tcp_timeout / 1000;
3415 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3416 		event_del(&p->event);
3417 		memset(&p->event, 0, sizeof(p->event));
3418 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3419 			fn, p);
3420 		if(event_base_set(event_base, &p->event) != 0)
3421 			log_msg(LOG_ERR, "event base set failed");
3422 		if(event_add(&p->event, &timeout) != 0)
3423 			log_msg(LOG_ERR, "event add failed");
3424 	}
3425 
3426 	/* handle it */
3427 	while(nsd->current_tcp_count > 0) {
3428 		mode_t m = server_signal_mode(nsd);
3429 		struct event timeout;
3430 		struct timeval tv;
3431 		int timed_out = 0;
3432 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3433 			m == NSD_REAP_CHILDREN) {
3434 			/* quit */
3435 			break;
3436 		}
3437 		/* timer */
3438 		/* have to do something every second */
3439 		tv.tv_sec = 1;
3440 		tv.tv_usec = 0;
3441 		memset(&timeout, 0, sizeof(timeout));
3442 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3443 			&timed_out);
3444 		if(event_base_set(event_base, &timeout) != 0)
3445 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3446 		if(event_add(&timeout, &tv) != 0)
3447 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3448 
3449 		/* service loop */
3450 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3451 			if (errno != EINTR) {
3452 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3453 				break;
3454 			}
3455 		}
3456 		if(!timed_out) {
3457 			event_del(&timeout);
3458 		} else {
3459 			/* timed out, quit */
3460 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3461 			break;
3462 		}
3463 	}
3464 #ifdef MEMCLEAN
3465 	event_base_free(event_base);
3466 #endif
3467 	/* continue to quit after return */
3468 }
3469 
3470 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3471  * are always used, even if nonblocking operations are broken, in which case
3472  * NUM_RECV_PER_SELECT is defined to 1 (one).
3473  */
3474 #if defined(HAVE_RECVMMSG)
3475 #define nsd_recvmmsg recvmmsg
3476 #else /* !HAVE_RECVMMSG */
3477 
3478 static int
3479 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3480              int flags, struct timespec *timeout)
3481 {
3482 	unsigned int vpos = 0;
3483 	ssize_t rcvd;
3484 
3485 	/* timeout is ignored, ensure caller does not expect it to work */
3486 	assert(timeout == NULL); (void)timeout;
3487 
3488 	while(vpos < vlen) {
3489 		rcvd = recvfrom(sockfd,
3490 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3491 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3492 		                flags,
3493 		                msgvec[vpos].msg_hdr.msg_name,
3494 		               &msgvec[vpos].msg_hdr.msg_namelen);
3495 		if(rcvd < 0) {
3496 			break;
3497 		} else {
3498 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3499 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3500 			vpos++;
3501 		}
3502 	}
3503 
3504 	if(vpos) {
3505 		/* error will be picked up next time */
3506 		return (int)vpos;
3507 	} else if(errno == 0) {
3508 		return 0;
3509 	} else if(errno == EAGAIN) {
3510 		return 0;
3511 	}
3512 
3513 	return -1;
3514 }
3515 #endif /* HAVE_RECVMMSG */
3516 
3517 #ifdef HAVE_SENDMMSG
3518 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3519 #else /* !HAVE_SENDMMSG */
3520 
3521 static int
3522 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3523 {
3524 	unsigned int vpos = 0;
3525 	ssize_t snd;
3526 
3527 	while(vpos < vlen) {
3528 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3529 		snd = sendto(sockfd,
3530 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3531 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3532 		             flags,
3533 		             msgvec[vpos].msg_hdr.msg_name,
3534 		             msgvec[vpos].msg_hdr.msg_namelen);
3535 		if(snd < 0) {
3536 			break;
3537 		} else {
3538 			msgvec[vpos].msg_len = (unsigned int)snd;
3539 			vpos++;
3540 		}
3541 	}
3542 
3543 	if(vpos) {
3544 		return (int)vpos;
3545 	} else if(errno == 0) {
3546 		return 0;
3547 	}
3548 
3549 	return -1;
3550 }
3551 #endif /* HAVE_SENDMMSG */
3552 
3553 static int
3554 port_is_zero(
3555 #ifdef INET6
3556         struct sockaddr_storage *addr
3557 #else
3558         struct sockaddr_in *addr
3559 #endif
3560 	)
3561 {
3562 #ifdef INET6
3563 	if(addr->ss_family == AF_INET6) {
3564 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3565 	} else if(addr->ss_family == AF_INET) {
3566 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3567 	}
3568 	return 0;
3569 #else
3570 	if(addr->sin_family == AF_INET) {
3571 		return addr->sin_port == 0;
3572 	}
3573 	return 0;
3574 #endif
3575 }
3576 
3577 static void
3578 handle_udp(int fd, short event, void* arg)
3579 {
3580 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3581 	int received, sent, recvcount, i;
3582 	struct query *q;
3583 	uint32_t now = 0;
3584 
3585 	if (!(event & EV_READ)) {
3586 		return;
3587 	}
3588 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3589 	/* this printf strangely gave a performance increase on Linux */
3590 	/* printf("recvcount %d \n", recvcount); */
3591 	if (recvcount == -1) {
3592 		if (errno != EAGAIN && errno != EINTR) {
3593 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3594 			STATUP(data->nsd, rxerr);
3595 			/* No zone statup */
3596 		}
3597 		/* Simply no data available */
3598 		return;
3599 	}
3600 	for (i = 0; i < recvcount; i++) {
3601 	loopstart:
3602 		received = msgs[i].msg_len;
3603 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3604 		q = queries[i];
3605 		if (received == -1) {
3606 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3607 #if defined(HAVE_RECVMMSG)
3608 				msgs[i].msg_hdr.msg_flags
3609 #else
3610 				errno
3611 #endif
3612 				));
3613 			STATUP(data->nsd, rxerr);
3614 			/* No zone statup */
3615 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3616 			iovecs[i].iov_len = buffer_remaining(q->packet);
3617 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3618 			goto swap_drop;
3619 		}
3620 
3621 		/* Account... */
3622 #ifdef BIND8_STATS
3623 		if (data->socket->addr.ai_family == AF_INET) {
3624 			STATUP(data->nsd, qudp);
3625 		} else if (data->socket->addr.ai_family == AF_INET6) {
3626 			STATUP(data->nsd, qudp6);
3627 		}
3628 #endif
3629 
3630 		buffer_skip(q->packet, received);
3631 		buffer_flip(q->packet);
3632 #ifdef USE_DNSTAP
3633 		/*
3634 		 * sending UDP-query with server address (local) and client address to dnstap process
3635 		 */
3636 		log_addr("query from client", &q->addr);
3637 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3638 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3639 			q->tcp, q->packet);
3640 #endif /* USE_DNSTAP */
3641 
3642 		/* Process and answer the query... */
3643 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3644 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3645 				STATUP(data->nsd, nona);
3646 				ZTATUP(data->nsd, q->zone, nona);
3647 			}
3648 
3649 #ifdef USE_ZONE_STATS
3650 			if (data->socket->addr.ai_family == AF_INET) {
3651 				ZTATUP(data->nsd, q->zone, qudp);
3652 			} else if (data->socket->addr.ai_family == AF_INET6) {
3653 				ZTATUP(data->nsd, q->zone, qudp6);
3654 			}
3655 #endif
3656 
3657 			/* Add EDNS0 and TSIG info if necessary.  */
3658 			query_add_optional(q, data->nsd, &now);
3659 
3660 			buffer_flip(q->packet);
3661 			iovecs[i].iov_len = buffer_remaining(q->packet);
3662 #ifdef BIND8_STATS
3663 			/* Account the rcode & TC... */
3664 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3665 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3666 			if (TC(q->packet)) {
3667 				STATUP(data->nsd, truncated);
3668 				ZTATUP(data->nsd, q->zone, truncated);
3669 			}
3670 #endif /* BIND8_STATS */
3671 #ifdef USE_DNSTAP
3672 			/*
3673 			 * sending UDP-response with server address (local) and client address to dnstap process
3674 			 */
3675 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3676 			log_addr("response to client", &q->addr);
3677 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3678 				&q->addr, q->addrlen, q->tcp, q->packet,
3679 				q->zone);
3680 #endif /* USE_DNSTAP */
3681 		} else {
3682 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3683 			iovecs[i].iov_len = buffer_remaining(q->packet);
3684 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3685 		swap_drop:
3686 			STATUP(data->nsd, dropped);
3687 			ZTATUP(data->nsd, q->zone, dropped);
3688 			if(i != recvcount-1) {
3689 				/* swap with last and decrease recvcount */
3690 				struct mmsghdr mtmp = msgs[i];
3691 				struct iovec iotmp = iovecs[i];
3692 				recvcount--;
3693 				msgs[i] = msgs[recvcount];
3694 				iovecs[i] = iovecs[recvcount];
3695 				queries[i] = queries[recvcount];
3696 				msgs[recvcount] = mtmp;
3697 				iovecs[recvcount] = iotmp;
3698 				queries[recvcount] = q;
3699 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3700 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3701 				goto loopstart;
3702 			} else { recvcount --; }
3703 		}
3704 	}
3705 
3706 	/* send until all are sent */
3707 	i = 0;
3708 	while(i<recvcount) {
3709 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3710 		if(sent == -1) {
3711 			if(errno == ENOBUFS ||
3712 #ifdef EWOULDBLOCK
3713 				errno == EWOULDBLOCK ||
3714 #endif
3715 				errno == EAGAIN) {
3716 				/* block to wait until send buffer avail */
3717 				int flag, errstore;
3718 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3719 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3720 					flag = 0;
3721 				}
3722 				flag &= ~O_NONBLOCK;
3723 				if(fcntl(fd, F_SETFL, flag) == -1)
3724 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3725 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3726 				errstore = errno;
3727 				flag |= O_NONBLOCK;
3728 				if(fcntl(fd, F_SETFL, flag) == -1)
3729 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3730 				if(sent != -1) {
3731 					i += sent;
3732 					continue;
3733 				}
3734 				errno = errstore;
3735 			}
3736 			if(errno == EINVAL) {
3737 				/* skip the invalid argument entry,
3738 				 * send the remaining packets in the list */
3739 				if(!(port_is_zero((void*)&queries[i]->addr) &&
3740 					verbosity < 3)) {
3741 					const char* es = strerror(errno);
3742 					char a[64];
3743 					addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3744 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3745 				}
3746 				i += 1;
3747 				continue;
3748 			}
3749 			/* don't log transient network full errors, unless
3750 			 * on higher verbosity */
3751 			if(!(errno == ENOBUFS && verbosity < 1) &&
3752 #ifdef EWOULDBLOCK
3753 			   errno != EWOULDBLOCK &&
3754 #endif
3755 			   errno != EAGAIN) {
3756 				const char* es = strerror(errno);
3757 				char a[64];
3758 				addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3759 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3760 			}
3761 #ifdef BIND8_STATS
3762 			data->nsd->st.txerr += recvcount-i;
3763 #endif /* BIND8_STATS */
3764 			break;
3765 		}
3766 		i += sent;
3767 	}
3768 	for(i=0; i<recvcount; i++) {
3769 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3770 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3771 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3772 	}
3773 }
3774 
3775 #ifdef HAVE_SSL
3776 /*
3777  * Setup an event for the tcp handler.
3778  */
3779 static void
3780 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3781        int fd, short event)
3782 {
3783 	struct timeval timeout;
3784 	struct event_base* ev_base;
3785 
3786 	timeout.tv_sec = data->nsd->tcp_timeout;
3787 	timeout.tv_usec = 0L;
3788 
3789 	ev_base = data->event.ev_base;
3790 	event_del(&data->event);
3791 	memset(&data->event, 0, sizeof(data->event));
3792 	event_set(&data->event, fd, event, fn, data);
3793 	if(event_base_set(ev_base, &data->event) != 0)
3794 		log_msg(LOG_ERR, "event base set failed");
3795 	if(event_add(&data->event, &timeout) != 0)
3796 		log_msg(LOG_ERR, "event add failed");
3797 }
3798 #endif /* HAVE_SSL */
3799 
3800 static void
3801 cleanup_tcp_handler(struct tcp_handler_data* data)
3802 {
3803 	event_del(&data->event);
3804 #ifdef HAVE_SSL
3805 	if(data->tls) {
3806 		SSL_shutdown(data->tls);
3807 		SSL_free(data->tls);
3808 		data->tls = NULL;
3809 	}
3810 #endif
3811 	close(data->event.ev_fd);
3812 	if(data->prev)
3813 		data->prev->next = data->next;
3814 	else	tcp_active_list = data->next;
3815 	if(data->next)
3816 		data->next->prev = data->prev;
3817 
3818 	/*
3819 	 * Enable the TCP accept handlers when the current number of
3820 	 * TCP connections is about to drop below the maximum number
3821 	 * of TCP connections.
3822 	 */
3823 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3824 		configure_handler_event_types(EV_READ|EV_PERSIST);
3825 		if(slowaccept) {
3826 			event_del(&slowaccept_event);
3827 			slowaccept = 0;
3828 		}
3829 	}
3830 	--data->nsd->current_tcp_count;
3831 	assert(data->nsd->current_tcp_count >= 0);
3832 
3833 	region_destroy(data->region);
3834 }
3835 
3836 static void
3837 handle_tcp_reading(int fd, short event, void* arg)
3838 {
3839 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3840 	ssize_t received;
3841 	struct event_base* ev_base;
3842 	struct timeval timeout;
3843 	uint32_t now = 0;
3844 
3845 	if ((event & EV_TIMEOUT)) {
3846 		/* Connection timed out.  */
3847 		cleanup_tcp_handler(data);
3848 		return;
3849 	}
3850 
3851 	if ((data->nsd->tcp_query_count > 0 &&
3852 		data->query_count >= data->nsd->tcp_query_count) ||
3853 		data->tcp_no_more_queries) {
3854 		/* No more queries allowed on this tcp connection. */
3855 		cleanup_tcp_handler(data);
3856 		return;
3857 	}
3858 
3859 	assert((event & EV_READ));
3860 
3861 	if (data->bytes_transmitted == 0) {
3862 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3863 	}
3864 
3865 	/*
3866 	 * Check if we received the leading packet length bytes yet.
3867 	 */
3868 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3869 		received = read(fd,
3870 				(char *) &data->query->tcplen
3871 				+ data->bytes_transmitted,
3872 				sizeof(uint16_t) - data->bytes_transmitted);
3873 		if (received == -1) {
3874 			if (errno == EAGAIN || errno == EINTR) {
3875 				/*
3876 				 * Read would block, wait until more
3877 				 * data is available.
3878 				 */
3879 				return;
3880 			} else {
3881 				char buf[48];
3882 				addr2str(&data->query->addr, buf, sizeof(buf));
3883 #ifdef ECONNRESET
3884 				if (verbosity >= 2 || errno != ECONNRESET)
3885 #endif /* ECONNRESET */
3886 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3887 				cleanup_tcp_handler(data);
3888 				return;
3889 			}
3890 		} else if (received == 0) {
3891 			/* EOF */
3892 			cleanup_tcp_handler(data);
3893 			return;
3894 		}
3895 
3896 		data->bytes_transmitted += received;
3897 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3898 			/*
3899 			 * Not done with the tcplen yet, wait for more
3900 			 * data to become available.
3901 			 */
3902 			return;
3903 		}
3904 
3905 		assert(data->bytes_transmitted == sizeof(uint16_t));
3906 
3907 		data->query->tcplen = ntohs(data->query->tcplen);
3908 
3909 		/*
3910 		 * Minimum query size is:
3911 		 *
3912 		 *     Size of the header (12)
3913 		 *   + Root domain name   (1)
3914 		 *   + Query class        (2)
3915 		 *   + Query type         (2)
3916 		 */
3917 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3918 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3919 			cleanup_tcp_handler(data);
3920 			return;
3921 		}
3922 
3923 		if (data->query->tcplen > data->query->maxlen) {
3924 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3925 			cleanup_tcp_handler(data);
3926 			return;
3927 		}
3928 
3929 		buffer_set_limit(data->query->packet, data->query->tcplen);
3930 	}
3931 
3932 	assert(buffer_remaining(data->query->packet) > 0);
3933 
3934 	/* Read the (remaining) query data.  */
3935 	received = read(fd,
3936 			buffer_current(data->query->packet),
3937 			buffer_remaining(data->query->packet));
3938 	if (received == -1) {
3939 		if (errno == EAGAIN || errno == EINTR) {
3940 			/*
3941 			 * Read would block, wait until more data is
3942 			 * available.
3943 			 */
3944 			return;
3945 		} else {
3946 			char buf[48];
3947 			addr2str(&data->query->addr, buf, sizeof(buf));
3948 #ifdef ECONNRESET
3949 			if (verbosity >= 2 || errno != ECONNRESET)
3950 #endif /* ECONNRESET */
3951 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3952 			cleanup_tcp_handler(data);
3953 			return;
3954 		}
3955 	} else if (received == 0) {
3956 		/* EOF */
3957 		cleanup_tcp_handler(data);
3958 		return;
3959 	}
3960 
3961 	data->bytes_transmitted += received;
3962 	buffer_skip(data->query->packet, received);
3963 	if (buffer_remaining(data->query->packet) > 0) {
3964 		/*
3965 		 * Message not yet complete, wait for more data to
3966 		 * become available.
3967 		 */
3968 		return;
3969 	}
3970 
3971 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3972 
3973 	/* Account... */
3974 #ifdef BIND8_STATS
3975 #ifndef INET6
3976 	STATUP(data->nsd, ctcp);
3977 #else
3978 	if (data->query->addr.ss_family == AF_INET) {
3979 		STATUP(data->nsd, ctcp);
3980 	} else if (data->query->addr.ss_family == AF_INET6) {
3981 		STATUP(data->nsd, ctcp6);
3982 	}
3983 #endif
3984 #endif /* BIND8_STATS */
3985 
3986 	/* We have a complete query, process it.  */
3987 
3988 	/* tcp-query-count: handle query counter ++ */
3989 	data->query_count++;
3990 
3991 	buffer_flip(data->query->packet);
3992 #ifdef USE_DNSTAP
3993 	/*
3994 	 * and send TCP-query with found address (local) and client address to dnstap process
3995 	 */
3996 	log_addr("query from client", &data->query->addr);
3997 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3998 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3999 		data->query->addrlen, data->query->tcp, data->query->packet);
4000 #endif /* USE_DNSTAP */
4001 	data->query_state = server_process_query(data->nsd, data->query, &now);
4002 	if (data->query_state == QUERY_DISCARDED) {
4003 		/* Drop the packet and the entire connection... */
4004 		STATUP(data->nsd, dropped);
4005 		ZTATUP(data->nsd, data->query->zone, dropped);
4006 		cleanup_tcp_handler(data);
4007 		return;
4008 	}
4009 
4010 #ifdef BIND8_STATS
4011 	if (RCODE(data->query->packet) == RCODE_OK
4012 	    && !AA(data->query->packet))
4013 	{
4014 		STATUP(data->nsd, nona);
4015 		ZTATUP(data->nsd, data->query->zone, nona);
4016 	}
4017 #endif /* BIND8_STATS */
4018 
4019 #ifdef USE_ZONE_STATS
4020 #ifndef INET6
4021 	ZTATUP(data->nsd, data->query->zone, ctcp);
4022 #else
4023 	if (data->query->addr.ss_family == AF_INET) {
4024 		ZTATUP(data->nsd, data->query->zone, ctcp);
4025 	} else if (data->query->addr.ss_family == AF_INET6) {
4026 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4027 	}
4028 #endif
4029 #endif /* USE_ZONE_STATS */
4030 
4031 	query_add_optional(data->query, data->nsd, &now);
4032 
4033 	/* Switch to the tcp write handler.  */
4034 	buffer_flip(data->query->packet);
4035 	data->query->tcplen = buffer_remaining(data->query->packet);
4036 #ifdef BIND8_STATS
4037 	/* Account the rcode & TC... */
4038 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4039 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4040 	if (TC(data->query->packet)) {
4041 		STATUP(data->nsd, truncated);
4042 		ZTATUP(data->nsd, data->query->zone, truncated);
4043 	}
4044 #endif /* BIND8_STATS */
4045 #ifdef USE_DNSTAP
4046 	/*
4047 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4048 	 */
4049 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4050 	log_addr("response to client", &data->query->addr);
4051 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4052 		data->query->addrlen, data->query->tcp, data->query->packet,
4053 		data->query->zone);
4054 #endif /* USE_DNSTAP */
4055 	data->bytes_transmitted = 0;
4056 
4057 	timeout.tv_sec = data->tcp_timeout / 1000;
4058 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4059 
4060 	ev_base = data->event.ev_base;
4061 	event_del(&data->event);
4062 	memset(&data->event, 0, sizeof(data->event));
4063 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4064 		handle_tcp_reading, data);
4065 	if(event_base_set(ev_base, &data->event) != 0)
4066 		log_msg(LOG_ERR, "event base set tcpr failed");
4067 	if(event_add(&data->event, &timeout) != 0)
4068 		log_msg(LOG_ERR, "event add tcpr failed");
4069 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4070 	handle_tcp_writing(fd, EV_WRITE, data);
4071 }
4072 
4073 static void
4074 handle_tcp_writing(int fd, short event, void* arg)
4075 {
4076 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4077 	ssize_t sent;
4078 	struct query *q = data->query;
4079 	struct timeval timeout;
4080 	struct event_base* ev_base;
4081 	uint32_t now = 0;
4082 
4083 	if ((event & EV_TIMEOUT)) {
4084 		/* Connection timed out.  */
4085 		cleanup_tcp_handler(data);
4086 		return;
4087 	}
4088 
4089 	assert((event & EV_WRITE));
4090 
4091 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4092 		/* Writing the response packet length.  */
4093 		uint16_t n_tcplen = htons(q->tcplen);
4094 #ifdef HAVE_WRITEV
4095 		struct iovec iov[2];
4096 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4097 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4098 		iov[1].iov_base = buffer_begin(q->packet);
4099 		iov[1].iov_len = buffer_limit(q->packet);
4100 		sent = writev(fd, iov, 2);
4101 #else /* HAVE_WRITEV */
4102 		sent = write(fd,
4103 			     (const char *) &n_tcplen + data->bytes_transmitted,
4104 			     sizeof(n_tcplen) - data->bytes_transmitted);
4105 #endif /* HAVE_WRITEV */
4106 		if (sent == -1) {
4107 			if (errno == EAGAIN || errno == EINTR) {
4108 				/*
4109 				 * Write would block, wait until
4110 				 * socket becomes writable again.
4111 				 */
4112 				return;
4113 			} else {
4114 #ifdef ECONNRESET
4115 				if(verbosity >= 2 || errno != ECONNRESET)
4116 #endif /* ECONNRESET */
4117 #ifdef EPIPE
4118 				  if(verbosity >= 2 || errno != EPIPE)
4119 #endif /* EPIPE 'broken pipe' */
4120 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4121 				cleanup_tcp_handler(data);
4122 				return;
4123 			}
4124 		}
4125 
4126 		data->bytes_transmitted += sent;
4127 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4128 			/*
4129 			 * Writing not complete, wait until socket
4130 			 * becomes writable again.
4131 			 */
4132 			return;
4133 		}
4134 
4135 #ifdef HAVE_WRITEV
4136 		sent -= sizeof(n_tcplen);
4137 		/* handle potential 'packet done' code */
4138 		goto packet_could_be_done;
4139 #endif
4140  	}
4141 
4142 	sent = write(fd,
4143 		     buffer_current(q->packet),
4144 		     buffer_remaining(q->packet));
4145 	if (sent == -1) {
4146 		if (errno == EAGAIN || errno == EINTR) {
4147 			/*
4148 			 * Write would block, wait until
4149 			 * socket becomes writable again.
4150 			 */
4151 			return;
4152 		} else {
4153 #ifdef ECONNRESET
4154 			if(verbosity >= 2 || errno != ECONNRESET)
4155 #endif /* ECONNRESET */
4156 #ifdef EPIPE
4157 				  if(verbosity >= 2 || errno != EPIPE)
4158 #endif /* EPIPE 'broken pipe' */
4159 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4160 			cleanup_tcp_handler(data);
4161 			return;
4162 		}
4163 	}
4164 
4165 	data->bytes_transmitted += sent;
4166 #ifdef HAVE_WRITEV
4167   packet_could_be_done:
4168 #endif
4169 	buffer_skip(q->packet, sent);
4170 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4171 		/*
4172 		 * Still more data to write when socket becomes
4173 		 * writable again.
4174 		 */
4175 		return;
4176 	}
4177 
4178 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4179 
4180 	if (data->query_state == QUERY_IN_AXFR ||
4181 		data->query_state == QUERY_IN_IXFR) {
4182 		/* Continue processing AXFR and writing back results.  */
4183 		buffer_clear(q->packet);
4184 		if(data->query_state == QUERY_IN_AXFR)
4185 			data->query_state = query_axfr(data->nsd, q, 0);
4186 		else data->query_state = query_ixfr(data->nsd, q);
4187 		if (data->query_state != QUERY_PROCESSED) {
4188 			query_add_optional(data->query, data->nsd, &now);
4189 
4190 			/* Reset data. */
4191 			buffer_flip(q->packet);
4192 			q->tcplen = buffer_remaining(q->packet);
4193 			data->bytes_transmitted = 0;
4194 			/* Reset timeout.  */
4195 			timeout.tv_sec = data->tcp_timeout / 1000;
4196 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4197 			ev_base = data->event.ev_base;
4198 			event_del(&data->event);
4199 			memset(&data->event, 0, sizeof(data->event));
4200 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4201 				handle_tcp_writing, data);
4202 			if(event_base_set(ev_base, &data->event) != 0)
4203 				log_msg(LOG_ERR, "event base set tcpw failed");
4204 			if(event_add(&data->event, &timeout) != 0)
4205 				log_msg(LOG_ERR, "event add tcpw failed");
4206 
4207 			/*
4208 			 * Write data if/when the socket is writable
4209 			 * again.
4210 			 */
4211 			return;
4212 		}
4213 	}
4214 
4215 	/*
4216 	 * Done sending, wait for the next request to arrive on the
4217 	 * TCP socket by installing the TCP read handler.
4218 	 */
4219 	if ((data->nsd->tcp_query_count > 0 &&
4220 		data->query_count >= data->nsd->tcp_query_count) ||
4221 		data->tcp_no_more_queries) {
4222 
4223 		(void) shutdown(fd, SHUT_WR);
4224 	}
4225 
4226 	data->bytes_transmitted = 0;
4227 
4228 	timeout.tv_sec = data->tcp_timeout / 1000;
4229 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4230 	ev_base = data->event.ev_base;
4231 	event_del(&data->event);
4232 	memset(&data->event, 0, sizeof(data->event));
4233 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4234 		handle_tcp_reading, data);
4235 	if(event_base_set(ev_base, &data->event) != 0)
4236 		log_msg(LOG_ERR, "event base set tcpw failed");
4237 	if(event_add(&data->event, &timeout) != 0)
4238 		log_msg(LOG_ERR, "event add tcpw failed");
4239 }
4240 
4241 #ifdef HAVE_SSL
4242 /** create SSL object and associate fd */
4243 static SSL*
4244 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4245 {
4246 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4247 	if(!ssl) {
4248 		log_crypto_err("could not SSL_new");
4249 		return NULL;
4250 	}
4251 	SSL_set_accept_state(ssl);
4252 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4253 	if(!SSL_set_fd(ssl, fd)) {
4254 		log_crypto_err("could not SSL_set_fd");
4255 		SSL_free(ssl);
4256 		return NULL;
4257 	}
4258 	return ssl;
4259 }
4260 
4261 /** TLS handshake to upgrade TCP connection */
4262 static int
4263 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4264 {
4265 	int r;
4266 	if(data->shake_state == tls_hs_read_event) {
4267 		/* read condition satisfied back to writing */
4268 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4269 		data->shake_state = tls_hs_none;
4270 		return 1;
4271 	}
4272 	if(data->shake_state == tls_hs_write_event) {
4273 		/* write condition satisfied back to reading */
4274 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4275 		data->shake_state = tls_hs_none;
4276 		return 1;
4277 	}
4278 
4279 	/* (continue to) setup the TLS connection */
4280 	ERR_clear_error();
4281 	r = SSL_do_handshake(data->tls);
4282 
4283 	if(r != 1) {
4284 		int want = SSL_get_error(data->tls, r);
4285 		if(want == SSL_ERROR_WANT_READ) {
4286 			if(data->shake_state == tls_hs_read) {
4287 				/* try again later */
4288 				return 1;
4289 			}
4290 			data->shake_state = tls_hs_read;
4291 			/* switch back to reading mode */
4292 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4293 			return 1;
4294 		} else if(want == SSL_ERROR_WANT_WRITE) {
4295 			if(data->shake_state == tls_hs_write) {
4296 				/* try again later */
4297 				return 1;
4298 			}
4299 			data->shake_state = tls_hs_write;
4300 			/* switch back to writing mode */
4301 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4302 			return 1;
4303 		} else {
4304 			if(r == 0)
4305 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4306 			else {
4307 				unsigned long err = ERR_get_error();
4308 				if(!squelch_err_ssl_handshake(err)) {
4309 					char a[64], s[256];
4310 					addr2str(&data->query->addr, a, sizeof(a));
4311 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4312 					log_crypto_from_err(s, err);
4313 				}
4314 			}
4315 			cleanup_tcp_handler(data);
4316 			return 0;
4317 		}
4318 	}
4319 
4320 	/* Use to log successful upgrade for testing - could be removed*/
4321 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4322 	/* set back to the event we need to have when reading (or writing) */
4323 	if(data->shake_state == tls_hs_read && writing) {
4324 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4325 	} else if(data->shake_state == tls_hs_write && !writing) {
4326 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4327 	}
4328 	data->shake_state = tls_hs_none;
4329 	return 1;
4330 }
4331 
4332 /** handle TLS reading of incoming query */
4333 static void
4334 handle_tls_reading(int fd, short event, void* arg)
4335 {
4336 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4337 	ssize_t received;
4338 	uint32_t now = 0;
4339 
4340 	if ((event & EV_TIMEOUT)) {
4341 		/* Connection timed out.  */
4342 		cleanup_tcp_handler(data);
4343 		return;
4344 	}
4345 
4346 	if ((data->nsd->tcp_query_count > 0 &&
4347 	    data->query_count >= data->nsd->tcp_query_count) ||
4348 	    data->tcp_no_more_queries) {
4349 		/* No more queries allowed on this tcp connection. */
4350 		cleanup_tcp_handler(data);
4351 		return;
4352 	}
4353 
4354 	assert((event & EV_READ));
4355 
4356 	if (data->bytes_transmitted == 0) {
4357 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4358 	}
4359 
4360 	if(data->shake_state != tls_hs_none) {
4361 		if(!tls_handshake(data, fd, 0))
4362 			return;
4363 		if(data->shake_state != tls_hs_none)
4364 			return;
4365 	}
4366 
4367 	/*
4368 	 * Check if we received the leading packet length bytes yet.
4369 	 */
4370 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4371 		ERR_clear_error();
4372 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4373 		    + data->bytes_transmitted,
4374 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4375 			int want = SSL_get_error(data->tls, received);
4376 			if(want == SSL_ERROR_ZERO_RETURN) {
4377 				cleanup_tcp_handler(data);
4378 				return; /* shutdown, closed */
4379 			} else if(want == SSL_ERROR_WANT_READ) {
4380 				/* wants to be called again */
4381 				return;
4382 			}
4383 			else if(want == SSL_ERROR_WANT_WRITE) {
4384 				/* switch to writing */
4385 				data->shake_state = tls_hs_write_event;
4386 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4387 				return;
4388 			}
4389 			cleanup_tcp_handler(data);
4390 			log_crypto_err("could not SSL_read");
4391 			return;
4392 		}
4393 
4394 		data->bytes_transmitted += received;
4395 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4396 			/*
4397 			 * Not done with the tcplen yet, wait for more
4398 			 * data to become available.
4399 			 */
4400 			return;
4401 		}
4402 
4403 		assert(data->bytes_transmitted == sizeof(uint16_t));
4404 
4405 		data->query->tcplen = ntohs(data->query->tcplen);
4406 
4407 		/*
4408 		 * Minimum query size is:
4409 		 *
4410 		 *     Size of the header (12)
4411 		 *   + Root domain name   (1)
4412 		 *   + Query class        (2)
4413 		 *   + Query type         (2)
4414 		 */
4415 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4416 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4417 			cleanup_tcp_handler(data);
4418 			return;
4419 		}
4420 
4421 		if (data->query->tcplen > data->query->maxlen) {
4422 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4423 			cleanup_tcp_handler(data);
4424 			return;
4425 		}
4426 
4427 		buffer_set_limit(data->query->packet, data->query->tcplen);
4428 	}
4429 
4430 	assert(buffer_remaining(data->query->packet) > 0);
4431 
4432 	/* Read the (remaining) query data.  */
4433 	ERR_clear_error();
4434 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4435 			    (int)buffer_remaining(data->query->packet));
4436 	if(received <= 0) {
4437 		int want = SSL_get_error(data->tls, received);
4438 		if(want == SSL_ERROR_ZERO_RETURN) {
4439 			cleanup_tcp_handler(data);
4440 			return; /* shutdown, closed */
4441 		} else if(want == SSL_ERROR_WANT_READ) {
4442 			/* wants to be called again */
4443 			return;
4444 		}
4445 		else if(want == SSL_ERROR_WANT_WRITE) {
4446 			/* switch back writing */
4447 			data->shake_state = tls_hs_write_event;
4448 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4449 			return;
4450 		}
4451 		cleanup_tcp_handler(data);
4452 		log_crypto_err("could not SSL_read");
4453 		return;
4454 	}
4455 
4456 	data->bytes_transmitted += received;
4457 	buffer_skip(data->query->packet, received);
4458 	if (buffer_remaining(data->query->packet) > 0) {
4459 		/*
4460 		 * Message not yet complete, wait for more data to
4461 		 * become available.
4462 		 */
4463 		return;
4464 	}
4465 
4466 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4467 
4468 	/* Account... */
4469 #ifndef INET6
4470 	STATUP(data->nsd, ctls);
4471 #else
4472 	if (data->query->addr.ss_family == AF_INET) {
4473 		STATUP(data->nsd, ctls);
4474 	} else if (data->query->addr.ss_family == AF_INET6) {
4475 		STATUP(data->nsd, ctls6);
4476 	}
4477 #endif
4478 
4479 	/* We have a complete query, process it.  */
4480 
4481 	/* tcp-query-count: handle query counter ++ */
4482 	data->query_count++;
4483 
4484 	buffer_flip(data->query->packet);
4485 #ifdef USE_DNSTAP
4486 	/*
4487 	 * and send TCP-query with found address (local) and client address to dnstap process
4488 	 */
4489 	log_addr("query from client", &data->query->addr);
4490 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4491 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4492 		data->query->addrlen, data->query->tcp, data->query->packet);
4493 #endif /* USE_DNSTAP */
4494 	data->query_state = server_process_query(data->nsd, data->query, &now);
4495 	if (data->query_state == QUERY_DISCARDED) {
4496 		/* Drop the packet and the entire connection... */
4497 		STATUP(data->nsd, dropped);
4498 		ZTATUP(data->nsd, data->query->zone, dropped);
4499 		cleanup_tcp_handler(data);
4500 		return;
4501 	}
4502 
4503 #ifdef BIND8_STATS
4504 	if (RCODE(data->query->packet) == RCODE_OK
4505 	    && !AA(data->query->packet))
4506 	{
4507 		STATUP(data->nsd, nona);
4508 		ZTATUP(data->nsd, data->query->zone, nona);
4509 	}
4510 #endif /* BIND8_STATS */
4511 
4512 #ifdef USE_ZONE_STATS
4513 #ifndef INET6
4514 	ZTATUP(data->nsd, data->query->zone, ctls);
4515 #else
4516 	if (data->query->addr.ss_family == AF_INET) {
4517 		ZTATUP(data->nsd, data->query->zone, ctls);
4518 	} else if (data->query->addr.ss_family == AF_INET6) {
4519 		ZTATUP(data->nsd, data->query->zone, ctls6);
4520 	}
4521 #endif
4522 #endif /* USE_ZONE_STATS */
4523 
4524 	query_add_optional(data->query, data->nsd, &now);
4525 
4526 	/* Switch to the tcp write handler.  */
4527 	buffer_flip(data->query->packet);
4528 	data->query->tcplen = buffer_remaining(data->query->packet);
4529 #ifdef BIND8_STATS
4530 	/* Account the rcode & TC... */
4531 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4532 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4533 	if (TC(data->query->packet)) {
4534 		STATUP(data->nsd, truncated);
4535 		ZTATUP(data->nsd, data->query->zone, truncated);
4536 	}
4537 #endif /* BIND8_STATS */
4538 #ifdef USE_DNSTAP
4539 	/*
4540 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4541 	 */
4542 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4543 	log_addr("response to client", &data->query->addr);
4544 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4545 		data->query->addrlen, data->query->tcp, data->query->packet,
4546 		data->query->zone);
4547 #endif /* USE_DNSTAP */
4548 	data->bytes_transmitted = 0;
4549 
4550 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4551 
4552 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4553 	handle_tls_writing(fd, EV_WRITE, data);
4554 }
4555 
4556 /** handle TLS writing of outgoing response */
4557 static void
4558 handle_tls_writing(int fd, short event, void* arg)
4559 {
4560 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4561 	ssize_t sent;
4562 	struct query *q = data->query;
4563 	/* static variable that holds reassembly buffer used to put the
4564 	 * TCP length in front of the packet, like writev. */
4565 	static buffer_type* global_tls_temp_buffer = NULL;
4566 	buffer_type* write_buffer;
4567 	uint32_t now = 0;
4568 
4569 	if ((event & EV_TIMEOUT)) {
4570 		/* Connection timed out.  */
4571 		cleanup_tcp_handler(data);
4572 		return;
4573 	}
4574 
4575 	assert((event & EV_WRITE));
4576 
4577 	if(data->shake_state != tls_hs_none) {
4578 		if(!tls_handshake(data, fd, 1))
4579 			return;
4580 		if(data->shake_state != tls_hs_none)
4581 			return;
4582 	}
4583 
4584 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4585 
4586 	/* If we are writing the start of a message, we must include the length
4587 	 * this is done with a copy into write_buffer. */
4588 	write_buffer = NULL;
4589 	if (data->bytes_transmitted == 0) {
4590 		if(!global_tls_temp_buffer) {
4591 			/* gets deallocated when nsd shuts down from
4592 			 * nsd.region */
4593 			global_tls_temp_buffer = buffer_create(nsd.region,
4594 				QIOBUFSZ + sizeof(q->tcplen));
4595 			if (!global_tls_temp_buffer) {
4596 				return;
4597 			}
4598 		}
4599 		write_buffer = global_tls_temp_buffer;
4600 		buffer_clear(write_buffer);
4601 		buffer_write_u16(write_buffer, q->tcplen);
4602 		buffer_write(write_buffer, buffer_current(q->packet),
4603 			(int)buffer_remaining(q->packet));
4604 		buffer_flip(write_buffer);
4605 	} else {
4606 		write_buffer = q->packet;
4607 	}
4608 
4609 	/* Write the response */
4610 	ERR_clear_error();
4611 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4612 	if(sent <= 0) {
4613 		int want = SSL_get_error(data->tls, sent);
4614 		if(want == SSL_ERROR_ZERO_RETURN) {
4615 			cleanup_tcp_handler(data);
4616 			/* closed */
4617 		} else if(want == SSL_ERROR_WANT_READ) {
4618 			/* switch back to reading */
4619 			data->shake_state = tls_hs_read_event;
4620 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4621 		} else if(want != SSL_ERROR_WANT_WRITE) {
4622 			cleanup_tcp_handler(data);
4623 			log_crypto_err("could not SSL_write");
4624 		}
4625 		return;
4626 	}
4627 
4628 	buffer_skip(write_buffer, sent);
4629 	if(buffer_remaining(write_buffer) != 0) {
4630 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4631 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4632 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4633 		}
4634 	}
4635 
4636 	data->bytes_transmitted += sent;
4637 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4638 		/*
4639 		 * Still more data to write when socket becomes
4640 		 * writable again.
4641 		 */
4642 		return;
4643 	}
4644 
4645 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4646 
4647 	if (data->query_state == QUERY_IN_AXFR ||
4648 		data->query_state == QUERY_IN_IXFR) {
4649 		/* Continue processing AXFR and writing back results.  */
4650 		buffer_clear(q->packet);
4651 		if(data->query_state == QUERY_IN_AXFR)
4652 			data->query_state = query_axfr(data->nsd, q, 0);
4653 		else data->query_state = query_ixfr(data->nsd, q);
4654 		if (data->query_state != QUERY_PROCESSED) {
4655 			query_add_optional(data->query, data->nsd, &now);
4656 
4657 			/* Reset data. */
4658 			buffer_flip(q->packet);
4659 			q->tcplen = buffer_remaining(q->packet);
4660 			data->bytes_transmitted = 0;
4661 			/* Reset to writing mode.  */
4662 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4663 
4664 			/*
4665 			 * Write data if/when the socket is writable
4666 			 * again.
4667 			 */
4668 			return;
4669 		}
4670 	}
4671 
4672 	/*
4673 	 * Done sending, wait for the next request to arrive on the
4674 	 * TCP socket by installing the TCP read handler.
4675 	 */
4676 	if ((data->nsd->tcp_query_count > 0 &&
4677 		data->query_count >= data->nsd->tcp_query_count) ||
4678 		data->tcp_no_more_queries) {
4679 
4680 		(void) shutdown(fd, SHUT_WR);
4681 	}
4682 
4683 	data->bytes_transmitted = 0;
4684 
4685 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4686 }
4687 #endif
4688 
4689 static void
4690 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4691 	void* ATTR_UNUSED(arg))
4692 {
4693 	if(slowaccept) {
4694 		configure_handler_event_types(EV_PERSIST | EV_READ);
4695 		slowaccept = 0;
4696 	}
4697 }
4698 
4699 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4700 {
4701 #ifndef HAVE_ACCEPT4
4702 	int s = accept(fd, addr, addrlen);
4703 	if (s != -1) {
4704 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4705 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4706 			close(s);
4707 			s = -1;
4708 			errno=EINTR; /* stop error printout as error in accept4
4709 				by setting this errno, it omits printout, in
4710 				later code that calls nsd_accept4 */
4711 		}
4712 	}
4713 	return s;
4714 #else
4715 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4716 #endif /* HAVE_ACCEPT4 */
4717 }
4718 
4719 /*
4720  * Handle an incoming TCP connection.  The connection is accepted and
4721  * a new TCP reader event handler is added.  The TCP handler
4722  * is responsible for cleanup when the connection is closed.
4723  */
4724 static void
4725 handle_tcp_accept(int fd, short event, void* arg)
4726 {
4727 	struct tcp_accept_handler_data *data
4728 		= (struct tcp_accept_handler_data *) arg;
4729 	int s;
4730 	int reject = 0;
4731 	struct tcp_handler_data *tcp_data;
4732 	region_type *tcp_region;
4733 #ifdef INET6
4734 	struct sockaddr_storage addr;
4735 #else
4736 	struct sockaddr_in addr;
4737 #endif
4738 	socklen_t addrlen;
4739 	struct timeval timeout;
4740 
4741 	if (!(event & EV_READ)) {
4742 		return;
4743 	}
4744 
4745 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4746 		reject = data->nsd->options->tcp_reject_overflow;
4747 		if (!reject) {
4748 			return;
4749 		}
4750 	}
4751 
4752 	/* Accept it... */
4753 	addrlen = sizeof(addr);
4754 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4755 	if (s == -1) {
4756 		/**
4757 		 * EMFILE and ENFILE is a signal that the limit of open
4758 		 * file descriptors has been reached. Pause accept().
4759 		 * EINTR is a signal interrupt. The others are various OS ways
4760 		 * of saying that the client has closed the connection.
4761 		 */
4762 		if (errno == EMFILE || errno == ENFILE) {
4763 			if (!slowaccept) {
4764 				/* disable accept events */
4765 				struct timeval tv;
4766 				configure_handler_event_types(0);
4767 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4768 				tv.tv_usec = 0L;
4769 				memset(&slowaccept_event, 0,
4770 					sizeof(slowaccept_event));
4771 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4772 					handle_slowaccept_timeout, NULL);
4773 				(void)event_base_set(data->event.ev_base,
4774 					&slowaccept_event);
4775 				(void)event_add(&slowaccept_event, &tv);
4776 				slowaccept = 1;
4777 				/* We don't want to spam the logs here */
4778 			}
4779 		} else if (errno != EINTR
4780 			&& errno != EWOULDBLOCK
4781 #ifdef ECONNABORTED
4782 			&& errno != ECONNABORTED
4783 #endif /* ECONNABORTED */
4784 #ifdef EPROTO
4785 			&& errno != EPROTO
4786 #endif /* EPROTO */
4787 			) {
4788 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4789 		}
4790 		return;
4791 	}
4792 
4793 	if (reject) {
4794 		shutdown(s, SHUT_RDWR);
4795 		close(s);
4796 		return;
4797 	}
4798 
4799 	/*
4800 	 * This region is deallocated when the TCP connection is
4801 	 * closed by the TCP handler.
4802 	 */
4803 	tcp_region = region_create(xalloc, free);
4804 	tcp_data = (struct tcp_handler_data *) region_alloc(
4805 		tcp_region, sizeof(struct tcp_handler_data));
4806 	tcp_data->region = tcp_region;
4807 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4808 		compression_table_size, compressed_dnames);
4809 	tcp_data->nsd = data->nsd;
4810 	tcp_data->query_count = 0;
4811 #ifdef HAVE_SSL
4812 	tcp_data->shake_state = tls_hs_none;
4813 	tcp_data->tls = NULL;
4814 #endif
4815 	tcp_data->prev = NULL;
4816 	tcp_data->next = NULL;
4817 
4818 	tcp_data->query_state = QUERY_PROCESSED;
4819 	tcp_data->bytes_transmitted = 0;
4820 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4821 	tcp_data->query->addrlen = addrlen;
4822 
4823 	tcp_data->tcp_no_more_queries = 0;
4824 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4825 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4826 		/* very busy, give smaller timeout */
4827 		tcp_data->tcp_timeout = 200;
4828 	}
4829 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4830 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4831 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4832 
4833 #ifdef USE_DNSTAP
4834 	/* save the address of the connection */
4835 	tcp_data->socket = data->socket;
4836 #endif /* USE_DNSTAP */
4837 
4838 #ifdef HAVE_SSL
4839 	if (data->tls_accept) {
4840 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4841 		if(!tcp_data->tls) {
4842 			close(s);
4843 			return;
4844 		}
4845 		tcp_data->shake_state = tls_hs_read;
4846 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4847 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4848 			  handle_tls_reading, tcp_data);
4849 	} else {
4850 #endif
4851 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4852 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4853 			  handle_tcp_reading, tcp_data);
4854 #ifdef HAVE_SSL
4855 	}
4856 #endif
4857 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4858 		log_msg(LOG_ERR, "cannot set tcp event base");
4859 		close(s);
4860 		region_destroy(tcp_region);
4861 		return;
4862 	}
4863 	if(event_add(&tcp_data->event, &timeout) != 0) {
4864 		log_msg(LOG_ERR, "cannot add tcp to event base");
4865 		close(s);
4866 		region_destroy(tcp_region);
4867 		return;
4868 	}
4869 	if(tcp_active_list) {
4870 		tcp_active_list->prev = tcp_data;
4871 		tcp_data->next = tcp_active_list;
4872 	}
4873 	tcp_active_list = tcp_data;
4874 
4875 	/*
4876 	 * Keep track of the total number of TCP handlers installed so
4877 	 * we can stop accepting connections when the maximum number
4878 	 * of simultaneous TCP connections is reached.
4879 	 *
4880 	 * If tcp-reject-overflow is enabled, however, then we do not
4881 	 * change the handler event type; we keep it as-is and accept
4882 	 * overflow TCP connections only so that we can forcibly kill
4883 	 * them off.
4884 	 */
4885 	++data->nsd->current_tcp_count;
4886 	if (!data->nsd->options->tcp_reject_overflow &&
4887 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4888 	{
4889 		configure_handler_event_types(0);
4890 	}
4891 }
4892 
4893 static void
4894 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4895 {
4896 	size_t i;
4897 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4898 	for (i = 0; i < nsd->child_count; ++i) {
4899 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4900 			if (write(nsd->children[i].child_fd,
4901 				&command,
4902 				sizeof(command)) == -1)
4903 			{
4904 				if(errno != EAGAIN && errno != EINTR)
4905 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4906 					(int) command,
4907 					(int) nsd->children[i].pid,
4908 					strerror(errno));
4909 			} else if (timeout > 0) {
4910 				(void)block_read(NULL,
4911 					nsd->children[i].child_fd,
4912 					&command, sizeof(command), timeout);
4913 			}
4914 			fsync(nsd->children[i].child_fd);
4915 			close(nsd->children[i].child_fd);
4916 			nsd->children[i].child_fd = -1;
4917 		}
4918 	}
4919 }
4920 
4921 static void
4922 send_children_quit(struct nsd* nsd)
4923 {
4924 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4925 	send_children_command(nsd, NSD_QUIT, 0);
4926 }
4927 
4928 static void
4929 send_children_quit_and_wait(struct nsd* nsd)
4930 {
4931 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4932 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4933 }
4934 
4935 #ifdef BIND8_STATS
4936 static void
4937 set_children_stats(struct nsd* nsd)
4938 {
4939 	size_t i;
4940 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4941 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4942 	for (i = 0; i < nsd->child_count; ++i) {
4943 		nsd->children[i].need_to_send_STATS = 1;
4944 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4945 	}
4946 }
4947 #endif /* BIND8_STATS */
4948 
4949 static void
4950 configure_handler_event_types(short event_types)
4951 {
4952 	size_t i;
4953 
4954 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4955 		struct event* handler = &tcp_accept_handlers[i].event;
4956 		if(event_types) {
4957 			/* reassign */
4958 			int fd = handler->ev_fd;
4959 			struct event_base* base = handler->ev_base;
4960 			if(tcp_accept_handlers[i].event_added)
4961 				event_del(handler);
4962 			memset(handler, 0, sizeof(*handler));
4963 			event_set(handler, fd, event_types,
4964 				handle_tcp_accept, &tcp_accept_handlers[i]);
4965 			if(event_base_set(base, handler) != 0)
4966 				log_msg(LOG_ERR, "conhand: cannot event_base");
4967 			if(event_add(handler, NULL) != 0)
4968 				log_msg(LOG_ERR, "conhand: cannot event_add");
4969 			tcp_accept_handlers[i].event_added = 1;
4970 		} else {
4971 			/* remove */
4972 			if(tcp_accept_handlers[i].event_added) {
4973 				event_del(handler);
4974 				tcp_accept_handlers[i].event_added = 0;
4975 			}
4976 		}
4977 	}
4978 }
4979