xref: /openbsd-src/usr.sbin/nsd/server.c (revision 4e1ee0786f11cc571bd0be17d38e46f635c719fc)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #ifdef USE_DNSTAP
85 #include "dnstap/dnstap_collector.h"
86 #endif
87 
88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
89 
90 #ifdef USE_DNSTAP
91 /*
92  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
93  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
94  */
95 static void
96 log_addr(const char* descr,
97 #ifdef INET6
98 	struct sockaddr_storage* addr
99 #else
100 	struct sockaddr_in* addr
101 #endif
102 	)
103 {
104 	char str_buf[64];
105 	if(verbosity < 6)
106 		return;
107 	if(
108 #ifdef INET6
109 		addr->ss_family == AF_INET
110 #else
111 		addr->sin_family == AF_INET
112 #endif
113 		) {
114 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
115 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
116 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
117 #ifdef INET6
118 	} else {
119 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
120 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
121 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
122 #endif
123 	}
124 }
125 #endif /* USE_DNSTAP */
126 
127 #ifdef USE_TCP_FASTOPEN
128   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
129   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
130 #endif
131 
132 /*
133  * Data for the UDP handlers.
134  */
135 struct udp_handler_data
136 {
137 	struct nsd        *nsd;
138 	struct nsd_socket *socket;
139 	struct event       event;
140 };
141 
142 struct tcp_accept_handler_data {
143 	struct nsd        *nsd;
144 	struct nsd_socket *socket;
145 	int                event_added;
146 	struct event       event;
147 #ifdef HAVE_SSL
148 	/* handler accepts TLS connections on the dedicated port */
149 	int                tls_accept;
150 #endif
151 };
152 
153 /*
154  * These globals are used to enable the TCP accept handlers
155  * when the number of TCP connection drops below the maximum
156  * number of TCP connections.
157  */
158 static size_t tcp_accept_handler_count;
159 static struct tcp_accept_handler_data *tcp_accept_handlers;
160 
161 static struct event slowaccept_event;
162 static int slowaccept;
163 
164 #ifdef HAVE_SSL
165 static unsigned char *ocspdata = NULL;
166 static long ocspdata_len = 0;
167 #endif
168 
169 #ifdef NONBLOCKING_IS_BROKEN
170 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
171    read multiple times from a socket when reported ready by select. */
172 # define NUM_RECV_PER_SELECT (1)
173 #else /* !NONBLOCKING_IS_BROKEN */
174 # define NUM_RECV_PER_SELECT (100)
175 #endif /* NONBLOCKING_IS_BROKEN */
176 
177 #ifndef HAVE_MMSGHDR
178 struct mmsghdr {
179 	struct msghdr msg_hdr;
180 	unsigned int  msg_len;
181 };
182 #endif
183 
184 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
185 static struct iovec iovecs[NUM_RECV_PER_SELECT];
186 static struct query *queries[NUM_RECV_PER_SELECT];
187 
188 /*
189  * Data for the TCP connection handlers.
190  *
191  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
192  * blocking the entire server on a slow TCP connection, but does make
193  * reading from and writing to the socket more complicated.
194  *
195  * Basically, whenever a read/write would block (indicated by the
196  * EAGAIN errno variable) we remember the position we were reading
197  * from/writing to and return from the TCP reading/writing event
198  * handler.  When the socket becomes readable/writable again we
199  * continue from the same position.
200  */
201 struct tcp_handler_data
202 {
203 	/*
204 	 * The region used to allocate all TCP connection related
205 	 * data, including this structure.  This region is destroyed
206 	 * when the connection is closed.
207 	 */
208 	region_type*		region;
209 
210 	/*
211 	 * The global nsd structure.
212 	 */
213 	struct nsd*			nsd;
214 
215 	/*
216 	 * The current query data for this TCP connection.
217 	 */
218 	query_type*			query;
219 
220 	/*
221 	 * The query_state is used to remember if we are performing an
222 	 * AXFR, if we're done processing, or if we should discard the
223 	 * query and connection.
224 	 */
225 	query_state_type	query_state;
226 
227 	/*
228 	 * The event for the file descriptor and tcp timeout
229 	 */
230 	struct event event;
231 
232 	/*
233 	 * The bytes_transmitted field is used to remember the number
234 	 * of bytes transmitted when receiving or sending a DNS
235 	 * packet.  The count includes the two additional bytes used
236 	 * to specify the packet length on a TCP connection.
237 	 */
238 	size_t				bytes_transmitted;
239 
240 	/*
241 	 * The number of queries handled by this specific TCP connection.
242 	 */
243 	int					query_count;
244 
245 	/*
246 	 * The timeout in msec for this tcp connection
247 	 */
248 	int	tcp_timeout;
249 
250 	/*
251 	 * If the connection is allowed to have further queries on it.
252 	 */
253 	int tcp_no_more_queries;
254 
255 #ifdef USE_DNSTAP
256 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
257 	struct nsd_socket *socket;
258 #endif /* USE_DNSTAP */
259 
260 #ifdef HAVE_SSL
261 	/*
262 	 * TLS object.
263 	 */
264 	SSL* tls;
265 
266 	/*
267 	 * TLS handshake state.
268 	 */
269 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
270 		tls_hs_read_event, tls_hs_write_event } shake_state;
271 #endif
272 	/* list of connections, for service of remaining tcp channels */
273 	struct tcp_handler_data *prev, *next;
274 };
275 /* global that is the list of active tcp channels */
276 static struct tcp_handler_data *tcp_active_list = NULL;
277 
278 /*
279  * Handle incoming queries on the UDP server sockets.
280  */
281 static void handle_udp(int fd, short event, void* arg);
282 
283 /*
284  * Handle incoming connections on the TCP sockets.  These handlers
285  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
286  * connection) but are disabled when the number of current TCP
287  * connections is equal to the maximum number of TCP connections.
288  * Disabling is done by changing the handler to wait for the
289  * NETIO_EVENT_NONE type.  This is done using the function
290  * configure_tcp_accept_handlers.
291  */
292 static void handle_tcp_accept(int fd, short event, void* arg);
293 
294 /*
295  * Handle incoming queries on a TCP connection.  The TCP connections
296  * are configured to be non-blocking and the handler may be called
297  * multiple times before a complete query is received.
298  */
299 static void handle_tcp_reading(int fd, short event, void* arg);
300 
301 /*
302  * Handle outgoing responses on a TCP connection.  The TCP connections
303  * are configured to be non-blocking and the handler may be called
304  * multiple times before a complete response is sent.
305  */
306 static void handle_tcp_writing(int fd, short event, void* arg);
307 
308 #ifdef HAVE_SSL
309 /* Create SSL object and associate fd */
310 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
311 /*
312  * Handle TLS handshake. May be called multiple times if incomplete.
313  */
314 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
315 
316 /*
317  * Handle incoming queries on a TLS over TCP connection.  The TLS
318  * connections are configured to be non-blocking and the handler may
319  * be called multiple times before a complete query is received.
320  */
321 static void handle_tls_reading(int fd, short event, void* arg);
322 
323 /*
324  * Handle outgoing responses on a TLS over TCP connection.  The TLS
325  * connections are configured to be non-blocking and the handler may
326  * be called multiple times before a complete response is sent.
327  */
328 static void handle_tls_writing(int fd, short event, void* arg);
329 #endif
330 
331 /*
332  * Send all children the quit nonblocking, then close pipe.
333  */
334 static void send_children_quit(struct nsd* nsd);
335 /* same, for shutdown time, waits for child to exit to avoid restart issues */
336 static void send_children_quit_and_wait(struct nsd* nsd);
337 
338 /* set childrens flags to send NSD_STATS to them */
339 #ifdef BIND8_STATS
340 static void set_children_stats(struct nsd* nsd);
341 #endif /* BIND8_STATS */
342 
343 /*
344  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
345  */
346 static void configure_handler_event_types(short event_types);
347 
348 static uint16_t *compressed_dname_offsets = 0;
349 static uint32_t compression_table_capacity = 0;
350 static uint32_t compression_table_size = 0;
351 static domain_type* compressed_dnames[MAXRRSPP];
352 
353 #ifdef USE_TCP_FASTOPEN
354 /* Checks to see if the kernel value must be manually changed in order for
355    TCP Fast Open to support server mode */
356 static void report_tcp_fastopen_config() {
357 
358 	int tcp_fastopen_fp;
359 	uint8_t tcp_fastopen_value;
360 
361 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
362 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
363 	}
364 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
365 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
366 		close(tcp_fastopen_fp);
367 	}
368 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
369 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
370 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
371 		log_msg(LOG_WARNING, "To enable TFO use the command:");
372 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
373 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
374 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
375 		close(tcp_fastopen_fp);
376 	}
377 	close(tcp_fastopen_fp);
378 }
379 #endif
380 
381 /*
382  * Remove the specified pid from the list of child pids.  Returns -1 if
383  * the pid is not in the list, child_num otherwise.  The field is set to 0.
384  */
385 static int
386 delete_child_pid(struct nsd *nsd, pid_t pid)
387 {
388 	size_t i;
389 	for (i = 0; i < nsd->child_count; ++i) {
390 		if (nsd->children[i].pid == pid) {
391 			nsd->children[i].pid = 0;
392 			if(!nsd->children[i].need_to_exit) {
393 				if(nsd->children[i].child_fd != -1)
394 					close(nsd->children[i].child_fd);
395 				nsd->children[i].child_fd = -1;
396 				if(nsd->children[i].handler)
397 					nsd->children[i].handler->fd = -1;
398 			}
399 			return i;
400 		}
401 	}
402 	return -1;
403 }
404 
405 /*
406  * Restart child servers if necessary.
407  */
408 static int
409 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
410 	int* xfrd_sock_p)
411 {
412 	struct main_ipc_handler_data *ipc_data;
413 	size_t i;
414 	int sv[2];
415 
416 	/* Fork the child processes... */
417 	for (i = 0; i < nsd->child_count; ++i) {
418 		if (nsd->children[i].pid <= 0) {
419 			if (nsd->children[i].child_fd != -1)
420 				close(nsd->children[i].child_fd);
421 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
422 				log_msg(LOG_ERR, "socketpair: %s",
423 					strerror(errno));
424 				return -1;
425 			}
426 			nsd->children[i].child_fd = sv[0];
427 			nsd->children[i].parent_fd = sv[1];
428 			nsd->children[i].pid = fork();
429 			switch (nsd->children[i].pid) {
430 			default: /* SERVER MAIN */
431 				close(nsd->children[i].parent_fd);
432 				nsd->children[i].parent_fd = -1;
433 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
434 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
435 				}
436 				if(!nsd->children[i].handler)
437 				{
438 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
439 						region, sizeof(struct main_ipc_handler_data));
440 					ipc_data->nsd = nsd;
441 					ipc_data->child = &nsd->children[i];
442 					ipc_data->child_num = i;
443 					ipc_data->xfrd_sock = xfrd_sock_p;
444 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
445 					ipc_data->forward_mode = 0;
446 					ipc_data->got_bytes = 0;
447 					ipc_data->total_bytes = 0;
448 					ipc_data->acl_num = 0;
449 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
450 						region, sizeof(struct netio_handler));
451 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
452 					nsd->children[i].handler->timeout = NULL;
453 					nsd->children[i].handler->user_data = ipc_data;
454 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
455 					nsd->children[i].handler->event_handler = parent_handle_child_command;
456 					netio_add_handler(netio, nsd->children[i].handler);
457 				}
458 				/* clear any ongoing ipc */
459 				ipc_data = (struct main_ipc_handler_data*)
460 					nsd->children[i].handler->user_data;
461 				ipc_data->forward_mode = 0;
462 				/* restart - update fd */
463 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
464 				break;
465 			case 0: /* CHILD */
466 				/* the child need not be able to access the
467 				 * nsd.db file */
468 				namedb_close_udb(nsd->db);
469 #ifdef MEMCLEAN /* OS collects memory pages */
470 				region_destroy(region);
471 #endif
472 
473 				if (pledge("stdio rpath inet", NULL) == -1) {
474 					log_msg(LOG_ERR, "pledge");
475 					exit(1);
476 				}
477 
478 				nsd->pid = 0;
479 				nsd->child_count = 0;
480 				nsd->server_kind = nsd->children[i].kind;
481 				nsd->this_child = &nsd->children[i];
482 				nsd->this_child->child_num = i;
483 				/* remove signal flags inherited from parent
484 				   the parent will handle them. */
485 				nsd->signal_hint_reload_hup = 0;
486 				nsd->signal_hint_reload = 0;
487 				nsd->signal_hint_child = 0;
488 				nsd->signal_hint_quit = 0;
489 				nsd->signal_hint_shutdown = 0;
490 				nsd->signal_hint_stats = 0;
491 				nsd->signal_hint_statsusr = 0;
492 				close(*xfrd_sock_p);
493 				close(nsd->this_child->child_fd);
494 				nsd->this_child->child_fd = -1;
495 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
496 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
497 				}
498 				server_child(nsd);
499 				/* NOTREACH */
500 				exit(0);
501 			case -1:
502 				log_msg(LOG_ERR, "fork failed: %s",
503 					strerror(errno));
504 				return -1;
505 			}
506 		}
507 	}
508 	return 0;
509 }
510 
511 #ifdef BIND8_STATS
512 static void set_bind8_alarm(struct nsd* nsd)
513 {
514 	/* resync so that the next alarm is on the next whole minute */
515 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
516 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
517 }
518 #endif
519 
520 /* set zone stat ids for zones initially read in */
521 static void
522 zonestatid_tree_set(struct nsd* nsd)
523 {
524 	struct radnode* n;
525 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
526 		zone_type* zone = (zone_type*)n->elem;
527 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
528 	}
529 }
530 
531 #ifdef USE_ZONE_STATS
532 void
533 server_zonestat_alloc(struct nsd* nsd)
534 {
535 	size_t num = (nsd->options->zonestatnames->count==0?1:
536 			nsd->options->zonestatnames->count);
537 	size_t sz = sizeof(struct nsdst)*num;
538 	char tmpfile[256];
539 	uint8_t z = 0;
540 
541 	/* file names */
542 	nsd->zonestatfname[0] = 0;
543 	nsd->zonestatfname[1] = 0;
544 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
545 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
546 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
547 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
548 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
549 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
550 
551 	/* file descriptors */
552 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
553 	if(nsd->zonestatfd[0] == -1) {
554 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
555 			strerror(errno));
556 		exit(1);
557 	}
558 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
559 	if(nsd->zonestatfd[0] == -1) {
560 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
561 			strerror(errno));
562 		close(nsd->zonestatfd[0]);
563 		unlink(nsd->zonestatfname[0]);
564 		exit(1);
565 	}
566 
567 #ifdef HAVE_MMAP
568 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
569 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
570 			strerror(errno));
571 		exit(1);
572 	}
573 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
574 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
575 			nsd->zonestatfname[0], strerror(errno));
576 		exit(1);
577 	}
578 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
579 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
580 			strerror(errno));
581 		exit(1);
582 	}
583 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
584 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
585 			nsd->zonestatfname[1], strerror(errno));
586 		exit(1);
587 	}
588 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
589 		MAP_SHARED, nsd->zonestatfd[0], 0);
590 	if(nsd->zonestat[0] == MAP_FAILED) {
591 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
592 		unlink(nsd->zonestatfname[0]);
593 		unlink(nsd->zonestatfname[1]);
594 		exit(1);
595 	}
596 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
597 		MAP_SHARED, nsd->zonestatfd[1], 0);
598 	if(nsd->zonestat[1] == MAP_FAILED) {
599 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
600 		unlink(nsd->zonestatfname[0]);
601 		unlink(nsd->zonestatfname[1]);
602 		exit(1);
603 	}
604 	memset(nsd->zonestat[0], 0, sz);
605 	memset(nsd->zonestat[1], 0, sz);
606 	nsd->zonestatsize[0] = num;
607 	nsd->zonestatsize[1] = num;
608 	nsd->zonestatdesired = num;
609 	nsd->zonestatsizenow = num;
610 	nsd->zonestatnow = nsd->zonestat[0];
611 #endif /* HAVE_MMAP */
612 }
613 
614 void
615 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
616 {
617 #ifdef HAVE_MMAP
618 #ifdef MREMAP_MAYMOVE
619 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
620 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
621 		MREMAP_MAYMOVE);
622 	if(nsd->zonestat[idx] == MAP_FAILED) {
623 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
624 		exit(1);
625 	}
626 #else /* !HAVE MREMAP */
627 	if(msync(nsd->zonestat[idx],
628 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
629 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
630 	if(munmap(nsd->zonestat[idx],
631 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
632 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
633 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
634 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
635 	if(nsd->zonestat[idx] == MAP_FAILED) {
636 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
637 		exit(1);
638 	}
639 #endif /* MREMAP */
640 #endif /* HAVE_MMAP */
641 }
642 
643 /* realloc the zonestat array for the one that is not currently in use,
644  * to match the desired new size of the array (if applicable) */
645 void
646 server_zonestat_realloc(struct nsd* nsd)
647 {
648 #ifdef HAVE_MMAP
649 	uint8_t z = 0;
650 	size_t sz;
651 	int idx = 0; /* index of the zonestat array that is not in use */
652 	if(nsd->zonestatnow == nsd->zonestat[0])
653 		idx = 1;
654 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
655 		return;
656 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
657 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
658 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
659 			strerror(errno));
660 		exit(1);
661 	}
662 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
663 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
664 			nsd->zonestatfname[idx], strerror(errno));
665 		exit(1);
666 	}
667 	zonestat_remap(nsd, idx, sz);
668 	/* zero the newly allocated region */
669 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
670 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
671 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
672 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
673 	}
674 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
675 #endif /* HAVE_MMAP */
676 }
677 
678 /* switchover to use the other array for the new children, that
679  * briefly coexist with the old children.  And we want to avoid them
680  * both writing to the same statistics arrays. */
681 void
682 server_zonestat_switch(struct nsd* nsd)
683 {
684 	if(nsd->zonestatnow == nsd->zonestat[0]) {
685 		nsd->zonestatnow = nsd->zonestat[1];
686 		nsd->zonestatsizenow = nsd->zonestatsize[1];
687 	} else {
688 		nsd->zonestatnow = nsd->zonestat[0];
689 		nsd->zonestatsizenow = nsd->zonestatsize[0];
690 	}
691 }
692 #endif /* USE_ZONE_STATS */
693 
694 static void
695 cleanup_dname_compression_tables(void *ptr)
696 {
697 	free(ptr);
698 	compressed_dname_offsets = NULL;
699 	compression_table_capacity = 0;
700 }
701 
702 static void
703 initialize_dname_compression_tables(struct nsd *nsd)
704 {
705 	size_t needed = domain_table_count(nsd->db->domains) + 1;
706 	needed += EXTRA_DOMAIN_NUMBERS;
707 	if(compression_table_capacity < needed) {
708 		if(compressed_dname_offsets) {
709 			region_remove_cleanup(nsd->db->region,
710 				cleanup_dname_compression_tables,
711 				compressed_dname_offsets);
712 			free(compressed_dname_offsets);
713 		}
714 		compressed_dname_offsets = (uint16_t *) xmallocarray(
715 			needed, sizeof(uint16_t));
716 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
717 			compressed_dname_offsets);
718 		compression_table_capacity = needed;
719 		compression_table_size=domain_table_count(nsd->db->domains)+1;
720 	}
721 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
722 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
723 }
724 
725 static int
726 set_cloexec(struct nsd_socket *sock)
727 {
728 	assert(sock != NULL);
729 
730 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
731 		const char *socktype =
732 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
733 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
734 			socktype, strerror(errno));
735 		return -1;
736 	}
737 
738 	return 1;
739 }
740 
741 static int
742 set_reuseport(struct nsd_socket *sock)
743 {
744 #ifdef SO_REUSEPORT
745 	int on = 1;
746 #ifdef SO_REUSEPORT_LB
747 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
748 	 * SO_REUSEPORT on Linux. This is what the users want with the config
749 	 * option in nsd.conf; if we actually need local address and port reuse
750 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
751 	 * _LB they want.
752 	 */
753 	int opt = SO_REUSEPORT_LB;
754 	static const char optname[] = "SO_REUSEPORT_LB";
755 #else /* !SO_REUSEPORT_LB */
756 	int opt = SO_REUSEPORT;
757 	static const char optname[] = "SO_REUSEPORT";
758 #endif /* SO_REUSEPORT_LB */
759 
760 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
761 		return 1;
762 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
763 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
764 			optname, strerror(errno));
765 	}
766 	return -1;
767 #else
768 	(void)sock;
769 #endif /* SO_REUSEPORT */
770 
771 	return 0;
772 }
773 
774 static int
775 set_reuseaddr(struct nsd_socket *sock)
776 {
777 #ifdef SO_REUSEADDR
778 	int on = 1;
779 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
780 		return 1;
781 	}
782 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
783 		strerror(errno));
784 	return -1;
785 #endif /* SO_REUSEADDR */
786 	return 0;
787 }
788 
789 static int
790 set_rcvbuf(struct nsd_socket *sock, int rcv)
791 {
792 #ifdef SO_RCVBUF
793 #ifdef SO_RCVBUFFORCE
794 	if(0 == setsockopt(
795 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
796 	{
797 		return 1;
798 	}
799 	if(errno == EPERM || errno == ENOBUFS) {
800 		return 0;
801 	}
802 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
803 		strerror(errno));
804 	return -1;
805 #else /* !SO_RCVBUFFORCE */
806 	if (0 == setsockopt(
807 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
808 	{
809 		return 1;
810 	}
811 	if(errno == ENOSYS || errno == ENOBUFS) {
812 		return 0;
813 	}
814 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
815 		strerror(errno));
816 	return -1;
817 #endif /* SO_RCVBUFFORCE */
818 #endif /* SO_RCVBUF */
819 
820 	return 0;
821 }
822 
823 static int
824 set_sndbuf(struct nsd_socket *sock, int snd)
825 {
826 #ifdef SO_SNDBUF
827 #ifdef SO_SNDBUFFORCE
828 	if(0 == setsockopt(
829 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
830 	{
831 		return 1;
832 	}
833 	if(errno == EPERM || errno == ENOBUFS) {
834 		return 0;
835 	}
836 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
837 		strerror(errno));
838 	return -1;
839 #else /* !SO_SNDBUFFORCE */
840 	if(0 == setsockopt(
841 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
842 	{
843 		return 1;
844 	}
845 	if(errno == ENOSYS || errno == ENOBUFS) {
846 		return 0;
847 	}
848 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
849 		strerror(errno));
850 	return -1;
851 #endif /* SO_SNDBUFFORCE */
852 #endif /* SO_SNDBUF */
853 
854 	return 0;
855 }
856 
857 static int
858 set_nonblock(struct nsd_socket *sock)
859 {
860 	const char *socktype =
861 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
862 
863 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
864 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
865 			socktype, strerror(errno));
866 		return -1;
867 	}
868 
869 	return 1;
870 }
871 
872 #ifdef INET6
873 static int
874 set_ipv6_v6only(struct nsd_socket *sock)
875 {
876 #ifdef IPV6_V6ONLY
877 	int on = 1;
878 	const char *socktype =
879 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
880 
881 	if(0 == setsockopt(
882 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
883 	{
884 		return 1;
885 	}
886 
887 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
888 		socktype, strerror(errno));
889 	return -1;
890 #else
891 	(void)sock;
892 #endif /* IPV6_V6ONLY */
893 
894 	return 0;
895 }
896 #endif /* INET6 */
897 
898 #ifdef INET6
899 static int
900 set_ipv6_use_min_mtu(struct nsd_socket *sock)
901 {
902 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
903 #if defined(IPV6_USE_MIN_MTU)
904 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
905 	 * network. Therefore we do not send UDP datagrams larger than the
906 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
907 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
908 	 */
909 	int opt = IPV6_USE_MIN_MTU;
910 	int optval = 1;
911 	static const char optname[] = "IPV6_USE_MIN_MTU";
912 #elif defined(IPV6_MTU)
913 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
914 	 * to the MIN MTU to get the same.
915 	 */
916 	int opt = IPV6_MTU;
917 	int optval = IPV6_MIN_MTU;
918 	static const char optname[] = "IPV6_MTU";
919 #endif
920 	if(0 == setsockopt(
921 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
922 	{
923 		return 1;
924 	}
925 
926 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
927 		optname, strerror(errno));
928 	return -1;
929 #else
930 	(void)sock;
931 #endif /* INET6 */
932 
933 	return 0;
934 }
935 #endif /* INET6 */
936 
937 static int
938 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
939 {
940 	int ret = 0;
941 
942 #if defined(IP_MTU_DISCOVER)
943 	int opt = IP_MTU_DISCOVER;
944 	int optval;
945 # if defined(IP_PMTUDISC_OMIT)
946 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
947 	 * information and send packets with DF=0. Fragmentation is allowed if
948 	 * and only if the packet size exceeds the outgoing interface MTU or
949 	 * the packet encounters smaller MTU link in network. This mitigates
950 	 * DNS fragmentation attacks by preventing forged PMTU information.
951 	 * FreeBSD already has same semantics without setting the option.
952 	 */
953 	optval = IP_PMTUDISC_OMIT;
954 	if(0 == setsockopt(
955 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
956 	{
957 		return 1;
958 	}
959 
960 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
961 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
962 # endif /* IP_PMTUDISC_OMIT */
963 # if defined(IP_PMTUDISC_DONT)
964 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
965 	optval = IP_PMTUDISC_DONT;
966 	if(0 == setsockopt(
967 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
968 	{
969 		return 1;
970 	}
971 
972 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
973 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
974 # endif
975 	ret = -1;
976 #elif defined(IP_DONTFRAG)
977 	int off = 0;
978 	if (0 == setsockopt(
979 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
980 	{
981 		return 1;
982 	}
983 
984 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
985 		strerror(errno));
986 	ret = -1;
987 #else
988 	(void)sock;
989 #endif
990 
991 	return ret;
992 }
993 
994 static int
995 set_ip_freebind(struct nsd_socket *sock)
996 {
997 #ifdef IP_FREEBIND
998 	int on = 1;
999 	const char *socktype =
1000 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1001 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1002 	{
1003 		return 1;
1004 	}
1005 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1006 		socktype, strerror(errno));
1007 	return -1;
1008 #else
1009 	(void)sock;
1010 #endif /* IP_FREEBIND */
1011 
1012 	return 0;
1013 }
1014 
1015 static int
1016 set_ip_transparent(struct nsd_socket *sock)
1017 {
1018 	/*
1019 	The scandalous preprocessor blob here calls for some explanation :)
1020 	POSIX does not specify an option to bind non-local IPs, so
1021 	platforms developed several implementation-specific options,
1022 	all set in the same way, but with different names.
1023 	For additional complexity, some platform manage this setting
1024 	differently for different address families (IPv4 vs IPv6).
1025 	This scandalous preprocessor blob below abstracts such variability
1026 	in the way which leaves the C code as lean and clear as possible.
1027 	*/
1028 
1029 #if defined(IP_TRANSPARENT)
1030 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1031 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1032 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1033 // as of 2020-01, Linux does not support this on IPv6 programmatically
1034 #elif defined(SO_BINDANY)
1035 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1036 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1037 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1038 #elif defined(IP_BINDANY)
1039 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1040 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1041 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1042 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1043 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1044 #endif
1045 
1046 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1047 	(void)sock;
1048 #else
1049 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1050 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1051 #	endif
1052 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1053 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1054 #	endif
1055 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1056 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1057 #	endif
1058 
1059 	int on = 1;
1060 	const char *socktype =
1061 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1062 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1063 
1064 	if(0 == setsockopt(
1065 		sock->s,
1066 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1067 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1068 		&on, sizeof(on)))
1069 	{
1070 		return 1;
1071 	}
1072 
1073 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1074 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1075 	return -1;
1076 #endif
1077 
1078 	return 0;
1079 }
1080 
1081 static int
1082 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1083 {
1084 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1085 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1086 		return 1;
1087 	}
1088 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1089 		strerror(errno));
1090 	return -1;
1091 #else
1092 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1093 #endif
1094 	return 0;
1095 }
1096 
1097 #ifdef USE_TCP_FASTOPEN
1098 static int
1099 set_tcp_fastopen(struct nsd_socket *sock)
1100 {
1101 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1102 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1103 	 */
1104 	int qlen;
1105 
1106 #ifdef __APPLE__
1107 	/* macOS X implementation only supports qlen of 1 via this call. The
1108 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1109 	 * kernel parameter.
1110 	 */
1111 	qlen = 1;
1112 #else
1113 	/* 5 is recommended on Linux. */
1114 	qlen = 5;
1115 #endif
1116 	if (0 == setsockopt(
1117 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1118 	{
1119 		return 1;
1120 	}
1121 
1122 	if (errno == EPERM) {
1123 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1124 				 "; this could likely be because sysctl "
1125 				 "net.inet.tcp.fastopen.enabled, "
1126 				 "net.inet.tcp.fastopen.server_enable, or "
1127 				 "net.ipv4.tcp_fastopen is disabled",
1128 			strerror(errno));
1129 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1130 	 * disabled, except when verbosity enabled for debugging
1131 	 */
1132 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1133 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1134 			strerror(errno));
1135 	}
1136 
1137 	return (errno == ENOPROTOOPT ? 0 : -1);
1138 }
1139 #endif /* USE_TCP_FASTOPEN */
1140 
1141 static int
1142 set_bindtodevice(struct nsd_socket *sock)
1143 {
1144 #if defined(SO_BINDTODEVICE)
1145 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1146 		sock->device, strlen(sock->device)) == -1)
1147 	{
1148 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1149 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1150 		return -1;
1151 	}
1152 
1153 	return 1;
1154 #else
1155 	(void)sock;
1156 	return 0;
1157 #endif
1158 }
1159 
1160 static int
1161 set_setfib(struct nsd_socket *sock)
1162 {
1163 #if defined(SO_SETFIB)
1164 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1165 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1166 	{
1167 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1168 		                 "SO_SETFIB", sock->fib, strerror(errno));
1169 		return -1;
1170 	}
1171 
1172 	return 1;
1173 #else
1174 	(void)sock;
1175 	return 0;
1176 #endif
1177 }
1178 
1179 static int
1180 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1181 {
1182 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1183 
1184 	if(-1 == (sock->s = socket(
1185 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1186 	{
1187 #ifdef INET6
1188 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1189 		   (sock->addr.ai_family == AF_INET6) &&
1190 		   (errno == EAFNOSUPPORT))
1191 		{
1192 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1193 				"not supported");
1194 			return 0;
1195 		}
1196 #endif
1197 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1198 		return -1;
1199 	}
1200 
1201 	set_cloexec(sock);
1202 
1203 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1204 		*reuseport_works = (set_reuseport(sock) == 1);
1205 
1206 	if(nsd->options->receive_buffer_size > 0)
1207 		rcv = nsd->options->receive_buffer_size;
1208 	if(set_rcvbuf(sock, rcv) == -1)
1209 		return -1;
1210 
1211 	if(nsd->options->send_buffer_size > 0)
1212 		snd = nsd->options->send_buffer_size;
1213 	if(set_sndbuf(sock, snd) == -1)
1214 		return -1;
1215 #ifdef INET6
1216 	if(sock->addr.ai_family == AF_INET6) {
1217 		if(set_ipv6_v6only(sock) == -1 ||
1218 		   set_ipv6_use_min_mtu(sock) == -1)
1219 			return -1;
1220 	} else
1221 #endif /* INET6 */
1222 	if(sock->addr.ai_family == AF_INET) {
1223 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1224 			return -1;
1225 	}
1226 
1227 	/* Set socket to non-blocking. Otherwise, on operating systems
1228 	 * with thundering herd problems, the UDP recv could block
1229 	 * after select returns readable.
1230 	 */
1231 	set_nonblock(sock);
1232 
1233 	if(nsd->options->ip_freebind)
1234 		(void)set_ip_freebind(sock);
1235 	if(nsd->options->ip_transparent)
1236 		(void)set_ip_transparent(sock);
1237 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1238 		return -1;
1239 	if(sock->fib != -1 && set_setfib(sock) == -1)
1240 		return -1;
1241 
1242 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1243 		char buf[256];
1244 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1245 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1246 			buf, strerror(errno));
1247 		return -1;
1248 	}
1249 
1250 	return 1;
1251 }
1252 
1253 static int
1254 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1255 {
1256 #ifdef USE_TCP_FASTOPEN
1257 	report_tcp_fastopen_config();
1258 #endif
1259 
1260 	(void)reuseport_works;
1261 
1262 	if(-1 == (sock->s = socket(
1263 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1264 	{
1265 #ifdef INET6
1266 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1267 		   (sock->addr.ai_family == AF_INET6) &&
1268 		   (errno == EAFNOSUPPORT))
1269 		{
1270 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1271 			                     "not supported");
1272 			return 0;
1273 		}
1274 #endif /* INET6 */
1275 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1276 		return -1;
1277 	}
1278 
1279 	set_cloexec(sock);
1280 
1281 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1282 		*reuseport_works = (set_reuseport(sock) == 1);
1283 
1284 	(void)set_reuseaddr(sock);
1285 
1286 #ifdef INET6
1287 	if(sock->addr.ai_family == AF_INET6) {
1288 		if (set_ipv6_v6only(sock) == -1 ||
1289 		    set_ipv6_use_min_mtu(sock) == -1)
1290 			return -1;
1291 	}
1292 #endif
1293 
1294 	if(nsd->tcp_mss > 0)
1295 		set_tcp_maxseg(sock, nsd->tcp_mss);
1296 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1297 	   it may block in accept, even if select() says readable. */
1298 	(void)set_nonblock(sock);
1299 	if(nsd->options->ip_freebind)
1300 		(void)set_ip_freebind(sock);
1301 	if(nsd->options->ip_transparent)
1302 		(void)set_ip_transparent(sock);
1303 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1304 		return -1;
1305 	if(sock->fib != -1 && set_setfib(sock) == -1)
1306 		return -1;
1307 
1308 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1309 		char buf[256];
1310 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1311 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1312 			buf, strerror(errno));
1313 		return -1;
1314 	}
1315 
1316 #ifdef USE_TCP_FASTOPEN
1317 	(void)set_tcp_fastopen(sock);
1318 #endif
1319 
1320 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1321 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1322 		return -1;
1323 	}
1324 
1325 	return 1;
1326 }
1327 
1328 /*
1329  * Initialize the server, reuseport, create and bind the sockets.
1330  */
1331 int
1332 server_init(struct nsd *nsd)
1333 {
1334 	size_t i;
1335 	int reuseport = 1; /* Determine if REUSEPORT works. */
1336 
1337 	/* open server interface ports */
1338 	for(i = 0; i < nsd->ifs; i++) {
1339 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1340 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1341 		{
1342 			return -1;
1343 		}
1344 	}
1345 
1346 	if(nsd->reuseport && reuseport) {
1347 		size_t ifs = nsd->ifs * nsd->reuseport;
1348 
1349 		/* increase the size of the interface arrays, there are going
1350 		 * to be separate interface file descriptors for every server
1351 		 * instance */
1352 		region_remove_cleanup(nsd->region, free, nsd->udp);
1353 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1354 
1355 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1356 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1357 		region_add_cleanup(nsd->region, free, nsd->udp);
1358 		region_add_cleanup(nsd->region, free, nsd->tcp);
1359 		if(ifs > nsd->ifs) {
1360 			memset(&nsd->udp[nsd->ifs], 0,
1361 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1362 			memset(&nsd->tcp[nsd->ifs], 0,
1363 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1364 		}
1365 
1366 		for(i = nsd->ifs; i < ifs; i++) {
1367 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1368 			nsd->udp[i].s = -1;
1369 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1370 				return -1;
1371 			}
1372 			/* Turn off REUSEPORT for TCP by copying the socket
1373 			 * file descriptor.
1374 			 * This means we should not close TCP used by
1375 			 * other servers in reuseport enabled mode, in
1376 			 * server_child().
1377 			 */
1378 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1379 		}
1380 
1381 		nsd->ifs = ifs;
1382 	} else {
1383 		nsd->reuseport = 0;
1384 	}
1385 
1386 	return 0;
1387 }
1388 
1389 /*
1390  * Prepare the server for take off.
1391  *
1392  */
1393 int
1394 server_prepare(struct nsd *nsd)
1395 {
1396 #ifdef RATELIMIT
1397 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1398 #ifdef HAVE_GETRANDOM
1399 	uint32_t v;
1400 	if(getrandom(&v, sizeof(v), 0) == -1) {
1401 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1402 		exit(1);
1403 	}
1404 	hash_set_raninit(v);
1405 #elif defined(HAVE_ARC4RANDOM)
1406 	hash_set_raninit(arc4random());
1407 #else
1408 	uint32_t v = getpid() ^ time(NULL);
1409 	srandom((unsigned long)v);
1410 #  ifdef HAVE_SSL
1411 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1412 		hash_set_raninit(v);
1413 	else
1414 #  endif
1415 		hash_set_raninit(random());
1416 #endif
1417 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1418 		nsd->options->rrl_ratelimit,
1419 		nsd->options->rrl_whitelist_ratelimit,
1420 		nsd->options->rrl_slip,
1421 		nsd->options->rrl_ipv4_prefix_length,
1422 		nsd->options->rrl_ipv6_prefix_length);
1423 #endif /* RATELIMIT */
1424 
1425 	/* Open the database... */
1426 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1427 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1428 			nsd->dbfile, strerror(errno));
1429 		unlink(nsd->task[0]->fname);
1430 		unlink(nsd->task[1]->fname);
1431 #ifdef USE_ZONE_STATS
1432 		unlink(nsd->zonestatfname[0]);
1433 		unlink(nsd->zonestatfname[1]);
1434 #endif
1435 		xfrd_del_tempdir(nsd);
1436 		return -1;
1437 	}
1438 	/* check if zone files have been modified */
1439 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1440 	 * for all zones */
1441 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1442 		nsd->options->database[0] == 0))
1443 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1444 	zonestatid_tree_set(nsd);
1445 
1446 	compression_table_capacity = 0;
1447 	initialize_dname_compression_tables(nsd);
1448 
1449 #ifdef	BIND8_STATS
1450 	/* Initialize times... */
1451 	time(&nsd->st.boot);
1452 	set_bind8_alarm(nsd);
1453 #endif /* BIND8_STATS */
1454 
1455 	return 0;
1456 }
1457 
1458 /*
1459  * Fork the required number of servers.
1460  */
1461 static int
1462 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1463 	int* xfrd_sock_p)
1464 {
1465 	size_t i;
1466 
1467 	/* Start all child servers initially.  */
1468 	for (i = 0; i < nsd->child_count; ++i) {
1469 		nsd->children[i].pid = 0;
1470 	}
1471 
1472 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1473 }
1474 
1475 static void
1476 server_close_socket(struct nsd_socket *sock)
1477 {
1478 	if(sock->s != -1) {
1479 		close(sock->s);
1480 		sock->s = -1;
1481 	}
1482 }
1483 
1484 void
1485 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1486 {
1487 	size_t i;
1488 
1489 	/* Close all the sockets... */
1490 	for (i = 0; i < n; ++i) {
1491 		server_close_socket(&sockets[i]);
1492 	}
1493 }
1494 
1495 /*
1496  * Close the sockets, shutdown the server and exit.
1497  * Does not return.
1498  */
1499 void
1500 server_shutdown(struct nsd *nsd)
1501 {
1502 	size_t i;
1503 
1504 	server_close_all_sockets(nsd->udp, nsd->ifs);
1505 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1506 	/* CHILD: close command channel to parent */
1507 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1508 	{
1509 		close(nsd->this_child->parent_fd);
1510 		nsd->this_child->parent_fd = -1;
1511 	}
1512 	/* SERVER: close command channels to children */
1513 	if(!nsd->this_child)
1514 	{
1515 		for(i=0; i < nsd->child_count; ++i)
1516 			if(nsd->children[i].child_fd != -1)
1517 			{
1518 				close(nsd->children[i].child_fd);
1519 				nsd->children[i].child_fd = -1;
1520 			}
1521 	}
1522 
1523 	tsig_finalize();
1524 #ifdef HAVE_SSL
1525 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1526 	if (nsd->tls_ctx)
1527 		SSL_CTX_free(nsd->tls_ctx);
1528 #endif
1529 
1530 #ifdef MEMCLEAN /* OS collects memory pages */
1531 #ifdef RATELIMIT
1532 	rrl_mmap_deinit_keep_mmap();
1533 #endif
1534 #ifdef USE_DNSTAP
1535 	dt_collector_destroy(nsd->dt_collector, nsd);
1536 #endif
1537 	udb_base_free_keep_mmap(nsd->task[0]);
1538 	udb_base_free_keep_mmap(nsd->task[1]);
1539 	namedb_close_udb(nsd->db); /* keeps mmap */
1540 	namedb_close(nsd->db);
1541 	nsd_options_destroy(nsd->options);
1542 	region_destroy(nsd->region);
1543 #endif
1544 	log_finalize();
1545 	exit(0);
1546 }
1547 
1548 void
1549 server_prepare_xfrd(struct nsd* nsd)
1550 {
1551 	char tmpfile[256];
1552 	/* create task mmaps */
1553 	nsd->mytask = 0;
1554 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1555 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1556 	nsd->task[0] = task_file_create(tmpfile);
1557 	if(!nsd->task[0]) {
1558 #ifdef USE_ZONE_STATS
1559 		unlink(nsd->zonestatfname[0]);
1560 		unlink(nsd->zonestatfname[1]);
1561 #endif
1562 		xfrd_del_tempdir(nsd);
1563 		exit(1);
1564 	}
1565 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1566 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1567 	nsd->task[1] = task_file_create(tmpfile);
1568 	if(!nsd->task[1]) {
1569 		unlink(nsd->task[0]->fname);
1570 #ifdef USE_ZONE_STATS
1571 		unlink(nsd->zonestatfname[0]);
1572 		unlink(nsd->zonestatfname[1]);
1573 #endif
1574 		xfrd_del_tempdir(nsd);
1575 		exit(1);
1576 	}
1577 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1578 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1579 	/* create xfrd listener structure */
1580 	nsd->xfrd_listener = region_alloc(nsd->region,
1581 		sizeof(netio_handler_type));
1582 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1583 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1584 	nsd->xfrd_listener->fd = -1;
1585 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1586 		nsd;
1587 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1588 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1589 }
1590 
1591 
1592 void
1593 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1594 {
1595 	pid_t pid;
1596 	int sockets[2] = {0,0};
1597 	struct ipc_handler_conn_data *data;
1598 
1599 	if(nsd->xfrd_listener->fd != -1)
1600 		close(nsd->xfrd_listener->fd);
1601 	if(del_db) {
1602 		/* recreate taskdb that xfrd was using, it may be corrupt */
1603 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1604 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1605 		nsd->task[1-nsd->mytask]->fname = NULL;
1606 		/* free alloc already, so udb does not shrink itself */
1607 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1608 		nsd->task[1-nsd->mytask]->alloc = NULL;
1609 		udb_base_free(nsd->task[1-nsd->mytask]);
1610 		/* create new file, overwrite the old one */
1611 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1612 		free(tmpfile);
1613 	}
1614 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1615 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1616 		return;
1617 	}
1618 	pid = fork();
1619 	switch (pid) {
1620 	case -1:
1621 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1622 		break;
1623 	default:
1624 		/* PARENT: close first socket, use second one */
1625 		close(sockets[0]);
1626 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1627 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1628 		}
1629 		if(del_db) xfrd_free_namedb(nsd);
1630 		/* use other task than I am using, since if xfrd died and is
1631 		 * restarted, the reload is using nsd->mytask */
1632 		nsd->mytask = 1 - nsd->mytask;
1633 
1634 #ifdef HAVE_SETPROCTITLE
1635 		setproctitle("xfrd");
1636 #endif
1637 #ifdef HAVE_CPUSET_T
1638 		if(nsd->use_cpu_affinity) {
1639 			set_cpu_affinity(nsd->xfrd_cpuset);
1640 		}
1641 #endif
1642 
1643 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1644 		/* ENOTREACH */
1645 		break;
1646 	case 0:
1647 		/* CHILD: close second socket, use first one */
1648 		close(sockets[1]);
1649 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1650 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1651 		}
1652 		nsd->xfrd_listener->fd = sockets[0];
1653 		break;
1654 	}
1655 	/* server-parent only */
1656 	nsd->xfrd_listener->timeout = NULL;
1657 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1658 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1659 	/* clear ongoing ipc reads */
1660 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1661 	data->conn->is_reading = 0;
1662 }
1663 
1664 /** add all soainfo to taskdb */
1665 static void
1666 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1667 {
1668 	struct radnode* n;
1669 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1670 	/* add all SOA INFO to mytask */
1671 	udb_ptr_init(&task_last, taskudb);
1672 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1673 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1674 	}
1675 	udb_ptr_unlink(&task_last, taskudb);
1676 }
1677 
1678 void
1679 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1680 {
1681 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1682 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1683 	 *   then they exchange and process.
1684 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1685 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1686 	 *   expire notifications can be sent back via a normal reload later
1687 	 *   (xfrd will wait for current running reload to finish if any).
1688 	 */
1689 	sig_atomic_t cmd = 0;
1690 	pid_t mypid;
1691 	int xfrd_sock = nsd->xfrd_listener->fd;
1692 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1693 	udb_ptr t;
1694 	if(!shortsoa) {
1695 		if(nsd->signal_hint_shutdown) {
1696 		shutdown:
1697 			log_msg(LOG_WARNING, "signal received, shutting down...");
1698 			server_close_all_sockets(nsd->udp, nsd->ifs);
1699 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1700 #ifdef HAVE_SSL
1701 			daemon_remote_close(nsd->rc);
1702 #endif
1703 			/* Unlink it if possible... */
1704 			unlinkpid(nsd->pidfile);
1705 			unlink(nsd->task[0]->fname);
1706 			unlink(nsd->task[1]->fname);
1707 #ifdef USE_ZONE_STATS
1708 			unlink(nsd->zonestatfname[0]);
1709 			unlink(nsd->zonestatfname[1]);
1710 #endif
1711 			/* write the nsd.db to disk, wait for it to complete */
1712 			udb_base_sync(nsd->db->udb, 1);
1713 			udb_base_close(nsd->db->udb);
1714 			server_shutdown(nsd);
1715 			/* ENOTREACH */
1716 			exit(0);
1717 		}
1718 	}
1719 	if(shortsoa) {
1720 		/* put SOA in xfrd task because mytask may be in use */
1721 		taskudb = nsd->task[1-nsd->mytask];
1722 	}
1723 
1724 	add_all_soa_to_task(nsd, taskudb);
1725 	if(!shortsoa) {
1726 		/* wait for xfrd to signal task is ready, RELOAD signal */
1727 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1728 			cmd != NSD_RELOAD) {
1729 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1730 			exit(1);
1731 		}
1732 		if(nsd->signal_hint_shutdown) {
1733 			goto shutdown;
1734 		}
1735 	}
1736 	/* give xfrd our task, signal it with RELOAD_DONE */
1737 	task_process_sync(taskudb);
1738 	cmd = NSD_RELOAD_DONE;
1739 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1740 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1741 			(int)nsd->pid, strerror(errno));
1742 	}
1743 	mypid = getpid();
1744 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1745 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1746 			strerror(errno));
1747 	}
1748 
1749 	if(!shortsoa) {
1750 		/* process the xfrd task works (expiry data) */
1751 		nsd->mytask = 1 - nsd->mytask;
1752 		taskudb = nsd->task[nsd->mytask];
1753 		task_remap(taskudb);
1754 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1755 		while(!udb_ptr_is_null(&t)) {
1756 			task_process_expire(nsd->db, TASKLIST(&t));
1757 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1758 		}
1759 		udb_ptr_unlink(&t, taskudb);
1760 		task_clear(taskudb);
1761 
1762 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1763 		cmd = NSD_RELOAD_DONE;
1764 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1765 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1766 				(int)nsd->pid, strerror(errno));
1767 		}
1768 	}
1769 }
1770 
1771 #ifdef HAVE_SSL
1772 static void
1773 log_crypto_from_err(const char* str, unsigned long err)
1774 {
1775 	/* error:[error code]:[library name]:[function name]:[reason string] */
1776 	char buf[128];
1777 	unsigned long e;
1778 	ERR_error_string_n(err, buf, sizeof(buf));
1779 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1780 	while( (e=ERR_get_error()) ) {
1781 		ERR_error_string_n(e, buf, sizeof(buf));
1782 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1783 	}
1784 }
1785 
1786 void
1787 log_crypto_err(const char* str)
1788 {
1789 	log_crypto_from_err(str, ERR_get_error());
1790 }
1791 
1792 /** true if the ssl handshake error has to be squelched from the logs */
1793 static int
1794 squelch_err_ssl_handshake(unsigned long err)
1795 {
1796 	if(verbosity >= 3)
1797 		return 0; /* only squelch on low verbosity */
1798 	/* this is very specific, we could filter on ERR_GET_REASON()
1799 	 * (the third element in ERR_PACK) */
1800 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1801 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1802 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1803 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1804 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1805 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1806 #endif
1807 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1808 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1809 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1810 #  ifdef SSL_R_VERSION_TOO_LOW
1811 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1812 #  endif
1813 #endif
1814 		)
1815 		return 1;
1816 	return 0;
1817 }
1818 
1819 void
1820 perform_openssl_init(void)
1821 {
1822 	/* init SSL library */
1823 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1824 	ERR_load_crypto_strings();
1825 #endif
1826 	ERR_load_SSL_strings();
1827 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1828 	OpenSSL_add_all_algorithms();
1829 #else
1830 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1831 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1832 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1833 #endif
1834 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1835 	(void)SSL_library_init();
1836 #else
1837 	OPENSSL_init_ssl(0, NULL);
1838 #endif
1839 
1840 	if(!RAND_status()) {
1841 		/* try to seed it */
1842 		unsigned char buf[256];
1843 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1844 		size_t i;
1845 		v = seed;
1846 		for(i=0; i<256/sizeof(v); i++) {
1847 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1848 			v = v*seed + (unsigned int)i;
1849 		}
1850 		RAND_seed(buf, 256);
1851 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1852 	}
1853 }
1854 
1855 static int
1856 get_ocsp(char *filename, unsigned char **ocsp)
1857 {
1858 	BIO *bio;
1859 	OCSP_RESPONSE *response;
1860 	int len = -1;
1861 	unsigned char *p, *buf;
1862 	assert(filename);
1863 
1864 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1865 		log_crypto_err("get_ocsp: BIO_new_file failed");
1866 		return -1;
1867 	}
1868 
1869 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1870 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1871 		BIO_free(bio);
1872 		return -1;
1873 	}
1874 
1875 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1876 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1877 		OCSP_RESPONSE_free(response);
1878 		BIO_free(bio);
1879 		return -1;
1880 	}
1881 
1882 	if ((buf = malloc((size_t) len)) == NULL) {
1883 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1884 		OCSP_RESPONSE_free(response);
1885 		BIO_free(bio);
1886 		return -1;
1887 	}
1888 
1889 	p = buf;
1890 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1891 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1892 		free(buf);
1893 		OCSP_RESPONSE_free(response);
1894 		BIO_free(bio);
1895 		return -1;
1896 	}
1897 
1898 	OCSP_RESPONSE_free(response);
1899 	BIO_free(bio);
1900 
1901 	*ocsp = buf;
1902 	return len;
1903 }
1904 
1905 /* further setup ssl ctx after the keys are loaded */
1906 static void
1907 listen_sslctx_setup_2(void* ctxt)
1908 {
1909 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1910 	(void)ctx;
1911 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1912 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1913 		/* ENOTREACH */
1914 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1915 	}
1916 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1917 	if(1) {
1918 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1919 		if (!ecdh) {
1920 			log_crypto_err("could not find p256, not enabling ECDHE");
1921 		} else {
1922 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1923 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1924 			}
1925 			EC_KEY_free (ecdh);
1926 		}
1927 	}
1928 #endif
1929 }
1930 
1931 static int
1932 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1933 {
1934 	if(ocspdata) {
1935 		unsigned char *p;
1936 		if ((p=malloc(ocspdata_len)) == NULL) {
1937 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1938 			return SSL_TLSEXT_ERR_NOACK;
1939 		}
1940 		memcpy(p, ocspdata, ocspdata_len);
1941 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1942 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1943 			free(p);
1944 			return SSL_TLSEXT_ERR_NOACK;
1945 		}
1946 		return SSL_TLSEXT_ERR_OK;
1947 	} else {
1948 		return SSL_TLSEXT_ERR_NOACK;
1949 	}
1950 }
1951 
1952 SSL_CTX*
1953 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1954 {
1955 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1956 	if(!ctx) {
1957 		log_crypto_err("could not SSL_CTX_new");
1958 		return NULL;
1959 	}
1960 	/* no SSLv2, SSLv3 because has defects */
1961 #if SSL_OP_NO_SSLv2 != 0
1962 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1963 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1964 		SSL_CTX_free(ctx);
1965 		return NULL;
1966 	}
1967 #endif
1968 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1969 		!= SSL_OP_NO_SSLv3){
1970 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1971 		SSL_CTX_free(ctx);
1972 		return 0;
1973 	}
1974 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1975 	/* if we have tls 1.1 disable 1.0 */
1976 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1977 		!= SSL_OP_NO_TLSv1){
1978 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1979 		SSL_CTX_free(ctx);
1980 		return 0;
1981 	}
1982 #endif
1983 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1984 	/* if we have tls 1.2 disable 1.1 */
1985 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1986 		!= SSL_OP_NO_TLSv1_1){
1987 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1988 		SSL_CTX_free(ctx);
1989 		return 0;
1990 	}
1991 #endif
1992 #if defined(SSL_OP_NO_RENEGOTIATION)
1993 	/* disable client renegotiation */
1994 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1995 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1996 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1997 		SSL_CTX_free(ctx);
1998 		return 0;
1999 	}
2000 #endif
2001 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2002 	/* if we have sha256, set the cipher list to have no known vulns */
2003 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2004 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2005 #endif
2006 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2007 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2008 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2009 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2010 		SSL_CTX_free(ctx);
2011 		return 0;
2012 	}
2013 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2014 	SSL_CTX_set_security_level(ctx, 0);
2015 #endif
2016 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2017 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2018 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2019 		SSL_CTX_free(ctx);
2020 		return NULL;
2021 	}
2022 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2023 		log_msg(LOG_ERR, "error for private key file: %s", key);
2024 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2025 		SSL_CTX_free(ctx);
2026 		return NULL;
2027 	}
2028 	if(!SSL_CTX_check_private_key(ctx)) {
2029 		log_msg(LOG_ERR, "error for key file: %s", key);
2030 		log_crypto_err("Error in SSL_CTX check_private_key");
2031 		SSL_CTX_free(ctx);
2032 		return NULL;
2033 	}
2034 	listen_sslctx_setup_2(ctx);
2035 	if(verifypem && verifypem[0]) {
2036 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2037 			log_crypto_err("Error in SSL_CTX verify locations");
2038 			SSL_CTX_free(ctx);
2039 			return NULL;
2040 		}
2041 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2042 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2043 	}
2044 	return ctx;
2045 }
2046 
2047 SSL_CTX*
2048 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2049 {
2050 	char *key, *pem;
2051 	SSL_CTX *ctx;
2052 
2053 	key = nsd->options->tls_service_key;
2054 	pem = nsd->options->tls_service_pem;
2055 	if(!key || key[0] == 0) {
2056 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2057 		return NULL;
2058 	}
2059 	if(!pem || pem[0] == 0) {
2060 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2061 		return NULL;
2062 	}
2063 
2064 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2065 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2066 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2067 	if(!ctx) {
2068 		log_msg(LOG_ERR, "could not setup server TLS context");
2069 		return NULL;
2070 	}
2071 	if(ocspfile && ocspfile[0]) {
2072 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2073 			log_crypto_err("Error reading OCSPfile");
2074 			SSL_CTX_free(ctx);
2075 			return NULL;
2076 		} else {
2077 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2078 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2079 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2080 				SSL_CTX_free(ctx);
2081 				return NULL;
2082 			}
2083 		}
2084 	}
2085 	return ctx;
2086 }
2087 
2088 /* check if tcp_handler_accept_data created for TLS dedicated port */
2089 int
2090 using_tls_port(struct sockaddr* addr, const char* tls_port)
2091 {
2092 	in_port_t port = 0;
2093 
2094 	if (addr->sa_family == AF_INET)
2095 		port = ((struct sockaddr_in*)addr)->sin_port;
2096 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2097 	else
2098 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2099 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2100 	if (atoi(tls_port) == ntohs(port))
2101 		return 1;
2102 
2103 	return 0;
2104 }
2105 #endif
2106 
2107 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2108 ssize_t
2109 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2110 {
2111 	uint8_t* buf = (uint8_t*) p;
2112 	ssize_t total = 0;
2113 	struct pollfd fd;
2114 	memset(&fd, 0, sizeof(fd));
2115 	fd.fd = s;
2116 	fd.events = POLLIN;
2117 
2118 	while( total < sz) {
2119 		ssize_t ret;
2120 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2121 		if(ret == -1) {
2122 			if(errno == EAGAIN)
2123 				/* blocking read */
2124 				continue;
2125 			if(errno == EINTR) {
2126 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2127 					return -1;
2128 				/* other signals can be handled later */
2129 				continue;
2130 			}
2131 			/* some error */
2132 			return -1;
2133 		}
2134 		if(ret == 0) {
2135 			/* operation timed out */
2136 			return -2;
2137 		}
2138 		ret = read(s, buf+total, sz-total);
2139 		if(ret == -1) {
2140 			if(errno == EAGAIN)
2141 				/* blocking read */
2142 				continue;
2143 			if(errno == EINTR) {
2144 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2145 					return -1;
2146 				/* other signals can be handled later */
2147 				continue;
2148 			}
2149 			/* some error */
2150 			return -1;
2151 		}
2152 		if(ret == 0) {
2153 			/* closed connection! */
2154 			return 0;
2155 		}
2156 		total += ret;
2157 	}
2158 	return total;
2159 }
2160 
2161 static void
2162 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2163 {
2164 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2165 	udb_ptr t, next;
2166 	udb_base* u = nsd->task[nsd->mytask];
2167 	udb_ptr_init(&next, u);
2168 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2169 	udb_base_set_userdata(u, 0);
2170 	while(!udb_ptr_is_null(&t)) {
2171 		/* store next in list so this one can be deleted or reused */
2172 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2173 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2174 
2175 		/* process task t */
2176 		/* append results for task t and update last_task */
2177 		task_process_in_reload(nsd, u, last_task, &t);
2178 
2179 		/* go to next */
2180 		udb_ptr_set_ptr(&t, u, &next);
2181 
2182 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2183 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2184 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2185 			if(cmd == NSD_QUIT) {
2186 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2187 				/* sync to disk (if needed) */
2188 				udb_base_sync(nsd->db->udb, 0);
2189 				/* unlink files of remainder of tasks */
2190 				while(!udb_ptr_is_null(&t)) {
2191 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2192 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2193 					}
2194 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2195 				}
2196 				udb_ptr_unlink(&t, u);
2197 				udb_ptr_unlink(&next, u);
2198 				exit(0);
2199 			}
2200 		}
2201 
2202 	}
2203 	udb_ptr_unlink(&t, u);
2204 	udb_ptr_unlink(&next, u);
2205 }
2206 
2207 #ifdef BIND8_STATS
2208 static void
2209 parent_send_stats(struct nsd* nsd, int cmdfd)
2210 {
2211 	size_t i;
2212 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2213 		log_msg(LOG_ERR, "could not write stats to reload");
2214 		return;
2215 	}
2216 	for(i=0; i<nsd->child_count; i++)
2217 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2218 			sizeof(stc_type))) {
2219 			log_msg(LOG_ERR, "could not write stats to reload");
2220 			return;
2221 		}
2222 }
2223 
2224 static void
2225 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2226 {
2227 	struct nsdst s;
2228 	stc_type* p;
2229 	size_t i;
2230 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2231 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2232 		log_msg(LOG_ERR, "could not read stats from oldpar");
2233 		return;
2234 	}
2235 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2236 	s.db_mem = region_get_mem(nsd->db->region);
2237 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2238 		nsd->child_count);
2239 	if(!p) return;
2240 	for(i=0; i<nsd->child_count; i++) {
2241 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2242 			sizeof(stc_type))
2243 			return;
2244 	}
2245 }
2246 #endif /* BIND8_STATS */
2247 
2248 /*
2249  * Reload the database, stop parent, re-fork children and continue.
2250  * as server_main.
2251  */
2252 static void
2253 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2254 	int cmdsocket)
2255 {
2256 	pid_t mypid;
2257 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2258 	int ret;
2259 	udb_ptr last_task;
2260 	struct sigaction old_sigchld, ign_sigchld;
2261 	/* ignore SIGCHLD from the previous server_main that used this pid */
2262 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2263 	ign_sigchld.sa_handler = SIG_IGN;
2264 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2265 
2266 #ifdef HAVE_SETPROCTITLE
2267 	setproctitle("main");
2268 #endif
2269 #ifdef HAVE_CPUSET_T
2270 	if(nsd->use_cpu_affinity) {
2271 		set_cpu_affinity(nsd->cpuset);
2272 	}
2273 #endif
2274 
2275 	/* see what tasks we got from xfrd */
2276 	task_remap(nsd->task[nsd->mytask]);
2277 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2278 	udb_compact_inhibited(nsd->db->udb, 1);
2279 	reload_process_tasks(nsd, &last_task, cmdsocket);
2280 	udb_compact_inhibited(nsd->db->udb, 0);
2281 	udb_compact(nsd->db->udb);
2282 
2283 #ifndef NDEBUG
2284 	if(nsd_debug_level >= 1)
2285 		region_log_stats(nsd->db->region);
2286 #endif /* NDEBUG */
2287 	/* sync to disk (if needed) */
2288 	udb_base_sync(nsd->db->udb, 0);
2289 
2290 	initialize_dname_compression_tables(nsd);
2291 
2292 #ifdef BIND8_STATS
2293 	/* Restart dumping stats if required.  */
2294 	time(&nsd->st.boot);
2295 	set_bind8_alarm(nsd);
2296 #endif
2297 #ifdef USE_ZONE_STATS
2298 	server_zonestat_realloc(nsd); /* realloc for new children */
2299 	server_zonestat_switch(nsd);
2300 #endif
2301 
2302 	/* listen for the signals of failed children again */
2303 	sigaction(SIGCHLD, &old_sigchld, NULL);
2304 #ifdef USE_DNSTAP
2305 	if (nsd->dt_collector) {
2306 		int *swap_fd_send;
2307 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2308 		/* Swap fd_send with fd_swap so old serve child and new serve
2309 		 * childs will not write to the same pipe ends simultaneously */
2310 		swap_fd_send = nsd->dt_collector_fd_send;
2311 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2312 		nsd->dt_collector_fd_swap = swap_fd_send;
2313 
2314 	}
2315 #endif
2316 	/* Start new child processes */
2317 	if (server_start_children(nsd, server_region, netio, &nsd->
2318 		xfrd_listener->fd) != 0) {
2319 		send_children_quit(nsd);
2320 		exit(1);
2321 	}
2322 
2323 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2324 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2325 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2326 		if(cmd == NSD_QUIT) {
2327 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2328 			send_children_quit(nsd);
2329 			exit(0);
2330 		}
2331 	}
2332 
2333 	/* Send quit command to parent: blocking, wait for receipt. */
2334 	do {
2335 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2336 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2337 		{
2338 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2339 				strerror(errno));
2340 		}
2341 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2342 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2343 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2344 			RELOAD_SYNC_TIMEOUT);
2345 		if(ret == -2) {
2346 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2347 		}
2348 	} while (ret == -2);
2349 	if(ret == -1) {
2350 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2351 			strerror(errno));
2352 	}
2353 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2354 	if(cmd == NSD_QUIT) {
2355 		/* small race condition possible here, parent got quit cmd. */
2356 		send_children_quit(nsd);
2357 		exit(1);
2358 	}
2359 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2360 #ifdef BIND8_STATS
2361 	reload_do_stats(cmdsocket, nsd, &last_task);
2362 #endif
2363 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2364 	task_process_sync(nsd->task[nsd->mytask]);
2365 #ifdef USE_ZONE_STATS
2366 	server_zonestat_realloc(nsd); /* realloc for next children */
2367 #endif
2368 
2369 	/* send soainfo to the xfrd process, signal it that reload is done,
2370 	 * it picks up the taskudb */
2371 	cmd = NSD_RELOAD_DONE;
2372 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2373 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2374 			strerror(errno));
2375 	}
2376 	mypid = getpid();
2377 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2378 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2379 			strerror(errno));
2380 	}
2381 
2382 	/* try to reopen file */
2383 	if (nsd->file_rotation_ok)
2384 		log_reopen(nsd->log_filename, 1);
2385 	/* exit reload, continue as new server_main */
2386 }
2387 
2388 /*
2389  * Get the mode depending on the signal hints that have been received.
2390  * Multiple signal hints can be received and will be handled in turn.
2391  */
2392 static sig_atomic_t
2393 server_signal_mode(struct nsd *nsd)
2394 {
2395 	if(nsd->signal_hint_quit) {
2396 		nsd->signal_hint_quit = 0;
2397 		return NSD_QUIT;
2398 	}
2399 	else if(nsd->signal_hint_shutdown) {
2400 		nsd->signal_hint_shutdown = 0;
2401 		return NSD_SHUTDOWN;
2402 	}
2403 	else if(nsd->signal_hint_child) {
2404 		nsd->signal_hint_child = 0;
2405 		return NSD_REAP_CHILDREN;
2406 	}
2407 	else if(nsd->signal_hint_reload) {
2408 		nsd->signal_hint_reload = 0;
2409 		return NSD_RELOAD;
2410 	}
2411 	else if(nsd->signal_hint_reload_hup) {
2412 		nsd->signal_hint_reload_hup = 0;
2413 		return NSD_RELOAD_REQ;
2414 	}
2415 	else if(nsd->signal_hint_stats) {
2416 		nsd->signal_hint_stats = 0;
2417 #ifdef BIND8_STATS
2418 		set_bind8_alarm(nsd);
2419 #endif
2420 		return NSD_STATS;
2421 	}
2422 	else if(nsd->signal_hint_statsusr) {
2423 		nsd->signal_hint_statsusr = 0;
2424 		return NSD_STATS;
2425 	}
2426 	return NSD_RUN;
2427 }
2428 
2429 /*
2430  * The main server simply waits for signals and child processes to
2431  * terminate.  Child processes are restarted as necessary.
2432  */
2433 void
2434 server_main(struct nsd *nsd)
2435 {
2436 	region_type *server_region = region_create(xalloc, free);
2437 	netio_type *netio = netio_create(server_region);
2438 	netio_handler_type reload_listener;
2439 	int reload_sockets[2] = {-1, -1};
2440 	struct timespec timeout_spec;
2441 	int status;
2442 	pid_t child_pid;
2443 	pid_t reload_pid = -1;
2444 	sig_atomic_t mode;
2445 
2446 	/* Ensure we are the main process */
2447 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2448 
2449 	/* Add listener for the XFRD process */
2450 	netio_add_handler(netio, nsd->xfrd_listener);
2451 
2452 	/* Start the child processes that handle incoming queries */
2453 	if (server_start_children(nsd, server_region, netio,
2454 		&nsd->xfrd_listener->fd) != 0) {
2455 		send_children_quit(nsd);
2456 		exit(1);
2457 	}
2458 	reload_listener.fd = -1;
2459 
2460 	/* This_child MUST be 0, because this is the parent process */
2461 	assert(nsd->this_child == 0);
2462 
2463 	/* Run the server until we get a shutdown signal */
2464 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2465 		/* Did we receive a signal that changes our mode? */
2466 		if(mode == NSD_RUN) {
2467 			nsd->mode = mode = server_signal_mode(nsd);
2468 		}
2469 
2470 		switch (mode) {
2471 		case NSD_RUN:
2472 			/* see if any child processes terminated */
2473 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2474 				int is_child = delete_child_pid(nsd, child_pid);
2475 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2476 					if(nsd->children[is_child].child_fd == -1)
2477 						nsd->children[is_child].has_exited = 1;
2478 					parent_check_all_children_exited(nsd);
2479 				} else if(is_child != -1) {
2480 					log_msg(LOG_WARNING,
2481 					       "server %d died unexpectedly with status %d, restarting",
2482 					       (int) child_pid, status);
2483 					restart_child_servers(nsd, server_region, netio,
2484 						&nsd->xfrd_listener->fd);
2485 				} else if (child_pid == reload_pid) {
2486 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2487 					pid_t mypid;
2488 					log_msg(LOG_WARNING,
2489 					       "Reload process %d failed with status %d, continuing with old database",
2490 					       (int) child_pid, status);
2491 					reload_pid = -1;
2492 					if(reload_listener.fd != -1) close(reload_listener.fd);
2493 					reload_listener.fd = -1;
2494 					reload_listener.event_types = NETIO_EVENT_NONE;
2495 					task_process_sync(nsd->task[nsd->mytask]);
2496 					/* inform xfrd reload attempt ended */
2497 					if(!write_socket(nsd->xfrd_listener->fd,
2498 						&cmd, sizeof(cmd))) {
2499 						log_msg(LOG_ERR, "problems "
2500 						  "sending SOAEND to xfrd: %s",
2501 						  strerror(errno));
2502 					}
2503 					mypid = getpid();
2504 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2505 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2506 							strerror(errno));
2507 					}
2508 #ifdef USE_DNSTAP
2509 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2510 					log_msg(LOG_WARNING,
2511 					       "dnstap-collector %d terminated with status %d",
2512 					       (int) child_pid, status);
2513 					if(nsd->dt_collector) {
2514 						dt_collector_close(nsd->dt_collector, nsd);
2515 						dt_collector_destroy(nsd->dt_collector, nsd);
2516 						nsd->dt_collector = NULL;
2517 					}
2518 					/* Only respawn a crashed (or exited)
2519 					 * dnstap-collector when not reloading,
2520 					 * to not induce a reload during a
2521 					 * reload (which would seriously
2522 					 * disrupt nsd procedures and lead to
2523 					 * unpredictable results)!
2524 					 *
2525 					 * This will *leave* a dnstap-collector
2526 					 * process terminated, but because
2527 					 * signalling of the reload process to
2528 					 * the main process to respawn in this
2529 					 * situation will be cumbersome, and
2530 					 * because this situation is so
2531 					 * specific (and therefore hopefully
2532 					 * extremely rare or non-existing at
2533 					 * all), plus the fact that we are left
2534 					 * with a perfectly function NSD
2535 					 * (besides not logging dnstap
2536 					 * messages), I consider it acceptable
2537 					 * to leave this unresolved.
2538 					 */
2539 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2540 						nsd->dt_collector = dt_collector_create(nsd);
2541 						dt_collector_start(nsd->dt_collector, nsd);
2542 						nsd->mode = NSD_RELOAD_REQ;
2543 					}
2544 #endif
2545 				} else if(status != 0) {
2546 					/* check for status, because we get
2547 					 * the old-servermain because reload
2548 					 * is the process-parent of old-main,
2549 					 * and we get older server-processes
2550 					 * that are exiting after a reload */
2551 					log_msg(LOG_WARNING,
2552 					       "process %d terminated with status %d",
2553 					       (int) child_pid, status);
2554 				}
2555 			}
2556 			if (child_pid == -1) {
2557 				if (errno == EINTR) {
2558 					continue;
2559 				}
2560 				if (errno != ECHILD)
2561 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2562 			}
2563 			if (nsd->mode != NSD_RUN)
2564 				break;
2565 
2566 			/* timeout to collect processes. In case no sigchild happens. */
2567 			timeout_spec.tv_sec = 60;
2568 			timeout_spec.tv_nsec = 0;
2569 
2570 			/* listen on ports, timeout for collecting terminated children */
2571 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2572 				if (errno != EINTR) {
2573 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2574 				}
2575 			}
2576 			if(nsd->restart_children) {
2577 				restart_child_servers(nsd, server_region, netio,
2578 					&nsd->xfrd_listener->fd);
2579 				nsd->restart_children = 0;
2580 			}
2581 			if(nsd->reload_failed) {
2582 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2583 				pid_t mypid;
2584 				nsd->reload_failed = 0;
2585 				log_msg(LOG_WARNING,
2586 				       "Reload process %d failed, continuing with old database",
2587 				       (int) reload_pid);
2588 				reload_pid = -1;
2589 				if(reload_listener.fd != -1) close(reload_listener.fd);
2590 				reload_listener.fd = -1;
2591 				reload_listener.event_types = NETIO_EVENT_NONE;
2592 				task_process_sync(nsd->task[nsd->mytask]);
2593 				/* inform xfrd reload attempt ended */
2594 				if(!write_socket(nsd->xfrd_listener->fd,
2595 					&cmd, sizeof(cmd))) {
2596 					log_msg(LOG_ERR, "problems "
2597 					  "sending SOAEND to xfrd: %s",
2598 					  strerror(errno));
2599 				}
2600 				mypid = getpid();
2601 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2602 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2603 						strerror(errno));
2604 				}
2605 			}
2606 
2607 			break;
2608 		case NSD_RELOAD_REQ: {
2609 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2610 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2611 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2612 				"main: ipc send reload_req to xfrd"));
2613 			if(!write_socket(nsd->xfrd_listener->fd,
2614 				&cmd, sizeof(cmd))) {
2615 				log_msg(LOG_ERR, "server_main: could not send "
2616 				"reload_req to xfrd: %s", strerror(errno));
2617 			}
2618 			nsd->mode = NSD_RUN;
2619 			} break;
2620 		case NSD_RELOAD:
2621 			/* Continue to run nsd after reload */
2622 			nsd->mode = NSD_RUN;
2623 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2624 			if (reload_pid != -1) {
2625 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2626 				       (int) reload_pid);
2627 				break;
2628 			}
2629 
2630 			/* switch the mytask to keep track of who owns task*/
2631 			nsd->mytask = 1 - nsd->mytask;
2632 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2633 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2634 				reload_pid = -1;
2635 				break;
2636 			}
2637 
2638 			/* Do actual reload */
2639 			reload_pid = fork();
2640 			switch (reload_pid) {
2641 			case -1:
2642 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2643 				break;
2644 			default:
2645 				/* PARENT */
2646 				close(reload_sockets[0]);
2647 				server_reload(nsd, server_region, netio,
2648 					reload_sockets[1]);
2649 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2650 				close(reload_sockets[1]);
2651 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2652 				/* drop stale xfrd ipc data */
2653 				((struct ipc_handler_conn_data*)nsd->
2654 					xfrd_listener->user_data)
2655 					->conn->is_reading = 0;
2656 				reload_pid = -1;
2657 				reload_listener.fd = -1;
2658 				reload_listener.event_types = NETIO_EVENT_NONE;
2659 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2660 				break;
2661 			case 0:
2662 				/* CHILD */
2663 				/* server_main keep running until NSD_QUIT_SYNC
2664 				 * received from reload. */
2665 				close(reload_sockets[1]);
2666 				reload_listener.fd = reload_sockets[0];
2667 				reload_listener.timeout = NULL;
2668 				reload_listener.user_data = nsd;
2669 				reload_listener.event_types = NETIO_EVENT_READ;
2670 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2671 				netio_add_handler(netio, &reload_listener);
2672 				reload_pid = getppid();
2673 				break;
2674 			}
2675 			break;
2676 		case NSD_QUIT_SYNC:
2677 			/* synchronisation of xfrd, parent and reload */
2678 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2679 				sig_atomic_t cmd = NSD_RELOAD;
2680 				/* stop xfrd ipc writes in progress */
2681 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2682 					"main: ipc send indication reload"));
2683 				if(!write_socket(nsd->xfrd_listener->fd,
2684 					&cmd, sizeof(cmd))) {
2685 					log_msg(LOG_ERR, "server_main: could not send reload "
2686 					"indication to xfrd: %s", strerror(errno));
2687 				}
2688 				/* wait for ACK from xfrd */
2689 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2690 				nsd->quit_sync_done = 1;
2691 			}
2692 			nsd->mode = NSD_RUN;
2693 			break;
2694 		case NSD_QUIT:
2695 			/* silent shutdown during reload */
2696 			if(reload_listener.fd != -1) {
2697 				/* acknowledge the quit, to sync reload that we will really quit now */
2698 				sig_atomic_t cmd = NSD_RELOAD;
2699 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2700 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2701 					log_msg(LOG_ERR, "server_main: "
2702 						"could not ack quit: %s", strerror(errno));
2703 				}
2704 #ifdef BIND8_STATS
2705 				parent_send_stats(nsd, reload_listener.fd);
2706 #endif /* BIND8_STATS */
2707 				close(reload_listener.fd);
2708 			}
2709 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2710 			/* only quit children after xfrd has acked */
2711 			send_children_quit(nsd);
2712 
2713 #ifdef MEMCLEAN /* OS collects memory pages */
2714 			region_destroy(server_region);
2715 #endif
2716 			server_shutdown(nsd);
2717 
2718 			/* ENOTREACH */
2719 			break;
2720 		case NSD_SHUTDOWN:
2721 			break;
2722 		case NSD_REAP_CHILDREN:
2723 			/* continue; wait for child in run loop */
2724 			nsd->mode = NSD_RUN;
2725 			break;
2726 		case NSD_STATS:
2727 #ifdef BIND8_STATS
2728 			set_children_stats(nsd);
2729 #endif
2730 			nsd->mode = NSD_RUN;
2731 			break;
2732 		default:
2733 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2734 			nsd->mode = NSD_RUN;
2735 			break;
2736 		}
2737 	}
2738 	log_msg(LOG_WARNING, "signal received, shutting down...");
2739 
2740 	/* close opened ports to avoid race with restart of nsd */
2741 	server_close_all_sockets(nsd->udp, nsd->ifs);
2742 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2743 #ifdef HAVE_SSL
2744 	daemon_remote_close(nsd->rc);
2745 #endif
2746 	send_children_quit_and_wait(nsd);
2747 
2748 	/* Unlink it if possible... */
2749 	unlinkpid(nsd->pidfile);
2750 	unlink(nsd->task[0]->fname);
2751 	unlink(nsd->task[1]->fname);
2752 #ifdef USE_ZONE_STATS
2753 	unlink(nsd->zonestatfname[0]);
2754 	unlink(nsd->zonestatfname[1]);
2755 #endif
2756 #ifdef USE_DNSTAP
2757 	dt_collector_close(nsd->dt_collector, nsd);
2758 #endif
2759 
2760 	if(reload_listener.fd != -1) {
2761 		sig_atomic_t cmd = NSD_QUIT;
2762 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2763 			"main: ipc send quit to reload-process"));
2764 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2765 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2766 				strerror(errno));
2767 		}
2768 		fsync(reload_listener.fd);
2769 		close(reload_listener.fd);
2770 		/* wait for reload to finish processing */
2771 		while(1) {
2772 			if(waitpid(reload_pid, NULL, 0) == -1) {
2773 				if(errno == EINTR) continue;
2774 				if(errno == ECHILD) break;
2775 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2776 					(int)reload_pid, strerror(errno));
2777 			}
2778 			break;
2779 		}
2780 	}
2781 	if(nsd->xfrd_listener->fd != -1) {
2782 		/* complete quit, stop xfrd */
2783 		sig_atomic_t cmd = NSD_QUIT;
2784 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2785 			"main: ipc send quit to xfrd"));
2786 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2787 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2788 				strerror(errno));
2789 		}
2790 		fsync(nsd->xfrd_listener->fd);
2791 		close(nsd->xfrd_listener->fd);
2792 		(void)kill(nsd->pid, SIGTERM);
2793 	}
2794 
2795 #ifdef MEMCLEAN /* OS collects memory pages */
2796 	region_destroy(server_region);
2797 #endif
2798 	/* write the nsd.db to disk, wait for it to complete */
2799 	udb_base_sync(nsd->db->udb, 1);
2800 	udb_base_close(nsd->db->udb);
2801 	server_shutdown(nsd);
2802 }
2803 
2804 static query_state_type
2805 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2806 {
2807 	return query_process(query, nsd, now_p);
2808 }
2809 
2810 static query_state_type
2811 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2812 {
2813 #ifdef RATELIMIT
2814 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2815 		if(query->edns.cookie_status != COOKIE_VALID
2816 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2817 		&& rrl_process_query(query))
2818 			return rrl_slip(query);
2819 		else	return QUERY_PROCESSED;
2820 	}
2821 	return QUERY_DISCARDED;
2822 #else
2823 	return query_process(query, nsd, now_p);
2824 #endif
2825 }
2826 
2827 const char*
2828 nsd_event_vs(void)
2829 {
2830 #ifdef USE_MINI_EVENT
2831 	return "";
2832 #else
2833 	return event_get_version();
2834 #endif
2835 }
2836 
2837 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2838 static const char* ub_ev_backend2str(int b)
2839 {
2840 	switch(b) {
2841 	case EVBACKEND_SELECT:	return "select";
2842 	case EVBACKEND_POLL:	return "poll";
2843 	case EVBACKEND_EPOLL:	return "epoll";
2844 	case EVBACKEND_KQUEUE:	return "kqueue";
2845 	case EVBACKEND_DEVPOLL: return "devpoll";
2846 	case EVBACKEND_PORT:	return "evport";
2847 	}
2848 	return "unknown";
2849 }
2850 #endif
2851 
2852 const char*
2853 nsd_event_method(void)
2854 {
2855 #ifdef USE_MINI_EVENT
2856 	return "select";
2857 #else
2858 	struct event_base* b = nsd_child_event_base();
2859 	const char* m = "?";
2860 #  ifdef EV_FEATURE_BACKENDS
2861 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2862 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2863 	m = event_base_get_method(b);
2864 #  endif
2865 #  ifdef MEMCLEAN
2866 	event_base_free(b);
2867 #  endif
2868 	return m;
2869 #endif
2870 }
2871 
2872 struct event_base*
2873 nsd_child_event_base(void)
2874 {
2875 	struct event_base* base;
2876 #ifdef USE_MINI_EVENT
2877 	static time_t secs;
2878 	static struct timeval now;
2879 	base = event_init(&secs, &now);
2880 #else
2881 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2882 	/* libev */
2883 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2884 #  else
2885 	/* libevent */
2886 #    ifdef HAVE_EVENT_BASE_NEW
2887 	base = event_base_new();
2888 #    else
2889 	base = event_init();
2890 #    endif
2891 #  endif
2892 #endif
2893 	return base;
2894 }
2895 
2896 static void
2897 add_udp_handler(
2898 	struct nsd *nsd,
2899 	struct nsd_socket *sock,
2900 	struct udp_handler_data *data)
2901 {
2902 	struct event *handler = &data->event;
2903 
2904 	data->nsd = nsd;
2905 	data->socket = sock;
2906 
2907 	memset(handler, 0, sizeof(*handler));
2908 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2909 	if(event_base_set(nsd->event_base, handler) != 0)
2910 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2911 	if(event_add(handler, NULL) != 0)
2912 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2913 }
2914 
2915 void
2916 add_tcp_handler(
2917 	struct nsd *nsd,
2918 	struct nsd_socket *sock,
2919 	struct tcp_accept_handler_data *data)
2920 {
2921 	struct event *handler = &data->event;
2922 
2923 	data->nsd = nsd;
2924 	data->socket = sock;
2925 
2926 #ifdef HAVE_SSL
2927 	if (nsd->tls_ctx &&
2928 	    nsd->options->tls_port &&
2929 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2930 	{
2931 		data->tls_accept = 1;
2932 		if(verbosity >= 2) {
2933 			char buf[48];
2934 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2935 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2936 		}
2937 	} else {
2938 		data->tls_accept = 0;
2939 	}
2940 #endif
2941 
2942 	memset(handler, 0, sizeof(*handler));
2943 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2944 	if(event_base_set(nsd->event_base, handler) != 0)
2945 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2946 	if(event_add(handler, NULL) != 0)
2947 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2948 	data->event_added = 1;
2949 }
2950 
2951 /*
2952  * Serve DNS requests.
2953  */
2954 void
2955 server_child(struct nsd *nsd)
2956 {
2957 	size_t i, from, numifs;
2958 	region_type *server_region = region_create(xalloc, free);
2959 	struct event_base* event_base = nsd_child_event_base();
2960 	sig_atomic_t mode;
2961 
2962 	if(!event_base) {
2963 		log_msg(LOG_ERR, "nsd server could not create event base");
2964 		exit(1);
2965 	}
2966 	nsd->event_base = event_base;
2967 	nsd->server_region = server_region;
2968 
2969 #ifdef RATELIMIT
2970 	rrl_init(nsd->this_child->child_num);
2971 #endif
2972 
2973 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2974 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2975 
2976 #ifdef HAVE_SETPROCTITLE
2977 	setproctitle("server %d", nsd->this_child->child_num + 1);
2978 #endif
2979 #ifdef HAVE_CPUSET_T
2980 	if(nsd->use_cpu_affinity) {
2981 		set_cpu_affinity(nsd->this_child->cpuset);
2982 	}
2983 #endif
2984 
2985 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2986 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2987 	}
2988 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2989 		server_close_all_sockets(nsd->udp, nsd->ifs);
2990 	}
2991 
2992 	if (nsd->this_child->parent_fd != -1) {
2993 		struct event *handler;
2994 		struct ipc_handler_conn_data* user_data =
2995 			(struct ipc_handler_conn_data*)region_alloc(
2996 			server_region, sizeof(struct ipc_handler_conn_data));
2997 		user_data->nsd = nsd;
2998 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2999 
3000 		handler = (struct event*) region_alloc(
3001 			server_region, sizeof(*handler));
3002 		memset(handler, 0, sizeof(*handler));
3003 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3004 			EV_READ, child_handle_parent_command, user_data);
3005 		if(event_base_set(event_base, handler) != 0)
3006 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3007 		if(event_add(handler, NULL) != 0)
3008 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3009 	}
3010 
3011 	if(nsd->reuseport) {
3012 		numifs = nsd->ifs / nsd->reuseport;
3013 		from = numifs * nsd->this_child->child_num;
3014 		if(from+numifs > nsd->ifs) { /* should not happen */
3015 			from = 0;
3016 			numifs = nsd->ifs;
3017 		}
3018 	} else {
3019 		from = 0;
3020 		numifs = nsd->ifs;
3021 	}
3022 
3023 	if (nsd->server_kind & NSD_SERVER_UDP) {
3024 		int child = nsd->this_child->child_num;
3025 		memset(msgs, 0, sizeof(msgs));
3026 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3027 			queries[i] = query_create(server_region,
3028 				compressed_dname_offsets,
3029 				compression_table_size, compressed_dnames);
3030 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3031 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3032 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3033 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3034 			msgs[i].msg_hdr.msg_iovlen  = 1;
3035 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
3036 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3037 		}
3038 
3039 		for (i = 0; i < nsd->ifs; i++) {
3040 			int listen;
3041 			struct udp_handler_data *data;
3042 
3043 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3044 
3045 			if(i >= from && i < (from + numifs) && listen) {
3046 				data = region_alloc_zero(
3047 					nsd->server_region, sizeof(*data));
3048 				add_udp_handler(nsd, &nsd->udp[i], data);
3049 			} else {
3050 				/* close sockets intended for other servers */
3051 				server_close_socket(&nsd->udp[i]);
3052 			}
3053 		}
3054 	}
3055 
3056 	/*
3057 	 * Keep track of all the TCP accept handlers so we can enable
3058 	 * and disable them based on the current number of active TCP
3059 	 * connections.
3060 	 */
3061 	if (nsd->server_kind & NSD_SERVER_TCP) {
3062 		int child = nsd->this_child->child_num;
3063 		tcp_accept_handler_count = numifs;
3064 		tcp_accept_handlers = region_alloc_array(server_region,
3065 			numifs, sizeof(*tcp_accept_handlers));
3066 
3067 		for (i = 0; i < nsd->ifs; i++) {
3068 			int listen;
3069 			struct tcp_accept_handler_data *data;
3070 
3071 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3072 
3073 			if(i >= from && i < (from + numifs) && listen) {
3074 				data = &tcp_accept_handlers[i-from];
3075 				memset(data, 0, sizeof(*data));
3076 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3077 			} else {
3078 				/* close sockets intended for other servers */
3079 				/*
3080 				 * uncomment this once tcp servers are no
3081 				 * longer copied in the tcp fd copy line
3082 				 * in server_init().
3083 				server_close_socket(&nsd->tcp[i]);
3084 				*/
3085 				/* close sockets not meant for this server*/
3086 				if(!listen)
3087 					server_close_socket(&nsd->tcp[i]);
3088 			}
3089 		}
3090 	} else {
3091 		tcp_accept_handler_count = 0;
3092 	}
3093 
3094 	/* The main loop... */
3095 	while ((mode = nsd->mode) != NSD_QUIT) {
3096 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3097 
3098 		/* Do we need to do the statistics... */
3099 		if (mode == NSD_STATS) {
3100 #ifdef BIND8_STATS
3101 			int p = nsd->st.period;
3102 			nsd->st.period = 1; /* force stats printout */
3103 			/* Dump the statistics */
3104 			bind8_stats(nsd);
3105 			nsd->st.period = p;
3106 #else /* !BIND8_STATS */
3107 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3108 #endif /* BIND8_STATS */
3109 
3110 			nsd->mode = NSD_RUN;
3111 		}
3112 		else if (mode == NSD_REAP_CHILDREN) {
3113 			/* got signal, notify parent. parent reaps terminated children. */
3114 			if (nsd->this_child->parent_fd != -1) {
3115 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3116 				if (write(nsd->this_child->parent_fd,
3117 				    &parent_notify,
3118 				    sizeof(parent_notify)) == -1)
3119 				{
3120 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3121 						(int) nsd->this_child->pid, strerror(errno));
3122 				}
3123 			} else /* no parent, so reap 'em */
3124 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3125 			nsd->mode = NSD_RUN;
3126 		}
3127 		else if(mode == NSD_RUN) {
3128 			/* Wait for a query... */
3129 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3130 				if (errno != EINTR) {
3131 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3132 					break;
3133 				}
3134 			}
3135 		} else if(mode == NSD_QUIT) {
3136 			/* ignore here, quit */
3137 		} else {
3138 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3139 				(int)mode);
3140 			nsd->mode = NSD_RUN;
3141 		}
3142 	}
3143 
3144 	service_remaining_tcp(nsd);
3145 #ifdef	BIND8_STATS
3146 	bind8_stats(nsd);
3147 #endif /* BIND8_STATS */
3148 
3149 #ifdef MEMCLEAN /* OS collects memory pages */
3150 #ifdef RATELIMIT
3151 	rrl_deinit(nsd->this_child->child_num);
3152 #endif
3153 	event_base_free(event_base);
3154 	region_destroy(server_region);
3155 #endif
3156 	server_shutdown(nsd);
3157 }
3158 
3159 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3160 {
3161 	int* timed_out = (int*)arg;
3162         assert(event & EV_TIMEOUT); (void)event;
3163 	/* wake up the service tcp thread, note event is no longer
3164 	 * registered */
3165 	*timed_out = 1;
3166 }
3167 
3168 void
3169 service_remaining_tcp(struct nsd* nsd)
3170 {
3171 	struct tcp_handler_data* p;
3172 	struct event_base* event_base;
3173 	/* check if it is needed */
3174 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3175 		return;
3176 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3177 #ifdef USE_DNSTAP
3178 	/* remove dnstap collector, we cannot write there because the new
3179 	 * child process is using the file descriptor, or the child
3180 	 * process after that. */
3181 	dt_collector_destroy(nsd->dt_collector, nsd);
3182 	nsd->dt_collector = NULL;
3183 #endif
3184 	/* setup event base */
3185 	event_base = nsd_child_event_base();
3186 	if(!event_base) {
3187 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3188 		return;
3189 	}
3190 	/* register tcp connections */
3191 	for(p = tcp_active_list; p != NULL; p = p->next) {
3192 		struct timeval timeout;
3193 		int fd = p->event.ev_fd;
3194 #ifdef USE_MINI_EVENT
3195 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3196 #else
3197 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3198 #endif
3199 		void (*fn)(int, short, void*);
3200 #ifdef HAVE_SSL
3201 		if(p->tls) {
3202 			if((event&EV_READ))
3203 				fn = handle_tls_reading;
3204 			else	fn = handle_tls_writing;
3205 		} else {
3206 #endif
3207 			if((event&EV_READ))
3208 				fn = handle_tcp_reading;
3209 			else	fn = handle_tcp_writing;
3210 #ifdef HAVE_SSL
3211 		}
3212 #endif
3213 
3214 		p->tcp_no_more_queries = 1;
3215 		/* set timeout to 1/10 second */
3216 		if(p->tcp_timeout > 100)
3217 			p->tcp_timeout = 100;
3218 		timeout.tv_sec = p->tcp_timeout / 1000;
3219 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3220 		event_del(&p->event);
3221 		memset(&p->event, 0, sizeof(p->event));
3222 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3223 			fn, p);
3224 		if(event_base_set(event_base, &p->event) != 0)
3225 			log_msg(LOG_ERR, "event base set failed");
3226 		if(event_add(&p->event, &timeout) != 0)
3227 			log_msg(LOG_ERR, "event add failed");
3228 	}
3229 
3230 	/* handle it */
3231 	while(nsd->current_tcp_count > 0) {
3232 		mode_t m = server_signal_mode(nsd);
3233 		struct event timeout;
3234 		struct timeval tv;
3235 		int timed_out = 0;
3236 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3237 			m == NSD_REAP_CHILDREN) {
3238 			/* quit */
3239 			break;
3240 		}
3241 		/* timer */
3242 		/* have to do something every second */
3243 		tv.tv_sec = 1;
3244 		tv.tv_usec = 0;
3245 		memset(&timeout, 0, sizeof(timeout));
3246 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3247 			&timed_out);
3248 		if(event_base_set(event_base, &timeout) != 0)
3249 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3250 		if(event_add(&timeout, &tv) != 0)
3251 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3252 
3253 		/* service loop */
3254 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3255 			if (errno != EINTR) {
3256 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3257 				break;
3258 			}
3259 		}
3260 		if(!timed_out) {
3261 			event_del(&timeout);
3262 		} else {
3263 			/* timed out, quit */
3264 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3265 			break;
3266 		}
3267 	}
3268 #ifdef MEMCLEAN
3269 	event_base_free(event_base);
3270 #endif
3271 	/* continue to quit after return */
3272 }
3273 
3274 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3275  * are always used, even if nonblocking operations are broken, in which case
3276  * NUM_RECV_PER_SELECT is defined to 1 (one).
3277  */
3278 #if defined(HAVE_RECVMMSG)
3279 #define nsd_recvmmsg recvmmsg
3280 #else /* !HAVE_RECVMMSG */
3281 
3282 static int
3283 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3284              int flags, struct timespec *timeout)
3285 {
3286 	unsigned int vpos = 0;
3287 	ssize_t rcvd;
3288 
3289 	/* timeout is ignored, ensure caller does not expect it to work */
3290 	assert(timeout == NULL); (void)timeout;
3291 
3292 	while(vpos < vlen) {
3293 		rcvd = recvfrom(sockfd,
3294 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3295 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3296 		                flags,
3297 		                msgvec[vpos].msg_hdr.msg_name,
3298 		               &msgvec[vpos].msg_hdr.msg_namelen);
3299 		if(rcvd < 0) {
3300 			break;
3301 		} else {
3302 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3303 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3304 			vpos++;
3305 		}
3306 	}
3307 
3308 	if(vpos) {
3309 		/* error will be picked up next time */
3310 		return (int)vpos;
3311 	} else if(errno == 0) {
3312 		return 0;
3313 	} else if(errno == EAGAIN) {
3314 		return 0;
3315 	}
3316 
3317 	return -1;
3318 }
3319 #endif /* HAVE_RECVMMSG */
3320 
3321 #ifdef HAVE_SENDMMSG
3322 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3323 #else /* !HAVE_SENDMMSG */
3324 
3325 static int
3326 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3327 {
3328 	unsigned int vpos = 0;
3329 	ssize_t snd;
3330 
3331 	while(vpos < vlen) {
3332 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3333 		snd = sendto(sockfd,
3334 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3335 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3336 		             flags,
3337 		             msgvec[vpos].msg_hdr.msg_name,
3338 		             msgvec[vpos].msg_hdr.msg_namelen);
3339 		if(snd < 0) {
3340 			break;
3341 		} else {
3342 			msgvec[vpos].msg_len = (unsigned int)snd;
3343 			vpos++;
3344 		}
3345 	}
3346 
3347 	if(vpos) {
3348 		return (int)vpos;
3349 	} else if(errno == 0) {
3350 		return 0;
3351 	}
3352 
3353 	return -1;
3354 }
3355 #endif /* HAVE_SENDMMSG */
3356 
3357 static int
3358 port_is_zero(
3359 #ifdef INET6
3360         struct sockaddr_storage *addr
3361 #else
3362         struct sockaddr_in *addr
3363 #endif
3364 	)
3365 {
3366 #ifdef INET6
3367 	if(addr->ss_family == AF_INET6) {
3368 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3369 	} else if(addr->ss_family == AF_INET) {
3370 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3371 	}
3372 	return 0;
3373 #else
3374 	if(addr->sin_family == AF_INET) {
3375 		return addr->sin_port == 0;
3376 	}
3377 	return 0;
3378 #endif
3379 }
3380 
3381 static void
3382 handle_udp(int fd, short event, void* arg)
3383 {
3384 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3385 	int received, sent, recvcount, i;
3386 	struct query *q;
3387 	uint32_t now = 0;
3388 
3389 	if (!(event & EV_READ)) {
3390 		return;
3391 	}
3392 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3393 	/* this printf strangely gave a performance increase on Linux */
3394 	/* printf("recvcount %d \n", recvcount); */
3395 	if (recvcount == -1) {
3396 		if (errno != EAGAIN && errno != EINTR) {
3397 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3398 			STATUP(data->nsd, rxerr);
3399 			/* No zone statup */
3400 		}
3401 		/* Simply no data available */
3402 		return;
3403 	}
3404 	for (i = 0; i < recvcount; i++) {
3405 	loopstart:
3406 		received = msgs[i].msg_len;
3407 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3408 		q = queries[i];
3409 		if (received == -1) {
3410 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3411 #if defined(HAVE_RECVMMSG)
3412 				msgs[i].msg_hdr.msg_flags
3413 #else
3414 				errno
3415 #endif
3416 				));
3417 			STATUP(data->nsd, rxerr);
3418 			/* No zone statup */
3419 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3420 			iovecs[i].iov_len = buffer_remaining(q->packet);
3421 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3422 			goto swap_drop;
3423 		}
3424 
3425 		/* Account... */
3426 #ifdef BIND8_STATS
3427 		if (data->socket->addr.ai_family == AF_INET) {
3428 			STATUP(data->nsd, qudp);
3429 		} else if (data->socket->addr.ai_family == AF_INET6) {
3430 			STATUP(data->nsd, qudp6);
3431 		}
3432 #endif
3433 
3434 		buffer_skip(q->packet, received);
3435 		buffer_flip(q->packet);
3436 #ifdef USE_DNSTAP
3437 		/*
3438 		 * sending UDP-query with server address (local) and client address to dnstap process
3439 		 */
3440 		log_addr("query from client", &q->addr);
3441 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3442 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3443 			q->tcp, q->packet);
3444 #endif /* USE_DNSTAP */
3445 
3446 		/* Process and answer the query... */
3447 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3448 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3449 				STATUP(data->nsd, nona);
3450 				ZTATUP(data->nsd, q->zone, nona);
3451 			}
3452 
3453 #ifdef USE_ZONE_STATS
3454 			if (data->socket->addr.ai_family == AF_INET) {
3455 				ZTATUP(data->nsd, q->zone, qudp);
3456 			} else if (data->socket->addr.ai_family == AF_INET6) {
3457 				ZTATUP(data->nsd, q->zone, qudp6);
3458 			}
3459 #endif
3460 
3461 			/* Add EDNS0 and TSIG info if necessary.  */
3462 			query_add_optional(q, data->nsd, &now);
3463 
3464 			buffer_flip(q->packet);
3465 			iovecs[i].iov_len = buffer_remaining(q->packet);
3466 #ifdef BIND8_STATS
3467 			/* Account the rcode & TC... */
3468 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3469 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3470 			if (TC(q->packet)) {
3471 				STATUP(data->nsd, truncated);
3472 				ZTATUP(data->nsd, q->zone, truncated);
3473 			}
3474 #endif /* BIND8_STATS */
3475 #ifdef USE_DNSTAP
3476 			/*
3477 			 * sending UDP-response with server address (local) and client address to dnstap process
3478 			 */
3479 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3480 			log_addr("response to client", &q->addr);
3481 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3482 				&q->addr, q->addrlen, q->tcp, q->packet,
3483 				q->zone);
3484 #endif /* USE_DNSTAP */
3485 		} else {
3486 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3487 			iovecs[i].iov_len = buffer_remaining(q->packet);
3488 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3489 		swap_drop:
3490 			STATUP(data->nsd, dropped);
3491 			ZTATUP(data->nsd, q->zone, dropped);
3492 			if(i != recvcount-1) {
3493 				/* swap with last and decrease recvcount */
3494 				struct mmsghdr mtmp = msgs[i];
3495 				struct iovec iotmp = iovecs[i];
3496 				recvcount--;
3497 				msgs[i] = msgs[recvcount];
3498 				iovecs[i] = iovecs[recvcount];
3499 				queries[i] = queries[recvcount];
3500 				msgs[recvcount] = mtmp;
3501 				iovecs[recvcount] = iotmp;
3502 				queries[recvcount] = q;
3503 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3504 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3505 				goto loopstart;
3506 			} else { recvcount --; }
3507 		}
3508 	}
3509 
3510 	/* send until all are sent */
3511 	i = 0;
3512 	while(i<recvcount) {
3513 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3514 		if(sent == -1) {
3515 			if(errno == ENOBUFS ||
3516 #ifdef EWOULDBLOCK
3517 				errno == EWOULDBLOCK ||
3518 #endif
3519 				errno == EAGAIN) {
3520 				/* block to wait until send buffer avail */
3521 				int flag, errstore;
3522 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3523 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3524 					flag = 0;
3525 				}
3526 				flag &= ~O_NONBLOCK;
3527 				if(fcntl(fd, F_SETFL, flag) == -1)
3528 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3529 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3530 				errstore = errno;
3531 				flag |= O_NONBLOCK;
3532 				if(fcntl(fd, F_SETFL, flag) == -1)
3533 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3534 				if(sent != -1) {
3535 					i += sent;
3536 					continue;
3537 				}
3538 				errno = errstore;
3539 			}
3540 			if(errno == EINVAL) {
3541 				/* skip the invalid argument entry,
3542 				 * send the remaining packets in the list */
3543 				if(!(port_is_zero((void*)&queries[i]->addr) &&
3544 					verbosity < 3)) {
3545 					const char* es = strerror(errno);
3546 					char a[64];
3547 					addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3548 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3549 				}
3550 				i += 1;
3551 				continue;
3552 			}
3553 			/* don't log transient network full errors, unless
3554 			 * on higher verbosity */
3555 			if(!(errno == ENOBUFS && verbosity < 1) &&
3556 #ifdef EWOULDBLOCK
3557 			   errno != EWOULDBLOCK &&
3558 #endif
3559 			   errno != EAGAIN) {
3560 				const char* es = strerror(errno);
3561 				char a[64];
3562 				addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3563 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3564 			}
3565 #ifdef BIND8_STATS
3566 			data->nsd->st.txerr += recvcount-i;
3567 #endif /* BIND8_STATS */
3568 			break;
3569 		}
3570 		i += sent;
3571 	}
3572 	for(i=0; i<recvcount; i++) {
3573 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3574 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3575 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3576 	}
3577 }
3578 
3579 #ifdef HAVE_SSL
3580 /*
3581  * Setup an event for the tcp handler.
3582  */
3583 static void
3584 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3585        int fd, short event)
3586 {
3587 	struct timeval timeout;
3588 	struct event_base* ev_base;
3589 
3590 	timeout.tv_sec = data->nsd->tcp_timeout;
3591 	timeout.tv_usec = 0L;
3592 
3593 	ev_base = data->event.ev_base;
3594 	event_del(&data->event);
3595 	memset(&data->event, 0, sizeof(data->event));
3596 	event_set(&data->event, fd, event, fn, data);
3597 	if(event_base_set(ev_base, &data->event) != 0)
3598 		log_msg(LOG_ERR, "event base set failed");
3599 	if(event_add(&data->event, &timeout) != 0)
3600 		log_msg(LOG_ERR, "event add failed");
3601 }
3602 #endif /* HAVE_SSL */
3603 
3604 static void
3605 cleanup_tcp_handler(struct tcp_handler_data* data)
3606 {
3607 	event_del(&data->event);
3608 #ifdef HAVE_SSL
3609 	if(data->tls) {
3610 		SSL_shutdown(data->tls);
3611 		SSL_free(data->tls);
3612 		data->tls = NULL;
3613 	}
3614 #endif
3615 	close(data->event.ev_fd);
3616 	if(data->prev)
3617 		data->prev->next = data->next;
3618 	else	tcp_active_list = data->next;
3619 	if(data->next)
3620 		data->next->prev = data->prev;
3621 
3622 	/*
3623 	 * Enable the TCP accept handlers when the current number of
3624 	 * TCP connections is about to drop below the maximum number
3625 	 * of TCP connections.
3626 	 */
3627 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3628 		configure_handler_event_types(EV_READ|EV_PERSIST);
3629 		if(slowaccept) {
3630 			event_del(&slowaccept_event);
3631 			slowaccept = 0;
3632 		}
3633 	}
3634 	--data->nsd->current_tcp_count;
3635 	assert(data->nsd->current_tcp_count >= 0);
3636 
3637 	region_destroy(data->region);
3638 }
3639 
3640 static void
3641 handle_tcp_reading(int fd, short event, void* arg)
3642 {
3643 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3644 	ssize_t received;
3645 	struct event_base* ev_base;
3646 	struct timeval timeout;
3647 	uint32_t now = 0;
3648 
3649 	if ((event & EV_TIMEOUT)) {
3650 		/* Connection timed out.  */
3651 		cleanup_tcp_handler(data);
3652 		return;
3653 	}
3654 
3655 	if ((data->nsd->tcp_query_count > 0 &&
3656 		data->query_count >= data->nsd->tcp_query_count) ||
3657 		data->tcp_no_more_queries) {
3658 		/* No more queries allowed on this tcp connection. */
3659 		cleanup_tcp_handler(data);
3660 		return;
3661 	}
3662 
3663 	assert((event & EV_READ));
3664 
3665 	if (data->bytes_transmitted == 0) {
3666 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3667 	}
3668 
3669 	/*
3670 	 * Check if we received the leading packet length bytes yet.
3671 	 */
3672 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3673 		received = read(fd,
3674 				(char *) &data->query->tcplen
3675 				+ data->bytes_transmitted,
3676 				sizeof(uint16_t) - data->bytes_transmitted);
3677 		if (received == -1) {
3678 			if (errno == EAGAIN || errno == EINTR) {
3679 				/*
3680 				 * Read would block, wait until more
3681 				 * data is available.
3682 				 */
3683 				return;
3684 			} else {
3685 				char buf[48];
3686 				addr2str(&data->query->addr, buf, sizeof(buf));
3687 #ifdef ECONNRESET
3688 				if (verbosity >= 2 || errno != ECONNRESET)
3689 #endif /* ECONNRESET */
3690 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3691 				cleanup_tcp_handler(data);
3692 				return;
3693 			}
3694 		} else if (received == 0) {
3695 			/* EOF */
3696 			cleanup_tcp_handler(data);
3697 			return;
3698 		}
3699 
3700 		data->bytes_transmitted += received;
3701 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3702 			/*
3703 			 * Not done with the tcplen yet, wait for more
3704 			 * data to become available.
3705 			 */
3706 			return;
3707 		}
3708 
3709 		assert(data->bytes_transmitted == sizeof(uint16_t));
3710 
3711 		data->query->tcplen = ntohs(data->query->tcplen);
3712 
3713 		/*
3714 		 * Minimum query size is:
3715 		 *
3716 		 *     Size of the header (12)
3717 		 *   + Root domain name   (1)
3718 		 *   + Query class        (2)
3719 		 *   + Query type         (2)
3720 		 */
3721 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3722 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3723 			cleanup_tcp_handler(data);
3724 			return;
3725 		}
3726 
3727 		if (data->query->tcplen > data->query->maxlen) {
3728 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3729 			cleanup_tcp_handler(data);
3730 			return;
3731 		}
3732 
3733 		buffer_set_limit(data->query->packet, data->query->tcplen);
3734 	}
3735 
3736 	assert(buffer_remaining(data->query->packet) > 0);
3737 
3738 	/* Read the (remaining) query data.  */
3739 	received = read(fd,
3740 			buffer_current(data->query->packet),
3741 			buffer_remaining(data->query->packet));
3742 	if (received == -1) {
3743 		if (errno == EAGAIN || errno == EINTR) {
3744 			/*
3745 			 * Read would block, wait until more data is
3746 			 * available.
3747 			 */
3748 			return;
3749 		} else {
3750 			char buf[48];
3751 			addr2str(&data->query->addr, buf, sizeof(buf));
3752 #ifdef ECONNRESET
3753 			if (verbosity >= 2 || errno != ECONNRESET)
3754 #endif /* ECONNRESET */
3755 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3756 			cleanup_tcp_handler(data);
3757 			return;
3758 		}
3759 	} else if (received == 0) {
3760 		/* EOF */
3761 		cleanup_tcp_handler(data);
3762 		return;
3763 	}
3764 
3765 	data->bytes_transmitted += received;
3766 	buffer_skip(data->query->packet, received);
3767 	if (buffer_remaining(data->query->packet) > 0) {
3768 		/*
3769 		 * Message not yet complete, wait for more data to
3770 		 * become available.
3771 		 */
3772 		return;
3773 	}
3774 
3775 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3776 
3777 	/* Account... */
3778 #ifdef BIND8_STATS
3779 #ifndef INET6
3780 	STATUP(data->nsd, ctcp);
3781 #else
3782 	if (data->query->addr.ss_family == AF_INET) {
3783 		STATUP(data->nsd, ctcp);
3784 	} else if (data->query->addr.ss_family == AF_INET6) {
3785 		STATUP(data->nsd, ctcp6);
3786 	}
3787 #endif
3788 #endif /* BIND8_STATS */
3789 
3790 	/* We have a complete query, process it.  */
3791 
3792 	/* tcp-query-count: handle query counter ++ */
3793 	data->query_count++;
3794 
3795 	buffer_flip(data->query->packet);
3796 #ifdef USE_DNSTAP
3797 	/*
3798 	 * and send TCP-query with found address (local) and client address to dnstap process
3799 	 */
3800 	log_addr("query from client", &data->query->addr);
3801 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3802 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3803 		data->query->addrlen, data->query->tcp, data->query->packet);
3804 #endif /* USE_DNSTAP */
3805 	data->query_state = server_process_query(data->nsd, data->query, &now);
3806 	if (data->query_state == QUERY_DISCARDED) {
3807 		/* Drop the packet and the entire connection... */
3808 		STATUP(data->nsd, dropped);
3809 		ZTATUP(data->nsd, data->query->zone, dropped);
3810 		cleanup_tcp_handler(data);
3811 		return;
3812 	}
3813 
3814 #ifdef BIND8_STATS
3815 	if (RCODE(data->query->packet) == RCODE_OK
3816 	    && !AA(data->query->packet))
3817 	{
3818 		STATUP(data->nsd, nona);
3819 		ZTATUP(data->nsd, data->query->zone, nona);
3820 	}
3821 #endif /* BIND8_STATS */
3822 
3823 #ifdef USE_ZONE_STATS
3824 #ifndef INET6
3825 	ZTATUP(data->nsd, data->query->zone, ctcp);
3826 #else
3827 	if (data->query->addr.ss_family == AF_INET) {
3828 		ZTATUP(data->nsd, data->query->zone, ctcp);
3829 	} else if (data->query->addr.ss_family == AF_INET6) {
3830 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3831 	}
3832 #endif
3833 #endif /* USE_ZONE_STATS */
3834 
3835 	query_add_optional(data->query, data->nsd, &now);
3836 
3837 	/* Switch to the tcp write handler.  */
3838 	buffer_flip(data->query->packet);
3839 	data->query->tcplen = buffer_remaining(data->query->packet);
3840 #ifdef BIND8_STATS
3841 	/* Account the rcode & TC... */
3842 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3843 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3844 	if (TC(data->query->packet)) {
3845 		STATUP(data->nsd, truncated);
3846 		ZTATUP(data->nsd, data->query->zone, truncated);
3847 	}
3848 #endif /* BIND8_STATS */
3849 #ifdef USE_DNSTAP
3850 	/*
3851 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
3852 	 */
3853 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3854 	log_addr("response to client", &data->query->addr);
3855 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3856 		data->query->addrlen, data->query->tcp, data->query->packet,
3857 		data->query->zone);
3858 #endif /* USE_DNSTAP */
3859 	data->bytes_transmitted = 0;
3860 
3861 	timeout.tv_sec = data->tcp_timeout / 1000;
3862 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3863 
3864 	ev_base = data->event.ev_base;
3865 	event_del(&data->event);
3866 	memset(&data->event, 0, sizeof(data->event));
3867 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3868 		handle_tcp_reading, data);
3869 	if(event_base_set(ev_base, &data->event) != 0)
3870 		log_msg(LOG_ERR, "event base set tcpr failed");
3871 	if(event_add(&data->event, &timeout) != 0)
3872 		log_msg(LOG_ERR, "event add tcpr failed");
3873 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3874 	handle_tcp_writing(fd, EV_WRITE, data);
3875 }
3876 
3877 static void
3878 handle_tcp_writing(int fd, short event, void* arg)
3879 {
3880 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3881 	ssize_t sent;
3882 	struct query *q = data->query;
3883 	struct timeval timeout;
3884 	struct event_base* ev_base;
3885 	uint32_t now = 0;
3886 
3887 	if ((event & EV_TIMEOUT)) {
3888 		/* Connection timed out.  */
3889 		cleanup_tcp_handler(data);
3890 		return;
3891 	}
3892 
3893 	assert((event & EV_WRITE));
3894 
3895 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3896 		/* Writing the response packet length.  */
3897 		uint16_t n_tcplen = htons(q->tcplen);
3898 #ifdef HAVE_WRITEV
3899 		struct iovec iov[2];
3900 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3901 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3902 		iov[1].iov_base = buffer_begin(q->packet);
3903 		iov[1].iov_len = buffer_limit(q->packet);
3904 		sent = writev(fd, iov, 2);
3905 #else /* HAVE_WRITEV */
3906 		sent = write(fd,
3907 			     (const char *) &n_tcplen + data->bytes_transmitted,
3908 			     sizeof(n_tcplen) - data->bytes_transmitted);
3909 #endif /* HAVE_WRITEV */
3910 		if (sent == -1) {
3911 			if (errno == EAGAIN || errno == EINTR) {
3912 				/*
3913 				 * Write would block, wait until
3914 				 * socket becomes writable again.
3915 				 */
3916 				return;
3917 			} else {
3918 #ifdef ECONNRESET
3919 				if(verbosity >= 2 || errno != ECONNRESET)
3920 #endif /* ECONNRESET */
3921 #ifdef EPIPE
3922 				  if(verbosity >= 2 || errno != EPIPE)
3923 #endif /* EPIPE 'broken pipe' */
3924 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3925 				cleanup_tcp_handler(data);
3926 				return;
3927 			}
3928 		}
3929 
3930 		data->bytes_transmitted += sent;
3931 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3932 			/*
3933 			 * Writing not complete, wait until socket
3934 			 * becomes writable again.
3935 			 */
3936 			return;
3937 		}
3938 
3939 #ifdef HAVE_WRITEV
3940 		sent -= sizeof(n_tcplen);
3941 		/* handle potential 'packet done' code */
3942 		goto packet_could_be_done;
3943 #endif
3944  	}
3945 
3946 	sent = write(fd,
3947 		     buffer_current(q->packet),
3948 		     buffer_remaining(q->packet));
3949 	if (sent == -1) {
3950 		if (errno == EAGAIN || errno == EINTR) {
3951 			/*
3952 			 * Write would block, wait until
3953 			 * socket becomes writable again.
3954 			 */
3955 			return;
3956 		} else {
3957 #ifdef ECONNRESET
3958 			if(verbosity >= 2 || errno != ECONNRESET)
3959 #endif /* ECONNRESET */
3960 #ifdef EPIPE
3961 				  if(verbosity >= 2 || errno != EPIPE)
3962 #endif /* EPIPE 'broken pipe' */
3963 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3964 			cleanup_tcp_handler(data);
3965 			return;
3966 		}
3967 	}
3968 
3969 	data->bytes_transmitted += sent;
3970 #ifdef HAVE_WRITEV
3971   packet_could_be_done:
3972 #endif
3973 	buffer_skip(q->packet, sent);
3974 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3975 		/*
3976 		 * Still more data to write when socket becomes
3977 		 * writable again.
3978 		 */
3979 		return;
3980 	}
3981 
3982 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3983 
3984 	if (data->query_state == QUERY_IN_AXFR) {
3985 		/* Continue processing AXFR and writing back results.  */
3986 		buffer_clear(q->packet);
3987 		data->query_state = query_axfr(data->nsd, q);
3988 		if (data->query_state != QUERY_PROCESSED) {
3989 			query_add_optional(data->query, data->nsd, &now);
3990 
3991 			/* Reset data. */
3992 			buffer_flip(q->packet);
3993 			q->tcplen = buffer_remaining(q->packet);
3994 			data->bytes_transmitted = 0;
3995 			/* Reset timeout.  */
3996 			timeout.tv_sec = data->tcp_timeout / 1000;
3997 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3998 			ev_base = data->event.ev_base;
3999 			event_del(&data->event);
4000 			memset(&data->event, 0, sizeof(data->event));
4001 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4002 				handle_tcp_writing, data);
4003 			if(event_base_set(ev_base, &data->event) != 0)
4004 				log_msg(LOG_ERR, "event base set tcpw failed");
4005 			if(event_add(&data->event, &timeout) != 0)
4006 				log_msg(LOG_ERR, "event add tcpw failed");
4007 
4008 			/*
4009 			 * Write data if/when the socket is writable
4010 			 * again.
4011 			 */
4012 			return;
4013 		}
4014 	}
4015 
4016 	/*
4017 	 * Done sending, wait for the next request to arrive on the
4018 	 * TCP socket by installing the TCP read handler.
4019 	 */
4020 	if ((data->nsd->tcp_query_count > 0 &&
4021 		data->query_count >= data->nsd->tcp_query_count) ||
4022 		data->tcp_no_more_queries) {
4023 
4024 		(void) shutdown(fd, SHUT_WR);
4025 	}
4026 
4027 	data->bytes_transmitted = 0;
4028 
4029 	timeout.tv_sec = data->tcp_timeout / 1000;
4030 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4031 	ev_base = data->event.ev_base;
4032 	event_del(&data->event);
4033 	memset(&data->event, 0, sizeof(data->event));
4034 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4035 		handle_tcp_reading, data);
4036 	if(event_base_set(ev_base, &data->event) != 0)
4037 		log_msg(LOG_ERR, "event base set tcpw failed");
4038 	if(event_add(&data->event, &timeout) != 0)
4039 		log_msg(LOG_ERR, "event add tcpw failed");
4040 }
4041 
4042 #ifdef HAVE_SSL
4043 /** create SSL object and associate fd */
4044 static SSL*
4045 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4046 {
4047 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4048 	if(!ssl) {
4049 		log_crypto_err("could not SSL_new");
4050 		return NULL;
4051 	}
4052 	SSL_set_accept_state(ssl);
4053 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4054 	if(!SSL_set_fd(ssl, fd)) {
4055 		log_crypto_err("could not SSL_set_fd");
4056 		SSL_free(ssl);
4057 		return NULL;
4058 	}
4059 	return ssl;
4060 }
4061 
4062 /** TLS handshake to upgrade TCP connection */
4063 static int
4064 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4065 {
4066 	int r;
4067 	if(data->shake_state == tls_hs_read_event) {
4068 		/* read condition satisfied back to writing */
4069 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4070 		data->shake_state = tls_hs_none;
4071 		return 1;
4072 	}
4073 	if(data->shake_state == tls_hs_write_event) {
4074 		/* write condition satisfied back to reading */
4075 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4076 		data->shake_state = tls_hs_none;
4077 		return 1;
4078 	}
4079 
4080 	/* (continue to) setup the TLS connection */
4081 	ERR_clear_error();
4082 	r = SSL_do_handshake(data->tls);
4083 
4084 	if(r != 1) {
4085 		int want = SSL_get_error(data->tls, r);
4086 		if(want == SSL_ERROR_WANT_READ) {
4087 			if(data->shake_state == tls_hs_read) {
4088 				/* try again later */
4089 				return 1;
4090 			}
4091 			data->shake_state = tls_hs_read;
4092 			/* switch back to reading mode */
4093 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4094 			return 1;
4095 		} else if(want == SSL_ERROR_WANT_WRITE) {
4096 			if(data->shake_state == tls_hs_write) {
4097 				/* try again later */
4098 				return 1;
4099 			}
4100 			data->shake_state = tls_hs_write;
4101 			/* switch back to writing mode */
4102 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4103 			return 1;
4104 		} else {
4105 			if(r == 0)
4106 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4107 			else {
4108 				unsigned long err = ERR_get_error();
4109 				if(!squelch_err_ssl_handshake(err)) {
4110 					char a[64], s[256];
4111 					addr2str(&data->query->addr, a, sizeof(a));
4112 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4113 					log_crypto_from_err(s, err);
4114 				}
4115 			}
4116 			cleanup_tcp_handler(data);
4117 			return 0;
4118 		}
4119 	}
4120 
4121 	/* Use to log successful upgrade for testing - could be removed*/
4122 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4123 	/* set back to the event we need to have when reading (or writing) */
4124 	if(data->shake_state == tls_hs_read && writing) {
4125 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4126 	} else if(data->shake_state == tls_hs_write && !writing) {
4127 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4128 	}
4129 	data->shake_state = tls_hs_none;
4130 	return 1;
4131 }
4132 
4133 /** handle TLS reading of incoming query */
4134 static void
4135 handle_tls_reading(int fd, short event, void* arg)
4136 {
4137 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4138 	ssize_t received;
4139 	uint32_t now = 0;
4140 
4141 	if ((event & EV_TIMEOUT)) {
4142 		/* Connection timed out.  */
4143 		cleanup_tcp_handler(data);
4144 		return;
4145 	}
4146 
4147 	if ((data->nsd->tcp_query_count > 0 &&
4148 	    data->query_count >= data->nsd->tcp_query_count) ||
4149 	    data->tcp_no_more_queries) {
4150 		/* No more queries allowed on this tcp connection. */
4151 		cleanup_tcp_handler(data);
4152 		return;
4153 	}
4154 
4155 	assert((event & EV_READ));
4156 
4157 	if (data->bytes_transmitted == 0) {
4158 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4159 	}
4160 
4161 	if(data->shake_state != tls_hs_none) {
4162 		if(!tls_handshake(data, fd, 0))
4163 			return;
4164 		if(data->shake_state != tls_hs_none)
4165 			return;
4166 	}
4167 
4168 	/*
4169 	 * Check if we received the leading packet length bytes yet.
4170 	 */
4171 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4172 		ERR_clear_error();
4173 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4174 		    + data->bytes_transmitted,
4175 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4176 			int want = SSL_get_error(data->tls, received);
4177 			if(want == SSL_ERROR_ZERO_RETURN) {
4178 				cleanup_tcp_handler(data);
4179 				return; /* shutdown, closed */
4180 			} else if(want == SSL_ERROR_WANT_READ) {
4181 				/* wants to be called again */
4182 				return;
4183 			}
4184 			else if(want == SSL_ERROR_WANT_WRITE) {
4185 				/* switch to writing */
4186 				data->shake_state = tls_hs_write_event;
4187 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4188 				return;
4189 			}
4190 			cleanup_tcp_handler(data);
4191 			log_crypto_err("could not SSL_read");
4192 			return;
4193 		}
4194 
4195 		data->bytes_transmitted += received;
4196 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4197 			/*
4198 			 * Not done with the tcplen yet, wait for more
4199 			 * data to become available.
4200 			 */
4201 			return;
4202 		}
4203 
4204 		assert(data->bytes_transmitted == sizeof(uint16_t));
4205 
4206 		data->query->tcplen = ntohs(data->query->tcplen);
4207 
4208 		/*
4209 		 * Minimum query size is:
4210 		 *
4211 		 *     Size of the header (12)
4212 		 *   + Root domain name   (1)
4213 		 *   + Query class        (2)
4214 		 *   + Query type         (2)
4215 		 */
4216 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4217 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4218 			cleanup_tcp_handler(data);
4219 			return;
4220 		}
4221 
4222 		if (data->query->tcplen > data->query->maxlen) {
4223 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4224 			cleanup_tcp_handler(data);
4225 			return;
4226 		}
4227 
4228 		buffer_set_limit(data->query->packet, data->query->tcplen);
4229 	}
4230 
4231 	assert(buffer_remaining(data->query->packet) > 0);
4232 
4233 	/* Read the (remaining) query data.  */
4234 	ERR_clear_error();
4235 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4236 			    (int)buffer_remaining(data->query->packet));
4237 	if(received <= 0) {
4238 		int want = SSL_get_error(data->tls, received);
4239 		if(want == SSL_ERROR_ZERO_RETURN) {
4240 			cleanup_tcp_handler(data);
4241 			return; /* shutdown, closed */
4242 		} else if(want == SSL_ERROR_WANT_READ) {
4243 			/* wants to be called again */
4244 			return;
4245 		}
4246 		else if(want == SSL_ERROR_WANT_WRITE) {
4247 			/* switch back writing */
4248 			data->shake_state = tls_hs_write_event;
4249 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4250 			return;
4251 		}
4252 		cleanup_tcp_handler(data);
4253 		log_crypto_err("could not SSL_read");
4254 		return;
4255 	}
4256 
4257 	data->bytes_transmitted += received;
4258 	buffer_skip(data->query->packet, received);
4259 	if (buffer_remaining(data->query->packet) > 0) {
4260 		/*
4261 		 * Message not yet complete, wait for more data to
4262 		 * become available.
4263 		 */
4264 		return;
4265 	}
4266 
4267 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4268 
4269 	/* Account... */
4270 #ifndef INET6
4271 	STATUP(data->nsd, ctls);
4272 #else
4273 	if (data->query->addr.ss_family == AF_INET) {
4274 		STATUP(data->nsd, ctls);
4275 	} else if (data->query->addr.ss_family == AF_INET6) {
4276 		STATUP(data->nsd, ctls6);
4277 	}
4278 #endif
4279 
4280 	/* We have a complete query, process it.  */
4281 
4282 	/* tcp-query-count: handle query counter ++ */
4283 	data->query_count++;
4284 
4285 	buffer_flip(data->query->packet);
4286 #ifdef USE_DNSTAP
4287 	/*
4288 	 * and send TCP-query with found address (local) and client address to dnstap process
4289 	 */
4290 	log_addr("query from client", &data->query->addr);
4291 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4292 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4293 		data->query->addrlen, data->query->tcp, data->query->packet);
4294 #endif /* USE_DNSTAP */
4295 	data->query_state = server_process_query(data->nsd, data->query, &now);
4296 	if (data->query_state == QUERY_DISCARDED) {
4297 		/* Drop the packet and the entire connection... */
4298 		STATUP(data->nsd, dropped);
4299 		ZTATUP(data->nsd, data->query->zone, dropped);
4300 		cleanup_tcp_handler(data);
4301 		return;
4302 	}
4303 
4304 #ifdef BIND8_STATS
4305 	if (RCODE(data->query->packet) == RCODE_OK
4306 	    && !AA(data->query->packet))
4307 	{
4308 		STATUP(data->nsd, nona);
4309 		ZTATUP(data->nsd, data->query->zone, nona);
4310 	}
4311 #endif /* BIND8_STATS */
4312 
4313 #ifdef USE_ZONE_STATS
4314 #ifndef INET6
4315 	ZTATUP(data->nsd, data->query->zone, ctls);
4316 #else
4317 	if (data->query->addr.ss_family == AF_INET) {
4318 		ZTATUP(data->nsd, data->query->zone, ctls);
4319 	} else if (data->query->addr.ss_family == AF_INET6) {
4320 		ZTATUP(data->nsd, data->query->zone, ctls6);
4321 	}
4322 #endif
4323 #endif /* USE_ZONE_STATS */
4324 
4325 	query_add_optional(data->query, data->nsd, &now);
4326 
4327 	/* Switch to the tcp write handler.  */
4328 	buffer_flip(data->query->packet);
4329 	data->query->tcplen = buffer_remaining(data->query->packet);
4330 #ifdef BIND8_STATS
4331 	/* Account the rcode & TC... */
4332 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4333 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4334 	if (TC(data->query->packet)) {
4335 		STATUP(data->nsd, truncated);
4336 		ZTATUP(data->nsd, data->query->zone, truncated);
4337 	}
4338 #endif /* BIND8_STATS */
4339 #ifdef USE_DNSTAP
4340 	/*
4341 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4342 	 */
4343 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4344 	log_addr("response to client", &data->query->addr);
4345 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4346 		data->query->addrlen, data->query->tcp, data->query->packet,
4347 		data->query->zone);
4348 #endif /* USE_DNSTAP */
4349 	data->bytes_transmitted = 0;
4350 
4351 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4352 
4353 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4354 	handle_tls_writing(fd, EV_WRITE, data);
4355 }
4356 
4357 /** handle TLS writing of outgoing response */
4358 static void
4359 handle_tls_writing(int fd, short event, void* arg)
4360 {
4361 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4362 	ssize_t sent;
4363 	struct query *q = data->query;
4364 	/* static variable that holds reassembly buffer used to put the
4365 	 * TCP length in front of the packet, like writev. */
4366 	static buffer_type* global_tls_temp_buffer = NULL;
4367 	buffer_type* write_buffer;
4368 	uint32_t now = 0;
4369 
4370 	if ((event & EV_TIMEOUT)) {
4371 		/* Connection timed out.  */
4372 		cleanup_tcp_handler(data);
4373 		return;
4374 	}
4375 
4376 	assert((event & EV_WRITE));
4377 
4378 	if(data->shake_state != tls_hs_none) {
4379 		if(!tls_handshake(data, fd, 1))
4380 			return;
4381 		if(data->shake_state != tls_hs_none)
4382 			return;
4383 	}
4384 
4385 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4386 
4387 	/* If we are writing the start of a message, we must include the length
4388 	 * this is done with a copy into write_buffer. */
4389 	write_buffer = NULL;
4390 	if (data->bytes_transmitted == 0) {
4391 		if(!global_tls_temp_buffer) {
4392 			/* gets deallocated when nsd shuts down from
4393 			 * nsd.region */
4394 			global_tls_temp_buffer = buffer_create(nsd.region,
4395 				QIOBUFSZ + sizeof(q->tcplen));
4396 			if (!global_tls_temp_buffer) {
4397 				return;
4398 			}
4399 		}
4400 		write_buffer = global_tls_temp_buffer;
4401 		buffer_clear(write_buffer);
4402 		buffer_write_u16(write_buffer, q->tcplen);
4403 		buffer_write(write_buffer, buffer_current(q->packet),
4404 			(int)buffer_remaining(q->packet));
4405 		buffer_flip(write_buffer);
4406 	} else {
4407 		write_buffer = q->packet;
4408 	}
4409 
4410 	/* Write the response */
4411 	ERR_clear_error();
4412 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4413 	if(sent <= 0) {
4414 		int want = SSL_get_error(data->tls, sent);
4415 		if(want == SSL_ERROR_ZERO_RETURN) {
4416 			cleanup_tcp_handler(data);
4417 			/* closed */
4418 		} else if(want == SSL_ERROR_WANT_READ) {
4419 			/* switch back to reading */
4420 			data->shake_state = tls_hs_read_event;
4421 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4422 		} else if(want != SSL_ERROR_WANT_WRITE) {
4423 			cleanup_tcp_handler(data);
4424 			log_crypto_err("could not SSL_write");
4425 		}
4426 		return;
4427 	}
4428 
4429 	buffer_skip(write_buffer, sent);
4430 	if(buffer_remaining(write_buffer) != 0) {
4431 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4432 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4433 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4434 		}
4435 	}
4436 
4437 	data->bytes_transmitted += sent;
4438 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4439 		/*
4440 		 * Still more data to write when socket becomes
4441 		 * writable again.
4442 		 */
4443 		return;
4444 	}
4445 
4446 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4447 
4448 	if (data->query_state == QUERY_IN_AXFR) {
4449 		/* Continue processing AXFR and writing back results.  */
4450 		buffer_clear(q->packet);
4451 		data->query_state = query_axfr(data->nsd, q);
4452 		if (data->query_state != QUERY_PROCESSED) {
4453 			query_add_optional(data->query, data->nsd, &now);
4454 
4455 			/* Reset data. */
4456 			buffer_flip(q->packet);
4457 			q->tcplen = buffer_remaining(q->packet);
4458 			data->bytes_transmitted = 0;
4459 			/* Reset to writing mode.  */
4460 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4461 
4462 			/*
4463 			 * Write data if/when the socket is writable
4464 			 * again.
4465 			 */
4466 			return;
4467 		}
4468 	}
4469 
4470 	/*
4471 	 * Done sending, wait for the next request to arrive on the
4472 	 * TCP socket by installing the TCP read handler.
4473 	 */
4474 	if ((data->nsd->tcp_query_count > 0 &&
4475 		data->query_count >= data->nsd->tcp_query_count) ||
4476 		data->tcp_no_more_queries) {
4477 
4478 		(void) shutdown(fd, SHUT_WR);
4479 	}
4480 
4481 	data->bytes_transmitted = 0;
4482 
4483 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4484 }
4485 #endif
4486 
4487 static void
4488 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4489 	void* ATTR_UNUSED(arg))
4490 {
4491 	if(slowaccept) {
4492 		configure_handler_event_types(EV_PERSIST | EV_READ);
4493 		slowaccept = 0;
4494 	}
4495 }
4496 
4497 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4498 {
4499 #ifndef HAVE_ACCEPT4
4500 	int s = accept(fd, addr, addrlen);
4501 	if (s != -1) {
4502 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4503 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4504 			close(s);
4505 			s = -1;
4506 			errno=EINTR; /* stop error printout as error in accept4
4507 				by setting this errno, it omits printout, in
4508 				later code that calls nsd_accept4 */
4509 		}
4510 	}
4511 	return s;
4512 #else
4513 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4514 #endif /* HAVE_ACCEPT4 */
4515 }
4516 
4517 /*
4518  * Handle an incoming TCP connection.  The connection is accepted and
4519  * a new TCP reader event handler is added.  The TCP handler
4520  * is responsible for cleanup when the connection is closed.
4521  */
4522 static void
4523 handle_tcp_accept(int fd, short event, void* arg)
4524 {
4525 	struct tcp_accept_handler_data *data
4526 		= (struct tcp_accept_handler_data *) arg;
4527 	int s;
4528 	int reject = 0;
4529 	struct tcp_handler_data *tcp_data;
4530 	region_type *tcp_region;
4531 #ifdef INET6
4532 	struct sockaddr_storage addr;
4533 #else
4534 	struct sockaddr_in addr;
4535 #endif
4536 	socklen_t addrlen;
4537 	struct timeval timeout;
4538 
4539 	if (!(event & EV_READ)) {
4540 		return;
4541 	}
4542 
4543 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4544 		reject = data->nsd->options->tcp_reject_overflow;
4545 		if (!reject) {
4546 			return;
4547 		}
4548 	}
4549 
4550 	/* Accept it... */
4551 	addrlen = sizeof(addr);
4552 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4553 	if (s == -1) {
4554 		/**
4555 		 * EMFILE and ENFILE is a signal that the limit of open
4556 		 * file descriptors has been reached. Pause accept().
4557 		 * EINTR is a signal interrupt. The others are various OS ways
4558 		 * of saying that the client has closed the connection.
4559 		 */
4560 		if (errno == EMFILE || errno == ENFILE) {
4561 			if (!slowaccept) {
4562 				/* disable accept events */
4563 				struct timeval tv;
4564 				configure_handler_event_types(0);
4565 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4566 				tv.tv_usec = 0L;
4567 				memset(&slowaccept_event, 0,
4568 					sizeof(slowaccept_event));
4569 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4570 					handle_slowaccept_timeout, NULL);
4571 				(void)event_base_set(data->event.ev_base,
4572 					&slowaccept_event);
4573 				(void)event_add(&slowaccept_event, &tv);
4574 				slowaccept = 1;
4575 				/* We don't want to spam the logs here */
4576 			}
4577 		} else if (errno != EINTR
4578 			&& errno != EWOULDBLOCK
4579 #ifdef ECONNABORTED
4580 			&& errno != ECONNABORTED
4581 #endif /* ECONNABORTED */
4582 #ifdef EPROTO
4583 			&& errno != EPROTO
4584 #endif /* EPROTO */
4585 			) {
4586 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4587 		}
4588 		return;
4589 	}
4590 
4591 	if (reject) {
4592 		shutdown(s, SHUT_RDWR);
4593 		close(s);
4594 		return;
4595 	}
4596 
4597 	/*
4598 	 * This region is deallocated when the TCP connection is
4599 	 * closed by the TCP handler.
4600 	 */
4601 	tcp_region = region_create(xalloc, free);
4602 	tcp_data = (struct tcp_handler_data *) region_alloc(
4603 		tcp_region, sizeof(struct tcp_handler_data));
4604 	tcp_data->region = tcp_region;
4605 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4606 		compression_table_size, compressed_dnames);
4607 	tcp_data->nsd = data->nsd;
4608 	tcp_data->query_count = 0;
4609 #ifdef HAVE_SSL
4610 	tcp_data->shake_state = tls_hs_none;
4611 	tcp_data->tls = NULL;
4612 #endif
4613 	tcp_data->prev = NULL;
4614 	tcp_data->next = NULL;
4615 
4616 	tcp_data->query_state = QUERY_PROCESSED;
4617 	tcp_data->bytes_transmitted = 0;
4618 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4619 	tcp_data->query->addrlen = addrlen;
4620 
4621 	tcp_data->tcp_no_more_queries = 0;
4622 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4623 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4624 		/* very busy, give smaller timeout */
4625 		tcp_data->tcp_timeout = 200;
4626 	}
4627 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4628 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4629 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4630 
4631 #ifdef USE_DNSTAP
4632 	/* save the address of the connection */
4633 	tcp_data->socket = data->socket;
4634 #endif /* USE_DNSTAP */
4635 
4636 #ifdef HAVE_SSL
4637 	if (data->tls_accept) {
4638 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4639 		if(!tcp_data->tls) {
4640 			close(s);
4641 			return;
4642 		}
4643 		tcp_data->shake_state = tls_hs_read;
4644 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4645 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4646 			  handle_tls_reading, tcp_data);
4647 	} else {
4648 #endif
4649 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4650 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4651 			  handle_tcp_reading, tcp_data);
4652 #ifdef HAVE_SSL
4653 	}
4654 #endif
4655 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4656 		log_msg(LOG_ERR, "cannot set tcp event base");
4657 		close(s);
4658 		region_destroy(tcp_region);
4659 		return;
4660 	}
4661 	if(event_add(&tcp_data->event, &timeout) != 0) {
4662 		log_msg(LOG_ERR, "cannot add tcp to event base");
4663 		close(s);
4664 		region_destroy(tcp_region);
4665 		return;
4666 	}
4667 	if(tcp_active_list) {
4668 		tcp_active_list->prev = tcp_data;
4669 		tcp_data->next = tcp_active_list;
4670 	}
4671 	tcp_active_list = tcp_data;
4672 
4673 	/*
4674 	 * Keep track of the total number of TCP handlers installed so
4675 	 * we can stop accepting connections when the maximum number
4676 	 * of simultaneous TCP connections is reached.
4677 	 *
4678 	 * If tcp-reject-overflow is enabled, however, then we do not
4679 	 * change the handler event type; we keep it as-is and accept
4680 	 * overflow TCP connections only so that we can forcibly kill
4681 	 * them off.
4682 	 */
4683 	++data->nsd->current_tcp_count;
4684 	if (!data->nsd->options->tcp_reject_overflow &&
4685 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4686 	{
4687 		configure_handler_event_types(0);
4688 	}
4689 }
4690 
4691 static void
4692 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4693 {
4694 	size_t i;
4695 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4696 	for (i = 0; i < nsd->child_count; ++i) {
4697 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4698 			if (write(nsd->children[i].child_fd,
4699 				&command,
4700 				sizeof(command)) == -1)
4701 			{
4702 				if(errno != EAGAIN && errno != EINTR)
4703 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4704 					(int) command,
4705 					(int) nsd->children[i].pid,
4706 					strerror(errno));
4707 			} else if (timeout > 0) {
4708 				(void)block_read(NULL,
4709 					nsd->children[i].child_fd,
4710 					&command, sizeof(command), timeout);
4711 			}
4712 			fsync(nsd->children[i].child_fd);
4713 			close(nsd->children[i].child_fd);
4714 			nsd->children[i].child_fd = -1;
4715 		}
4716 	}
4717 }
4718 
4719 static void
4720 send_children_quit(struct nsd* nsd)
4721 {
4722 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4723 	send_children_command(nsd, NSD_QUIT, 0);
4724 }
4725 
4726 static void
4727 send_children_quit_and_wait(struct nsd* nsd)
4728 {
4729 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4730 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4731 }
4732 
4733 #ifdef BIND8_STATS
4734 static void
4735 set_children_stats(struct nsd* nsd)
4736 {
4737 	size_t i;
4738 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4739 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4740 	for (i = 0; i < nsd->child_count; ++i) {
4741 		nsd->children[i].need_to_send_STATS = 1;
4742 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4743 	}
4744 }
4745 #endif /* BIND8_STATS */
4746 
4747 static void
4748 configure_handler_event_types(short event_types)
4749 {
4750 	size_t i;
4751 
4752 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4753 		struct event* handler = &tcp_accept_handlers[i].event;
4754 		if(event_types) {
4755 			/* reassign */
4756 			int fd = handler->ev_fd;
4757 			struct event_base* base = handler->ev_base;
4758 			if(tcp_accept_handlers[i].event_added)
4759 				event_del(handler);
4760 			memset(handler, 0, sizeof(*handler));
4761 			event_set(handler, fd, event_types,
4762 				handle_tcp_accept, &tcp_accept_handlers[i]);
4763 			if(event_base_set(base, handler) != 0)
4764 				log_msg(LOG_ERR, "conhand: cannot event_base");
4765 			if(event_add(handler, NULL) != 0)
4766 				log_msg(LOG_ERR, "conhand: cannot event_add");
4767 			tcp_accept_handlers[i].event_added = 1;
4768 		} else {
4769 			/* remove */
4770 			if(tcp_accept_handlers[i].event_added) {
4771 				event_del(handler);
4772 				tcp_accept_handlers[i].event_added = 0;
4773 			}
4774 		}
4775 	}
4776 }
4777