xref: /openbsd-src/usr.sbin/nsd/server.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #ifdef USE_DNSTAP
85 #include "dnstap/dnstap_collector.h"
86 #endif
87 
88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
89 
90 #ifdef USE_DNSTAP
91 /*
92  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
93  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
94  */
95 static void
96 log_addr(const char* descr,
97 #ifdef INET6
98 	struct sockaddr_storage* addr
99 #else
100 	struct sockaddr_in* addr
101 #endif
102 	)
103 {
104 	char str_buf[64];
105 	if(verbosity < 6)
106 		return;
107 	if(
108 #ifdef INET6
109 		addr->ss_family == AF_INET
110 #else
111 		addr->sin_family == AF_INET
112 #endif
113 		) {
114 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
115 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
116 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
117 #ifdef INET6
118 	} else {
119 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
120 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
121 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
122 #endif
123 	}
124 }
125 #endif /* USE_DNSTAP */
126 
127 #ifdef USE_TCP_FASTOPEN
128   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
129   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
130 #endif
131 
132 /*
133  * Data for the UDP handlers.
134  */
135 struct udp_handler_data
136 {
137 	struct nsd        *nsd;
138 	struct nsd_socket *socket;
139 	struct event       event;
140 };
141 
142 struct tcp_accept_handler_data {
143 	struct nsd        *nsd;
144 	struct nsd_socket *socket;
145 	int                event_added;
146 	struct event       event;
147 #ifdef HAVE_SSL
148 	/* handler accepts TLS connections on the dedicated port */
149 	int                tls_accept;
150 #endif
151 };
152 
153 /*
154  * These globals are used to enable the TCP accept handlers
155  * when the number of TCP connection drops below the maximum
156  * number of TCP connections.
157  */
158 static size_t tcp_accept_handler_count;
159 static struct tcp_accept_handler_data *tcp_accept_handlers;
160 
161 static struct event slowaccept_event;
162 static int slowaccept;
163 
164 #ifdef HAVE_SSL
165 static unsigned char *ocspdata = NULL;
166 static long ocspdata_len = 0;
167 #endif
168 
169 #ifdef NONBLOCKING_IS_BROKEN
170 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
171    read multiple times from a socket when reported ready by select. */
172 # define NUM_RECV_PER_SELECT (1)
173 #else /* !NONBLOCKING_IS_BROKEN */
174 # define NUM_RECV_PER_SELECT (100)
175 #endif /* NONBLOCKING_IS_BROKEN */
176 
177 #ifndef HAVE_MMSGHDR
178 struct mmsghdr {
179 	struct msghdr msg_hdr;
180 	unsigned int  msg_len;
181 };
182 #endif
183 
184 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
185 static struct iovec iovecs[NUM_RECV_PER_SELECT];
186 static struct query *queries[NUM_RECV_PER_SELECT];
187 
188 /*
189  * Data for the TCP connection handlers.
190  *
191  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
192  * blocking the entire server on a slow TCP connection, but does make
193  * reading from and writing to the socket more complicated.
194  *
195  * Basically, whenever a read/write would block (indicated by the
196  * EAGAIN errno variable) we remember the position we were reading
197  * from/writing to and return from the TCP reading/writing event
198  * handler.  When the socket becomes readable/writable again we
199  * continue from the same position.
200  */
201 struct tcp_handler_data
202 {
203 	/*
204 	 * The region used to allocate all TCP connection related
205 	 * data, including this structure.  This region is destroyed
206 	 * when the connection is closed.
207 	 */
208 	region_type*		region;
209 
210 	/*
211 	 * The global nsd structure.
212 	 */
213 	struct nsd*			nsd;
214 
215 	/*
216 	 * The current query data for this TCP connection.
217 	 */
218 	query_type*			query;
219 
220 	/*
221 	 * The query_state is used to remember if we are performing an
222 	 * AXFR, if we're done processing, or if we should discard the
223 	 * query and connection.
224 	 */
225 	query_state_type	query_state;
226 
227 	/*
228 	 * The event for the file descriptor and tcp timeout
229 	 */
230 	struct event event;
231 
232 	/*
233 	 * The bytes_transmitted field is used to remember the number
234 	 * of bytes transmitted when receiving or sending a DNS
235 	 * packet.  The count includes the two additional bytes used
236 	 * to specify the packet length on a TCP connection.
237 	 */
238 	size_t				bytes_transmitted;
239 
240 	/*
241 	 * The number of queries handled by this specific TCP connection.
242 	 */
243 	int					query_count;
244 
245 	/*
246 	 * The timeout in msec for this tcp connection
247 	 */
248 	int	tcp_timeout;
249 
250 	/*
251 	 * If the connection is allowed to have further queries on it.
252 	 */
253 	int tcp_no_more_queries;
254 
255 #ifdef USE_DNSTAP
256 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
257 	struct nsd_socket *socket;
258 #endif /* USE_DNSTAP */
259 
260 #ifdef HAVE_SSL
261 	/*
262 	 * TLS object.
263 	 */
264 	SSL* tls;
265 
266 	/*
267 	 * TLS handshake state.
268 	 */
269 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
270 		tls_hs_read_event, tls_hs_write_event } shake_state;
271 #endif
272 	/* list of connections, for service of remaining tcp channels */
273 	struct tcp_handler_data *prev, *next;
274 };
275 /* global that is the list of active tcp channels */
276 static struct tcp_handler_data *tcp_active_list = NULL;
277 
278 /*
279  * Handle incoming queries on the UDP server sockets.
280  */
281 static void handle_udp(int fd, short event, void* arg);
282 
283 /*
284  * Handle incoming connections on the TCP sockets.  These handlers
285  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
286  * connection) but are disabled when the number of current TCP
287  * connections is equal to the maximum number of TCP connections.
288  * Disabling is done by changing the handler to wait for the
289  * NETIO_EVENT_NONE type.  This is done using the function
290  * configure_tcp_accept_handlers.
291  */
292 static void handle_tcp_accept(int fd, short event, void* arg);
293 
294 /*
295  * Handle incoming queries on a TCP connection.  The TCP connections
296  * are configured to be non-blocking and the handler may be called
297  * multiple times before a complete query is received.
298  */
299 static void handle_tcp_reading(int fd, short event, void* arg);
300 
301 /*
302  * Handle outgoing responses on a TCP connection.  The TCP connections
303  * are configured to be non-blocking and the handler may be called
304  * multiple times before a complete response is sent.
305  */
306 static void handle_tcp_writing(int fd, short event, void* arg);
307 
308 #ifdef HAVE_SSL
309 /* Create SSL object and associate fd */
310 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
311 /*
312  * Handle TLS handshake. May be called multiple times if incomplete.
313  */
314 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
315 
316 /*
317  * Handle incoming queries on a TLS over TCP connection.  The TLS
318  * connections are configured to be non-blocking and the handler may
319  * be called multiple times before a complete query is received.
320  */
321 static void handle_tls_reading(int fd, short event, void* arg);
322 
323 /*
324  * Handle outgoing responses on a TLS over TCP connection.  The TLS
325  * connections are configured to be non-blocking and the handler may
326  * be called multiple times before a complete response is sent.
327  */
328 static void handle_tls_writing(int fd, short event, void* arg);
329 #endif
330 
331 /*
332  * Send all children the quit nonblocking, then close pipe.
333  */
334 static void send_children_quit(struct nsd* nsd);
335 /* same, for shutdown time, waits for child to exit to avoid restart issues */
336 static void send_children_quit_and_wait(struct nsd* nsd);
337 
338 /* set childrens flags to send NSD_STATS to them */
339 #ifdef BIND8_STATS
340 static void set_children_stats(struct nsd* nsd);
341 #endif /* BIND8_STATS */
342 
343 /*
344  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
345  */
346 static void configure_handler_event_types(short event_types);
347 
348 static uint16_t *compressed_dname_offsets = 0;
349 static uint32_t compression_table_capacity = 0;
350 static uint32_t compression_table_size = 0;
351 static domain_type* compressed_dnames[MAXRRSPP];
352 
353 #ifdef USE_TCP_FASTOPEN
354 /* Checks to see if the kernel value must be manually changed in order for
355    TCP Fast Open to support server mode */
356 static void report_tcp_fastopen_config() {
357 
358 	int tcp_fastopen_fp;
359 	uint8_t tcp_fastopen_value;
360 
361 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
362 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
363 	}
364 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
365 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
366 		close(tcp_fastopen_fp);
367 	}
368 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
369 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
370 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
371 		log_msg(LOG_WARNING, "To enable TFO use the command:");
372 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
373 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
374 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
375 		close(tcp_fastopen_fp);
376 	}
377 	close(tcp_fastopen_fp);
378 }
379 #endif
380 
381 /*
382  * Remove the specified pid from the list of child pids.  Returns -1 if
383  * the pid is not in the list, child_num otherwise.  The field is set to 0.
384  */
385 static int
386 delete_child_pid(struct nsd *nsd, pid_t pid)
387 {
388 	size_t i;
389 	for (i = 0; i < nsd->child_count; ++i) {
390 		if (nsd->children[i].pid == pid) {
391 			nsd->children[i].pid = 0;
392 			if(!nsd->children[i].need_to_exit) {
393 				if(nsd->children[i].child_fd != -1)
394 					close(nsd->children[i].child_fd);
395 				nsd->children[i].child_fd = -1;
396 				if(nsd->children[i].handler)
397 					nsd->children[i].handler->fd = -1;
398 			}
399 			return i;
400 		}
401 	}
402 	return -1;
403 }
404 
405 /*
406  * Restart child servers if necessary.
407  */
408 static int
409 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
410 	int* xfrd_sock_p)
411 {
412 	struct main_ipc_handler_data *ipc_data;
413 	size_t i;
414 	int sv[2];
415 
416 	/* Fork the child processes... */
417 	for (i = 0; i < nsd->child_count; ++i) {
418 		if (nsd->children[i].pid <= 0) {
419 			if (nsd->children[i].child_fd != -1)
420 				close(nsd->children[i].child_fd);
421 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
422 				log_msg(LOG_ERR, "socketpair: %s",
423 					strerror(errno));
424 				return -1;
425 			}
426 			nsd->children[i].child_fd = sv[0];
427 			nsd->children[i].parent_fd = sv[1];
428 			nsd->children[i].pid = fork();
429 			switch (nsd->children[i].pid) {
430 			default: /* SERVER MAIN */
431 				close(nsd->children[i].parent_fd);
432 				nsd->children[i].parent_fd = -1;
433 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
434 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
435 				}
436 				if(!nsd->children[i].handler)
437 				{
438 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
439 						region, sizeof(struct main_ipc_handler_data));
440 					ipc_data->nsd = nsd;
441 					ipc_data->child = &nsd->children[i];
442 					ipc_data->child_num = i;
443 					ipc_data->xfrd_sock = xfrd_sock_p;
444 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
445 					ipc_data->forward_mode = 0;
446 					ipc_data->got_bytes = 0;
447 					ipc_data->total_bytes = 0;
448 					ipc_data->acl_num = 0;
449 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
450 						region, sizeof(struct netio_handler));
451 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
452 					nsd->children[i].handler->timeout = NULL;
453 					nsd->children[i].handler->user_data = ipc_data;
454 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
455 					nsd->children[i].handler->event_handler = parent_handle_child_command;
456 					netio_add_handler(netio, nsd->children[i].handler);
457 				}
458 				/* clear any ongoing ipc */
459 				ipc_data = (struct main_ipc_handler_data*)
460 					nsd->children[i].handler->user_data;
461 				ipc_data->forward_mode = 0;
462 				/* restart - update fd */
463 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
464 				break;
465 			case 0: /* CHILD */
466 				/* the child need not be able to access the
467 				 * nsd.db file */
468 				namedb_close_udb(nsd->db);
469 #ifdef MEMCLEAN /* OS collects memory pages */
470 				region_destroy(region);
471 #endif
472 
473 				if (pledge("stdio rpath inet", NULL) == -1) {
474 					log_msg(LOG_ERR, "pledge");
475 					exit(1);
476 				}
477 
478 				nsd->pid = 0;
479 				nsd->child_count = 0;
480 				nsd->server_kind = nsd->children[i].kind;
481 				nsd->this_child = &nsd->children[i];
482 				nsd->this_child->child_num = i;
483 				/* remove signal flags inherited from parent
484 				   the parent will handle them. */
485 				nsd->signal_hint_reload_hup = 0;
486 				nsd->signal_hint_reload = 0;
487 				nsd->signal_hint_child = 0;
488 				nsd->signal_hint_quit = 0;
489 				nsd->signal_hint_shutdown = 0;
490 				nsd->signal_hint_stats = 0;
491 				nsd->signal_hint_statsusr = 0;
492 				close(*xfrd_sock_p);
493 				close(nsd->this_child->child_fd);
494 				nsd->this_child->child_fd = -1;
495 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
496 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
497 				}
498 				server_child(nsd);
499 				/* NOTREACH */
500 				exit(0);
501 			case -1:
502 				log_msg(LOG_ERR, "fork failed: %s",
503 					strerror(errno));
504 				return -1;
505 			}
506 		}
507 	}
508 	return 0;
509 }
510 
511 #ifdef BIND8_STATS
512 static void set_bind8_alarm(struct nsd* nsd)
513 {
514 	/* resync so that the next alarm is on the next whole minute */
515 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
516 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
517 }
518 #endif
519 
520 /* set zone stat ids for zones initially read in */
521 static void
522 zonestatid_tree_set(struct nsd* nsd)
523 {
524 	struct radnode* n;
525 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
526 		zone_type* zone = (zone_type*)n->elem;
527 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
528 	}
529 }
530 
531 #ifdef USE_ZONE_STATS
532 void
533 server_zonestat_alloc(struct nsd* nsd)
534 {
535 	size_t num = (nsd->options->zonestatnames->count==0?1:
536 			nsd->options->zonestatnames->count);
537 	size_t sz = sizeof(struct nsdst)*num;
538 	char tmpfile[256];
539 	uint8_t z = 0;
540 
541 	/* file names */
542 	nsd->zonestatfname[0] = 0;
543 	nsd->zonestatfname[1] = 0;
544 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
545 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
546 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
547 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
548 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
549 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
550 
551 	/* file descriptors */
552 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
553 	if(nsd->zonestatfd[0] == -1) {
554 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
555 			strerror(errno));
556 		exit(1);
557 	}
558 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
559 	if(nsd->zonestatfd[0] == -1) {
560 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
561 			strerror(errno));
562 		close(nsd->zonestatfd[0]);
563 		unlink(nsd->zonestatfname[0]);
564 		exit(1);
565 	}
566 
567 #ifdef HAVE_MMAP
568 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
569 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
570 			strerror(errno));
571 		exit(1);
572 	}
573 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
574 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
575 			nsd->zonestatfname[0], strerror(errno));
576 		exit(1);
577 	}
578 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
579 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
580 			strerror(errno));
581 		exit(1);
582 	}
583 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
584 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
585 			nsd->zonestatfname[1], strerror(errno));
586 		exit(1);
587 	}
588 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
589 		MAP_SHARED, nsd->zonestatfd[0], 0);
590 	if(nsd->zonestat[0] == MAP_FAILED) {
591 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
592 		unlink(nsd->zonestatfname[0]);
593 		unlink(nsd->zonestatfname[1]);
594 		exit(1);
595 	}
596 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
597 		MAP_SHARED, nsd->zonestatfd[1], 0);
598 	if(nsd->zonestat[1] == MAP_FAILED) {
599 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
600 		unlink(nsd->zonestatfname[0]);
601 		unlink(nsd->zonestatfname[1]);
602 		exit(1);
603 	}
604 	memset(nsd->zonestat[0], 0, sz);
605 	memset(nsd->zonestat[1], 0, sz);
606 	nsd->zonestatsize[0] = num;
607 	nsd->zonestatsize[1] = num;
608 	nsd->zonestatdesired = num;
609 	nsd->zonestatsizenow = num;
610 	nsd->zonestatnow = nsd->zonestat[0];
611 #endif /* HAVE_MMAP */
612 }
613 
614 void
615 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
616 {
617 #ifdef HAVE_MMAP
618 #ifdef MREMAP_MAYMOVE
619 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
620 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
621 		MREMAP_MAYMOVE);
622 	if(nsd->zonestat[idx] == MAP_FAILED) {
623 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
624 		exit(1);
625 	}
626 #else /* !HAVE MREMAP */
627 	if(msync(nsd->zonestat[idx],
628 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
629 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
630 	if(munmap(nsd->zonestat[idx],
631 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
632 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
633 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
634 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
635 	if(nsd->zonestat[idx] == MAP_FAILED) {
636 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
637 		exit(1);
638 	}
639 #endif /* MREMAP */
640 #endif /* HAVE_MMAP */
641 }
642 
643 /* realloc the zonestat array for the one that is not currently in use,
644  * to match the desired new size of the array (if applicable) */
645 void
646 server_zonestat_realloc(struct nsd* nsd)
647 {
648 #ifdef HAVE_MMAP
649 	uint8_t z = 0;
650 	size_t sz;
651 	int idx = 0; /* index of the zonestat array that is not in use */
652 	if(nsd->zonestatnow == nsd->zonestat[0])
653 		idx = 1;
654 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
655 		return;
656 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
657 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
658 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
659 			strerror(errno));
660 		exit(1);
661 	}
662 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
663 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
664 			nsd->zonestatfname[idx], strerror(errno));
665 		exit(1);
666 	}
667 	zonestat_remap(nsd, idx, sz);
668 	/* zero the newly allocated region */
669 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
670 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
671 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
672 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
673 	}
674 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
675 #endif /* HAVE_MMAP */
676 }
677 
678 /* switchover to use the other array for the new children, that
679  * briefly coexist with the old children.  And we want to avoid them
680  * both writing to the same statistics arrays. */
681 void
682 server_zonestat_switch(struct nsd* nsd)
683 {
684 	if(nsd->zonestatnow == nsd->zonestat[0]) {
685 		nsd->zonestatnow = nsd->zonestat[1];
686 		nsd->zonestatsizenow = nsd->zonestatsize[1];
687 	} else {
688 		nsd->zonestatnow = nsd->zonestat[0];
689 		nsd->zonestatsizenow = nsd->zonestatsize[0];
690 	}
691 }
692 #endif /* USE_ZONE_STATS */
693 
694 static void
695 cleanup_dname_compression_tables(void *ptr)
696 {
697 	free(ptr);
698 	compressed_dname_offsets = NULL;
699 	compression_table_capacity = 0;
700 }
701 
702 static void
703 initialize_dname_compression_tables(struct nsd *nsd)
704 {
705 	size_t needed = domain_table_count(nsd->db->domains) + 1;
706 	needed += EXTRA_DOMAIN_NUMBERS;
707 	if(compression_table_capacity < needed) {
708 		if(compressed_dname_offsets) {
709 			region_remove_cleanup(nsd->db->region,
710 				cleanup_dname_compression_tables,
711 				compressed_dname_offsets);
712 			free(compressed_dname_offsets);
713 		}
714 		compressed_dname_offsets = (uint16_t *) xmallocarray(
715 			needed, sizeof(uint16_t));
716 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
717 			compressed_dname_offsets);
718 		compression_table_capacity = needed;
719 		compression_table_size=domain_table_count(nsd->db->domains)+1;
720 	}
721 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
722 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
723 }
724 
725 static int
726 set_cloexec(struct nsd_socket *sock)
727 {
728 	assert(sock != NULL);
729 
730 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
731 		const char *socktype =
732 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
733 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
734 			socktype, strerror(errno));
735 		return -1;
736 	}
737 
738 	return 1;
739 }
740 
741 static int
742 set_reuseport(struct nsd_socket *sock)
743 {
744 #ifdef SO_REUSEPORT
745 	int on = 1;
746 #ifdef SO_REUSEPORT_LB
747 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
748 	 * SO_REUSEPORT on Linux. This is what the users want with the config
749 	 * option in nsd.conf; if we actually need local address and port reuse
750 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
751 	 * _LB they want.
752 	 */
753 	int opt = SO_REUSEPORT_LB;
754 	static const char optname[] = "SO_REUSEPORT_LB";
755 #else /* !SO_REUSEPORT_LB */
756 	int opt = SO_REUSEPORT;
757 	static const char optname[] = "SO_REUSEPORT";
758 #endif /* SO_REUSEPORT_LB */
759 
760 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
761 		return 1;
762 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
763 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
764 			optname, strerror(errno));
765 	}
766 	return -1;
767 #else
768 	(void)sock;
769 #endif /* SO_REUSEPORT */
770 
771 	return 0;
772 }
773 
774 static int
775 set_reuseaddr(struct nsd_socket *sock)
776 {
777 #ifdef SO_REUSEADDR
778 	int on = 1;
779 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
780 		return 1;
781 	}
782 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
783 		strerror(errno));
784 	return -1;
785 #endif /* SO_REUSEADDR */
786 	return 0;
787 }
788 
789 static int
790 set_rcvbuf(struct nsd_socket *sock, int rcv)
791 {
792 #ifdef SO_RCVBUF
793 #ifdef SO_RCVBUFFORCE
794 	if(0 == setsockopt(
795 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
796 	{
797 		return 1;
798 	}
799 	if(errno == EPERM || errno == ENOBUFS) {
800 		return 0;
801 	}
802 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
803 		strerror(errno));
804 	return -1;
805 #else /* !SO_RCVBUFFORCE */
806 	if (0 == setsockopt(
807 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
808 	{
809 		return 1;
810 	}
811 	if(errno == ENOSYS || errno == ENOBUFS) {
812 		return 0;
813 	}
814 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
815 		strerror(errno));
816 	return -1;
817 #endif /* SO_RCVBUFFORCE */
818 #endif /* SO_RCVBUF */
819 
820 	return 0;
821 }
822 
823 static int
824 set_sndbuf(struct nsd_socket *sock, int snd)
825 {
826 #ifdef SO_SNDBUF
827 #ifdef SO_SNDBUFFORCE
828 	if(0 == setsockopt(
829 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
830 	{
831 		return 1;
832 	}
833 	if(errno == EPERM || errno == ENOBUFS) {
834 		return 0;
835 	}
836 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
837 		strerror(errno));
838 	return -1;
839 #else /* !SO_SNDBUFFORCE */
840 	if(0 == setsockopt(
841 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
842 	{
843 		return 1;
844 	}
845 	if(errno == ENOSYS || errno == ENOBUFS) {
846 		return 0;
847 	}
848 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
849 		strerror(errno));
850 	return -1;
851 #endif /* SO_SNDBUFFORCE */
852 #endif /* SO_SNDBUF */
853 
854 	return 0;
855 }
856 
857 static int
858 set_nonblock(struct nsd_socket *sock)
859 {
860 	const char *socktype =
861 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
862 
863 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
864 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
865 			socktype, strerror(errno));
866 		return -1;
867 	}
868 
869 	return 1;
870 }
871 
872 #ifdef INET6
873 static int
874 set_ipv6_v6only(struct nsd_socket *sock)
875 {
876 #ifdef IPV6_V6ONLY
877 	int on = 1;
878 	const char *socktype =
879 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
880 
881 	if(0 == setsockopt(
882 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
883 	{
884 		return 1;
885 	}
886 
887 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
888 		socktype, strerror(errno));
889 	return -1;
890 #else
891 	(void)sock;
892 #endif /* IPV6_V6ONLY */
893 
894 	return 0;
895 }
896 #endif /* INET6 */
897 
898 #ifdef INET6
899 static int
900 set_ipv6_use_min_mtu(struct nsd_socket *sock)
901 {
902 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
903 #if defined(IPV6_USE_MIN_MTU)
904 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
905 	 * network. Therefore we do not send UDP datagrams larger than the
906 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
907 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
908 	 */
909 	int opt = IPV6_USE_MIN_MTU;
910 	int optval = 1;
911 	static const char optname[] = "IPV6_USE_MIN_MTU";
912 #elif defined(IPV6_MTU)
913 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
914 	 * to the MIN MTU to get the same.
915 	 */
916 	int opt = IPV6_MTU;
917 	int optval = IPV6_MIN_MTU;
918 	static const char optname[] = "IPV6_MTU";
919 #endif
920 	if(0 == setsockopt(
921 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
922 	{
923 		return 1;
924 	}
925 
926 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
927 		optname, strerror(errno));
928 	return -1;
929 #else
930 	(void)sock;
931 #endif /* INET6 */
932 
933 	return 0;
934 }
935 #endif /* INET6 */
936 
937 static int
938 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
939 {
940 	int ret = 0;
941 
942 #if defined(IP_MTU_DISCOVER)
943 	int opt = IP_MTU_DISCOVER;
944 	int optval;
945 # if defined(IP_PMTUDISC_OMIT)
946 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
947 	 * information and send packets with DF=0. Fragmentation is allowed if
948 	 * and only if the packet size exceeds the outgoing interface MTU or
949 	 * the packet encounters smaller MTU link in network. This mitigates
950 	 * DNS fragmentation attacks by preventing forged PMTU information.
951 	 * FreeBSD already has same semantics without setting the option.
952 	 */
953 	optval = IP_PMTUDISC_OMIT;
954 	if(0 == setsockopt(
955 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
956 	{
957 		return 1;
958 	}
959 
960 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
961 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
962 # endif /* IP_PMTUDISC_OMIT */
963 # if defined(IP_PMTUDISC_DONT)
964 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
965 	optval = IP_PMTUDISC_DONT;
966 	if(0 == setsockopt(
967 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
968 	{
969 		return 1;
970 	}
971 
972 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
973 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
974 # endif
975 	ret = -1;
976 #elif defined(IP_DONTFRAG)
977 	int off = 0;
978 	if (0 == setsockopt(
979 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
980 	{
981 		return 1;
982 	}
983 
984 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
985 		strerror(errno));
986 	ret = -1;
987 #else
988 	(void)sock;
989 #endif
990 
991 	return ret;
992 }
993 
994 static int
995 set_ip_freebind(struct nsd_socket *sock)
996 {
997 #ifdef IP_FREEBIND
998 	int on = 1;
999 	const char *socktype =
1000 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1001 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1002 	{
1003 		return 1;
1004 	}
1005 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1006 		socktype, strerror(errno));
1007 	return -1;
1008 #else
1009 	(void)sock;
1010 #endif /* IP_FREEBIND */
1011 
1012 	return 0;
1013 }
1014 
1015 static int
1016 set_ip_transparent(struct nsd_socket *sock)
1017 {
1018 	/*
1019 	The scandalous preprocessor blob here calls for some explanation :)
1020 	POSIX does not specify an option to bind non-local IPs, so
1021 	platforms developed several implementation-specific options,
1022 	all set in the same way, but with different names.
1023 	For additional complexity, some platform manage this setting
1024 	differently for different address families (IPv4 vs IPv6).
1025 	This scandalous preprocessor blob below abstracts such variability
1026 	in the way which leaves the C code as lean and clear as possible.
1027 	*/
1028 
1029 #if defined(IP_TRANSPARENT)
1030 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1031 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1032 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1033 // as of 2020-01, Linux does not support this on IPv6 programmatically
1034 #elif defined(SO_BINDANY)
1035 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1036 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1037 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1038 #elif defined(IP_BINDANY)
1039 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1040 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1041 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1042 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1043 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1044 #endif
1045 
1046 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1047 	(void)sock;
1048 #else
1049 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1050 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1051 #	endif
1052 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1053 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1054 #	endif
1055 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1056 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1057 #	endif
1058 
1059 	int on = 1;
1060 	const char *socktype =
1061 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1062 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1063 
1064 	if(0 == setsockopt(
1065 		sock->s,
1066 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1067 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1068 		&on, sizeof(on)))
1069 	{
1070 		return 1;
1071 	}
1072 
1073 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1074 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1075 	return -1;
1076 #endif
1077 
1078 	return 0;
1079 }
1080 
1081 static int
1082 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1083 {
1084 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1085 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1086 		return 1;
1087 	}
1088 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1089 		strerror(errno));
1090 	return -1;
1091 #else
1092 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1093 #endif
1094 	return 0;
1095 }
1096 
1097 #ifdef USE_TCP_FASTOPEN
1098 static int
1099 set_tcp_fastopen(struct nsd_socket *sock)
1100 {
1101 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1102 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1103 	 */
1104 	int qlen;
1105 
1106 #ifdef __APPLE__
1107 	/* macOS X implementation only supports qlen of 1 via this call. The
1108 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1109 	 * kernel parameter.
1110 	 */
1111 	qlen = 1;
1112 #else
1113 	/* 5 is recommended on Linux. */
1114 	qlen = 5;
1115 #endif
1116 	if (0 == setsockopt(
1117 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1118 	{
1119 		return 1;
1120 	}
1121 
1122 	if (errno == EPERM) {
1123 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1124 				 "; this could likely be because sysctl "
1125 				 "net.inet.tcp.fastopen.enabled, "
1126 				 "net.inet.tcp.fastopen.server_enable, or "
1127 				 "net.ipv4.tcp_fastopen is disabled",
1128 			strerror(errno));
1129 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1130 	 * disabled, except when verbosity enabled for debugging
1131 	 */
1132 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1133 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1134 			strerror(errno));
1135 	}
1136 
1137 	return (errno == ENOPROTOOPT ? 0 : -1);
1138 }
1139 #endif /* USE_TCP_FASTOPEN */
1140 
1141 static int
1142 set_bindtodevice(struct nsd_socket *sock)
1143 {
1144 #if defined(SO_BINDTODEVICE)
1145 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1146 		sock->device, strlen(sock->device)) == -1)
1147 	{
1148 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1149 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1150 		return -1;
1151 	}
1152 
1153 	return 1;
1154 #else
1155 	(void)sock;
1156 	return 0;
1157 #endif
1158 }
1159 
1160 static int
1161 set_setfib(struct nsd_socket *sock)
1162 {
1163 #if defined(SO_SETFIB)
1164 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1165 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1166 	{
1167 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1168 		                 "SO_SETFIB", sock->fib, strerror(errno));
1169 		return -1;
1170 	}
1171 
1172 	return 1;
1173 #else
1174 	(void)sock;
1175 	return 0;
1176 #endif
1177 }
1178 
1179 static int
1180 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1181 {
1182 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1183 
1184 	if(-1 == (sock->s = socket(
1185 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1186 	{
1187 #ifdef INET6
1188 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1189 		   (sock->addr.ai_family == AF_INET6) &&
1190 		   (errno == EAFNOSUPPORT))
1191 		{
1192 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1193 				"not supported");
1194 			return 0;
1195 		}
1196 #endif
1197 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1198 		return -1;
1199 	}
1200 
1201 	set_cloexec(sock);
1202 
1203 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1204 		*reuseport_works = (set_reuseport(sock) == 1);
1205 
1206 	if(nsd->options->receive_buffer_size > 0)
1207 		rcv = nsd->options->receive_buffer_size;
1208 	if(set_rcvbuf(sock, rcv) == -1)
1209 		return -1;
1210 
1211 	if(nsd->options->send_buffer_size > 0)
1212 		snd = nsd->options->send_buffer_size;
1213 	if(set_sndbuf(sock, snd) == -1)
1214 		return -1;
1215 #ifdef INET6
1216 	if(sock->addr.ai_family == AF_INET6) {
1217 		if(set_ipv6_v6only(sock) == -1 ||
1218 		   set_ipv6_use_min_mtu(sock) == -1)
1219 			return -1;
1220 	} else
1221 #endif /* INET6 */
1222 	if(sock->addr.ai_family == AF_INET) {
1223 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1224 			return -1;
1225 	}
1226 
1227 	/* Set socket to non-blocking. Otherwise, on operating systems
1228 	 * with thundering herd problems, the UDP recv could block
1229 	 * after select returns readable.
1230 	 */
1231 	set_nonblock(sock);
1232 
1233 	if(nsd->options->ip_freebind)
1234 		(void)set_ip_freebind(sock);
1235 	if(nsd->options->ip_transparent)
1236 		(void)set_ip_transparent(sock);
1237 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1238 		return -1;
1239 	if(sock->fib != -1 && set_setfib(sock) == -1)
1240 		return -1;
1241 
1242 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1243 		char buf[256];
1244 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1245 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1246 			buf, strerror(errno));
1247 		return -1;
1248 	}
1249 
1250 	return 1;
1251 }
1252 
1253 static int
1254 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1255 {
1256 #ifdef USE_TCP_FASTOPEN
1257 	report_tcp_fastopen_config();
1258 #endif
1259 
1260 	(void)reuseport_works;
1261 
1262 	if(-1 == (sock->s = socket(
1263 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1264 	{
1265 #ifdef INET6
1266 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1267 		   (sock->addr.ai_family == AF_INET6) &&
1268 		   (errno == EAFNOSUPPORT))
1269 		{
1270 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1271 			                     "not supported");
1272 			return 0;
1273 		}
1274 #endif /* INET6 */
1275 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1276 		return -1;
1277 	}
1278 
1279 	set_cloexec(sock);
1280 
1281 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1282 		*reuseport_works = (set_reuseport(sock) == 1);
1283 
1284 	(void)set_reuseaddr(sock);
1285 
1286 #ifdef INET6
1287 	if(sock->addr.ai_family == AF_INET6) {
1288 		if (set_ipv6_v6only(sock) == -1 ||
1289 		    set_ipv6_use_min_mtu(sock) == -1)
1290 			return -1;
1291 	}
1292 #endif
1293 
1294 	if(nsd->tcp_mss > 0)
1295 		set_tcp_maxseg(sock, nsd->tcp_mss);
1296 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1297 	   it may block in accept, even if select() says readable. */
1298 	(void)set_nonblock(sock);
1299 	if(nsd->options->ip_freebind)
1300 		(void)set_ip_freebind(sock);
1301 	if(nsd->options->ip_transparent)
1302 		(void)set_ip_transparent(sock);
1303 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1304 		return -1;
1305 	if(sock->fib != -1 && set_setfib(sock) == -1)
1306 		return -1;
1307 
1308 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1309 		char buf[256];
1310 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1311 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1312 			buf, strerror(errno));
1313 		return -1;
1314 	}
1315 
1316 #ifdef USE_TCP_FASTOPEN
1317 	(void)set_tcp_fastopen(sock);
1318 #endif
1319 
1320 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1321 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1322 		return -1;
1323 	}
1324 
1325 	return 1;
1326 }
1327 
1328 /*
1329  * Initialize the server, reuseport, create and bind the sockets.
1330  */
1331 int
1332 server_init(struct nsd *nsd)
1333 {
1334 	size_t i;
1335 	int reuseport = 1; /* Determine if REUSEPORT works. */
1336 
1337 	/* open server interface ports */
1338 	for(i = 0; i < nsd->ifs; i++) {
1339 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1340 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1341 		{
1342 			return -1;
1343 		}
1344 	}
1345 
1346 	if(nsd->reuseport && reuseport) {
1347 		size_t ifs = nsd->ifs * nsd->reuseport;
1348 
1349 		/* increase the size of the interface arrays, there are going
1350 		 * to be separate interface file descriptors for every server
1351 		 * instance */
1352 		region_remove_cleanup(nsd->region, free, nsd->udp);
1353 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1354 
1355 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1356 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1357 		region_add_cleanup(nsd->region, free, nsd->udp);
1358 		region_add_cleanup(nsd->region, free, nsd->tcp);
1359 		if(ifs > nsd->ifs) {
1360 			memset(&nsd->udp[nsd->ifs], 0,
1361 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1362 			memset(&nsd->tcp[nsd->ifs], 0,
1363 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1364 		}
1365 
1366 		for(i = nsd->ifs; i < ifs; i++) {
1367 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1368 			nsd->udp[i].s = -1;
1369 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1370 				return -1;
1371 			}
1372 			/* Turn off REUSEPORT for TCP by copying the socket
1373 			 * file descriptor.
1374 			 * This means we should not close TCP used by
1375 			 * other servers in reuseport enabled mode, in
1376 			 * server_child().
1377 			 */
1378 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1379 		}
1380 
1381 		nsd->ifs = ifs;
1382 	} else {
1383 		nsd->reuseport = 0;
1384 	}
1385 
1386 	return 0;
1387 }
1388 
1389 /*
1390  * Prepare the server for take off.
1391  *
1392  */
1393 int
1394 server_prepare(struct nsd *nsd)
1395 {
1396 #ifdef RATELIMIT
1397 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1398 #ifdef HAVE_GETRANDOM
1399 	uint32_t v;
1400 	if(getrandom(&v, sizeof(v), 0) == -1) {
1401 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1402 		exit(1);
1403 	}
1404 	hash_set_raninit(v);
1405 #elif defined(HAVE_ARC4RANDOM)
1406 	hash_set_raninit(arc4random());
1407 #else
1408 	uint32_t v = getpid() ^ time(NULL);
1409 	srandom((unsigned long)v);
1410 #  ifdef HAVE_SSL
1411 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1412 		hash_set_raninit(v);
1413 	else
1414 #  endif
1415 		hash_set_raninit(random());
1416 #endif
1417 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1418 		nsd->options->rrl_ratelimit,
1419 		nsd->options->rrl_whitelist_ratelimit,
1420 		nsd->options->rrl_slip,
1421 		nsd->options->rrl_ipv4_prefix_length,
1422 		nsd->options->rrl_ipv6_prefix_length);
1423 #endif /* RATELIMIT */
1424 
1425 	/* Open the database... */
1426 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1427 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1428 			nsd->dbfile, strerror(errno));
1429 		unlink(nsd->task[0]->fname);
1430 		unlink(nsd->task[1]->fname);
1431 #ifdef USE_ZONE_STATS
1432 		unlink(nsd->zonestatfname[0]);
1433 		unlink(nsd->zonestatfname[1]);
1434 #endif
1435 		xfrd_del_tempdir(nsd);
1436 		return -1;
1437 	}
1438 	/* check if zone files have been modified */
1439 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1440 	 * for all zones */
1441 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1442 		nsd->options->database[0] == 0))
1443 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1444 	zonestatid_tree_set(nsd);
1445 
1446 	compression_table_capacity = 0;
1447 	initialize_dname_compression_tables(nsd);
1448 
1449 #ifdef	BIND8_STATS
1450 	/* Initialize times... */
1451 	time(&nsd->st.boot);
1452 	set_bind8_alarm(nsd);
1453 #endif /* BIND8_STATS */
1454 
1455 	return 0;
1456 }
1457 
1458 /*
1459  * Fork the required number of servers.
1460  */
1461 static int
1462 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1463 	int* xfrd_sock_p)
1464 {
1465 	size_t i;
1466 
1467 	/* Start all child servers initially.  */
1468 	for (i = 0; i < nsd->child_count; ++i) {
1469 		nsd->children[i].pid = 0;
1470 	}
1471 
1472 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1473 }
1474 
1475 static void
1476 server_close_socket(struct nsd_socket *sock)
1477 {
1478 	if(sock->s != -1) {
1479 		close(sock->s);
1480 		sock->s = -1;
1481 	}
1482 }
1483 
1484 void
1485 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1486 {
1487 	size_t i;
1488 
1489 	/* Close all the sockets... */
1490 	for (i = 0; i < n; ++i) {
1491 		server_close_socket(&sockets[i]);
1492 	}
1493 }
1494 
1495 /*
1496  * Close the sockets, shutdown the server and exit.
1497  * Does not return.
1498  */
1499 void
1500 server_shutdown(struct nsd *nsd)
1501 {
1502 	size_t i;
1503 
1504 	server_close_all_sockets(nsd->udp, nsd->ifs);
1505 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1506 	/* CHILD: close command channel to parent */
1507 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1508 	{
1509 		close(nsd->this_child->parent_fd);
1510 		nsd->this_child->parent_fd = -1;
1511 	}
1512 	/* SERVER: close command channels to children */
1513 	if(!nsd->this_child)
1514 	{
1515 		for(i=0; i < nsd->child_count; ++i)
1516 			if(nsd->children[i].child_fd != -1)
1517 			{
1518 				close(nsd->children[i].child_fd);
1519 				nsd->children[i].child_fd = -1;
1520 			}
1521 	}
1522 
1523 	tsig_finalize();
1524 #ifdef HAVE_SSL
1525 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1526 	if (nsd->tls_ctx)
1527 		SSL_CTX_free(nsd->tls_ctx);
1528 #endif
1529 
1530 #ifdef MEMCLEAN /* OS collects memory pages */
1531 #ifdef RATELIMIT
1532 	rrl_mmap_deinit_keep_mmap();
1533 #endif
1534 #ifdef USE_DNSTAP
1535 	dt_collector_destroy(nsd->dt_collector, nsd);
1536 #endif
1537 	udb_base_free_keep_mmap(nsd->task[0]);
1538 	udb_base_free_keep_mmap(nsd->task[1]);
1539 	namedb_close_udb(nsd->db); /* keeps mmap */
1540 	namedb_close(nsd->db);
1541 	nsd_options_destroy(nsd->options);
1542 	region_destroy(nsd->region);
1543 #endif
1544 	log_finalize();
1545 	exit(0);
1546 }
1547 
1548 void
1549 server_prepare_xfrd(struct nsd* nsd)
1550 {
1551 	char tmpfile[256];
1552 	/* create task mmaps */
1553 	nsd->mytask = 0;
1554 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1555 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1556 	nsd->task[0] = task_file_create(tmpfile);
1557 	if(!nsd->task[0]) {
1558 #ifdef USE_ZONE_STATS
1559 		unlink(nsd->zonestatfname[0]);
1560 		unlink(nsd->zonestatfname[1]);
1561 #endif
1562 		xfrd_del_tempdir(nsd);
1563 		exit(1);
1564 	}
1565 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1566 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1567 	nsd->task[1] = task_file_create(tmpfile);
1568 	if(!nsd->task[1]) {
1569 		unlink(nsd->task[0]->fname);
1570 #ifdef USE_ZONE_STATS
1571 		unlink(nsd->zonestatfname[0]);
1572 		unlink(nsd->zonestatfname[1]);
1573 #endif
1574 		xfrd_del_tempdir(nsd);
1575 		exit(1);
1576 	}
1577 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1578 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1579 	/* create xfrd listener structure */
1580 	nsd->xfrd_listener = region_alloc(nsd->region,
1581 		sizeof(netio_handler_type));
1582 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1583 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1584 	nsd->xfrd_listener->fd = -1;
1585 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1586 		nsd;
1587 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1588 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1589 }
1590 
1591 
1592 void
1593 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1594 {
1595 	pid_t pid;
1596 	int sockets[2] = {0,0};
1597 	struct ipc_handler_conn_data *data;
1598 
1599 	if(nsd->xfrd_listener->fd != -1)
1600 		close(nsd->xfrd_listener->fd);
1601 	if(del_db) {
1602 		/* recreate taskdb that xfrd was using, it may be corrupt */
1603 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1604 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1605 		nsd->task[1-nsd->mytask]->fname = NULL;
1606 		/* free alloc already, so udb does not shrink itself */
1607 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1608 		nsd->task[1-nsd->mytask]->alloc = NULL;
1609 		udb_base_free(nsd->task[1-nsd->mytask]);
1610 		/* create new file, overwrite the old one */
1611 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1612 		free(tmpfile);
1613 	}
1614 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1615 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1616 		return;
1617 	}
1618 	pid = fork();
1619 	switch (pid) {
1620 	case -1:
1621 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1622 		break;
1623 	default:
1624 		/* PARENT: close first socket, use second one */
1625 		close(sockets[0]);
1626 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1627 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1628 		}
1629 		if(del_db) xfrd_free_namedb(nsd);
1630 		/* use other task than I am using, since if xfrd died and is
1631 		 * restarted, the reload is using nsd->mytask */
1632 		nsd->mytask = 1 - nsd->mytask;
1633 
1634 #ifdef HAVE_SETPROCTITLE
1635 		setproctitle("xfrd");
1636 #endif
1637 #ifdef HAVE_CPUSET_T
1638 		if(nsd->use_cpu_affinity) {
1639 			set_cpu_affinity(nsd->xfrd_cpuset);
1640 		}
1641 #endif
1642 
1643 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1644 		/* ENOTREACH */
1645 		break;
1646 	case 0:
1647 		/* CHILD: close second socket, use first one */
1648 		close(sockets[1]);
1649 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1650 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1651 		}
1652 		nsd->xfrd_listener->fd = sockets[0];
1653 		break;
1654 	}
1655 	/* server-parent only */
1656 	nsd->xfrd_listener->timeout = NULL;
1657 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1658 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1659 	/* clear ongoing ipc reads */
1660 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1661 	data->conn->is_reading = 0;
1662 }
1663 
1664 /** add all soainfo to taskdb */
1665 static void
1666 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1667 {
1668 	struct radnode* n;
1669 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1670 	/* add all SOA INFO to mytask */
1671 	udb_ptr_init(&task_last, taskudb);
1672 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1673 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1674 	}
1675 	udb_ptr_unlink(&task_last, taskudb);
1676 }
1677 
1678 void
1679 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1680 {
1681 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1682 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1683 	 *   then they exchange and process.
1684 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1685 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1686 	 *   expire notifications can be sent back via a normal reload later
1687 	 *   (xfrd will wait for current running reload to finish if any).
1688 	 */
1689 	sig_atomic_t cmd = 0;
1690 	pid_t mypid;
1691 	int xfrd_sock = nsd->xfrd_listener->fd;
1692 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1693 	udb_ptr t;
1694 	if(!shortsoa) {
1695 		if(nsd->signal_hint_shutdown) {
1696 		shutdown:
1697 			log_msg(LOG_WARNING, "signal received, shutting down...");
1698 			server_close_all_sockets(nsd->udp, nsd->ifs);
1699 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1700 #ifdef HAVE_SSL
1701 			daemon_remote_close(nsd->rc);
1702 #endif
1703 			/* Unlink it if possible... */
1704 			unlinkpid(nsd->pidfile);
1705 			unlink(nsd->task[0]->fname);
1706 			unlink(nsd->task[1]->fname);
1707 #ifdef USE_ZONE_STATS
1708 			unlink(nsd->zonestatfname[0]);
1709 			unlink(nsd->zonestatfname[1]);
1710 #endif
1711 			/* write the nsd.db to disk, wait for it to complete */
1712 			udb_base_sync(nsd->db->udb, 1);
1713 			udb_base_close(nsd->db->udb);
1714 			server_shutdown(nsd);
1715 			/* ENOTREACH */
1716 			exit(0);
1717 		}
1718 	}
1719 	if(shortsoa) {
1720 		/* put SOA in xfrd task because mytask may be in use */
1721 		taskudb = nsd->task[1-nsd->mytask];
1722 	}
1723 
1724 	add_all_soa_to_task(nsd, taskudb);
1725 	if(!shortsoa) {
1726 		/* wait for xfrd to signal task is ready, RELOAD signal */
1727 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1728 			cmd != NSD_RELOAD) {
1729 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1730 			exit(1);
1731 		}
1732 		if(nsd->signal_hint_shutdown) {
1733 			goto shutdown;
1734 		}
1735 	}
1736 	/* give xfrd our task, signal it with RELOAD_DONE */
1737 	task_process_sync(taskudb);
1738 	cmd = NSD_RELOAD_DONE;
1739 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1740 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1741 			(int)nsd->pid, strerror(errno));
1742 	}
1743 	mypid = getpid();
1744 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1745 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1746 			strerror(errno));
1747 	}
1748 
1749 	if(!shortsoa) {
1750 		/* process the xfrd task works (expiry data) */
1751 		nsd->mytask = 1 - nsd->mytask;
1752 		taskudb = nsd->task[nsd->mytask];
1753 		task_remap(taskudb);
1754 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1755 		while(!udb_ptr_is_null(&t)) {
1756 			task_process_expire(nsd->db, TASKLIST(&t));
1757 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1758 		}
1759 		udb_ptr_unlink(&t, taskudb);
1760 		task_clear(taskudb);
1761 
1762 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1763 		cmd = NSD_RELOAD_DONE;
1764 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1765 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1766 				(int)nsd->pid, strerror(errno));
1767 		}
1768 	}
1769 }
1770 
1771 #ifdef HAVE_SSL
1772 static void
1773 log_crypto_from_err(const char* str, unsigned long err)
1774 {
1775 	/* error:[error code]:[library name]:[function name]:[reason string] */
1776 	char buf[128];
1777 	unsigned long e;
1778 	ERR_error_string_n(err, buf, sizeof(buf));
1779 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1780 	while( (e=ERR_get_error()) ) {
1781 		ERR_error_string_n(e, buf, sizeof(buf));
1782 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1783 	}
1784 }
1785 
1786 void
1787 log_crypto_err(const char* str)
1788 {
1789 	log_crypto_from_err(str, ERR_get_error());
1790 }
1791 
1792 /** true if the ssl handshake error has to be squelched from the logs */
1793 static int
1794 squelch_err_ssl_handshake(unsigned long err)
1795 {
1796 	if(verbosity >= 3)
1797 		return 0; /* only squelch on low verbosity */
1798 	/* this is very specific, we could filter on ERR_GET_REASON()
1799 	 * (the third element in ERR_PACK) */
1800 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1801 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1802 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1803 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1804 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1805 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1806 #endif
1807 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1808 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1809 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1810 #  ifdef SSL_R_VERSION_TOO_LOW
1811 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1812 #  endif
1813 #endif
1814 		)
1815 		return 1;
1816 	return 0;
1817 }
1818 
1819 void
1820 perform_openssl_init(void)
1821 {
1822 	/* init SSL library */
1823 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1824 	ERR_load_crypto_strings();
1825 #endif
1826 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1827 	ERR_load_SSL_strings();
1828 #endif
1829 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1830 	OpenSSL_add_all_algorithms();
1831 #else
1832 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1833 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1834 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1835 #endif
1836 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1837 	(void)SSL_library_init();
1838 #else
1839 	OPENSSL_init_ssl(0, NULL);
1840 #endif
1841 
1842 	if(!RAND_status()) {
1843 		/* try to seed it */
1844 		unsigned char buf[256];
1845 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1846 		size_t i;
1847 		v = seed;
1848 		for(i=0; i<256/sizeof(v); i++) {
1849 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1850 			v = v*seed + (unsigned int)i;
1851 		}
1852 		RAND_seed(buf, 256);
1853 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1854 	}
1855 }
1856 
1857 static int
1858 get_ocsp(char *filename, unsigned char **ocsp)
1859 {
1860 	BIO *bio;
1861 	OCSP_RESPONSE *response;
1862 	int len = -1;
1863 	unsigned char *p, *buf;
1864 	assert(filename);
1865 
1866 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1867 		log_crypto_err("get_ocsp: BIO_new_file failed");
1868 		return -1;
1869 	}
1870 
1871 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1872 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1873 		BIO_free(bio);
1874 		return -1;
1875 	}
1876 
1877 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1878 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1879 		OCSP_RESPONSE_free(response);
1880 		BIO_free(bio);
1881 		return -1;
1882 	}
1883 
1884 	if ((buf = malloc((size_t) len)) == NULL) {
1885 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1886 		OCSP_RESPONSE_free(response);
1887 		BIO_free(bio);
1888 		return -1;
1889 	}
1890 
1891 	p = buf;
1892 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1893 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1894 		free(buf);
1895 		OCSP_RESPONSE_free(response);
1896 		BIO_free(bio);
1897 		return -1;
1898 	}
1899 
1900 	OCSP_RESPONSE_free(response);
1901 	BIO_free(bio);
1902 
1903 	*ocsp = buf;
1904 	return len;
1905 }
1906 
1907 /* further setup ssl ctx after the keys are loaded */
1908 static void
1909 listen_sslctx_setup_2(void* ctxt)
1910 {
1911 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1912 	(void)ctx;
1913 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1914 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1915 		/* ENOTREACH */
1916 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1917 	}
1918 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1919 	if(1) {
1920 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1921 		if (!ecdh) {
1922 			log_crypto_err("could not find p256, not enabling ECDHE");
1923 		} else {
1924 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1925 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1926 			}
1927 			EC_KEY_free (ecdh);
1928 		}
1929 	}
1930 #endif
1931 }
1932 
1933 static int
1934 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1935 {
1936 	if(ocspdata) {
1937 		unsigned char *p;
1938 		if ((p=malloc(ocspdata_len)) == NULL) {
1939 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1940 			return SSL_TLSEXT_ERR_NOACK;
1941 		}
1942 		memcpy(p, ocspdata, ocspdata_len);
1943 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1944 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1945 			free(p);
1946 			return SSL_TLSEXT_ERR_NOACK;
1947 		}
1948 		return SSL_TLSEXT_ERR_OK;
1949 	} else {
1950 		return SSL_TLSEXT_ERR_NOACK;
1951 	}
1952 }
1953 
1954 SSL_CTX*
1955 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1956 {
1957 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1958 	if(!ctx) {
1959 		log_crypto_err("could not SSL_CTX_new");
1960 		return NULL;
1961 	}
1962 	/* no SSLv2, SSLv3 because has defects */
1963 #if SSL_OP_NO_SSLv2 != 0
1964 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1965 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1966 		SSL_CTX_free(ctx);
1967 		return NULL;
1968 	}
1969 #endif
1970 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1971 		!= SSL_OP_NO_SSLv3){
1972 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1973 		SSL_CTX_free(ctx);
1974 		return 0;
1975 	}
1976 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1977 	/* if we have tls 1.1 disable 1.0 */
1978 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1979 		!= SSL_OP_NO_TLSv1){
1980 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1981 		SSL_CTX_free(ctx);
1982 		return 0;
1983 	}
1984 #endif
1985 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1986 	/* if we have tls 1.2 disable 1.1 */
1987 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1988 		!= SSL_OP_NO_TLSv1_1){
1989 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1990 		SSL_CTX_free(ctx);
1991 		return 0;
1992 	}
1993 #endif
1994 #if defined(SSL_OP_NO_RENEGOTIATION)
1995 	/* disable client renegotiation */
1996 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1997 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1998 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1999 		SSL_CTX_free(ctx);
2000 		return 0;
2001 	}
2002 #endif
2003 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2004 	/* if we detect system-wide crypto policies, use those */
2005 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2006 		/* if we have sha256, set the cipher list to have no known vulns */
2007 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2008 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2009 	}
2010 #endif
2011 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2012 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2013 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2014 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2015 		SSL_CTX_free(ctx);
2016 		return 0;
2017 	}
2018 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2019 	SSL_CTX_set_security_level(ctx, 0);
2020 #endif
2021 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2022 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2023 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2024 		SSL_CTX_free(ctx);
2025 		return NULL;
2026 	}
2027 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2028 		log_msg(LOG_ERR, "error for private key file: %s", key);
2029 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2030 		SSL_CTX_free(ctx);
2031 		return NULL;
2032 	}
2033 	if(!SSL_CTX_check_private_key(ctx)) {
2034 		log_msg(LOG_ERR, "error for key file: %s", key);
2035 		log_crypto_err("Error in SSL_CTX check_private_key");
2036 		SSL_CTX_free(ctx);
2037 		return NULL;
2038 	}
2039 	listen_sslctx_setup_2(ctx);
2040 	if(verifypem && verifypem[0]) {
2041 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2042 			log_crypto_err("Error in SSL_CTX verify locations");
2043 			SSL_CTX_free(ctx);
2044 			return NULL;
2045 		}
2046 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2047 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2048 	}
2049 	return ctx;
2050 }
2051 
2052 SSL_CTX*
2053 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2054 {
2055 	char *key, *pem;
2056 	SSL_CTX *ctx;
2057 
2058 	key = nsd->options->tls_service_key;
2059 	pem = nsd->options->tls_service_pem;
2060 	if(!key || key[0] == 0) {
2061 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2062 		return NULL;
2063 	}
2064 	if(!pem || pem[0] == 0) {
2065 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2066 		return NULL;
2067 	}
2068 
2069 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2070 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2071 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2072 	if(!ctx) {
2073 		log_msg(LOG_ERR, "could not setup server TLS context");
2074 		return NULL;
2075 	}
2076 	if(ocspfile && ocspfile[0]) {
2077 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2078 			log_crypto_err("Error reading OCSPfile");
2079 			SSL_CTX_free(ctx);
2080 			return NULL;
2081 		} else {
2082 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2083 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2084 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2085 				SSL_CTX_free(ctx);
2086 				return NULL;
2087 			}
2088 		}
2089 	}
2090 	return ctx;
2091 }
2092 
2093 /* check if tcp_handler_accept_data created for TLS dedicated port */
2094 int
2095 using_tls_port(struct sockaddr* addr, const char* tls_port)
2096 {
2097 	in_port_t port = 0;
2098 
2099 	if (addr->sa_family == AF_INET)
2100 		port = ((struct sockaddr_in*)addr)->sin_port;
2101 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2102 	else
2103 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2104 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2105 	if (atoi(tls_port) == ntohs(port))
2106 		return 1;
2107 
2108 	return 0;
2109 }
2110 #endif
2111 
2112 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2113 ssize_t
2114 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2115 {
2116 	uint8_t* buf = (uint8_t*) p;
2117 	ssize_t total = 0;
2118 	struct pollfd fd;
2119 	memset(&fd, 0, sizeof(fd));
2120 	fd.fd = s;
2121 	fd.events = POLLIN;
2122 
2123 	while( total < sz) {
2124 		ssize_t ret;
2125 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2126 		if(ret == -1) {
2127 			if(errno == EAGAIN)
2128 				/* blocking read */
2129 				continue;
2130 			if(errno == EINTR) {
2131 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2132 					return -1;
2133 				/* other signals can be handled later */
2134 				continue;
2135 			}
2136 			/* some error */
2137 			return -1;
2138 		}
2139 		if(ret == 0) {
2140 			/* operation timed out */
2141 			return -2;
2142 		}
2143 		ret = read(s, buf+total, sz-total);
2144 		if(ret == -1) {
2145 			if(errno == EAGAIN)
2146 				/* blocking read */
2147 				continue;
2148 			if(errno == EINTR) {
2149 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2150 					return -1;
2151 				/* other signals can be handled later */
2152 				continue;
2153 			}
2154 			/* some error */
2155 			return -1;
2156 		}
2157 		if(ret == 0) {
2158 			/* closed connection! */
2159 			return 0;
2160 		}
2161 		total += ret;
2162 	}
2163 	return total;
2164 }
2165 
2166 static void
2167 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2168 {
2169 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2170 	udb_ptr t, next;
2171 	udb_base* u = nsd->task[nsd->mytask];
2172 	udb_ptr_init(&next, u);
2173 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2174 	udb_base_set_userdata(u, 0);
2175 	while(!udb_ptr_is_null(&t)) {
2176 		/* store next in list so this one can be deleted or reused */
2177 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2178 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2179 
2180 		/* process task t */
2181 		/* append results for task t and update last_task */
2182 		task_process_in_reload(nsd, u, last_task, &t);
2183 
2184 		/* go to next */
2185 		udb_ptr_set_ptr(&t, u, &next);
2186 
2187 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2188 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2189 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2190 			if(cmd == NSD_QUIT) {
2191 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2192 				/* sync to disk (if needed) */
2193 				udb_base_sync(nsd->db->udb, 0);
2194 				/* unlink files of remainder of tasks */
2195 				while(!udb_ptr_is_null(&t)) {
2196 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2197 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2198 					}
2199 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2200 				}
2201 				udb_ptr_unlink(&t, u);
2202 				udb_ptr_unlink(&next, u);
2203 				exit(0);
2204 			}
2205 		}
2206 
2207 	}
2208 	udb_ptr_unlink(&t, u);
2209 	udb_ptr_unlink(&next, u);
2210 }
2211 
2212 #ifdef BIND8_STATS
2213 static void
2214 parent_send_stats(struct nsd* nsd, int cmdfd)
2215 {
2216 	size_t i;
2217 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2218 		log_msg(LOG_ERR, "could not write stats to reload");
2219 		return;
2220 	}
2221 	for(i=0; i<nsd->child_count; i++)
2222 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2223 			sizeof(stc_type))) {
2224 			log_msg(LOG_ERR, "could not write stats to reload");
2225 			return;
2226 		}
2227 }
2228 
2229 static void
2230 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2231 {
2232 	struct nsdst s;
2233 	stc_type* p;
2234 	size_t i;
2235 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2236 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2237 		log_msg(LOG_ERR, "could not read stats from oldpar");
2238 		return;
2239 	}
2240 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2241 	s.db_mem = region_get_mem(nsd->db->region);
2242 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2243 		nsd->child_count);
2244 	if(!p) return;
2245 	for(i=0; i<nsd->child_count; i++) {
2246 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2247 			sizeof(stc_type))
2248 			return;
2249 	}
2250 }
2251 #endif /* BIND8_STATS */
2252 
2253 /*
2254  * Reload the database, stop parent, re-fork children and continue.
2255  * as server_main.
2256  */
2257 static void
2258 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2259 	int cmdsocket)
2260 {
2261 	pid_t mypid;
2262 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2263 	int ret;
2264 	udb_ptr last_task;
2265 	struct sigaction old_sigchld, ign_sigchld;
2266 	/* ignore SIGCHLD from the previous server_main that used this pid */
2267 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2268 	ign_sigchld.sa_handler = SIG_IGN;
2269 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2270 
2271 #ifdef HAVE_SETPROCTITLE
2272 	setproctitle("main");
2273 #endif
2274 #ifdef HAVE_CPUSET_T
2275 	if(nsd->use_cpu_affinity) {
2276 		set_cpu_affinity(nsd->cpuset);
2277 	}
2278 #endif
2279 
2280 	/* see what tasks we got from xfrd */
2281 	task_remap(nsd->task[nsd->mytask]);
2282 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2283 	udb_compact_inhibited(nsd->db->udb, 1);
2284 	reload_process_tasks(nsd, &last_task, cmdsocket);
2285 	udb_compact_inhibited(nsd->db->udb, 0);
2286 	udb_compact(nsd->db->udb);
2287 
2288 #ifndef NDEBUG
2289 	if(nsd_debug_level >= 1)
2290 		region_log_stats(nsd->db->region);
2291 #endif /* NDEBUG */
2292 	/* sync to disk (if needed) */
2293 	udb_base_sync(nsd->db->udb, 0);
2294 
2295 	initialize_dname_compression_tables(nsd);
2296 
2297 #ifdef BIND8_STATS
2298 	/* Restart dumping stats if required.  */
2299 	time(&nsd->st.boot);
2300 	set_bind8_alarm(nsd);
2301 #endif
2302 #ifdef USE_ZONE_STATS
2303 	server_zonestat_realloc(nsd); /* realloc for new children */
2304 	server_zonestat_switch(nsd);
2305 #endif
2306 
2307 	/* listen for the signals of failed children again */
2308 	sigaction(SIGCHLD, &old_sigchld, NULL);
2309 #ifdef USE_DNSTAP
2310 	if (nsd->dt_collector) {
2311 		int *swap_fd_send;
2312 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2313 		/* Swap fd_send with fd_swap so old serve child and new serve
2314 		 * childs will not write to the same pipe ends simultaneously */
2315 		swap_fd_send = nsd->dt_collector_fd_send;
2316 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2317 		nsd->dt_collector_fd_swap = swap_fd_send;
2318 
2319 	}
2320 #endif
2321 	/* Start new child processes */
2322 	if (server_start_children(nsd, server_region, netio, &nsd->
2323 		xfrd_listener->fd) != 0) {
2324 		send_children_quit(nsd);
2325 		exit(1);
2326 	}
2327 
2328 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2329 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2330 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2331 		if(cmd == NSD_QUIT) {
2332 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2333 			send_children_quit(nsd);
2334 			exit(0);
2335 		}
2336 	}
2337 
2338 	/* Send quit command to parent: blocking, wait for receipt. */
2339 	do {
2340 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2341 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2342 		{
2343 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2344 				strerror(errno));
2345 		}
2346 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2347 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2348 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2349 			RELOAD_SYNC_TIMEOUT);
2350 		if(ret == -2) {
2351 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2352 		}
2353 	} while (ret == -2);
2354 	if(ret == -1) {
2355 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2356 			strerror(errno));
2357 	}
2358 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2359 	if(cmd == NSD_QUIT) {
2360 		/* small race condition possible here, parent got quit cmd. */
2361 		send_children_quit(nsd);
2362 		exit(1);
2363 	}
2364 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2365 #ifdef BIND8_STATS
2366 	reload_do_stats(cmdsocket, nsd, &last_task);
2367 #endif
2368 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2369 	task_process_sync(nsd->task[nsd->mytask]);
2370 #ifdef USE_ZONE_STATS
2371 	server_zonestat_realloc(nsd); /* realloc for next children */
2372 #endif
2373 
2374 	/* send soainfo to the xfrd process, signal it that reload is done,
2375 	 * it picks up the taskudb */
2376 	cmd = NSD_RELOAD_DONE;
2377 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2378 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2379 			strerror(errno));
2380 	}
2381 	mypid = getpid();
2382 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2383 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2384 			strerror(errno));
2385 	}
2386 
2387 	/* try to reopen file */
2388 	if (nsd->file_rotation_ok)
2389 		log_reopen(nsd->log_filename, 1);
2390 	/* exit reload, continue as new server_main */
2391 }
2392 
2393 /*
2394  * Get the mode depending on the signal hints that have been received.
2395  * Multiple signal hints can be received and will be handled in turn.
2396  */
2397 static sig_atomic_t
2398 server_signal_mode(struct nsd *nsd)
2399 {
2400 	if(nsd->signal_hint_quit) {
2401 		nsd->signal_hint_quit = 0;
2402 		return NSD_QUIT;
2403 	}
2404 	else if(nsd->signal_hint_shutdown) {
2405 		nsd->signal_hint_shutdown = 0;
2406 		return NSD_SHUTDOWN;
2407 	}
2408 	else if(nsd->signal_hint_child) {
2409 		nsd->signal_hint_child = 0;
2410 		return NSD_REAP_CHILDREN;
2411 	}
2412 	else if(nsd->signal_hint_reload) {
2413 		nsd->signal_hint_reload = 0;
2414 		return NSD_RELOAD;
2415 	}
2416 	else if(nsd->signal_hint_reload_hup) {
2417 		nsd->signal_hint_reload_hup = 0;
2418 		return NSD_RELOAD_REQ;
2419 	}
2420 	else if(nsd->signal_hint_stats) {
2421 		nsd->signal_hint_stats = 0;
2422 #ifdef BIND8_STATS
2423 		set_bind8_alarm(nsd);
2424 #endif
2425 		return NSD_STATS;
2426 	}
2427 	else if(nsd->signal_hint_statsusr) {
2428 		nsd->signal_hint_statsusr = 0;
2429 		return NSD_STATS;
2430 	}
2431 	return NSD_RUN;
2432 }
2433 
2434 /*
2435  * The main server simply waits for signals and child processes to
2436  * terminate.  Child processes are restarted as necessary.
2437  */
2438 void
2439 server_main(struct nsd *nsd)
2440 {
2441 	region_type *server_region = region_create(xalloc, free);
2442 	netio_type *netio = netio_create(server_region);
2443 	netio_handler_type reload_listener;
2444 	int reload_sockets[2] = {-1, -1};
2445 	struct timespec timeout_spec;
2446 	int status;
2447 	pid_t child_pid;
2448 	pid_t reload_pid = -1;
2449 	sig_atomic_t mode;
2450 
2451 	/* Ensure we are the main process */
2452 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2453 
2454 	/* Add listener for the XFRD process */
2455 	netio_add_handler(netio, nsd->xfrd_listener);
2456 
2457 	/* Start the child processes that handle incoming queries */
2458 	if (server_start_children(nsd, server_region, netio,
2459 		&nsd->xfrd_listener->fd) != 0) {
2460 		send_children_quit(nsd);
2461 		exit(1);
2462 	}
2463 	reload_listener.fd = -1;
2464 
2465 	/* This_child MUST be 0, because this is the parent process */
2466 	assert(nsd->this_child == 0);
2467 
2468 	/* Run the server until we get a shutdown signal */
2469 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2470 		/* Did we receive a signal that changes our mode? */
2471 		if(mode == NSD_RUN) {
2472 			nsd->mode = mode = server_signal_mode(nsd);
2473 		}
2474 
2475 		switch (mode) {
2476 		case NSD_RUN:
2477 			/* see if any child processes terminated */
2478 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2479 				int is_child = delete_child_pid(nsd, child_pid);
2480 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2481 					if(nsd->children[is_child].child_fd == -1)
2482 						nsd->children[is_child].has_exited = 1;
2483 					parent_check_all_children_exited(nsd);
2484 				} else if(is_child != -1) {
2485 					log_msg(LOG_WARNING,
2486 					       "server %d died unexpectedly with status %d, restarting",
2487 					       (int) child_pid, status);
2488 					restart_child_servers(nsd, server_region, netio,
2489 						&nsd->xfrd_listener->fd);
2490 				} else if (child_pid == reload_pid) {
2491 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2492 					pid_t mypid;
2493 					log_msg(LOG_WARNING,
2494 					       "Reload process %d failed with status %d, continuing with old database",
2495 					       (int) child_pid, status);
2496 					reload_pid = -1;
2497 					if(reload_listener.fd != -1) close(reload_listener.fd);
2498 					reload_listener.fd = -1;
2499 					reload_listener.event_types = NETIO_EVENT_NONE;
2500 					task_process_sync(nsd->task[nsd->mytask]);
2501 					/* inform xfrd reload attempt ended */
2502 					if(!write_socket(nsd->xfrd_listener->fd,
2503 						&cmd, sizeof(cmd))) {
2504 						log_msg(LOG_ERR, "problems "
2505 						  "sending SOAEND to xfrd: %s",
2506 						  strerror(errno));
2507 					}
2508 					mypid = getpid();
2509 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2510 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2511 							strerror(errno));
2512 					}
2513 #ifdef USE_DNSTAP
2514 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2515 					log_msg(LOG_WARNING,
2516 					       "dnstap-collector %d terminated with status %d",
2517 					       (int) child_pid, status);
2518 					if(nsd->dt_collector) {
2519 						dt_collector_close(nsd->dt_collector, nsd);
2520 						dt_collector_destroy(nsd->dt_collector, nsd);
2521 						nsd->dt_collector = NULL;
2522 					}
2523 					/* Only respawn a crashed (or exited)
2524 					 * dnstap-collector when not reloading,
2525 					 * to not induce a reload during a
2526 					 * reload (which would seriously
2527 					 * disrupt nsd procedures and lead to
2528 					 * unpredictable results)!
2529 					 *
2530 					 * This will *leave* a dnstap-collector
2531 					 * process terminated, but because
2532 					 * signalling of the reload process to
2533 					 * the main process to respawn in this
2534 					 * situation will be cumbersome, and
2535 					 * because this situation is so
2536 					 * specific (and therefore hopefully
2537 					 * extremely rare or non-existing at
2538 					 * all), plus the fact that we are left
2539 					 * with a perfectly function NSD
2540 					 * (besides not logging dnstap
2541 					 * messages), I consider it acceptable
2542 					 * to leave this unresolved.
2543 					 */
2544 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2545 						nsd->dt_collector = dt_collector_create(nsd);
2546 						dt_collector_start(nsd->dt_collector, nsd);
2547 						nsd->mode = NSD_RELOAD_REQ;
2548 					}
2549 #endif
2550 				} else if(status != 0) {
2551 					/* check for status, because we get
2552 					 * the old-servermain because reload
2553 					 * is the process-parent of old-main,
2554 					 * and we get older server-processes
2555 					 * that are exiting after a reload */
2556 					log_msg(LOG_WARNING,
2557 					       "process %d terminated with status %d",
2558 					       (int) child_pid, status);
2559 				}
2560 			}
2561 			if (child_pid == -1) {
2562 				if (errno == EINTR) {
2563 					continue;
2564 				}
2565 				if (errno != ECHILD)
2566 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2567 			}
2568 			if (nsd->mode != NSD_RUN)
2569 				break;
2570 
2571 			/* timeout to collect processes. In case no sigchild happens. */
2572 			timeout_spec.tv_sec = 60;
2573 			timeout_spec.tv_nsec = 0;
2574 
2575 			/* listen on ports, timeout for collecting terminated children */
2576 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2577 				if (errno != EINTR) {
2578 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2579 				}
2580 			}
2581 			if(nsd->restart_children) {
2582 				restart_child_servers(nsd, server_region, netio,
2583 					&nsd->xfrd_listener->fd);
2584 				nsd->restart_children = 0;
2585 			}
2586 			if(nsd->reload_failed) {
2587 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2588 				pid_t mypid;
2589 				nsd->reload_failed = 0;
2590 				log_msg(LOG_WARNING,
2591 				       "Reload process %d failed, continuing with old database",
2592 				       (int) reload_pid);
2593 				reload_pid = -1;
2594 				if(reload_listener.fd != -1) close(reload_listener.fd);
2595 				reload_listener.fd = -1;
2596 				reload_listener.event_types = NETIO_EVENT_NONE;
2597 				task_process_sync(nsd->task[nsd->mytask]);
2598 				/* inform xfrd reload attempt ended */
2599 				if(!write_socket(nsd->xfrd_listener->fd,
2600 					&cmd, sizeof(cmd))) {
2601 					log_msg(LOG_ERR, "problems "
2602 					  "sending SOAEND to xfrd: %s",
2603 					  strerror(errno));
2604 				}
2605 				mypid = getpid();
2606 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2607 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2608 						strerror(errno));
2609 				}
2610 			}
2611 
2612 			break;
2613 		case NSD_RELOAD_REQ: {
2614 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2615 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2616 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2617 				"main: ipc send reload_req to xfrd"));
2618 			if(!write_socket(nsd->xfrd_listener->fd,
2619 				&cmd, sizeof(cmd))) {
2620 				log_msg(LOG_ERR, "server_main: could not send "
2621 				"reload_req to xfrd: %s", strerror(errno));
2622 			}
2623 			nsd->mode = NSD_RUN;
2624 			} break;
2625 		case NSD_RELOAD:
2626 			/* Continue to run nsd after reload */
2627 			nsd->mode = NSD_RUN;
2628 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2629 			if (reload_pid != -1) {
2630 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2631 				       (int) reload_pid);
2632 				break;
2633 			}
2634 
2635 			/* switch the mytask to keep track of who owns task*/
2636 			nsd->mytask = 1 - nsd->mytask;
2637 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2638 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2639 				reload_pid = -1;
2640 				break;
2641 			}
2642 
2643 			/* Do actual reload */
2644 			reload_pid = fork();
2645 			switch (reload_pid) {
2646 			case -1:
2647 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2648 				break;
2649 			default:
2650 				/* PARENT */
2651 				close(reload_sockets[0]);
2652 				server_reload(nsd, server_region, netio,
2653 					reload_sockets[1]);
2654 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2655 				close(reload_sockets[1]);
2656 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2657 				/* drop stale xfrd ipc data */
2658 				((struct ipc_handler_conn_data*)nsd->
2659 					xfrd_listener->user_data)
2660 					->conn->is_reading = 0;
2661 				reload_pid = -1;
2662 				reload_listener.fd = -1;
2663 				reload_listener.event_types = NETIO_EVENT_NONE;
2664 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2665 				break;
2666 			case 0:
2667 				/* CHILD */
2668 				/* server_main keep running until NSD_QUIT_SYNC
2669 				 * received from reload. */
2670 				close(reload_sockets[1]);
2671 				reload_listener.fd = reload_sockets[0];
2672 				reload_listener.timeout = NULL;
2673 				reload_listener.user_data = nsd;
2674 				reload_listener.event_types = NETIO_EVENT_READ;
2675 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2676 				netio_add_handler(netio, &reload_listener);
2677 				reload_pid = getppid();
2678 				break;
2679 			}
2680 			break;
2681 		case NSD_QUIT_SYNC:
2682 			/* synchronisation of xfrd, parent and reload */
2683 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2684 				sig_atomic_t cmd = NSD_RELOAD;
2685 				/* stop xfrd ipc writes in progress */
2686 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2687 					"main: ipc send indication reload"));
2688 				if(!write_socket(nsd->xfrd_listener->fd,
2689 					&cmd, sizeof(cmd))) {
2690 					log_msg(LOG_ERR, "server_main: could not send reload "
2691 					"indication to xfrd: %s", strerror(errno));
2692 				}
2693 				/* wait for ACK from xfrd */
2694 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2695 				nsd->quit_sync_done = 1;
2696 			}
2697 			nsd->mode = NSD_RUN;
2698 			break;
2699 		case NSD_QUIT:
2700 			/* silent shutdown during reload */
2701 			if(reload_listener.fd != -1) {
2702 				/* acknowledge the quit, to sync reload that we will really quit now */
2703 				sig_atomic_t cmd = NSD_RELOAD;
2704 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2705 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2706 					log_msg(LOG_ERR, "server_main: "
2707 						"could not ack quit: %s", strerror(errno));
2708 				}
2709 #ifdef BIND8_STATS
2710 				parent_send_stats(nsd, reload_listener.fd);
2711 #endif /* BIND8_STATS */
2712 				close(reload_listener.fd);
2713 			}
2714 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2715 			/* only quit children after xfrd has acked */
2716 			send_children_quit(nsd);
2717 
2718 #ifdef MEMCLEAN /* OS collects memory pages */
2719 			region_destroy(server_region);
2720 #endif
2721 			server_shutdown(nsd);
2722 
2723 			/* ENOTREACH */
2724 			break;
2725 		case NSD_SHUTDOWN:
2726 			break;
2727 		case NSD_REAP_CHILDREN:
2728 			/* continue; wait for child in run loop */
2729 			nsd->mode = NSD_RUN;
2730 			break;
2731 		case NSD_STATS:
2732 #ifdef BIND8_STATS
2733 			set_children_stats(nsd);
2734 #endif
2735 			nsd->mode = NSD_RUN;
2736 			break;
2737 		default:
2738 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2739 			nsd->mode = NSD_RUN;
2740 			break;
2741 		}
2742 	}
2743 	log_msg(LOG_WARNING, "signal received, shutting down...");
2744 
2745 	/* close opened ports to avoid race with restart of nsd */
2746 	server_close_all_sockets(nsd->udp, nsd->ifs);
2747 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2748 #ifdef HAVE_SSL
2749 	daemon_remote_close(nsd->rc);
2750 #endif
2751 	send_children_quit_and_wait(nsd);
2752 
2753 	/* Unlink it if possible... */
2754 	unlinkpid(nsd->pidfile);
2755 	unlink(nsd->task[0]->fname);
2756 	unlink(nsd->task[1]->fname);
2757 #ifdef USE_ZONE_STATS
2758 	unlink(nsd->zonestatfname[0]);
2759 	unlink(nsd->zonestatfname[1]);
2760 #endif
2761 #ifdef USE_DNSTAP
2762 	dt_collector_close(nsd->dt_collector, nsd);
2763 #endif
2764 
2765 	if(reload_listener.fd != -1) {
2766 		sig_atomic_t cmd = NSD_QUIT;
2767 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2768 			"main: ipc send quit to reload-process"));
2769 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2770 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2771 				strerror(errno));
2772 		}
2773 		fsync(reload_listener.fd);
2774 		close(reload_listener.fd);
2775 		/* wait for reload to finish processing */
2776 		while(1) {
2777 			if(waitpid(reload_pid, NULL, 0) == -1) {
2778 				if(errno == EINTR) continue;
2779 				if(errno == ECHILD) break;
2780 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2781 					(int)reload_pid, strerror(errno));
2782 			}
2783 			break;
2784 		}
2785 	}
2786 	if(nsd->xfrd_listener->fd != -1) {
2787 		/* complete quit, stop xfrd */
2788 		sig_atomic_t cmd = NSD_QUIT;
2789 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2790 			"main: ipc send quit to xfrd"));
2791 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2792 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2793 				strerror(errno));
2794 		}
2795 		fsync(nsd->xfrd_listener->fd);
2796 		close(nsd->xfrd_listener->fd);
2797 		(void)kill(nsd->pid, SIGTERM);
2798 	}
2799 
2800 #ifdef MEMCLEAN /* OS collects memory pages */
2801 	region_destroy(server_region);
2802 #endif
2803 	/* write the nsd.db to disk, wait for it to complete */
2804 	udb_base_sync(nsd->db->udb, 1);
2805 	udb_base_close(nsd->db->udb);
2806 	server_shutdown(nsd);
2807 }
2808 
2809 static query_state_type
2810 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2811 {
2812 	return query_process(query, nsd, now_p);
2813 }
2814 
2815 static query_state_type
2816 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2817 {
2818 #ifdef RATELIMIT
2819 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2820 		if(query->edns.cookie_status != COOKIE_VALID
2821 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2822 		&& rrl_process_query(query))
2823 			return rrl_slip(query);
2824 		else	return QUERY_PROCESSED;
2825 	}
2826 	return QUERY_DISCARDED;
2827 #else
2828 	return query_process(query, nsd, now_p);
2829 #endif
2830 }
2831 
2832 const char*
2833 nsd_event_vs(void)
2834 {
2835 #ifdef USE_MINI_EVENT
2836 	return "";
2837 #else
2838 	return event_get_version();
2839 #endif
2840 }
2841 
2842 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2843 static const char* ub_ev_backend2str(int b)
2844 {
2845 	switch(b) {
2846 	case EVBACKEND_SELECT:	return "select";
2847 	case EVBACKEND_POLL:	return "poll";
2848 	case EVBACKEND_EPOLL:	return "epoll";
2849 	case EVBACKEND_KQUEUE:	return "kqueue";
2850 	case EVBACKEND_DEVPOLL: return "devpoll";
2851 	case EVBACKEND_PORT:	return "evport";
2852 	}
2853 	return "unknown";
2854 }
2855 #endif
2856 
2857 const char*
2858 nsd_event_method(void)
2859 {
2860 #ifdef USE_MINI_EVENT
2861 	return "select";
2862 #else
2863 	struct event_base* b = nsd_child_event_base();
2864 	const char* m = "?";
2865 #  ifdef EV_FEATURE_BACKENDS
2866 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2867 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2868 	m = event_base_get_method(b);
2869 #  endif
2870 #  ifdef MEMCLEAN
2871 	event_base_free(b);
2872 #  endif
2873 	return m;
2874 #endif
2875 }
2876 
2877 struct event_base*
2878 nsd_child_event_base(void)
2879 {
2880 	struct event_base* base;
2881 #ifdef USE_MINI_EVENT
2882 	static time_t secs;
2883 	static struct timeval now;
2884 	base = event_init(&secs, &now);
2885 #else
2886 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2887 	/* libev */
2888 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2889 #  else
2890 	/* libevent */
2891 #    ifdef HAVE_EVENT_BASE_NEW
2892 	base = event_base_new();
2893 #    else
2894 	base = event_init();
2895 #    endif
2896 #  endif
2897 #endif
2898 	return base;
2899 }
2900 
2901 static void
2902 add_udp_handler(
2903 	struct nsd *nsd,
2904 	struct nsd_socket *sock,
2905 	struct udp_handler_data *data)
2906 {
2907 	struct event *handler = &data->event;
2908 
2909 	data->nsd = nsd;
2910 	data->socket = sock;
2911 
2912 	memset(handler, 0, sizeof(*handler));
2913 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2914 	if(event_base_set(nsd->event_base, handler) != 0)
2915 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2916 	if(event_add(handler, NULL) != 0)
2917 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2918 }
2919 
2920 void
2921 add_tcp_handler(
2922 	struct nsd *nsd,
2923 	struct nsd_socket *sock,
2924 	struct tcp_accept_handler_data *data)
2925 {
2926 	struct event *handler = &data->event;
2927 
2928 	data->nsd = nsd;
2929 	data->socket = sock;
2930 
2931 #ifdef HAVE_SSL
2932 	if (nsd->tls_ctx &&
2933 	    nsd->options->tls_port &&
2934 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2935 	{
2936 		data->tls_accept = 1;
2937 		if(verbosity >= 2) {
2938 			char buf[48];
2939 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2940 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2941 		}
2942 	} else {
2943 		data->tls_accept = 0;
2944 	}
2945 #endif
2946 
2947 	memset(handler, 0, sizeof(*handler));
2948 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2949 	if(event_base_set(nsd->event_base, handler) != 0)
2950 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2951 	if(event_add(handler, NULL) != 0)
2952 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2953 	data->event_added = 1;
2954 }
2955 
2956 /*
2957  * Serve DNS requests.
2958  */
2959 void
2960 server_child(struct nsd *nsd)
2961 {
2962 	size_t i, from, numifs;
2963 	region_type *server_region = region_create(xalloc, free);
2964 	struct event_base* event_base = nsd_child_event_base();
2965 	sig_atomic_t mode;
2966 
2967 	if(!event_base) {
2968 		log_msg(LOG_ERR, "nsd server could not create event base");
2969 		exit(1);
2970 	}
2971 	nsd->event_base = event_base;
2972 	nsd->server_region = server_region;
2973 
2974 #ifdef RATELIMIT
2975 	rrl_init(nsd->this_child->child_num);
2976 #endif
2977 
2978 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2979 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2980 
2981 #ifdef HAVE_SETPROCTITLE
2982 	setproctitle("server %d", nsd->this_child->child_num + 1);
2983 #endif
2984 #ifdef HAVE_CPUSET_T
2985 	if(nsd->use_cpu_affinity) {
2986 		set_cpu_affinity(nsd->this_child->cpuset);
2987 	}
2988 #endif
2989 
2990 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2991 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2992 	}
2993 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2994 		server_close_all_sockets(nsd->udp, nsd->ifs);
2995 	}
2996 
2997 	if (nsd->this_child->parent_fd != -1) {
2998 		struct event *handler;
2999 		struct ipc_handler_conn_data* user_data =
3000 			(struct ipc_handler_conn_data*)region_alloc(
3001 			server_region, sizeof(struct ipc_handler_conn_data));
3002 		user_data->nsd = nsd;
3003 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3004 
3005 		handler = (struct event*) region_alloc(
3006 			server_region, sizeof(*handler));
3007 		memset(handler, 0, sizeof(*handler));
3008 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3009 			EV_READ, child_handle_parent_command, user_data);
3010 		if(event_base_set(event_base, handler) != 0)
3011 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3012 		if(event_add(handler, NULL) != 0)
3013 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3014 	}
3015 
3016 	if(nsd->reuseport) {
3017 		numifs = nsd->ifs / nsd->reuseport;
3018 		from = numifs * nsd->this_child->child_num;
3019 		if(from+numifs > nsd->ifs) { /* should not happen */
3020 			from = 0;
3021 			numifs = nsd->ifs;
3022 		}
3023 	} else {
3024 		from = 0;
3025 		numifs = nsd->ifs;
3026 	}
3027 
3028 	if (nsd->server_kind & NSD_SERVER_UDP) {
3029 		int child = nsd->this_child->child_num;
3030 		memset(msgs, 0, sizeof(msgs));
3031 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3032 			queries[i] = query_create(server_region,
3033 				compressed_dname_offsets,
3034 				compression_table_size, compressed_dnames);
3035 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3036 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3037 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3038 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3039 			msgs[i].msg_hdr.msg_iovlen  = 1;
3040 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
3041 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3042 		}
3043 
3044 		for (i = 0; i < nsd->ifs; i++) {
3045 			int listen;
3046 			struct udp_handler_data *data;
3047 
3048 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3049 
3050 			if(i >= from && i < (from + numifs) && listen) {
3051 				data = region_alloc_zero(
3052 					nsd->server_region, sizeof(*data));
3053 				add_udp_handler(nsd, &nsd->udp[i], data);
3054 			} else {
3055 				/* close sockets intended for other servers */
3056 				server_close_socket(&nsd->udp[i]);
3057 			}
3058 		}
3059 	}
3060 
3061 	/*
3062 	 * Keep track of all the TCP accept handlers so we can enable
3063 	 * and disable them based on the current number of active TCP
3064 	 * connections.
3065 	 */
3066 	if (nsd->server_kind & NSD_SERVER_TCP) {
3067 		int child = nsd->this_child->child_num;
3068 		tcp_accept_handler_count = numifs;
3069 		tcp_accept_handlers = region_alloc_array(server_region,
3070 			numifs, sizeof(*tcp_accept_handlers));
3071 
3072 		for (i = 0; i < nsd->ifs; i++) {
3073 			int listen;
3074 			struct tcp_accept_handler_data *data;
3075 
3076 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3077 
3078 			if(i >= from && i < (from + numifs) && listen) {
3079 				data = &tcp_accept_handlers[i-from];
3080 				memset(data, 0, sizeof(*data));
3081 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3082 			} else {
3083 				/* close sockets intended for other servers */
3084 				/*
3085 				 * uncomment this once tcp servers are no
3086 				 * longer copied in the tcp fd copy line
3087 				 * in server_init().
3088 				server_close_socket(&nsd->tcp[i]);
3089 				*/
3090 				/* close sockets not meant for this server*/
3091 				if(!listen)
3092 					server_close_socket(&nsd->tcp[i]);
3093 			}
3094 		}
3095 	} else {
3096 		tcp_accept_handler_count = 0;
3097 	}
3098 
3099 	/* The main loop... */
3100 	while ((mode = nsd->mode) != NSD_QUIT) {
3101 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3102 
3103 		/* Do we need to do the statistics... */
3104 		if (mode == NSD_STATS) {
3105 #ifdef BIND8_STATS
3106 			int p = nsd->st.period;
3107 			nsd->st.period = 1; /* force stats printout */
3108 			/* Dump the statistics */
3109 			bind8_stats(nsd);
3110 			nsd->st.period = p;
3111 #else /* !BIND8_STATS */
3112 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3113 #endif /* BIND8_STATS */
3114 
3115 			nsd->mode = NSD_RUN;
3116 		}
3117 		else if (mode == NSD_REAP_CHILDREN) {
3118 			/* got signal, notify parent. parent reaps terminated children. */
3119 			if (nsd->this_child->parent_fd != -1) {
3120 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3121 				if (write(nsd->this_child->parent_fd,
3122 				    &parent_notify,
3123 				    sizeof(parent_notify)) == -1)
3124 				{
3125 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3126 						(int) nsd->this_child->pid, strerror(errno));
3127 				}
3128 			} else /* no parent, so reap 'em */
3129 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3130 			nsd->mode = NSD_RUN;
3131 		}
3132 		else if(mode == NSD_RUN) {
3133 			/* Wait for a query... */
3134 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3135 				if (errno != EINTR) {
3136 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3137 					break;
3138 				}
3139 			}
3140 		} else if(mode == NSD_QUIT) {
3141 			/* ignore here, quit */
3142 		} else {
3143 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3144 				(int)mode);
3145 			nsd->mode = NSD_RUN;
3146 		}
3147 	}
3148 
3149 	service_remaining_tcp(nsd);
3150 #ifdef	BIND8_STATS
3151 	bind8_stats(nsd);
3152 #endif /* BIND8_STATS */
3153 
3154 #ifdef MEMCLEAN /* OS collects memory pages */
3155 #ifdef RATELIMIT
3156 	rrl_deinit(nsd->this_child->child_num);
3157 #endif
3158 	event_base_free(event_base);
3159 	region_destroy(server_region);
3160 #endif
3161 	server_shutdown(nsd);
3162 }
3163 
3164 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3165 {
3166 	int* timed_out = (int*)arg;
3167         assert(event & EV_TIMEOUT); (void)event;
3168 	/* wake up the service tcp thread, note event is no longer
3169 	 * registered */
3170 	*timed_out = 1;
3171 }
3172 
3173 void
3174 service_remaining_tcp(struct nsd* nsd)
3175 {
3176 	struct tcp_handler_data* p;
3177 	struct event_base* event_base;
3178 	/* check if it is needed */
3179 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3180 		return;
3181 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3182 #ifdef USE_DNSTAP
3183 	/* remove dnstap collector, we cannot write there because the new
3184 	 * child process is using the file descriptor, or the child
3185 	 * process after that. */
3186 	dt_collector_destroy(nsd->dt_collector, nsd);
3187 	nsd->dt_collector = NULL;
3188 #endif
3189 	/* setup event base */
3190 	event_base = nsd_child_event_base();
3191 	if(!event_base) {
3192 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3193 		return;
3194 	}
3195 	/* register tcp connections */
3196 	for(p = tcp_active_list; p != NULL; p = p->next) {
3197 		struct timeval timeout;
3198 		int fd = p->event.ev_fd;
3199 #ifdef USE_MINI_EVENT
3200 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3201 #else
3202 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3203 #endif
3204 		void (*fn)(int, short, void*);
3205 #ifdef HAVE_SSL
3206 		if(p->tls) {
3207 			if((event&EV_READ))
3208 				fn = handle_tls_reading;
3209 			else	fn = handle_tls_writing;
3210 		} else {
3211 #endif
3212 			if((event&EV_READ))
3213 				fn = handle_tcp_reading;
3214 			else	fn = handle_tcp_writing;
3215 #ifdef HAVE_SSL
3216 		}
3217 #endif
3218 
3219 		p->tcp_no_more_queries = 1;
3220 		/* set timeout to 1/10 second */
3221 		if(p->tcp_timeout > 100)
3222 			p->tcp_timeout = 100;
3223 		timeout.tv_sec = p->tcp_timeout / 1000;
3224 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3225 		event_del(&p->event);
3226 		memset(&p->event, 0, sizeof(p->event));
3227 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3228 			fn, p);
3229 		if(event_base_set(event_base, &p->event) != 0)
3230 			log_msg(LOG_ERR, "event base set failed");
3231 		if(event_add(&p->event, &timeout) != 0)
3232 			log_msg(LOG_ERR, "event add failed");
3233 	}
3234 
3235 	/* handle it */
3236 	while(nsd->current_tcp_count > 0) {
3237 		mode_t m = server_signal_mode(nsd);
3238 		struct event timeout;
3239 		struct timeval tv;
3240 		int timed_out = 0;
3241 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3242 			m == NSD_REAP_CHILDREN) {
3243 			/* quit */
3244 			break;
3245 		}
3246 		/* timer */
3247 		/* have to do something every second */
3248 		tv.tv_sec = 1;
3249 		tv.tv_usec = 0;
3250 		memset(&timeout, 0, sizeof(timeout));
3251 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3252 			&timed_out);
3253 		if(event_base_set(event_base, &timeout) != 0)
3254 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3255 		if(event_add(&timeout, &tv) != 0)
3256 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3257 
3258 		/* service loop */
3259 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3260 			if (errno != EINTR) {
3261 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3262 				break;
3263 			}
3264 		}
3265 		if(!timed_out) {
3266 			event_del(&timeout);
3267 		} else {
3268 			/* timed out, quit */
3269 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3270 			break;
3271 		}
3272 	}
3273 #ifdef MEMCLEAN
3274 	event_base_free(event_base);
3275 #endif
3276 	/* continue to quit after return */
3277 }
3278 
3279 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3280  * are always used, even if nonblocking operations are broken, in which case
3281  * NUM_RECV_PER_SELECT is defined to 1 (one).
3282  */
3283 #if defined(HAVE_RECVMMSG)
3284 #define nsd_recvmmsg recvmmsg
3285 #else /* !HAVE_RECVMMSG */
3286 
3287 static int
3288 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3289              int flags, struct timespec *timeout)
3290 {
3291 	unsigned int vpos = 0;
3292 	ssize_t rcvd;
3293 
3294 	/* timeout is ignored, ensure caller does not expect it to work */
3295 	assert(timeout == NULL); (void)timeout;
3296 
3297 	while(vpos < vlen) {
3298 		rcvd = recvfrom(sockfd,
3299 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3300 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3301 		                flags,
3302 		                msgvec[vpos].msg_hdr.msg_name,
3303 		               &msgvec[vpos].msg_hdr.msg_namelen);
3304 		if(rcvd < 0) {
3305 			break;
3306 		} else {
3307 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3308 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3309 			vpos++;
3310 		}
3311 	}
3312 
3313 	if(vpos) {
3314 		/* error will be picked up next time */
3315 		return (int)vpos;
3316 	} else if(errno == 0) {
3317 		return 0;
3318 	} else if(errno == EAGAIN) {
3319 		return 0;
3320 	}
3321 
3322 	return -1;
3323 }
3324 #endif /* HAVE_RECVMMSG */
3325 
3326 #ifdef HAVE_SENDMMSG
3327 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3328 #else /* !HAVE_SENDMMSG */
3329 
3330 static int
3331 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3332 {
3333 	unsigned int vpos = 0;
3334 	ssize_t snd;
3335 
3336 	while(vpos < vlen) {
3337 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3338 		snd = sendto(sockfd,
3339 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3340 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3341 		             flags,
3342 		             msgvec[vpos].msg_hdr.msg_name,
3343 		             msgvec[vpos].msg_hdr.msg_namelen);
3344 		if(snd < 0) {
3345 			break;
3346 		} else {
3347 			msgvec[vpos].msg_len = (unsigned int)snd;
3348 			vpos++;
3349 		}
3350 	}
3351 
3352 	if(vpos) {
3353 		return (int)vpos;
3354 	} else if(errno == 0) {
3355 		return 0;
3356 	}
3357 
3358 	return -1;
3359 }
3360 #endif /* HAVE_SENDMMSG */
3361 
3362 static int
3363 port_is_zero(
3364 #ifdef INET6
3365         struct sockaddr_storage *addr
3366 #else
3367         struct sockaddr_in *addr
3368 #endif
3369 	)
3370 {
3371 #ifdef INET6
3372 	if(addr->ss_family == AF_INET6) {
3373 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3374 	} else if(addr->ss_family == AF_INET) {
3375 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3376 	}
3377 	return 0;
3378 #else
3379 	if(addr->sin_family == AF_INET) {
3380 		return addr->sin_port == 0;
3381 	}
3382 	return 0;
3383 #endif
3384 }
3385 
3386 static void
3387 handle_udp(int fd, short event, void* arg)
3388 {
3389 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3390 	int received, sent, recvcount, i;
3391 	struct query *q;
3392 	uint32_t now = 0;
3393 
3394 	if (!(event & EV_READ)) {
3395 		return;
3396 	}
3397 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3398 	/* this printf strangely gave a performance increase on Linux */
3399 	/* printf("recvcount %d \n", recvcount); */
3400 	if (recvcount == -1) {
3401 		if (errno != EAGAIN && errno != EINTR) {
3402 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3403 			STATUP(data->nsd, rxerr);
3404 			/* No zone statup */
3405 		}
3406 		/* Simply no data available */
3407 		return;
3408 	}
3409 	for (i = 0; i < recvcount; i++) {
3410 	loopstart:
3411 		received = msgs[i].msg_len;
3412 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3413 		q = queries[i];
3414 		if (received == -1) {
3415 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3416 #if defined(HAVE_RECVMMSG)
3417 				msgs[i].msg_hdr.msg_flags
3418 #else
3419 				errno
3420 #endif
3421 				));
3422 			STATUP(data->nsd, rxerr);
3423 			/* No zone statup */
3424 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3425 			iovecs[i].iov_len = buffer_remaining(q->packet);
3426 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3427 			goto swap_drop;
3428 		}
3429 
3430 		/* Account... */
3431 #ifdef BIND8_STATS
3432 		if (data->socket->addr.ai_family == AF_INET) {
3433 			STATUP(data->nsd, qudp);
3434 		} else if (data->socket->addr.ai_family == AF_INET6) {
3435 			STATUP(data->nsd, qudp6);
3436 		}
3437 #endif
3438 
3439 		buffer_skip(q->packet, received);
3440 		buffer_flip(q->packet);
3441 #ifdef USE_DNSTAP
3442 		/*
3443 		 * sending UDP-query with server address (local) and client address to dnstap process
3444 		 */
3445 		log_addr("query from client", &q->addr);
3446 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3447 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen,
3448 			q->tcp, q->packet);
3449 #endif /* USE_DNSTAP */
3450 
3451 		/* Process and answer the query... */
3452 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3453 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3454 				STATUP(data->nsd, nona);
3455 				ZTATUP(data->nsd, q->zone, nona);
3456 			}
3457 
3458 #ifdef USE_ZONE_STATS
3459 			if (data->socket->addr.ai_family == AF_INET) {
3460 				ZTATUP(data->nsd, q->zone, qudp);
3461 			} else if (data->socket->addr.ai_family == AF_INET6) {
3462 				ZTATUP(data->nsd, q->zone, qudp6);
3463 			}
3464 #endif
3465 
3466 			/* Add EDNS0 and TSIG info if necessary.  */
3467 			query_add_optional(q, data->nsd, &now);
3468 
3469 			buffer_flip(q->packet);
3470 			iovecs[i].iov_len = buffer_remaining(q->packet);
3471 #ifdef BIND8_STATS
3472 			/* Account the rcode & TC... */
3473 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3474 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3475 			if (TC(q->packet)) {
3476 				STATUP(data->nsd, truncated);
3477 				ZTATUP(data->nsd, q->zone, truncated);
3478 			}
3479 #endif /* BIND8_STATS */
3480 #ifdef USE_DNSTAP
3481 			/*
3482 			 * sending UDP-response with server address (local) and client address to dnstap process
3483 			 */
3484 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3485 			log_addr("response to client", &q->addr);
3486 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3487 				&q->addr, q->addrlen, q->tcp, q->packet,
3488 				q->zone);
3489 #endif /* USE_DNSTAP */
3490 		} else {
3491 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3492 			iovecs[i].iov_len = buffer_remaining(q->packet);
3493 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3494 		swap_drop:
3495 			STATUP(data->nsd, dropped);
3496 			ZTATUP(data->nsd, q->zone, dropped);
3497 			if(i != recvcount-1) {
3498 				/* swap with last and decrease recvcount */
3499 				struct mmsghdr mtmp = msgs[i];
3500 				struct iovec iotmp = iovecs[i];
3501 				recvcount--;
3502 				msgs[i] = msgs[recvcount];
3503 				iovecs[i] = iovecs[recvcount];
3504 				queries[i] = queries[recvcount];
3505 				msgs[recvcount] = mtmp;
3506 				iovecs[recvcount] = iotmp;
3507 				queries[recvcount] = q;
3508 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3509 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3510 				goto loopstart;
3511 			} else { recvcount --; }
3512 		}
3513 	}
3514 
3515 	/* send until all are sent */
3516 	i = 0;
3517 	while(i<recvcount) {
3518 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3519 		if(sent == -1) {
3520 			if(errno == ENOBUFS ||
3521 #ifdef EWOULDBLOCK
3522 				errno == EWOULDBLOCK ||
3523 #endif
3524 				errno == EAGAIN) {
3525 				/* block to wait until send buffer avail */
3526 				int flag, errstore;
3527 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3528 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3529 					flag = 0;
3530 				}
3531 				flag &= ~O_NONBLOCK;
3532 				if(fcntl(fd, F_SETFL, flag) == -1)
3533 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3534 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3535 				errstore = errno;
3536 				flag |= O_NONBLOCK;
3537 				if(fcntl(fd, F_SETFL, flag) == -1)
3538 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3539 				if(sent != -1) {
3540 					i += sent;
3541 					continue;
3542 				}
3543 				errno = errstore;
3544 			}
3545 			if(errno == EINVAL) {
3546 				/* skip the invalid argument entry,
3547 				 * send the remaining packets in the list */
3548 				if(!(port_is_zero((void*)&queries[i]->addr) &&
3549 					verbosity < 3)) {
3550 					const char* es = strerror(errno);
3551 					char a[64];
3552 					addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3553 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3554 				}
3555 				i += 1;
3556 				continue;
3557 			}
3558 			/* don't log transient network full errors, unless
3559 			 * on higher verbosity */
3560 			if(!(errno == ENOBUFS && verbosity < 1) &&
3561 #ifdef EWOULDBLOCK
3562 			   errno != EWOULDBLOCK &&
3563 #endif
3564 			   errno != EAGAIN) {
3565 				const char* es = strerror(errno);
3566 				char a[64];
3567 				addrport2str((void*)&queries[i]->addr, a, sizeof(a));
3568 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3569 			}
3570 #ifdef BIND8_STATS
3571 			data->nsd->st.txerr += recvcount-i;
3572 #endif /* BIND8_STATS */
3573 			break;
3574 		}
3575 		i += sent;
3576 	}
3577 	for(i=0; i<recvcount; i++) {
3578 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3579 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3580 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3581 	}
3582 }
3583 
3584 #ifdef HAVE_SSL
3585 /*
3586  * Setup an event for the tcp handler.
3587  */
3588 static void
3589 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3590        int fd, short event)
3591 {
3592 	struct timeval timeout;
3593 	struct event_base* ev_base;
3594 
3595 	timeout.tv_sec = data->nsd->tcp_timeout;
3596 	timeout.tv_usec = 0L;
3597 
3598 	ev_base = data->event.ev_base;
3599 	event_del(&data->event);
3600 	memset(&data->event, 0, sizeof(data->event));
3601 	event_set(&data->event, fd, event, fn, data);
3602 	if(event_base_set(ev_base, &data->event) != 0)
3603 		log_msg(LOG_ERR, "event base set failed");
3604 	if(event_add(&data->event, &timeout) != 0)
3605 		log_msg(LOG_ERR, "event add failed");
3606 }
3607 #endif /* HAVE_SSL */
3608 
3609 static void
3610 cleanup_tcp_handler(struct tcp_handler_data* data)
3611 {
3612 	event_del(&data->event);
3613 #ifdef HAVE_SSL
3614 	if(data->tls) {
3615 		SSL_shutdown(data->tls);
3616 		SSL_free(data->tls);
3617 		data->tls = NULL;
3618 	}
3619 #endif
3620 	close(data->event.ev_fd);
3621 	if(data->prev)
3622 		data->prev->next = data->next;
3623 	else	tcp_active_list = data->next;
3624 	if(data->next)
3625 		data->next->prev = data->prev;
3626 
3627 	/*
3628 	 * Enable the TCP accept handlers when the current number of
3629 	 * TCP connections is about to drop below the maximum number
3630 	 * of TCP connections.
3631 	 */
3632 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3633 		configure_handler_event_types(EV_READ|EV_PERSIST);
3634 		if(slowaccept) {
3635 			event_del(&slowaccept_event);
3636 			slowaccept = 0;
3637 		}
3638 	}
3639 	--data->nsd->current_tcp_count;
3640 	assert(data->nsd->current_tcp_count >= 0);
3641 
3642 	region_destroy(data->region);
3643 }
3644 
3645 static void
3646 handle_tcp_reading(int fd, short event, void* arg)
3647 {
3648 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3649 	ssize_t received;
3650 	struct event_base* ev_base;
3651 	struct timeval timeout;
3652 	uint32_t now = 0;
3653 
3654 	if ((event & EV_TIMEOUT)) {
3655 		/* Connection timed out.  */
3656 		cleanup_tcp_handler(data);
3657 		return;
3658 	}
3659 
3660 	if ((data->nsd->tcp_query_count > 0 &&
3661 		data->query_count >= data->nsd->tcp_query_count) ||
3662 		data->tcp_no_more_queries) {
3663 		/* No more queries allowed on this tcp connection. */
3664 		cleanup_tcp_handler(data);
3665 		return;
3666 	}
3667 
3668 	assert((event & EV_READ));
3669 
3670 	if (data->bytes_transmitted == 0) {
3671 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3672 	}
3673 
3674 	/*
3675 	 * Check if we received the leading packet length bytes yet.
3676 	 */
3677 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3678 		received = read(fd,
3679 				(char *) &data->query->tcplen
3680 				+ data->bytes_transmitted,
3681 				sizeof(uint16_t) - data->bytes_transmitted);
3682 		if (received == -1) {
3683 			if (errno == EAGAIN || errno == EINTR) {
3684 				/*
3685 				 * Read would block, wait until more
3686 				 * data is available.
3687 				 */
3688 				return;
3689 			} else {
3690 				char buf[48];
3691 				addr2str(&data->query->addr, buf, sizeof(buf));
3692 #ifdef ECONNRESET
3693 				if (verbosity >= 2 || errno != ECONNRESET)
3694 #endif /* ECONNRESET */
3695 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3696 				cleanup_tcp_handler(data);
3697 				return;
3698 			}
3699 		} else if (received == 0) {
3700 			/* EOF */
3701 			cleanup_tcp_handler(data);
3702 			return;
3703 		}
3704 
3705 		data->bytes_transmitted += received;
3706 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3707 			/*
3708 			 * Not done with the tcplen yet, wait for more
3709 			 * data to become available.
3710 			 */
3711 			return;
3712 		}
3713 
3714 		assert(data->bytes_transmitted == sizeof(uint16_t));
3715 
3716 		data->query->tcplen = ntohs(data->query->tcplen);
3717 
3718 		/*
3719 		 * Minimum query size is:
3720 		 *
3721 		 *     Size of the header (12)
3722 		 *   + Root domain name   (1)
3723 		 *   + Query class        (2)
3724 		 *   + Query type         (2)
3725 		 */
3726 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3727 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3728 			cleanup_tcp_handler(data);
3729 			return;
3730 		}
3731 
3732 		if (data->query->tcplen > data->query->maxlen) {
3733 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3734 			cleanup_tcp_handler(data);
3735 			return;
3736 		}
3737 
3738 		buffer_set_limit(data->query->packet, data->query->tcplen);
3739 	}
3740 
3741 	assert(buffer_remaining(data->query->packet) > 0);
3742 
3743 	/* Read the (remaining) query data.  */
3744 	received = read(fd,
3745 			buffer_current(data->query->packet),
3746 			buffer_remaining(data->query->packet));
3747 	if (received == -1) {
3748 		if (errno == EAGAIN || errno == EINTR) {
3749 			/*
3750 			 * Read would block, wait until more data is
3751 			 * available.
3752 			 */
3753 			return;
3754 		} else {
3755 			char buf[48];
3756 			addr2str(&data->query->addr, buf, sizeof(buf));
3757 #ifdef ECONNRESET
3758 			if (verbosity >= 2 || errno != ECONNRESET)
3759 #endif /* ECONNRESET */
3760 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3761 			cleanup_tcp_handler(data);
3762 			return;
3763 		}
3764 	} else if (received == 0) {
3765 		/* EOF */
3766 		cleanup_tcp_handler(data);
3767 		return;
3768 	}
3769 
3770 	data->bytes_transmitted += received;
3771 	buffer_skip(data->query->packet, received);
3772 	if (buffer_remaining(data->query->packet) > 0) {
3773 		/*
3774 		 * Message not yet complete, wait for more data to
3775 		 * become available.
3776 		 */
3777 		return;
3778 	}
3779 
3780 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3781 
3782 	/* Account... */
3783 #ifdef BIND8_STATS
3784 #ifndef INET6
3785 	STATUP(data->nsd, ctcp);
3786 #else
3787 	if (data->query->addr.ss_family == AF_INET) {
3788 		STATUP(data->nsd, ctcp);
3789 	} else if (data->query->addr.ss_family == AF_INET6) {
3790 		STATUP(data->nsd, ctcp6);
3791 	}
3792 #endif
3793 #endif /* BIND8_STATS */
3794 
3795 	/* We have a complete query, process it.  */
3796 
3797 	/* tcp-query-count: handle query counter ++ */
3798 	data->query_count++;
3799 
3800 	buffer_flip(data->query->packet);
3801 #ifdef USE_DNSTAP
3802 	/*
3803 	 * and send TCP-query with found address (local) and client address to dnstap process
3804 	 */
3805 	log_addr("query from client", &data->query->addr);
3806 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3807 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3808 		data->query->addrlen, data->query->tcp, data->query->packet);
3809 #endif /* USE_DNSTAP */
3810 	data->query_state = server_process_query(data->nsd, data->query, &now);
3811 	if (data->query_state == QUERY_DISCARDED) {
3812 		/* Drop the packet and the entire connection... */
3813 		STATUP(data->nsd, dropped);
3814 		ZTATUP(data->nsd, data->query->zone, dropped);
3815 		cleanup_tcp_handler(data);
3816 		return;
3817 	}
3818 
3819 #ifdef BIND8_STATS
3820 	if (RCODE(data->query->packet) == RCODE_OK
3821 	    && !AA(data->query->packet))
3822 	{
3823 		STATUP(data->nsd, nona);
3824 		ZTATUP(data->nsd, data->query->zone, nona);
3825 	}
3826 #endif /* BIND8_STATS */
3827 
3828 #ifdef USE_ZONE_STATS
3829 #ifndef INET6
3830 	ZTATUP(data->nsd, data->query->zone, ctcp);
3831 #else
3832 	if (data->query->addr.ss_family == AF_INET) {
3833 		ZTATUP(data->nsd, data->query->zone, ctcp);
3834 	} else if (data->query->addr.ss_family == AF_INET6) {
3835 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3836 	}
3837 #endif
3838 #endif /* USE_ZONE_STATS */
3839 
3840 	query_add_optional(data->query, data->nsd, &now);
3841 
3842 	/* Switch to the tcp write handler.  */
3843 	buffer_flip(data->query->packet);
3844 	data->query->tcplen = buffer_remaining(data->query->packet);
3845 #ifdef BIND8_STATS
3846 	/* Account the rcode & TC... */
3847 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3848 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3849 	if (TC(data->query->packet)) {
3850 		STATUP(data->nsd, truncated);
3851 		ZTATUP(data->nsd, data->query->zone, truncated);
3852 	}
3853 #endif /* BIND8_STATS */
3854 #ifdef USE_DNSTAP
3855 	/*
3856 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
3857 	 */
3858 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3859 	log_addr("response to client", &data->query->addr);
3860 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
3861 		data->query->addrlen, data->query->tcp, data->query->packet,
3862 		data->query->zone);
3863 #endif /* USE_DNSTAP */
3864 	data->bytes_transmitted = 0;
3865 
3866 	timeout.tv_sec = data->tcp_timeout / 1000;
3867 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3868 
3869 	ev_base = data->event.ev_base;
3870 	event_del(&data->event);
3871 	memset(&data->event, 0, sizeof(data->event));
3872 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3873 		handle_tcp_reading, data);
3874 	if(event_base_set(ev_base, &data->event) != 0)
3875 		log_msg(LOG_ERR, "event base set tcpr failed");
3876 	if(event_add(&data->event, &timeout) != 0)
3877 		log_msg(LOG_ERR, "event add tcpr failed");
3878 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3879 	handle_tcp_writing(fd, EV_WRITE, data);
3880 }
3881 
3882 static void
3883 handle_tcp_writing(int fd, short event, void* arg)
3884 {
3885 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3886 	ssize_t sent;
3887 	struct query *q = data->query;
3888 	struct timeval timeout;
3889 	struct event_base* ev_base;
3890 	uint32_t now = 0;
3891 
3892 	if ((event & EV_TIMEOUT)) {
3893 		/* Connection timed out.  */
3894 		cleanup_tcp_handler(data);
3895 		return;
3896 	}
3897 
3898 	assert((event & EV_WRITE));
3899 
3900 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3901 		/* Writing the response packet length.  */
3902 		uint16_t n_tcplen = htons(q->tcplen);
3903 #ifdef HAVE_WRITEV
3904 		struct iovec iov[2];
3905 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3906 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3907 		iov[1].iov_base = buffer_begin(q->packet);
3908 		iov[1].iov_len = buffer_limit(q->packet);
3909 		sent = writev(fd, iov, 2);
3910 #else /* HAVE_WRITEV */
3911 		sent = write(fd,
3912 			     (const char *) &n_tcplen + data->bytes_transmitted,
3913 			     sizeof(n_tcplen) - data->bytes_transmitted);
3914 #endif /* HAVE_WRITEV */
3915 		if (sent == -1) {
3916 			if (errno == EAGAIN || errno == EINTR) {
3917 				/*
3918 				 * Write would block, wait until
3919 				 * socket becomes writable again.
3920 				 */
3921 				return;
3922 			} else {
3923 #ifdef ECONNRESET
3924 				if(verbosity >= 2 || errno != ECONNRESET)
3925 #endif /* ECONNRESET */
3926 #ifdef EPIPE
3927 				  if(verbosity >= 2 || errno != EPIPE)
3928 #endif /* EPIPE 'broken pipe' */
3929 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3930 				cleanup_tcp_handler(data);
3931 				return;
3932 			}
3933 		}
3934 
3935 		data->bytes_transmitted += sent;
3936 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3937 			/*
3938 			 * Writing not complete, wait until socket
3939 			 * becomes writable again.
3940 			 */
3941 			return;
3942 		}
3943 
3944 #ifdef HAVE_WRITEV
3945 		sent -= sizeof(n_tcplen);
3946 		/* handle potential 'packet done' code */
3947 		goto packet_could_be_done;
3948 #endif
3949  	}
3950 
3951 	sent = write(fd,
3952 		     buffer_current(q->packet),
3953 		     buffer_remaining(q->packet));
3954 	if (sent == -1) {
3955 		if (errno == EAGAIN || errno == EINTR) {
3956 			/*
3957 			 * Write would block, wait until
3958 			 * socket becomes writable again.
3959 			 */
3960 			return;
3961 		} else {
3962 #ifdef ECONNRESET
3963 			if(verbosity >= 2 || errno != ECONNRESET)
3964 #endif /* ECONNRESET */
3965 #ifdef EPIPE
3966 				  if(verbosity >= 2 || errno != EPIPE)
3967 #endif /* EPIPE 'broken pipe' */
3968 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3969 			cleanup_tcp_handler(data);
3970 			return;
3971 		}
3972 	}
3973 
3974 	data->bytes_transmitted += sent;
3975 #ifdef HAVE_WRITEV
3976   packet_could_be_done:
3977 #endif
3978 	buffer_skip(q->packet, sent);
3979 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3980 		/*
3981 		 * Still more data to write when socket becomes
3982 		 * writable again.
3983 		 */
3984 		return;
3985 	}
3986 
3987 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3988 
3989 	if (data->query_state == QUERY_IN_AXFR) {
3990 		/* Continue processing AXFR and writing back results.  */
3991 		buffer_clear(q->packet);
3992 		data->query_state = query_axfr(data->nsd, q);
3993 		if (data->query_state != QUERY_PROCESSED) {
3994 			query_add_optional(data->query, data->nsd, &now);
3995 
3996 			/* Reset data. */
3997 			buffer_flip(q->packet);
3998 			q->tcplen = buffer_remaining(q->packet);
3999 			data->bytes_transmitted = 0;
4000 			/* Reset timeout.  */
4001 			timeout.tv_sec = data->tcp_timeout / 1000;
4002 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4003 			ev_base = data->event.ev_base;
4004 			event_del(&data->event);
4005 			memset(&data->event, 0, sizeof(data->event));
4006 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4007 				handle_tcp_writing, data);
4008 			if(event_base_set(ev_base, &data->event) != 0)
4009 				log_msg(LOG_ERR, "event base set tcpw failed");
4010 			if(event_add(&data->event, &timeout) != 0)
4011 				log_msg(LOG_ERR, "event add tcpw failed");
4012 
4013 			/*
4014 			 * Write data if/when the socket is writable
4015 			 * again.
4016 			 */
4017 			return;
4018 		}
4019 	}
4020 
4021 	/*
4022 	 * Done sending, wait for the next request to arrive on the
4023 	 * TCP socket by installing the TCP read handler.
4024 	 */
4025 	if ((data->nsd->tcp_query_count > 0 &&
4026 		data->query_count >= data->nsd->tcp_query_count) ||
4027 		data->tcp_no_more_queries) {
4028 
4029 		(void) shutdown(fd, SHUT_WR);
4030 	}
4031 
4032 	data->bytes_transmitted = 0;
4033 
4034 	timeout.tv_sec = data->tcp_timeout / 1000;
4035 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4036 	ev_base = data->event.ev_base;
4037 	event_del(&data->event);
4038 	memset(&data->event, 0, sizeof(data->event));
4039 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4040 		handle_tcp_reading, data);
4041 	if(event_base_set(ev_base, &data->event) != 0)
4042 		log_msg(LOG_ERR, "event base set tcpw failed");
4043 	if(event_add(&data->event, &timeout) != 0)
4044 		log_msg(LOG_ERR, "event add tcpw failed");
4045 }
4046 
4047 #ifdef HAVE_SSL
4048 /** create SSL object and associate fd */
4049 static SSL*
4050 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4051 {
4052 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4053 	if(!ssl) {
4054 		log_crypto_err("could not SSL_new");
4055 		return NULL;
4056 	}
4057 	SSL_set_accept_state(ssl);
4058 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4059 	if(!SSL_set_fd(ssl, fd)) {
4060 		log_crypto_err("could not SSL_set_fd");
4061 		SSL_free(ssl);
4062 		return NULL;
4063 	}
4064 	return ssl;
4065 }
4066 
4067 /** TLS handshake to upgrade TCP connection */
4068 static int
4069 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4070 {
4071 	int r;
4072 	if(data->shake_state == tls_hs_read_event) {
4073 		/* read condition satisfied back to writing */
4074 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4075 		data->shake_state = tls_hs_none;
4076 		return 1;
4077 	}
4078 	if(data->shake_state == tls_hs_write_event) {
4079 		/* write condition satisfied back to reading */
4080 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4081 		data->shake_state = tls_hs_none;
4082 		return 1;
4083 	}
4084 
4085 	/* (continue to) setup the TLS connection */
4086 	ERR_clear_error();
4087 	r = SSL_do_handshake(data->tls);
4088 
4089 	if(r != 1) {
4090 		int want = SSL_get_error(data->tls, r);
4091 		if(want == SSL_ERROR_WANT_READ) {
4092 			if(data->shake_state == tls_hs_read) {
4093 				/* try again later */
4094 				return 1;
4095 			}
4096 			data->shake_state = tls_hs_read;
4097 			/* switch back to reading mode */
4098 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4099 			return 1;
4100 		} else if(want == SSL_ERROR_WANT_WRITE) {
4101 			if(data->shake_state == tls_hs_write) {
4102 				/* try again later */
4103 				return 1;
4104 			}
4105 			data->shake_state = tls_hs_write;
4106 			/* switch back to writing mode */
4107 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4108 			return 1;
4109 		} else {
4110 			if(r == 0)
4111 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4112 			else {
4113 				unsigned long err = ERR_get_error();
4114 				if(!squelch_err_ssl_handshake(err)) {
4115 					char a[64], s[256];
4116 					addr2str(&data->query->addr, a, sizeof(a));
4117 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4118 					log_crypto_from_err(s, err);
4119 				}
4120 			}
4121 			cleanup_tcp_handler(data);
4122 			return 0;
4123 		}
4124 	}
4125 
4126 	/* Use to log successful upgrade for testing - could be removed*/
4127 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4128 	/* set back to the event we need to have when reading (or writing) */
4129 	if(data->shake_state == tls_hs_read && writing) {
4130 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4131 	} else if(data->shake_state == tls_hs_write && !writing) {
4132 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4133 	}
4134 	data->shake_state = tls_hs_none;
4135 	return 1;
4136 }
4137 
4138 /** handle TLS reading of incoming query */
4139 static void
4140 handle_tls_reading(int fd, short event, void* arg)
4141 {
4142 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4143 	ssize_t received;
4144 	uint32_t now = 0;
4145 
4146 	if ((event & EV_TIMEOUT)) {
4147 		/* Connection timed out.  */
4148 		cleanup_tcp_handler(data);
4149 		return;
4150 	}
4151 
4152 	if ((data->nsd->tcp_query_count > 0 &&
4153 	    data->query_count >= data->nsd->tcp_query_count) ||
4154 	    data->tcp_no_more_queries) {
4155 		/* No more queries allowed on this tcp connection. */
4156 		cleanup_tcp_handler(data);
4157 		return;
4158 	}
4159 
4160 	assert((event & EV_READ));
4161 
4162 	if (data->bytes_transmitted == 0) {
4163 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4164 	}
4165 
4166 	if(data->shake_state != tls_hs_none) {
4167 		if(!tls_handshake(data, fd, 0))
4168 			return;
4169 		if(data->shake_state != tls_hs_none)
4170 			return;
4171 	}
4172 
4173 	/*
4174 	 * Check if we received the leading packet length bytes yet.
4175 	 */
4176 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4177 		ERR_clear_error();
4178 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4179 		    + data->bytes_transmitted,
4180 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4181 			int want = SSL_get_error(data->tls, received);
4182 			if(want == SSL_ERROR_ZERO_RETURN) {
4183 				cleanup_tcp_handler(data);
4184 				return; /* shutdown, closed */
4185 			} else if(want == SSL_ERROR_WANT_READ) {
4186 				/* wants to be called again */
4187 				return;
4188 			}
4189 			else if(want == SSL_ERROR_WANT_WRITE) {
4190 				/* switch to writing */
4191 				data->shake_state = tls_hs_write_event;
4192 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4193 				return;
4194 			}
4195 			cleanup_tcp_handler(data);
4196 			log_crypto_err("could not SSL_read");
4197 			return;
4198 		}
4199 
4200 		data->bytes_transmitted += received;
4201 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4202 			/*
4203 			 * Not done with the tcplen yet, wait for more
4204 			 * data to become available.
4205 			 */
4206 			return;
4207 		}
4208 
4209 		assert(data->bytes_transmitted == sizeof(uint16_t));
4210 
4211 		data->query->tcplen = ntohs(data->query->tcplen);
4212 
4213 		/*
4214 		 * Minimum query size is:
4215 		 *
4216 		 *     Size of the header (12)
4217 		 *   + Root domain name   (1)
4218 		 *   + Query class        (2)
4219 		 *   + Query type         (2)
4220 		 */
4221 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4222 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4223 			cleanup_tcp_handler(data);
4224 			return;
4225 		}
4226 
4227 		if (data->query->tcplen > data->query->maxlen) {
4228 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4229 			cleanup_tcp_handler(data);
4230 			return;
4231 		}
4232 
4233 		buffer_set_limit(data->query->packet, data->query->tcplen);
4234 	}
4235 
4236 	assert(buffer_remaining(data->query->packet) > 0);
4237 
4238 	/* Read the (remaining) query data.  */
4239 	ERR_clear_error();
4240 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4241 			    (int)buffer_remaining(data->query->packet));
4242 	if(received <= 0) {
4243 		int want = SSL_get_error(data->tls, received);
4244 		if(want == SSL_ERROR_ZERO_RETURN) {
4245 			cleanup_tcp_handler(data);
4246 			return; /* shutdown, closed */
4247 		} else if(want == SSL_ERROR_WANT_READ) {
4248 			/* wants to be called again */
4249 			return;
4250 		}
4251 		else if(want == SSL_ERROR_WANT_WRITE) {
4252 			/* switch back writing */
4253 			data->shake_state = tls_hs_write_event;
4254 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4255 			return;
4256 		}
4257 		cleanup_tcp_handler(data);
4258 		log_crypto_err("could not SSL_read");
4259 		return;
4260 	}
4261 
4262 	data->bytes_transmitted += received;
4263 	buffer_skip(data->query->packet, received);
4264 	if (buffer_remaining(data->query->packet) > 0) {
4265 		/*
4266 		 * Message not yet complete, wait for more data to
4267 		 * become available.
4268 		 */
4269 		return;
4270 	}
4271 
4272 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4273 
4274 	/* Account... */
4275 #ifndef INET6
4276 	STATUP(data->nsd, ctls);
4277 #else
4278 	if (data->query->addr.ss_family == AF_INET) {
4279 		STATUP(data->nsd, ctls);
4280 	} else if (data->query->addr.ss_family == AF_INET6) {
4281 		STATUP(data->nsd, ctls6);
4282 	}
4283 #endif
4284 
4285 	/* We have a complete query, process it.  */
4286 
4287 	/* tcp-query-count: handle query counter ++ */
4288 	data->query_count++;
4289 
4290 	buffer_flip(data->query->packet);
4291 #ifdef USE_DNSTAP
4292 	/*
4293 	 * and send TCP-query with found address (local) and client address to dnstap process
4294 	 */
4295 	log_addr("query from client", &data->query->addr);
4296 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4297 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4298 		data->query->addrlen, data->query->tcp, data->query->packet);
4299 #endif /* USE_DNSTAP */
4300 	data->query_state = server_process_query(data->nsd, data->query, &now);
4301 	if (data->query_state == QUERY_DISCARDED) {
4302 		/* Drop the packet and the entire connection... */
4303 		STATUP(data->nsd, dropped);
4304 		ZTATUP(data->nsd, data->query->zone, dropped);
4305 		cleanup_tcp_handler(data);
4306 		return;
4307 	}
4308 
4309 #ifdef BIND8_STATS
4310 	if (RCODE(data->query->packet) == RCODE_OK
4311 	    && !AA(data->query->packet))
4312 	{
4313 		STATUP(data->nsd, nona);
4314 		ZTATUP(data->nsd, data->query->zone, nona);
4315 	}
4316 #endif /* BIND8_STATS */
4317 
4318 #ifdef USE_ZONE_STATS
4319 #ifndef INET6
4320 	ZTATUP(data->nsd, data->query->zone, ctls);
4321 #else
4322 	if (data->query->addr.ss_family == AF_INET) {
4323 		ZTATUP(data->nsd, data->query->zone, ctls);
4324 	} else if (data->query->addr.ss_family == AF_INET6) {
4325 		ZTATUP(data->nsd, data->query->zone, ctls6);
4326 	}
4327 #endif
4328 #endif /* USE_ZONE_STATS */
4329 
4330 	query_add_optional(data->query, data->nsd, &now);
4331 
4332 	/* Switch to the tcp write handler.  */
4333 	buffer_flip(data->query->packet);
4334 	data->query->tcplen = buffer_remaining(data->query->packet);
4335 #ifdef BIND8_STATS
4336 	/* Account the rcode & TC... */
4337 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4338 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4339 	if (TC(data->query->packet)) {
4340 		STATUP(data->nsd, truncated);
4341 		ZTATUP(data->nsd, data->query->zone, truncated);
4342 	}
4343 #endif /* BIND8_STATS */
4344 #ifdef USE_DNSTAP
4345 	/*
4346 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4347 	 */
4348 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4349 	log_addr("response to client", &data->query->addr);
4350 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr,
4351 		data->query->addrlen, data->query->tcp, data->query->packet,
4352 		data->query->zone);
4353 #endif /* USE_DNSTAP */
4354 	data->bytes_transmitted = 0;
4355 
4356 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4357 
4358 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4359 	handle_tls_writing(fd, EV_WRITE, data);
4360 }
4361 
4362 /** handle TLS writing of outgoing response */
4363 static void
4364 handle_tls_writing(int fd, short event, void* arg)
4365 {
4366 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4367 	ssize_t sent;
4368 	struct query *q = data->query;
4369 	/* static variable that holds reassembly buffer used to put the
4370 	 * TCP length in front of the packet, like writev. */
4371 	static buffer_type* global_tls_temp_buffer = NULL;
4372 	buffer_type* write_buffer;
4373 	uint32_t now = 0;
4374 
4375 	if ((event & EV_TIMEOUT)) {
4376 		/* Connection timed out.  */
4377 		cleanup_tcp_handler(data);
4378 		return;
4379 	}
4380 
4381 	assert((event & EV_WRITE));
4382 
4383 	if(data->shake_state != tls_hs_none) {
4384 		if(!tls_handshake(data, fd, 1))
4385 			return;
4386 		if(data->shake_state != tls_hs_none)
4387 			return;
4388 	}
4389 
4390 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4391 
4392 	/* If we are writing the start of a message, we must include the length
4393 	 * this is done with a copy into write_buffer. */
4394 	write_buffer = NULL;
4395 	if (data->bytes_transmitted == 0) {
4396 		if(!global_tls_temp_buffer) {
4397 			/* gets deallocated when nsd shuts down from
4398 			 * nsd.region */
4399 			global_tls_temp_buffer = buffer_create(nsd.region,
4400 				QIOBUFSZ + sizeof(q->tcplen));
4401 			if (!global_tls_temp_buffer) {
4402 				return;
4403 			}
4404 		}
4405 		write_buffer = global_tls_temp_buffer;
4406 		buffer_clear(write_buffer);
4407 		buffer_write_u16(write_buffer, q->tcplen);
4408 		buffer_write(write_buffer, buffer_current(q->packet),
4409 			(int)buffer_remaining(q->packet));
4410 		buffer_flip(write_buffer);
4411 	} else {
4412 		write_buffer = q->packet;
4413 	}
4414 
4415 	/* Write the response */
4416 	ERR_clear_error();
4417 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4418 	if(sent <= 0) {
4419 		int want = SSL_get_error(data->tls, sent);
4420 		if(want == SSL_ERROR_ZERO_RETURN) {
4421 			cleanup_tcp_handler(data);
4422 			/* closed */
4423 		} else if(want == SSL_ERROR_WANT_READ) {
4424 			/* switch back to reading */
4425 			data->shake_state = tls_hs_read_event;
4426 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4427 		} else if(want != SSL_ERROR_WANT_WRITE) {
4428 			cleanup_tcp_handler(data);
4429 			log_crypto_err("could not SSL_write");
4430 		}
4431 		return;
4432 	}
4433 
4434 	buffer_skip(write_buffer, sent);
4435 	if(buffer_remaining(write_buffer) != 0) {
4436 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4437 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4438 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4439 		}
4440 	}
4441 
4442 	data->bytes_transmitted += sent;
4443 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4444 		/*
4445 		 * Still more data to write when socket becomes
4446 		 * writable again.
4447 		 */
4448 		return;
4449 	}
4450 
4451 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4452 
4453 	if (data->query_state == QUERY_IN_AXFR) {
4454 		/* Continue processing AXFR and writing back results.  */
4455 		buffer_clear(q->packet);
4456 		data->query_state = query_axfr(data->nsd, q);
4457 		if (data->query_state != QUERY_PROCESSED) {
4458 			query_add_optional(data->query, data->nsd, &now);
4459 
4460 			/* Reset data. */
4461 			buffer_flip(q->packet);
4462 			q->tcplen = buffer_remaining(q->packet);
4463 			data->bytes_transmitted = 0;
4464 			/* Reset to writing mode.  */
4465 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4466 
4467 			/*
4468 			 * Write data if/when the socket is writable
4469 			 * again.
4470 			 */
4471 			return;
4472 		}
4473 	}
4474 
4475 	/*
4476 	 * Done sending, wait for the next request to arrive on the
4477 	 * TCP socket by installing the TCP read handler.
4478 	 */
4479 	if ((data->nsd->tcp_query_count > 0 &&
4480 		data->query_count >= data->nsd->tcp_query_count) ||
4481 		data->tcp_no_more_queries) {
4482 
4483 		(void) shutdown(fd, SHUT_WR);
4484 	}
4485 
4486 	data->bytes_transmitted = 0;
4487 
4488 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4489 }
4490 #endif
4491 
4492 static void
4493 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4494 	void* ATTR_UNUSED(arg))
4495 {
4496 	if(slowaccept) {
4497 		configure_handler_event_types(EV_PERSIST | EV_READ);
4498 		slowaccept = 0;
4499 	}
4500 }
4501 
4502 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4503 {
4504 #ifndef HAVE_ACCEPT4
4505 	int s = accept(fd, addr, addrlen);
4506 	if (s != -1) {
4507 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4508 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4509 			close(s);
4510 			s = -1;
4511 			errno=EINTR; /* stop error printout as error in accept4
4512 				by setting this errno, it omits printout, in
4513 				later code that calls nsd_accept4 */
4514 		}
4515 	}
4516 	return s;
4517 #else
4518 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4519 #endif /* HAVE_ACCEPT4 */
4520 }
4521 
4522 /*
4523  * Handle an incoming TCP connection.  The connection is accepted and
4524  * a new TCP reader event handler is added.  The TCP handler
4525  * is responsible for cleanup when the connection is closed.
4526  */
4527 static void
4528 handle_tcp_accept(int fd, short event, void* arg)
4529 {
4530 	struct tcp_accept_handler_data *data
4531 		= (struct tcp_accept_handler_data *) arg;
4532 	int s;
4533 	int reject = 0;
4534 	struct tcp_handler_data *tcp_data;
4535 	region_type *tcp_region;
4536 #ifdef INET6
4537 	struct sockaddr_storage addr;
4538 #else
4539 	struct sockaddr_in addr;
4540 #endif
4541 	socklen_t addrlen;
4542 	struct timeval timeout;
4543 
4544 	if (!(event & EV_READ)) {
4545 		return;
4546 	}
4547 
4548 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4549 		reject = data->nsd->options->tcp_reject_overflow;
4550 		if (!reject) {
4551 			return;
4552 		}
4553 	}
4554 
4555 	/* Accept it... */
4556 	addrlen = sizeof(addr);
4557 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4558 	if (s == -1) {
4559 		/**
4560 		 * EMFILE and ENFILE is a signal that the limit of open
4561 		 * file descriptors has been reached. Pause accept().
4562 		 * EINTR is a signal interrupt. The others are various OS ways
4563 		 * of saying that the client has closed the connection.
4564 		 */
4565 		if (errno == EMFILE || errno == ENFILE) {
4566 			if (!slowaccept) {
4567 				/* disable accept events */
4568 				struct timeval tv;
4569 				configure_handler_event_types(0);
4570 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4571 				tv.tv_usec = 0L;
4572 				memset(&slowaccept_event, 0,
4573 					sizeof(slowaccept_event));
4574 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4575 					handle_slowaccept_timeout, NULL);
4576 				(void)event_base_set(data->event.ev_base,
4577 					&slowaccept_event);
4578 				(void)event_add(&slowaccept_event, &tv);
4579 				slowaccept = 1;
4580 				/* We don't want to spam the logs here */
4581 			}
4582 		} else if (errno != EINTR
4583 			&& errno != EWOULDBLOCK
4584 #ifdef ECONNABORTED
4585 			&& errno != ECONNABORTED
4586 #endif /* ECONNABORTED */
4587 #ifdef EPROTO
4588 			&& errno != EPROTO
4589 #endif /* EPROTO */
4590 			) {
4591 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4592 		}
4593 		return;
4594 	}
4595 
4596 	if (reject) {
4597 		shutdown(s, SHUT_RDWR);
4598 		close(s);
4599 		return;
4600 	}
4601 
4602 	/*
4603 	 * This region is deallocated when the TCP connection is
4604 	 * closed by the TCP handler.
4605 	 */
4606 	tcp_region = region_create(xalloc, free);
4607 	tcp_data = (struct tcp_handler_data *) region_alloc(
4608 		tcp_region, sizeof(struct tcp_handler_data));
4609 	tcp_data->region = tcp_region;
4610 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4611 		compression_table_size, compressed_dnames);
4612 	tcp_data->nsd = data->nsd;
4613 	tcp_data->query_count = 0;
4614 #ifdef HAVE_SSL
4615 	tcp_data->shake_state = tls_hs_none;
4616 	tcp_data->tls = NULL;
4617 #endif
4618 	tcp_data->prev = NULL;
4619 	tcp_data->next = NULL;
4620 
4621 	tcp_data->query_state = QUERY_PROCESSED;
4622 	tcp_data->bytes_transmitted = 0;
4623 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4624 	tcp_data->query->addrlen = addrlen;
4625 
4626 	tcp_data->tcp_no_more_queries = 0;
4627 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4628 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4629 		/* very busy, give smaller timeout */
4630 		tcp_data->tcp_timeout = 200;
4631 	}
4632 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4633 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4634 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4635 
4636 #ifdef USE_DNSTAP
4637 	/* save the address of the connection */
4638 	tcp_data->socket = data->socket;
4639 #endif /* USE_DNSTAP */
4640 
4641 #ifdef HAVE_SSL
4642 	if (data->tls_accept) {
4643 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4644 		if(!tcp_data->tls) {
4645 			close(s);
4646 			return;
4647 		}
4648 		tcp_data->shake_state = tls_hs_read;
4649 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4650 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4651 			  handle_tls_reading, tcp_data);
4652 	} else {
4653 #endif
4654 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4655 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4656 			  handle_tcp_reading, tcp_data);
4657 #ifdef HAVE_SSL
4658 	}
4659 #endif
4660 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4661 		log_msg(LOG_ERR, "cannot set tcp event base");
4662 		close(s);
4663 		region_destroy(tcp_region);
4664 		return;
4665 	}
4666 	if(event_add(&tcp_data->event, &timeout) != 0) {
4667 		log_msg(LOG_ERR, "cannot add tcp to event base");
4668 		close(s);
4669 		region_destroy(tcp_region);
4670 		return;
4671 	}
4672 	if(tcp_active_list) {
4673 		tcp_active_list->prev = tcp_data;
4674 		tcp_data->next = tcp_active_list;
4675 	}
4676 	tcp_active_list = tcp_data;
4677 
4678 	/*
4679 	 * Keep track of the total number of TCP handlers installed so
4680 	 * we can stop accepting connections when the maximum number
4681 	 * of simultaneous TCP connections is reached.
4682 	 *
4683 	 * If tcp-reject-overflow is enabled, however, then we do not
4684 	 * change the handler event type; we keep it as-is and accept
4685 	 * overflow TCP connections only so that we can forcibly kill
4686 	 * them off.
4687 	 */
4688 	++data->nsd->current_tcp_count;
4689 	if (!data->nsd->options->tcp_reject_overflow &&
4690 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4691 	{
4692 		configure_handler_event_types(0);
4693 	}
4694 }
4695 
4696 static void
4697 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4698 {
4699 	size_t i;
4700 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4701 	for (i = 0; i < nsd->child_count; ++i) {
4702 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4703 			if (write(nsd->children[i].child_fd,
4704 				&command,
4705 				sizeof(command)) == -1)
4706 			{
4707 				if(errno != EAGAIN && errno != EINTR)
4708 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4709 					(int) command,
4710 					(int) nsd->children[i].pid,
4711 					strerror(errno));
4712 			} else if (timeout > 0) {
4713 				(void)block_read(NULL,
4714 					nsd->children[i].child_fd,
4715 					&command, sizeof(command), timeout);
4716 			}
4717 			fsync(nsd->children[i].child_fd);
4718 			close(nsd->children[i].child_fd);
4719 			nsd->children[i].child_fd = -1;
4720 		}
4721 	}
4722 }
4723 
4724 static void
4725 send_children_quit(struct nsd* nsd)
4726 {
4727 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4728 	send_children_command(nsd, NSD_QUIT, 0);
4729 }
4730 
4731 static void
4732 send_children_quit_and_wait(struct nsd* nsd)
4733 {
4734 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4735 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4736 }
4737 
4738 #ifdef BIND8_STATS
4739 static void
4740 set_children_stats(struct nsd* nsd)
4741 {
4742 	size_t i;
4743 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4744 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4745 	for (i = 0; i < nsd->child_count; ++i) {
4746 		nsd->children[i].need_to_send_STATS = 1;
4747 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4748 	}
4749 }
4750 #endif /* BIND8_STATS */
4751 
4752 static void
4753 configure_handler_event_types(short event_types)
4754 {
4755 	size_t i;
4756 
4757 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4758 		struct event* handler = &tcp_accept_handlers[i].event;
4759 		if(event_types) {
4760 			/* reassign */
4761 			int fd = handler->ev_fd;
4762 			struct event_base* base = handler->ev_base;
4763 			if(tcp_accept_handlers[i].event_added)
4764 				event_del(handler);
4765 			memset(handler, 0, sizeof(*handler));
4766 			event_set(handler, fd, event_types,
4767 				handle_tcp_accept, &tcp_accept_handlers[i]);
4768 			if(event_base_set(base, handler) != 0)
4769 				log_msg(LOG_ERR, "conhand: cannot event_base");
4770 			if(event_add(handler, NULL) != 0)
4771 				log_msg(LOG_ERR, "conhand: cannot event_add");
4772 			tcp_accept_handlers[i].event_added = 1;
4773 		} else {
4774 			/* remove */
4775 			if(tcp_accept_handlers[i].event_added) {
4776 				event_del(handler);
4777 				tcp_accept_handlers[i].event_added = 0;
4778 			}
4779 		}
4780 	}
4781 }
4782