xref: /netbsd-src/external/bsd/nsd/dist/server.c (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifndef SHUT_WR
39 #define SHUT_WR 1
40 #endif
41 #ifdef HAVE_MMAP
42 #include <sys/mman.h>
43 #endif /* HAVE_MMAP */
44 #ifdef HAVE_OPENSSL_RAND_H
45 #include <openssl/rand.h>
46 #endif
47 #ifdef HAVE_OPENSSL_SSL_H
48 #include <openssl/ssl.h>
49 #endif
50 #ifdef HAVE_OPENSSL_ERR_H
51 #include <openssl/err.h>
52 #endif
53 #ifdef HAVE_OPENSSL_OCSP_H
54 #include <openssl/ocsp.h>
55 #endif
56 #ifndef USE_MINI_EVENT
57 #  ifdef HAVE_EVENT_H
58 #    include <event.h>
59 #  else
60 #    include <event2/event.h>
61 #    include "event2/event_struct.h"
62 #    include "event2/event_compat.h"
63 #  endif
64 #else
65 #  include "mini_event.h"
66 #endif
67 
68 #include "axfr.h"
69 #include "namedb.h"
70 #include "netio.h"
71 #include "xfrd.h"
72 #include "xfrd-tcp.h"
73 #include "xfrd-disk.h"
74 #include "difffile.h"
75 #include "nsec3.h"
76 #include "ipc.h"
77 #include "udb.h"
78 #include "remote.h"
79 #include "lookup3.h"
80 #include "rrl.h"
81 #ifdef USE_DNSTAP
82 #include "dnstap/dnstap_collector.h"
83 #endif
84 
85 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
86 
87 #ifdef USE_TCP_FASTOPEN
88   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
89   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
90 #endif
91 
92 /*
93  * Data for the UDP handlers.
94  */
95 struct udp_handler_data
96 {
97 	struct nsd        *nsd;
98 	struct nsd_socket *socket;
99 	struct event       event;
100 };
101 
102 struct tcp_accept_handler_data {
103 	struct nsd        *nsd;
104 	struct nsd_socket *socket;
105 	int                event_added;
106 	struct event       event;
107 #ifdef HAVE_SSL
108 	/* handler accepts TLS connections on the dedicated port */
109 	int                tls_accept;
110 #endif
111 };
112 
113 /*
114  * These globals are used to enable the TCP accept handlers
115  * when the number of TCP connection drops below the maximum
116  * number of TCP connections.
117  */
118 static size_t tcp_accept_handler_count;
119 static struct tcp_accept_handler_data *tcp_accept_handlers;
120 
121 static struct event slowaccept_event;
122 static int slowaccept;
123 
124 #ifdef HAVE_SSL
125 static unsigned char *ocspdata = NULL;
126 static long ocspdata_len = 0;
127 #endif
128 
129 #ifdef NONBLOCKING_IS_BROKEN
130 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
131    read multiple times from a socket when reported ready by select. */
132 # define NUM_RECV_PER_SELECT (1)
133 #else /* !NONBLOCKING_IS_BROKEN */
134 # define NUM_RECV_PER_SELECT (100)
135 #endif /* NONBLOCKING_IS_BROKEN */
136 
137 #ifndef HAVE_MMSGHDR
138 struct mmsghdr {
139 	struct msghdr msg_hdr;
140 	unsigned int  msg_len;
141 };
142 #endif
143 
144 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
145 static struct iovec iovecs[NUM_RECV_PER_SELECT];
146 static struct query *queries[NUM_RECV_PER_SELECT];
147 
148 /*
149  * Data for the TCP connection handlers.
150  *
151  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
152  * blocking the entire server on a slow TCP connection, but does make
153  * reading from and writing to the socket more complicated.
154  *
155  * Basically, whenever a read/write would block (indicated by the
156  * EAGAIN errno variable) we remember the position we were reading
157  * from/writing to and return from the TCP reading/writing event
158  * handler.  When the socket becomes readable/writable again we
159  * continue from the same position.
160  */
161 struct tcp_handler_data
162 {
163 	/*
164 	 * The region used to allocate all TCP connection related
165 	 * data, including this structure.  This region is destroyed
166 	 * when the connection is closed.
167 	 */
168 	region_type*		region;
169 
170 	/*
171 	 * The global nsd structure.
172 	 */
173 	struct nsd*			nsd;
174 
175 	/*
176 	 * The current query data for this TCP connection.
177 	 */
178 	query_type*			query;
179 
180 	/*
181 	 * The query_state is used to remember if we are performing an
182 	 * AXFR, if we're done processing, or if we should discard the
183 	 * query and connection.
184 	 */
185 	query_state_type	query_state;
186 
187 	/*
188 	 * The event for the file descriptor and tcp timeout
189 	 */
190 	struct event event;
191 
192 	/*
193 	 * The bytes_transmitted field is used to remember the number
194 	 * of bytes transmitted when receiving or sending a DNS
195 	 * packet.  The count includes the two additional bytes used
196 	 * to specify the packet length on a TCP connection.
197 	 */
198 	size_t				bytes_transmitted;
199 
200 	/*
201 	 * The number of queries handled by this specific TCP connection.
202 	 */
203 	int					query_count;
204 
205 	/*
206 	 * The timeout in msec for this tcp connection
207 	 */
208 	int	tcp_timeout;
209 #ifdef HAVE_SSL
210 	/*
211 	 * TLS object.
212 	 */
213 	SSL* tls;
214 
215 	/*
216 	 * TLS handshake state.
217 	 */
218 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
219 		tls_hs_read_event, tls_hs_write_event } shake_state;
220 #endif
221 	/* list of connections, for service of remaining tcp channels */
222 	struct tcp_handler_data *prev, *next;
223 };
224 /* global that is the list of active tcp channels */
225 static struct tcp_handler_data *tcp_active_list = NULL;
226 
227 /*
228  * Handle incoming queries on the UDP server sockets.
229  */
230 static void handle_udp(int fd, short event, void* arg);
231 
232 /*
233  * Handle incoming connections on the TCP sockets.  These handlers
234  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
235  * connection) but are disabled when the number of current TCP
236  * connections is equal to the maximum number of TCP connections.
237  * Disabling is done by changing the handler to wait for the
238  * NETIO_EVENT_NONE type.  This is done using the function
239  * configure_tcp_accept_handlers.
240  */
241 static void handle_tcp_accept(int fd, short event, void* arg);
242 
243 /*
244  * Handle incoming queries on a TCP connection.  The TCP connections
245  * are configured to be non-blocking and the handler may be called
246  * multiple times before a complete query is received.
247  */
248 static void handle_tcp_reading(int fd, short event, void* arg);
249 
250 /*
251  * Handle outgoing responses on a TCP connection.  The TCP connections
252  * are configured to be non-blocking and the handler may be called
253  * multiple times before a complete response is sent.
254  */
255 static void handle_tcp_writing(int fd, short event, void* arg);
256 
257 #ifdef HAVE_SSL
258 /* Create SSL object and associate fd */
259 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
260 /*
261  * Handle TLS handshake. May be called multiple times if incomplete.
262  */
263 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
264 
265 /*
266  * Handle incoming queries on a TLS over TCP connection.  The TLS
267  * connections are configured to be non-blocking and the handler may
268  * be called multiple times before a complete query is received.
269  */
270 static void handle_tls_reading(int fd, short event, void* arg);
271 
272 /*
273  * Handle outgoing responses on a TLS over TCP connection.  The TLS
274  * connections are configured to be non-blocking and the handler may
275  * be called multiple times before a complete response is sent.
276  */
277 static void handle_tls_writing(int fd, short event, void* arg);
278 #endif
279 
280 /*
281  * Send all children the quit nonblocking, then close pipe.
282  */
283 static void send_children_quit(struct nsd* nsd);
284 /* same, for shutdown time, waits for child to exit to avoid restart issues */
285 static void send_children_quit_and_wait(struct nsd* nsd);
286 
287 /* set childrens flags to send NSD_STATS to them */
288 #ifdef BIND8_STATS
289 static void set_children_stats(struct nsd* nsd);
290 #endif /* BIND8_STATS */
291 
292 /*
293  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
294  */
295 static void configure_handler_event_types(short event_types);
296 
297 static uint16_t *compressed_dname_offsets = 0;
298 static uint32_t compression_table_capacity = 0;
299 static uint32_t compression_table_size = 0;
300 static domain_type* compressed_dnames[MAXRRSPP];
301 
302 #ifdef USE_TCP_FASTOPEN
303 /* Checks to see if the kernel value must be manually changed in order for
304    TCP Fast Open to support server mode */
305 static void report_tcp_fastopen_config() {
306 
307 	int tcp_fastopen_fp;
308 	uint8_t tcp_fastopen_value;
309 
310 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
311 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
312 	}
313 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
314 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
315 		close(tcp_fastopen_fp);
316 	}
317 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
318 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
319 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
320 		log_msg(LOG_WARNING, "To enable TFO use the command:");
321 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
322 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
323 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
324 		close(tcp_fastopen_fp);
325 	}
326 	close(tcp_fastopen_fp);
327 }
328 #endif
329 
330 /*
331  * Remove the specified pid from the list of child pids.  Returns -1 if
332  * the pid is not in the list, child_num otherwise.  The field is set to 0.
333  */
334 static int
335 delete_child_pid(struct nsd *nsd, pid_t pid)
336 {
337 	size_t i;
338 	for (i = 0; i < nsd->child_count; ++i) {
339 		if (nsd->children[i].pid == pid) {
340 			nsd->children[i].pid = 0;
341 			if(!nsd->children[i].need_to_exit) {
342 				if(nsd->children[i].child_fd != -1)
343 					close(nsd->children[i].child_fd);
344 				nsd->children[i].child_fd = -1;
345 				if(nsd->children[i].handler)
346 					nsd->children[i].handler->fd = -1;
347 			}
348 			return i;
349 		}
350 	}
351 	return -1;
352 }
353 
354 /*
355  * Restart child servers if necessary.
356  */
357 static int
358 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
359 	int* xfrd_sock_p)
360 {
361 	struct main_ipc_handler_data *ipc_data;
362 	size_t i;
363 	int sv[2];
364 
365 	/* Fork the child processes... */
366 	for (i = 0; i < nsd->child_count; ++i) {
367 		if (nsd->children[i].pid <= 0) {
368 			if (nsd->children[i].child_fd != -1)
369 				close(nsd->children[i].child_fd);
370 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
371 				log_msg(LOG_ERR, "socketpair: %s",
372 					strerror(errno));
373 				return -1;
374 			}
375 			nsd->children[i].child_fd = sv[0];
376 			nsd->children[i].parent_fd = sv[1];
377 			nsd->children[i].pid = fork();
378 			switch (nsd->children[i].pid) {
379 			default: /* SERVER MAIN */
380 				close(nsd->children[i].parent_fd);
381 				nsd->children[i].parent_fd = -1;
382 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
383 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
384 				}
385 				if(!nsd->children[i].handler)
386 				{
387 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
388 						region, sizeof(struct main_ipc_handler_data));
389 					ipc_data->nsd = nsd;
390 					ipc_data->child = &nsd->children[i];
391 					ipc_data->child_num = i;
392 					ipc_data->xfrd_sock = xfrd_sock_p;
393 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
394 					ipc_data->forward_mode = 0;
395 					ipc_data->got_bytes = 0;
396 					ipc_data->total_bytes = 0;
397 					ipc_data->acl_num = 0;
398 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
399 						region, sizeof(struct netio_handler));
400 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
401 					nsd->children[i].handler->timeout = NULL;
402 					nsd->children[i].handler->user_data = ipc_data;
403 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
404 					nsd->children[i].handler->event_handler = parent_handle_child_command;
405 					netio_add_handler(netio, nsd->children[i].handler);
406 				}
407 				/* clear any ongoing ipc */
408 				ipc_data = (struct main_ipc_handler_data*)
409 					nsd->children[i].handler->user_data;
410 				ipc_data->forward_mode = 0;
411 				/* restart - update fd */
412 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
413 				break;
414 			case 0: /* CHILD */
415 				/* the child need not be able to access the
416 				 * nsd.db file */
417 				namedb_close_udb(nsd->db);
418 #ifdef MEMCLEAN /* OS collects memory pages */
419 				region_destroy(region);
420 #endif
421 				nsd->pid = 0;
422 				nsd->child_count = 0;
423 				nsd->server_kind = nsd->children[i].kind;
424 				nsd->this_child = &nsd->children[i];
425 				nsd->this_child->child_num = i;
426 				/* remove signal flags inherited from parent
427 				   the parent will handle them. */
428 				nsd->signal_hint_reload_hup = 0;
429 				nsd->signal_hint_reload = 0;
430 				nsd->signal_hint_child = 0;
431 				nsd->signal_hint_quit = 0;
432 				nsd->signal_hint_shutdown = 0;
433 				nsd->signal_hint_stats = 0;
434 				nsd->signal_hint_statsusr = 0;
435 				close(*xfrd_sock_p);
436 				close(nsd->this_child->child_fd);
437 				nsd->this_child->child_fd = -1;
438 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
439 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
440 				}
441 				server_child(nsd);
442 				/* NOTREACH */
443 				exit(0);
444 			case -1:
445 				log_msg(LOG_ERR, "fork failed: %s",
446 					strerror(errno));
447 				return -1;
448 			}
449 		}
450 	}
451 	return 0;
452 }
453 
454 #ifdef BIND8_STATS
455 static void set_bind8_alarm(struct nsd* nsd)
456 {
457 	/* resync so that the next alarm is on the next whole minute */
458 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
459 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
460 }
461 #endif
462 
463 /* set zone stat ids for zones initially read in */
464 static void
465 zonestatid_tree_set(struct nsd* nsd)
466 {
467 	struct radnode* n;
468 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
469 		zone_type* zone = (zone_type*)n->elem;
470 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
471 	}
472 }
473 
474 #ifdef USE_ZONE_STATS
475 void
476 server_zonestat_alloc(struct nsd* nsd)
477 {
478 	size_t num = (nsd->options->zonestatnames->count==0?1:
479 			nsd->options->zonestatnames->count);
480 	size_t sz = sizeof(struct nsdst)*num;
481 	char tmpfile[256];
482 	uint8_t z = 0;
483 
484 	/* file names */
485 	nsd->zonestatfname[0] = 0;
486 	nsd->zonestatfname[1] = 0;
487 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
488 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
489 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
490 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
491 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
492 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
493 
494 	/* file descriptors */
495 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
496 	if(nsd->zonestatfd[0] == -1) {
497 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
498 			strerror(errno));
499 		exit(1);
500 	}
501 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
502 	if(nsd->zonestatfd[0] == -1) {
503 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
504 			strerror(errno));
505 		close(nsd->zonestatfd[0]);
506 		unlink(nsd->zonestatfname[0]);
507 		exit(1);
508 	}
509 
510 #ifdef HAVE_MMAP
511 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
512 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
513 			strerror(errno));
514 		exit(1);
515 	}
516 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
517 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
518 			nsd->zonestatfname[0], strerror(errno));
519 		exit(1);
520 	}
521 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
522 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
523 			strerror(errno));
524 		exit(1);
525 	}
526 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
527 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
528 			nsd->zonestatfname[1], strerror(errno));
529 		exit(1);
530 	}
531 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
532 		MAP_SHARED, nsd->zonestatfd[0], 0);
533 	if(nsd->zonestat[0] == MAP_FAILED) {
534 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
535 		unlink(nsd->zonestatfname[0]);
536 		unlink(nsd->zonestatfname[1]);
537 		exit(1);
538 	}
539 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
540 		MAP_SHARED, nsd->zonestatfd[1], 0);
541 	if(nsd->zonestat[1] == MAP_FAILED) {
542 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
543 		unlink(nsd->zonestatfname[0]);
544 		unlink(nsd->zonestatfname[1]);
545 		exit(1);
546 	}
547 	memset(nsd->zonestat[0], 0, sz);
548 	memset(nsd->zonestat[1], 0, sz);
549 	nsd->zonestatsize[0] = num;
550 	nsd->zonestatsize[1] = num;
551 	nsd->zonestatdesired = num;
552 	nsd->zonestatsizenow = num;
553 	nsd->zonestatnow = nsd->zonestat[0];
554 #endif /* HAVE_MMAP */
555 }
556 
557 void
558 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
559 {
560 #ifdef HAVE_MMAP
561 #ifdef MREMAP_MAYMOVE
562 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
563 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
564 		MREMAP_MAYMOVE);
565 	if(nsd->zonestat[idx] == MAP_FAILED) {
566 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
567 		exit(1);
568 	}
569 #else /* !HAVE MREMAP */
570 	if(msync(nsd->zonestat[idx],
571 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
572 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
573 	if(munmap(nsd->zonestat[idx],
574 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
575 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
576 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
577 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
578 	if(nsd->zonestat[idx] == MAP_FAILED) {
579 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
580 		exit(1);
581 	}
582 #endif /* MREMAP */
583 #endif /* HAVE_MMAP */
584 }
585 
586 /* realloc the zonestat array for the one that is not currently in use,
587  * to match the desired new size of the array (if applicable) */
588 void
589 server_zonestat_realloc(struct nsd* nsd)
590 {
591 #ifdef HAVE_MMAP
592 	uint8_t z = 0;
593 	size_t sz;
594 	int idx = 0; /* index of the zonestat array that is not in use */
595 	if(nsd->zonestatnow == nsd->zonestat[0])
596 		idx = 1;
597 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
598 		return;
599 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
600 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
601 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
602 			strerror(errno));
603 		exit(1);
604 	}
605 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
606 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
607 			nsd->zonestatfname[idx], strerror(errno));
608 		exit(1);
609 	}
610 	zonestat_remap(nsd, idx, sz);
611 	/* zero the newly allocated region */
612 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
613 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
614 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
615 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
616 	}
617 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
618 #endif /* HAVE_MMAP */
619 }
620 
621 /* switchover to use the other array for the new children, that
622  * briefly coexist with the old children.  And we want to avoid them
623  * both writing to the same statistics arrays. */
624 void
625 server_zonestat_switch(struct nsd* nsd)
626 {
627 	if(nsd->zonestatnow == nsd->zonestat[0]) {
628 		nsd->zonestatnow = nsd->zonestat[1];
629 		nsd->zonestatsizenow = nsd->zonestatsize[1];
630 	} else {
631 		nsd->zonestatnow = nsd->zonestat[0];
632 		nsd->zonestatsizenow = nsd->zonestatsize[0];
633 	}
634 }
635 #endif /* USE_ZONE_STATS */
636 
637 static void
638 cleanup_dname_compression_tables(void *ptr)
639 {
640 	free(ptr);
641 	compressed_dname_offsets = NULL;
642 	compression_table_capacity = 0;
643 }
644 
645 static void
646 initialize_dname_compression_tables(struct nsd *nsd)
647 {
648 	size_t needed = domain_table_count(nsd->db->domains) + 1;
649 	needed += EXTRA_DOMAIN_NUMBERS;
650 	if(compression_table_capacity < needed) {
651 		if(compressed_dname_offsets) {
652 			region_remove_cleanup(nsd->db->region,
653 				cleanup_dname_compression_tables,
654 				compressed_dname_offsets);
655 			free(compressed_dname_offsets);
656 		}
657 		compressed_dname_offsets = (uint16_t *) xmallocarray(
658 			needed, sizeof(uint16_t));
659 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
660 			compressed_dname_offsets);
661 		compression_table_capacity = needed;
662 		compression_table_size=domain_table_count(nsd->db->domains)+1;
663 	}
664 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
665 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
666 }
667 
668 static int
669 set_reuseport(struct nsd_socket *sock)
670 {
671 #ifdef SO_REUSEPORT
672 	int on = 1;
673 #ifdef SO_REUSEPORT_LB
674 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
675 	 * SO_REUSEPORT on Linux. This is what the users want with the config
676 	 * option in nsd.conf; if we actually need local address and port reuse
677 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
678 	 * _LB they want.
679 	 */
680 	int opt = SO_REUSEPORT_LB;
681 	static const char optname[] = "SO_REUSEPORT_LB";
682 #else /* !SO_REUSEPORT_LB */
683 	int opt = SO_REUSEPORT;
684 	static const char optname[] = "SO_REUSEPORT";
685 #endif /* SO_REUSEPORT_LB */
686 
687 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
688 		return 1;
689 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
690 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
691 			optname, strerror(errno));
692 	}
693 	return -1;
694 #else
695 	(void)sock;
696 #endif /* SO_REUSEPORT */
697 
698 	return 0;
699 }
700 
701 static int
702 set_reuseaddr(struct nsd_socket *sock)
703 {
704 #ifdef SO_REUSEADDR
705 	int on = 1;
706 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
707 		return 1;
708 	}
709 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
710 		strerror(errno));
711 	return -1;
712 #endif /* SO_REUSEADDR */
713 	return 0;
714 }
715 
716 static int
717 set_rcvbuf(struct nsd_socket *sock, int rcv)
718 {
719 #ifdef SO_RCVBUF
720 #ifdef SO_RCVBUFFORCE
721 	if(0 == setsockopt(
722 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
723 	{
724 		return 1;
725 	}
726 	if(errno == EPERM || errno == ENOBUFS) {
727 		return 0;
728 	}
729 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
730 		strerror(errno));
731 	return -1;
732 #else /* !SO_RCVBUFFORCE */
733 	if (0 == setsockopt(
734 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
735 	{
736 		return 1;
737 	}
738 	if(errno == ENOSYS || errno == ENOBUFS) {
739 		return 0;
740 	}
741 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
742 		strerror(errno));
743 	return -1;
744 #endif /* SO_RCVBUFFORCE */
745 #endif /* SO_RCVBUF */
746 
747 	return 0;
748 }
749 
750 static int
751 set_sndbuf(struct nsd_socket *sock, int snd)
752 {
753 #ifdef SO_SNDBUF
754 #ifdef SO_SNDBUFFORCE
755 	if(0 == setsockopt(
756 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
757 	{
758 		return 1;
759 	}
760 	if(errno == EPERM || errno == ENOBUFS) {
761 		return 0;
762 	}
763 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
764 		strerror(errno));
765 	return -1;
766 #else /* !SO_SNDBUFFORCE */
767 	if(0 == setsockopt(
768 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
769 	{
770 		return 1;
771 	}
772 	if(errno == ENOSYS || errno == ENOBUFS) {
773 		return 0;
774 	}
775 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
776 		strerror(errno));
777 	return -1;
778 #endif /* SO_SNDBUFFORCE */
779 #endif /* SO_SNDBUF */
780 
781 	return 0;
782 }
783 
784 static int
785 set_nonblock(struct nsd_socket *sock)
786 {
787 	const char *socktype =
788 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
789 
790 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
791 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
792 			socktype, strerror(errno));
793 		return -1;
794 	}
795 
796 	return 1;
797 }
798 
799 static int
800 set_ipv6_v6only(struct nsd_socket *sock)
801 {
802 #ifdef INET6
803 #ifdef IPV6_V6ONLY
804 	int on = 1;
805 	const char *socktype =
806 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
807 
808 	if(0 == setsockopt(
809 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
810 	{
811 		return 1;
812 	}
813 
814 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
815 		socktype, strerror(errno));
816 	return -1;
817 #endif /* IPV6_V6ONLY */
818 #endif /* INET6 */
819 
820 	return 0;
821 }
822 
823 static int
824 set_ipv6_use_min_mtu(struct nsd_socket *sock)
825 {
826 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))
827 #if defined(IPV6_USE_MIN_MTU)
828 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
829 	 * network. Therefore we do not send UDP datagrams larger than the
830 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
831 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
832 	 */
833 	int opt = IPV6_USE_MIN_MTU;
834 	int optval = 1;
835 	static const char optname[] = "IPV6_USE_MIN_MTU";
836 #elif defined(IPV6_MTU)
837 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
838 	 * to the MIN MTU to get the same.
839 	 */
840 	int opt = IPV6_MTU;
841 	int optval = IPV6_MIN_MTU;
842 	static const char optname[] = "IPV6_MTU";
843 #endif
844 	if(0 == setsockopt(
845 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
846 	{
847 		return 1;
848 	}
849 
850 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
851 		optname, strerror(errno));
852 	return -1;
853 #else
854 	(void)sock;
855 #endif /* INET6 */
856 
857 	return 0;
858 }
859 
860 static int
861 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
862 {
863 	int ret = 0;
864 
865 #if defined(IP_MTU_DISCOVER)
866 	int opt = IP_MTU_DISCOVER;
867 	int optval;
868 # if defined(IP_PMTUDISC_OMIT)
869 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
870 	 * information and send packets with DF=0. Fragmentation is allowed if
871 	 * and only if the packet size exceeds the outgoing interface MTU or
872 	 * the packet encounters smaller MTU link in network. This mitigates
873 	 * DNS fragmentation attacks by preventing forged PMTU information.
874 	 * FreeBSD already has same semantics without setting the option.
875 	 */
876 	optval = IP_PMTUDISC_OMIT;
877 	if(0 == setsockopt(
878 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
879 	{
880 		return 1;
881 	}
882 
883 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
884 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
885 # endif /* IP_PMTUDISC_OMIT */
886 # if defined(IP_PMTUDISC_DONT)
887 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
888 	optval = IP_PMTUDISC_DONT;
889 	if(0 == setsockopt(
890 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
891 	{
892 		return 1;
893 	}
894 
895 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
896 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
897 # endif
898 	ret = -1;
899 #elif defined(IP_DONTFRAG)
900 	int off = 0;
901 	if (0 == setsockopt(
902 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
903 	{
904 		return 1;
905 	}
906 
907 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
908 		strerror(errno));
909 	ret = -1;
910 #else
911 	(void)sock;
912 #endif
913 
914 	return ret;
915 }
916 
917 static int
918 set_ip_freebind(struct nsd_socket *sock)
919 {
920 #ifdef IP_FREEBIND
921 	int on = 1;
922 	const char *socktype =
923 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
924 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
925 	{
926 		return 1;
927 	}
928 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
929 		socktype, strerror(errno));
930 	return -1;
931 #else
932 	(void)sock;
933 #endif /* IP_FREEBIND */
934 
935 	return 0;
936 }
937 
938 static int
939 set_ip_transparent(struct nsd_socket *sock)
940 {
941 #if defined(IP_TRANSPARENT)
942 	int on = 1;
943 	const char *socktype =
944 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
945 	if(0 == setsockopt(
946 		sock->s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)))
947 	{
948 		return 1;
949 	}
950 
951 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
952 		"IP_TRANSPARENT", socktype, strerror(errno));
953 	return -1;
954 #elif defined(SO_BINDANY)
955 	int on = 1;
956 	const char *socktype =
957 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
958 	if(0 == setsockopt(
959 		sock->s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)))
960 	{
961 		return 1;
962 	}
963 
964 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
965 		"SO_BINDANY", socktype, strerror(errno));
966 	return -1;
967 #else
968 	(void)sock;
969 #endif
970 
971 	return 0;
972 }
973 
974 static int
975 set_tcp_maxseg(struct nsd_socket *sock, int mss)
976 {
977 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
978 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
979 		return 1;
980 	}
981 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
982 		strerror(errno));
983 	return -1;
984 #else
985 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
986 #endif
987 	return 0;
988 }
989 
990 #ifdef USE_TCP_FASTOPEN
991 static int
992 set_tcp_fastopen(struct nsd_socket *sock)
993 {
994 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
995 	 * a defense against IP spoofing attacks as suggested in RFC7413.
996 	 */
997 	int qlen;
998 
999 #ifdef __APPLE__
1000 	/* macOS X implementation only supports qlen of 1 via this call. The
1001 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1002 	 * kernel parameter.
1003 	 */
1004 	qlen = 1;
1005 #else
1006 	/* 5 is recommended on Linux. */
1007 	qlen = 5;
1008 #endif
1009 	if (0 == setsockopt(
1010 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1011 	{
1012 		return 1;
1013 	}
1014 
1015 	if (errno == EPERM) {
1016 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1017 				 "; this could likely be because sysctl "
1018 				 "net.inet.tcp.fastopen.enabled, "
1019 				 "net.inet.tcp.fastopen.server_enable, or "
1020 				 "net.ipv4.tcp_fastopen is disabled",
1021 			strerror(errno));
1022 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1023 	 * disabled, except when verbosity enabled for debugging
1024 	 */
1025 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1026 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1027 			strerror(errno));
1028 	}
1029 
1030 	return (errno == ENOPROTOOPT ? 0 : -1);
1031 }
1032 #endif /* USE_TCP_FASTOPEN */
1033 
1034 static int
1035 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1036 {
1037 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1038 
1039 	if(-1 == (sock->s = socket(
1040 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1041 	{
1042 #ifdef INET6
1043 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1044 		   (sock->addr.ai_family == AF_INET6) &&
1045 		   (errno == EAFNOSUPPORT))
1046 		{
1047 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1048 				"not supported");
1049 			return 0;
1050 		}
1051 #endif
1052 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1053 		return -1;
1054 	}
1055 
1056 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1057 		*reuseport_works = (set_reuseport(sock) == 1);
1058 
1059 	if(nsd->options->receive_buffer_size > 0)
1060 		rcv = nsd->options->receive_buffer_size;
1061 	if(set_rcvbuf(sock, rcv) == -1)
1062 		return -1;
1063 
1064 	if(nsd->options->send_buffer_size > 0)
1065 		snd = nsd->options->send_buffer_size;
1066 	if(set_sndbuf(sock, snd) == -1)
1067 		return -1;
1068 #ifdef INET6
1069 	if(sock->addr.ai_family == AF_INET6) {
1070 		if(set_ipv6_v6only(sock) == -1 ||
1071 		   set_ipv6_use_min_mtu(sock) == -1)
1072 			return -1;
1073 	} else
1074 #endif /* INET6 */
1075 	if(sock->addr.ai_family == AF_INET) {
1076 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1077 			return -1;
1078 	}
1079 
1080 	/* Set socket to non-blocking. Otherwise, on operating systems
1081 	 * with thundering herd problems, the UDP recv could block
1082 	 * after select returns readable.
1083 	 */
1084 	set_nonblock(sock);
1085 
1086 	if(nsd->options->ip_freebind)
1087 		(void)set_ip_freebind(sock);
1088 	if(nsd->options->ip_transparent)
1089 		(void)set_ip_transparent(sock);
1090 
1091 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1092 		char buf[256];
1093 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1094 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1095 			buf, strerror(errno));
1096 		return -1;
1097 	}
1098 
1099 	return 1;
1100 }
1101 
1102 static int
1103 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1104 {
1105 #ifdef USE_TCP_FASTOPEN
1106 	report_tcp_fastopen_config();
1107 #endif
1108 
1109 	(void)reuseport_works;
1110 
1111 	if(-1 == (sock->s = socket(
1112 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1113 	{
1114 #ifdef INET6
1115 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1116 		   (sock->addr.ai_family == AF_INET6) &&
1117 		   (errno == EAFNOSUPPORT))
1118 		{
1119 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1120 			                     "not supported");
1121 			return 0;
1122 		}
1123 #endif /* INET6 */
1124 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1125 		return -1;
1126 	}
1127 
1128 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1129 		*reuseport_works = (set_reuseport(sock) == 1);
1130 
1131 	(void)set_reuseaddr(sock);
1132 
1133 #ifdef INET6
1134 	if(sock->addr.ai_family == AF_INET6) {
1135 		if (set_ipv6_v6only(sock) == -1 ||
1136 		    set_ipv6_use_min_mtu(sock) == -1)
1137 			return -1;
1138 	}
1139 #endif
1140 
1141 	if(nsd->tcp_mss > 0)
1142 		set_tcp_maxseg(sock, nsd->tcp_mss);
1143 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1144 	   it may block in accept, even if select() says readable. */
1145 	(void)set_nonblock(sock);
1146 	if(nsd->options->ip_freebind)
1147 		(void)set_ip_freebind(sock);
1148 	if(nsd->options->ip_transparent)
1149 		(void)set_ip_transparent(sock);
1150 
1151 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1152 		char buf[256];
1153 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1154 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1155 			buf, strerror(errno));
1156 		return -1;
1157 	}
1158 
1159 #ifdef USE_TCP_FASTOPEN
1160 	(void)set_tcp_fastopen(sock);
1161 #endif
1162 
1163 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1164 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1165 		return -1;
1166 	}
1167 
1168 	return 1;
1169 }
1170 
1171 /*
1172  * Initialize the server, reuseport, create and bind the sockets.
1173  */
1174 int
1175 server_init(struct nsd *nsd)
1176 {
1177 	size_t i;
1178 	int reuseport = 1; /* Determine if REUSEPORT works. */
1179 
1180 	/* open server interface ports */
1181 	for(i = 0; i < nsd->ifs; i++) {
1182 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1183 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1184 		{
1185 			return -1;
1186 		}
1187 	}
1188 
1189 	if(nsd->reuseport && reuseport) {
1190 		size_t ifs = nsd->ifs * nsd->reuseport;
1191 
1192 		/* increase the size of the interface arrays, there are going
1193 		 * to be separate interface file descriptors for every server
1194 		 * instance */
1195 		region_remove_cleanup(nsd->region, free, nsd->udp);
1196 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1197 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1198 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1199 		region_add_cleanup(nsd->region, free, nsd->udp);
1200 		region_add_cleanup(nsd->region, free, nsd->tcp);
1201 
1202 		for(i = nsd->ifs; i < ifs; i++) {
1203 			nsd->udp[i].addr = nsd->udp[i%nsd->ifs].addr;
1204 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1205 				return -1;
1206 			}
1207 			/* Turn off REUSEPORT for TCP by copying the socket
1208 			 * file descriptor.
1209 			 */
1210 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1211 		}
1212 
1213 		nsd->ifs = ifs;
1214 	} else {
1215 		nsd->reuseport = 0;
1216 	}
1217 
1218 	return 0;
1219 }
1220 
1221 /*
1222  * Prepare the server for take off.
1223  *
1224  */
1225 int
1226 server_prepare(struct nsd *nsd)
1227 {
1228 #ifdef RATELIMIT
1229 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1230 #ifdef HAVE_ARC4RANDOM
1231 	hash_set_raninit(arc4random());
1232 #else
1233 	uint32_t v = getpid() ^ time(NULL);
1234 	srandom((unsigned long)v);
1235 #  ifdef HAVE_SSL
1236 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1237 		hash_set_raninit(v);
1238 	else
1239 #  endif
1240 		hash_set_raninit(random());
1241 #endif
1242 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1243 		nsd->options->rrl_ratelimit,
1244 		nsd->options->rrl_whitelist_ratelimit,
1245 		nsd->options->rrl_slip,
1246 		nsd->options->rrl_ipv4_prefix_length,
1247 		nsd->options->rrl_ipv6_prefix_length);
1248 #endif /* RATELIMIT */
1249 
1250 	/* Open the database... */
1251 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1252 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1253 			nsd->dbfile, strerror(errno));
1254 		unlink(nsd->task[0]->fname);
1255 		unlink(nsd->task[1]->fname);
1256 #ifdef USE_ZONE_STATS
1257 		unlink(nsd->zonestatfname[0]);
1258 		unlink(nsd->zonestatfname[1]);
1259 #endif
1260 		xfrd_del_tempdir(nsd);
1261 		return -1;
1262 	}
1263 	/* check if zone files have been modified */
1264 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1265 	 * for all zones */
1266 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1267 		nsd->options->database[0] == 0))
1268 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1269 	zonestatid_tree_set(nsd);
1270 
1271 	compression_table_capacity = 0;
1272 	initialize_dname_compression_tables(nsd);
1273 
1274 #ifdef	BIND8_STATS
1275 	/* Initialize times... */
1276 	time(&nsd->st.boot);
1277 	set_bind8_alarm(nsd);
1278 #endif /* BIND8_STATS */
1279 
1280 	return 0;
1281 }
1282 
1283 /*
1284  * Fork the required number of servers.
1285  */
1286 static int
1287 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1288 	int* xfrd_sock_p)
1289 {
1290 	size_t i;
1291 
1292 	/* Start all child servers initially.  */
1293 	for (i = 0; i < nsd->child_count; ++i) {
1294 		nsd->children[i].pid = 0;
1295 	}
1296 
1297 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1298 }
1299 
1300 void
1301 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1302 {
1303 	size_t i;
1304 
1305 	/* Close all the sockets... */
1306 	for (i = 0; i < n; ++i) {
1307 		if (sockets[i].s != -1) {
1308 			close(sockets[i].s);
1309 			sockets[i].s = -1;
1310 		}
1311 	}
1312 }
1313 
1314 /*
1315  * Close the sockets, shutdown the server and exit.
1316  * Does not return.
1317  */
1318 void
1319 server_shutdown(struct nsd *nsd)
1320 {
1321 	size_t i;
1322 
1323 	server_close_all_sockets(nsd->udp, nsd->ifs);
1324 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1325 	/* CHILD: close command channel to parent */
1326 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1327 	{
1328 		close(nsd->this_child->parent_fd);
1329 		nsd->this_child->parent_fd = -1;
1330 	}
1331 	/* SERVER: close command channels to children */
1332 	if(!nsd->this_child)
1333 	{
1334 		for(i=0; i < nsd->child_count; ++i)
1335 			if(nsd->children[i].child_fd != -1)
1336 			{
1337 				close(nsd->children[i].child_fd);
1338 				nsd->children[i].child_fd = -1;
1339 			}
1340 	}
1341 
1342 	tsig_finalize();
1343 #ifdef HAVE_SSL
1344 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1345 	if (nsd->tls_ctx)
1346 		SSL_CTX_free(nsd->tls_ctx);
1347 #endif
1348 
1349 #ifdef MEMCLEAN /* OS collects memory pages */
1350 #ifdef RATELIMIT
1351 	rrl_mmap_deinit_keep_mmap();
1352 #endif
1353 #ifdef USE_DNSTAP
1354 	dt_collector_destroy(nsd->dt_collector, nsd);
1355 #endif
1356 	udb_base_free_keep_mmap(nsd->task[0]);
1357 	udb_base_free_keep_mmap(nsd->task[1]);
1358 	namedb_close_udb(nsd->db); /* keeps mmap */
1359 	namedb_close(nsd->db);
1360 	nsd_options_destroy(nsd->options);
1361 	region_destroy(nsd->region);
1362 #endif
1363 	log_finalize();
1364 	exit(0);
1365 }
1366 
1367 void
1368 server_prepare_xfrd(struct nsd* nsd)
1369 {
1370 	char tmpfile[256];
1371 	/* create task mmaps */
1372 	nsd->mytask = 0;
1373 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1374 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1375 	nsd->task[0] = task_file_create(tmpfile);
1376 	if(!nsd->task[0]) {
1377 #ifdef USE_ZONE_STATS
1378 		unlink(nsd->zonestatfname[0]);
1379 		unlink(nsd->zonestatfname[1]);
1380 #endif
1381 		xfrd_del_tempdir(nsd);
1382 		exit(1);
1383 	}
1384 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1385 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1386 	nsd->task[1] = task_file_create(tmpfile);
1387 	if(!nsd->task[1]) {
1388 		unlink(nsd->task[0]->fname);
1389 #ifdef USE_ZONE_STATS
1390 		unlink(nsd->zonestatfname[0]);
1391 		unlink(nsd->zonestatfname[1]);
1392 #endif
1393 		xfrd_del_tempdir(nsd);
1394 		exit(1);
1395 	}
1396 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1397 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1398 	/* create xfrd listener structure */
1399 	nsd->xfrd_listener = region_alloc(nsd->region,
1400 		sizeof(netio_handler_type));
1401 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1402 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1403 	nsd->xfrd_listener->fd = -1;
1404 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1405 		nsd;
1406 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1407 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1408 }
1409 
1410 
1411 void
1412 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1413 {
1414 	pid_t pid;
1415 	int sockets[2] = {0,0};
1416 	struct ipc_handler_conn_data *data;
1417 
1418 	if(nsd->xfrd_listener->fd != -1)
1419 		close(nsd->xfrd_listener->fd);
1420 	if(del_db) {
1421 		/* recreate taskdb that xfrd was using, it may be corrupt */
1422 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1423 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1424 		nsd->task[1-nsd->mytask]->fname = NULL;
1425 		/* free alloc already, so udb does not shrink itself */
1426 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1427 		nsd->task[1-nsd->mytask]->alloc = NULL;
1428 		udb_base_free(nsd->task[1-nsd->mytask]);
1429 		/* create new file, overwrite the old one */
1430 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1431 		free(tmpfile);
1432 	}
1433 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1434 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1435 		return;
1436 	}
1437 	pid = fork();
1438 	switch (pid) {
1439 	case -1:
1440 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1441 		break;
1442 	default:
1443 		/* PARENT: close first socket, use second one */
1444 		close(sockets[0]);
1445 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1446 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1447 		}
1448 		if(del_db) xfrd_free_namedb(nsd);
1449 		/* use other task than I am using, since if xfrd died and is
1450 		 * restarted, the reload is using nsd->mytask */
1451 		nsd->mytask = 1 - nsd->mytask;
1452 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1453 		/* ENOTREACH */
1454 		break;
1455 	case 0:
1456 		/* CHILD: close second socket, use first one */
1457 		close(sockets[1]);
1458 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1459 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1460 		}
1461 		nsd->xfrd_listener->fd = sockets[0];
1462 		break;
1463 	}
1464 	/* server-parent only */
1465 	nsd->xfrd_listener->timeout = NULL;
1466 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1467 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1468 	/* clear ongoing ipc reads */
1469 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1470 	data->conn->is_reading = 0;
1471 }
1472 
1473 /** add all soainfo to taskdb */
1474 static void
1475 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1476 {
1477 	struct radnode* n;
1478 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1479 	/* add all SOA INFO to mytask */
1480 	udb_ptr_init(&task_last, taskudb);
1481 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1482 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1483 	}
1484 	udb_ptr_unlink(&task_last, taskudb);
1485 }
1486 
1487 void
1488 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1489 {
1490 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1491 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1492 	 *   then they exchange and process.
1493 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1494 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1495 	 *   expire notifications can be sent back via a normal reload later
1496 	 *   (xfrd will wait for current running reload to finish if any).
1497 	 */
1498 	sig_atomic_t cmd = 0;
1499 	pid_t mypid;
1500 	int xfrd_sock = nsd->xfrd_listener->fd;
1501 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1502 	udb_ptr t;
1503 	if(!shortsoa) {
1504 		if(nsd->signal_hint_shutdown) {
1505 		shutdown:
1506 			log_msg(LOG_WARNING, "signal received, shutting down...");
1507 			server_close_all_sockets(nsd->udp, nsd->ifs);
1508 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1509 #ifdef HAVE_SSL
1510 			daemon_remote_close(nsd->rc);
1511 #endif
1512 			/* Unlink it if possible... */
1513 			unlinkpid(nsd->pidfile);
1514 			unlink(nsd->task[0]->fname);
1515 			unlink(nsd->task[1]->fname);
1516 #ifdef USE_ZONE_STATS
1517 			unlink(nsd->zonestatfname[0]);
1518 			unlink(nsd->zonestatfname[1]);
1519 #endif
1520 			/* write the nsd.db to disk, wait for it to complete */
1521 			udb_base_sync(nsd->db->udb, 1);
1522 			udb_base_close(nsd->db->udb);
1523 			server_shutdown(nsd);
1524 			exit(0);
1525 		}
1526 	}
1527 	if(shortsoa) {
1528 		/* put SOA in xfrd task because mytask may be in use */
1529 		taskudb = nsd->task[1-nsd->mytask];
1530 	}
1531 
1532 	add_all_soa_to_task(nsd, taskudb);
1533 	if(!shortsoa) {
1534 		/* wait for xfrd to signal task is ready, RELOAD signal */
1535 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1536 			cmd != NSD_RELOAD) {
1537 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1538 			exit(1);
1539 		}
1540 		if(nsd->signal_hint_shutdown) {
1541 			goto shutdown;
1542 		}
1543 	}
1544 	/* give xfrd our task, signal it with RELOAD_DONE */
1545 	task_process_sync(taskudb);
1546 	cmd = NSD_RELOAD_DONE;
1547 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1548 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1549 			(int)nsd->pid, strerror(errno));
1550 	}
1551 	mypid = getpid();
1552 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1553 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1554 			strerror(errno));
1555 	}
1556 
1557 	if(!shortsoa) {
1558 		/* process the xfrd task works (expiry data) */
1559 		nsd->mytask = 1 - nsd->mytask;
1560 		taskudb = nsd->task[nsd->mytask];
1561 		task_remap(taskudb);
1562 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1563 		while(!udb_ptr_is_null(&t)) {
1564 			task_process_expire(nsd->db, TASKLIST(&t));
1565 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1566 		}
1567 		udb_ptr_unlink(&t, taskudb);
1568 		task_clear(taskudb);
1569 
1570 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1571 		cmd = NSD_RELOAD_DONE;
1572 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1573 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1574 				(int)nsd->pid, strerror(errno));
1575 		}
1576 	}
1577 }
1578 
1579 #ifdef HAVE_SSL
1580 static void
1581 log_crypto_from_err(const char* str, unsigned long err)
1582 {
1583 	/* error:[error code]:[library name]:[function name]:[reason string] */
1584 	char buf[128];
1585 	unsigned long e;
1586 	ERR_error_string_n(err, buf, sizeof(buf));
1587 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1588 	while( (e=ERR_get_error()) ) {
1589 		ERR_error_string_n(e, buf, sizeof(buf));
1590 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1591 	}
1592 }
1593 
1594 void
1595 log_crypto_err(const char* str)
1596 {
1597 	log_crypto_from_err(str, ERR_get_error());
1598 }
1599 
1600 /** true if the ssl handshake error has to be squelched from the logs */
1601 static int
1602 squelch_err_ssl_handshake(unsigned long err)
1603 {
1604 	if(verbosity >= 3)
1605 		return 0; /* only squelch on low verbosity */
1606 	/* this is very specific, we could filter on ERR_GET_REASON()
1607 	 * (the third element in ERR_PACK) */
1608 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1609 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1610 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1611 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1612 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1613 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1614 #endif
1615 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1616 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1617 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1618 #  ifdef SSL_R_VERSION_TOO_LOW
1619 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1620 #  endif
1621 #endif
1622 		)
1623 		return 1;
1624 	return 0;
1625 }
1626 
1627 void
1628 perform_openssl_init(void)
1629 {
1630 	/* init SSL library */
1631 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1632 	ERR_load_crypto_strings();
1633 #endif
1634 	ERR_load_SSL_strings();
1635 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1636 	OpenSSL_add_all_algorithms();
1637 #else
1638 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1639 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1640 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1641 #endif
1642 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1643 	(void)SSL_library_init();
1644 #else
1645 	OPENSSL_init_ssl(0, NULL);
1646 #endif
1647 
1648 	if(!RAND_status()) {
1649 		/* try to seed it */
1650 		unsigned char buf[256];
1651 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1652 		size_t i;
1653 		v = seed;
1654 		for(i=0; i<256/sizeof(v); i++) {
1655 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1656 			v = v*seed + (unsigned int)i;
1657 		}
1658 		RAND_seed(buf, 256);
1659 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1660 	}
1661 }
1662 
1663 static int
1664 get_ocsp(char *filename, unsigned char **ocsp)
1665 {
1666 	BIO *bio;
1667 	OCSP_RESPONSE *response;
1668 	int len = -1;
1669 	unsigned char *p, *buf;
1670 	assert(filename);
1671 
1672 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1673 		log_crypto_err("get_ocsp: BIO_new_file failed");
1674 		return -1;
1675 	}
1676 
1677 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1678 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1679 		BIO_free(bio);
1680 		return -1;
1681 	}
1682 
1683 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1684 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1685 		OCSP_RESPONSE_free(response);
1686 		BIO_free(bio);
1687 		return -1;
1688 	}
1689 
1690 	if ((buf = malloc((size_t) len)) == NULL) {
1691 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1692 		OCSP_RESPONSE_free(response);
1693 		BIO_free(bio);
1694 		return -1;
1695 	}
1696 
1697 	p = buf;
1698 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1699 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1700 		free(buf);
1701 		OCSP_RESPONSE_free(response);
1702 		BIO_free(bio);
1703 		return -1;
1704 	}
1705 
1706 	OCSP_RESPONSE_free(response);
1707 	BIO_free(bio);
1708 
1709 	*ocsp = buf;
1710 	return len;
1711 }
1712 
1713 /* further setup ssl ctx after the keys are loaded */
1714 static void
1715 listen_sslctx_setup_2(void* ctxt)
1716 {
1717 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1718 	(void)ctx;
1719 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1720 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1721 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1722 	}
1723 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1724 	if(1) {
1725 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1726 		if (!ecdh) {
1727 			log_crypto_err("could not find p256, not enabling ECDHE");
1728 		} else {
1729 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1730 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1731 			}
1732 			EC_KEY_free (ecdh);
1733 		}
1734 	}
1735 #endif
1736 }
1737 
1738 static int
1739 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1740 {
1741 	if(ocspdata) {
1742 		unsigned char *p;
1743 		if ((p=malloc(ocspdata_len)) == NULL) {
1744 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1745 			return SSL_TLSEXT_ERR_NOACK;
1746 		}
1747 		memcpy(p, ocspdata, ocspdata_len);
1748 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1749 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1750 			free(p);
1751 			return SSL_TLSEXT_ERR_NOACK;
1752 		}
1753 		return SSL_TLSEXT_ERR_OK;
1754 	} else {
1755 		return SSL_TLSEXT_ERR_NOACK;
1756 	}
1757 }
1758 
1759 SSL_CTX*
1760 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1761 {
1762 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1763 	if(!ctx) {
1764 		log_crypto_err("could not SSL_CTX_new");
1765 		return NULL;
1766 	}
1767 	/* no SSLv2, SSLv3 because has defects */
1768 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1769 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1770 		SSL_CTX_free(ctx);
1771 		return NULL;
1772 	}
1773 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1774 		!= SSL_OP_NO_SSLv3){
1775 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1776 		SSL_CTX_free(ctx);
1777 		return 0;
1778 	}
1779 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1780 	/* if we have tls 1.1 disable 1.0 */
1781 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1782 		!= SSL_OP_NO_TLSv1){
1783 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1784 		SSL_CTX_free(ctx);
1785 		return 0;
1786 	}
1787 #endif
1788 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1789 	/* if we have tls 1.2 disable 1.1 */
1790 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1791 		!= SSL_OP_NO_TLSv1_1){
1792 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1793 		SSL_CTX_free(ctx);
1794 		return 0;
1795 	}
1796 #endif
1797 #if defined(SSL_OP_NO_RENEGOTIATION)
1798 	/* disable client renegotiation */
1799 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1800 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1801 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1802 		SSL_CTX_free(ctx);
1803 		return 0;
1804 	}
1805 #endif
1806 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
1807 	/* if we have sha256, set the cipher list to have no known vulns */
1808 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
1809 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
1810 #endif
1811 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
1812 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
1813 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
1814 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
1815 		SSL_CTX_free(ctx);
1816 		return 0;
1817 	}
1818 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
1819 	SSL_CTX_set_security_level(ctx, 0);
1820 #endif
1821 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
1822 		log_msg(LOG_ERR, "error for cert file: %s", pem);
1823 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
1824 		SSL_CTX_free(ctx);
1825 		return NULL;
1826 	}
1827 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
1828 		log_msg(LOG_ERR, "error for private key file: %s", key);
1829 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
1830 		SSL_CTX_free(ctx);
1831 		return NULL;
1832 	}
1833 	if(!SSL_CTX_check_private_key(ctx)) {
1834 		log_msg(LOG_ERR, "error for key file: %s", key);
1835 		log_crypto_err("Error in SSL_CTX check_private_key");
1836 		SSL_CTX_free(ctx);
1837 		return NULL;
1838 	}
1839 	listen_sslctx_setup_2(ctx);
1840 	if(verifypem && verifypem[0]) {
1841 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
1842 			log_crypto_err("Error in SSL_CTX verify locations");
1843 			SSL_CTX_free(ctx);
1844 			return NULL;
1845 		}
1846 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
1847 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
1848 	}
1849 	return ctx;
1850 }
1851 
1852 SSL_CTX*
1853 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
1854 {
1855 	char *key, *pem;
1856 	SSL_CTX *ctx;
1857 
1858 	key = nsd->options->tls_service_key;
1859 	pem = nsd->options->tls_service_pem;
1860 	if(!key || key[0] == 0) {
1861 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
1862 		return NULL;
1863 	}
1864 	if(!pem || pem[0] == 0) {
1865 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
1866 		return NULL;
1867 	}
1868 
1869 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
1870 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
1871 	ctx = server_tls_ctx_setup(key, pem, verifypem);
1872 	if(!ctx) {
1873 		log_msg(LOG_ERR, "could not setup server TLS context");
1874 		return NULL;
1875 	}
1876 	if(ocspfile && ocspfile[0]) {
1877 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
1878 			log_crypto_err("Error reading OCSPfile");
1879 			SSL_CTX_free(ctx);
1880 			return NULL;
1881 		} else {
1882 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
1883 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
1884 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
1885 				SSL_CTX_free(ctx);
1886 				return NULL;
1887 			}
1888 		}
1889 	}
1890 	return ctx;
1891 }
1892 
1893 /* check if tcp_handler_accept_data created for TLS dedicated port */
1894 int
1895 using_tls_port(struct sockaddr* addr, const char* tls_port)
1896 {
1897 	in_port_t port = 0;
1898 
1899 	if (addr->sa_family == AF_INET)
1900 		port = ((struct sockaddr_in*)addr)->sin_port;
1901 #ifndef HAVE_STRUCT_SOCKADDR_IN6
1902 	else
1903 		port = ((struct sockaddr_in6*)addr)->sin6_port;
1904 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
1905 	if (atoi(tls_port) == ntohs(port))
1906 		return 1;
1907 
1908 	return 0;
1909 }
1910 #endif
1911 
1912 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1913 ssize_t
1914 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1915 {
1916 	uint8_t* buf = (uint8_t*) p;
1917 	ssize_t total = 0;
1918 	struct pollfd fd;
1919 	memset(&fd, 0, sizeof(fd));
1920 	fd.fd = s;
1921 	fd.events = POLLIN;
1922 
1923 	while( total < sz) {
1924 		ssize_t ret;
1925 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1926 		if(ret == -1) {
1927 			if(errno == EAGAIN)
1928 				/* blocking read */
1929 				continue;
1930 			if(errno == EINTR) {
1931 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1932 					return -1;
1933 				/* other signals can be handled later */
1934 				continue;
1935 			}
1936 			/* some error */
1937 			return -1;
1938 		}
1939 		if(ret == 0) {
1940 			/* operation timed out */
1941 			return -2;
1942 		}
1943 		ret = read(s, buf+total, sz-total);
1944 		if(ret == -1) {
1945 			if(errno == EAGAIN)
1946 				/* blocking read */
1947 				continue;
1948 			if(errno == EINTR) {
1949 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1950 					return -1;
1951 				/* other signals can be handled later */
1952 				continue;
1953 			}
1954 			/* some error */
1955 			return -1;
1956 		}
1957 		if(ret == 0) {
1958 			/* closed connection! */
1959 			return 0;
1960 		}
1961 		total += ret;
1962 	}
1963 	return total;
1964 }
1965 
1966 static void
1967 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1968 {
1969 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1970 	udb_ptr t, next;
1971 	udb_base* u = nsd->task[nsd->mytask];
1972 	udb_ptr_init(&next, u);
1973 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1974 	udb_base_set_userdata(u, 0);
1975 	while(!udb_ptr_is_null(&t)) {
1976 		/* store next in list so this one can be deleted or reused */
1977 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1978 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1979 
1980 		/* process task t */
1981 		/* append results for task t and update last_task */
1982 		task_process_in_reload(nsd, u, last_task, &t);
1983 
1984 		/* go to next */
1985 		udb_ptr_set_ptr(&t, u, &next);
1986 
1987 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1988 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1989 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1990 			if(cmd == NSD_QUIT) {
1991 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1992 				/* sync to disk (if needed) */
1993 				udb_base_sync(nsd->db->udb, 0);
1994 				/* unlink files of remainder of tasks */
1995 				while(!udb_ptr_is_null(&t)) {
1996 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1997 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1998 					}
1999 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2000 				}
2001 				udb_ptr_unlink(&t, u);
2002 				udb_ptr_unlink(&next, u);
2003 				exit(0);
2004 			}
2005 		}
2006 
2007 	}
2008 	udb_ptr_unlink(&t, u);
2009 	udb_ptr_unlink(&next, u);
2010 }
2011 
2012 #ifdef BIND8_STATS
2013 static void
2014 parent_send_stats(struct nsd* nsd, int cmdfd)
2015 {
2016 	size_t i;
2017 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2018 		log_msg(LOG_ERR, "could not write stats to reload");
2019 		return;
2020 	}
2021 	for(i=0; i<nsd->child_count; i++)
2022 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2023 			sizeof(stc_type))) {
2024 			log_msg(LOG_ERR, "could not write stats to reload");
2025 			return;
2026 		}
2027 }
2028 
2029 static void
2030 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2031 {
2032 	struct nsdst s;
2033 	stc_type* p;
2034 	size_t i;
2035 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2036 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2037 		log_msg(LOG_ERR, "could not read stats from oldpar");
2038 		return;
2039 	}
2040 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2041 	s.db_mem = region_get_mem(nsd->db->region);
2042 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2043 		nsd->child_count);
2044 	if(!p) return;
2045 	for(i=0; i<nsd->child_count; i++) {
2046 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2047 			sizeof(stc_type))
2048 			return;
2049 	}
2050 }
2051 #endif /* BIND8_STATS */
2052 
2053 /*
2054  * Reload the database, stop parent, re-fork children and continue.
2055  * as server_main.
2056  */
2057 static void
2058 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2059 	int cmdsocket)
2060 {
2061 	pid_t mypid;
2062 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2063 	int ret;
2064 	udb_ptr last_task;
2065 	struct sigaction old_sigchld, ign_sigchld;
2066 	/* ignore SIGCHLD from the previous server_main that used this pid */
2067 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2068 	ign_sigchld.sa_handler = SIG_IGN;
2069 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2070 
2071 	/* see what tasks we got from xfrd */
2072 	task_remap(nsd->task[nsd->mytask]);
2073 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2074 	udb_compact_inhibited(nsd->db->udb, 1);
2075 	reload_process_tasks(nsd, &last_task, cmdsocket);
2076 	udb_compact_inhibited(nsd->db->udb, 0);
2077 	udb_compact(nsd->db->udb);
2078 
2079 #ifndef NDEBUG
2080 	if(nsd_debug_level >= 1)
2081 		region_log_stats(nsd->db->region);
2082 #endif /* NDEBUG */
2083 	/* sync to disk (if needed) */
2084 	udb_base_sync(nsd->db->udb, 0);
2085 
2086 	initialize_dname_compression_tables(nsd);
2087 
2088 #ifdef BIND8_STATS
2089 	/* Restart dumping stats if required.  */
2090 	time(&nsd->st.boot);
2091 	set_bind8_alarm(nsd);
2092 #endif
2093 #ifdef USE_ZONE_STATS
2094 	server_zonestat_realloc(nsd); /* realloc for new children */
2095 	server_zonestat_switch(nsd);
2096 #endif
2097 
2098 	/* listen for the signals of failed children again */
2099 	sigaction(SIGCHLD, &old_sigchld, NULL);
2100 	/* Start new child processes */
2101 	if (server_start_children(nsd, server_region, netio, &nsd->
2102 		xfrd_listener->fd) != 0) {
2103 		send_children_quit(nsd);
2104 		exit(1);
2105 	}
2106 
2107 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2108 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2109 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2110 		if(cmd == NSD_QUIT) {
2111 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2112 			send_children_quit(nsd);
2113 			exit(0);
2114 		}
2115 	}
2116 
2117 	/* Send quit command to parent: blocking, wait for receipt. */
2118 	do {
2119 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2120 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2121 		{
2122 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2123 				strerror(errno));
2124 		}
2125 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2126 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2127 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2128 			RELOAD_SYNC_TIMEOUT);
2129 		if(ret == -2) {
2130 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2131 		}
2132 	} while (ret == -2);
2133 	if(ret == -1) {
2134 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2135 			strerror(errno));
2136 	}
2137 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2138 	if(cmd == NSD_QUIT) {
2139 		/* small race condition possible here, parent got quit cmd. */
2140 		send_children_quit(nsd);
2141 		exit(1);
2142 	}
2143 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2144 #ifdef BIND8_STATS
2145 	reload_do_stats(cmdsocket, nsd, &last_task);
2146 #endif
2147 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2148 	task_process_sync(nsd->task[nsd->mytask]);
2149 #ifdef USE_ZONE_STATS
2150 	server_zonestat_realloc(nsd); /* realloc for next children */
2151 #endif
2152 
2153 	/* send soainfo to the xfrd process, signal it that reload is done,
2154 	 * it picks up the taskudb */
2155 	cmd = NSD_RELOAD_DONE;
2156 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2157 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2158 			strerror(errno));
2159 	}
2160 	mypid = getpid();
2161 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2162 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2163 			strerror(errno));
2164 	}
2165 
2166 	/* try to reopen file */
2167 	if (nsd->file_rotation_ok)
2168 		log_reopen(nsd->log_filename, 1);
2169 	/* exit reload, continue as new server_main */
2170 }
2171 
2172 /*
2173  * Get the mode depending on the signal hints that have been received.
2174  * Multiple signal hints can be received and will be handled in turn.
2175  */
2176 static sig_atomic_t
2177 server_signal_mode(struct nsd *nsd)
2178 {
2179 	if(nsd->signal_hint_quit) {
2180 		nsd->signal_hint_quit = 0;
2181 		return NSD_QUIT;
2182 	}
2183 	else if(nsd->signal_hint_shutdown) {
2184 		nsd->signal_hint_shutdown = 0;
2185 		return NSD_SHUTDOWN;
2186 	}
2187 	else if(nsd->signal_hint_child) {
2188 		nsd->signal_hint_child = 0;
2189 		return NSD_REAP_CHILDREN;
2190 	}
2191 	else if(nsd->signal_hint_reload) {
2192 		nsd->signal_hint_reload = 0;
2193 		return NSD_RELOAD;
2194 	}
2195 	else if(nsd->signal_hint_reload_hup) {
2196 		nsd->signal_hint_reload_hup = 0;
2197 		return NSD_RELOAD_REQ;
2198 	}
2199 	else if(nsd->signal_hint_stats) {
2200 		nsd->signal_hint_stats = 0;
2201 #ifdef BIND8_STATS
2202 		set_bind8_alarm(nsd);
2203 #endif
2204 		return NSD_STATS;
2205 	}
2206 	else if(nsd->signal_hint_statsusr) {
2207 		nsd->signal_hint_statsusr = 0;
2208 		return NSD_STATS;
2209 	}
2210 	return NSD_RUN;
2211 }
2212 
2213 /*
2214  * The main server simply waits for signals and child processes to
2215  * terminate.  Child processes are restarted as necessary.
2216  */
2217 void
2218 server_main(struct nsd *nsd)
2219 {
2220 	region_type *server_region = region_create(xalloc, free);
2221 	netio_type *netio = netio_create(server_region);
2222 	netio_handler_type reload_listener;
2223 	int reload_sockets[2] = {-1, -1};
2224 	struct timespec timeout_spec;
2225 	int status;
2226 	pid_t child_pid;
2227 	pid_t reload_pid = -1;
2228 	sig_atomic_t mode;
2229 
2230 	/* Ensure we are the main process */
2231 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2232 
2233 	/* Add listener for the XFRD process */
2234 	netio_add_handler(netio, nsd->xfrd_listener);
2235 
2236 	/* Start the child processes that handle incoming queries */
2237 	if (server_start_children(nsd, server_region, netio,
2238 		&nsd->xfrd_listener->fd) != 0) {
2239 		send_children_quit(nsd);
2240 		exit(1);
2241 	}
2242 	reload_listener.fd = -1;
2243 
2244 	/* This_child MUST be 0, because this is the parent process */
2245 	assert(nsd->this_child == 0);
2246 
2247 	/* Run the server until we get a shutdown signal */
2248 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2249 		/* Did we receive a signal that changes our mode? */
2250 		if(mode == NSD_RUN) {
2251 			nsd->mode = mode = server_signal_mode(nsd);
2252 		}
2253 
2254 		switch (mode) {
2255 		case NSD_RUN:
2256 			/* see if any child processes terminated */
2257 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2258 				int is_child = delete_child_pid(nsd, child_pid);
2259 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2260 					if(nsd->children[is_child].child_fd == -1)
2261 						nsd->children[is_child].has_exited = 1;
2262 					parent_check_all_children_exited(nsd);
2263 				} else if(is_child != -1) {
2264 					log_msg(LOG_WARNING,
2265 					       "server %d died unexpectedly with status %d, restarting",
2266 					       (int) child_pid, status);
2267 					restart_child_servers(nsd, server_region, netio,
2268 						&nsd->xfrd_listener->fd);
2269 				} else if (child_pid == reload_pid) {
2270 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2271 					pid_t mypid;
2272 					log_msg(LOG_WARNING,
2273 					       "Reload process %d failed with status %d, continuing with old database",
2274 					       (int) child_pid, status);
2275 					reload_pid = -1;
2276 					if(reload_listener.fd != -1) close(reload_listener.fd);
2277 					reload_listener.fd = -1;
2278 					reload_listener.event_types = NETIO_EVENT_NONE;
2279 					task_process_sync(nsd->task[nsd->mytask]);
2280 					/* inform xfrd reload attempt ended */
2281 					if(!write_socket(nsd->xfrd_listener->fd,
2282 						&cmd, sizeof(cmd))) {
2283 						log_msg(LOG_ERR, "problems "
2284 						  "sending SOAEND to xfrd: %s",
2285 						  strerror(errno));
2286 					}
2287 					mypid = getpid();
2288 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2289 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2290 							strerror(errno));
2291 					}
2292 				} else if(status != 0) {
2293 					/* check for status, because we get
2294 					 * the old-servermain because reload
2295 					 * is the process-parent of old-main,
2296 					 * and we get older server-processes
2297 					 * that are exiting after a reload */
2298 					log_msg(LOG_WARNING,
2299 					       "process %d terminated with status %d",
2300 					       (int) child_pid, status);
2301 				}
2302 			}
2303 			if (child_pid == -1) {
2304 				if (errno == EINTR) {
2305 					continue;
2306 				}
2307 				if (errno != ECHILD)
2308 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2309 			}
2310 			if (nsd->mode != NSD_RUN)
2311 				break;
2312 
2313 			/* timeout to collect processes. In case no sigchild happens. */
2314 			timeout_spec.tv_sec = 60;
2315 			timeout_spec.tv_nsec = 0;
2316 
2317 			/* listen on ports, timeout for collecting terminated children */
2318 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2319 				if (errno != EINTR) {
2320 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2321 				}
2322 			}
2323 			if(nsd->restart_children) {
2324 				restart_child_servers(nsd, server_region, netio,
2325 					&nsd->xfrd_listener->fd);
2326 				nsd->restart_children = 0;
2327 			}
2328 			if(nsd->reload_failed) {
2329 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2330 				pid_t mypid;
2331 				nsd->reload_failed = 0;
2332 				log_msg(LOG_WARNING,
2333 				       "Reload process %d failed, continuing with old database",
2334 				       (int) reload_pid);
2335 				reload_pid = -1;
2336 				if(reload_listener.fd != -1) close(reload_listener.fd);
2337 				reload_listener.fd = -1;
2338 				reload_listener.event_types = NETIO_EVENT_NONE;
2339 				task_process_sync(nsd->task[nsd->mytask]);
2340 				/* inform xfrd reload attempt ended */
2341 				if(!write_socket(nsd->xfrd_listener->fd,
2342 					&cmd, sizeof(cmd))) {
2343 					log_msg(LOG_ERR, "problems "
2344 					  "sending SOAEND to xfrd: %s",
2345 					  strerror(errno));
2346 				}
2347 				mypid = getpid();
2348 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2349 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2350 						strerror(errno));
2351 				}
2352 			}
2353 
2354 			break;
2355 		case NSD_RELOAD_REQ: {
2356 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2357 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2358 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2359 				"main: ipc send reload_req to xfrd"));
2360 			if(!write_socket(nsd->xfrd_listener->fd,
2361 				&cmd, sizeof(cmd))) {
2362 				log_msg(LOG_ERR, "server_main: could not send "
2363 				"reload_req to xfrd: %s", strerror(errno));
2364 			}
2365 			nsd->mode = NSD_RUN;
2366 			} break;
2367 		case NSD_RELOAD:
2368 			/* Continue to run nsd after reload */
2369 			nsd->mode = NSD_RUN;
2370 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2371 			if (reload_pid != -1) {
2372 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2373 				       (int) reload_pid);
2374 				break;
2375 			}
2376 
2377 			/* switch the mytask to keep track of who owns task*/
2378 			nsd->mytask = 1 - nsd->mytask;
2379 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2380 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2381 				reload_pid = -1;
2382 				break;
2383 			}
2384 
2385 			/* Do actual reload */
2386 			reload_pid = fork();
2387 			switch (reload_pid) {
2388 			case -1:
2389 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2390 				break;
2391 			default:
2392 				/* PARENT */
2393 				close(reload_sockets[0]);
2394 				server_reload(nsd, server_region, netio,
2395 					reload_sockets[1]);
2396 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2397 				close(reload_sockets[1]);
2398 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2399 				/* drop stale xfrd ipc data */
2400 				((struct ipc_handler_conn_data*)nsd->
2401 					xfrd_listener->user_data)
2402 					->conn->is_reading = 0;
2403 				reload_pid = -1;
2404 				reload_listener.fd = -1;
2405 				reload_listener.event_types = NETIO_EVENT_NONE;
2406 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2407 				break;
2408 			case 0:
2409 				/* CHILD */
2410 				/* server_main keep running until NSD_QUIT_SYNC
2411 				 * received from reload. */
2412 				close(reload_sockets[1]);
2413 				reload_listener.fd = reload_sockets[0];
2414 				reload_listener.timeout = NULL;
2415 				reload_listener.user_data = nsd;
2416 				reload_listener.event_types = NETIO_EVENT_READ;
2417 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2418 				netio_add_handler(netio, &reload_listener);
2419 				reload_pid = getppid();
2420 				break;
2421 			}
2422 			break;
2423 		case NSD_QUIT_SYNC:
2424 			/* synchronisation of xfrd, parent and reload */
2425 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2426 				sig_atomic_t cmd = NSD_RELOAD;
2427 				/* stop xfrd ipc writes in progress */
2428 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2429 					"main: ipc send indication reload"));
2430 				if(!write_socket(nsd->xfrd_listener->fd,
2431 					&cmd, sizeof(cmd))) {
2432 					log_msg(LOG_ERR, "server_main: could not send reload "
2433 					"indication to xfrd: %s", strerror(errno));
2434 				}
2435 				/* wait for ACK from xfrd */
2436 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2437 				nsd->quit_sync_done = 1;
2438 			}
2439 			nsd->mode = NSD_RUN;
2440 			break;
2441 		case NSD_QUIT:
2442 			/* silent shutdown during reload */
2443 			if(reload_listener.fd != -1) {
2444 				/* acknowledge the quit, to sync reload that we will really quit now */
2445 				sig_atomic_t cmd = NSD_RELOAD;
2446 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2447 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2448 					log_msg(LOG_ERR, "server_main: "
2449 						"could not ack quit: %s", strerror(errno));
2450 				}
2451 #ifdef BIND8_STATS
2452 				parent_send_stats(nsd, reload_listener.fd);
2453 #endif /* BIND8_STATS */
2454 				close(reload_listener.fd);
2455 			}
2456 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2457 			/* only quit children after xfrd has acked */
2458 			send_children_quit(nsd);
2459 
2460 #ifdef MEMCLEAN /* OS collects memory pages */
2461 			region_destroy(server_region);
2462 #endif
2463 			server_shutdown(nsd);
2464 
2465 			/* ENOTREACH */
2466 			break;
2467 		case NSD_SHUTDOWN:
2468 			break;
2469 		case NSD_REAP_CHILDREN:
2470 			/* continue; wait for child in run loop */
2471 			nsd->mode = NSD_RUN;
2472 			break;
2473 		case NSD_STATS:
2474 #ifdef BIND8_STATS
2475 			set_children_stats(nsd);
2476 #endif
2477 			nsd->mode = NSD_RUN;
2478 			break;
2479 		default:
2480 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2481 			nsd->mode = NSD_RUN;
2482 			break;
2483 		}
2484 	}
2485 	log_msg(LOG_WARNING, "signal received, shutting down...");
2486 
2487 	/* close opened ports to avoid race with restart of nsd */
2488 	server_close_all_sockets(nsd->udp, nsd->ifs);
2489 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2490 #ifdef HAVE_SSL
2491 	daemon_remote_close(nsd->rc);
2492 #endif
2493 	send_children_quit_and_wait(nsd);
2494 
2495 	/* Unlink it if possible... */
2496 	unlinkpid(nsd->pidfile);
2497 	unlink(nsd->task[0]->fname);
2498 	unlink(nsd->task[1]->fname);
2499 #ifdef USE_ZONE_STATS
2500 	unlink(nsd->zonestatfname[0]);
2501 	unlink(nsd->zonestatfname[1]);
2502 #endif
2503 #ifdef USE_DNSTAP
2504 	dt_collector_close(nsd->dt_collector, nsd);
2505 #endif
2506 
2507 	if(reload_listener.fd != -1) {
2508 		sig_atomic_t cmd = NSD_QUIT;
2509 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2510 			"main: ipc send quit to reload-process"));
2511 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2512 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2513 				strerror(errno));
2514 		}
2515 		fsync(reload_listener.fd);
2516 		close(reload_listener.fd);
2517 		/* wait for reload to finish processing */
2518 		while(1) {
2519 			if(waitpid(reload_pid, NULL, 0) == -1) {
2520 				if(errno == EINTR) continue;
2521 				if(errno == ECHILD) break;
2522 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2523 					(int)reload_pid, strerror(errno));
2524 			}
2525 			break;
2526 		}
2527 	}
2528 	if(nsd->xfrd_listener->fd != -1) {
2529 		/* complete quit, stop xfrd */
2530 		sig_atomic_t cmd = NSD_QUIT;
2531 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2532 			"main: ipc send quit to xfrd"));
2533 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2534 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2535 				strerror(errno));
2536 		}
2537 		fsync(nsd->xfrd_listener->fd);
2538 		close(nsd->xfrd_listener->fd);
2539 		(void)kill(nsd->pid, SIGTERM);
2540 	}
2541 
2542 #ifdef MEMCLEAN /* OS collects memory pages */
2543 	region_destroy(server_region);
2544 #endif
2545 	/* write the nsd.db to disk, wait for it to complete */
2546 	udb_base_sync(nsd->db->udb, 1);
2547 	udb_base_close(nsd->db->udb);
2548 	server_shutdown(nsd);
2549 }
2550 
2551 static query_state_type
2552 server_process_query(struct nsd *nsd, struct query *query)
2553 {
2554 	return query_process(query, nsd);
2555 }
2556 
2557 static query_state_type
2558 server_process_query_udp(struct nsd *nsd, struct query *query)
2559 {
2560 #ifdef RATELIMIT
2561 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2562 		if(rrl_process_query(query))
2563 			return rrl_slip(query);
2564 		else	return QUERY_PROCESSED;
2565 	}
2566 	return QUERY_DISCARDED;
2567 #else
2568 	return query_process(query, nsd);
2569 #endif
2570 }
2571 
2572 struct event_base*
2573 nsd_child_event_base(void)
2574 {
2575 	struct event_base* base;
2576 #ifdef USE_MINI_EVENT
2577 	static time_t secs;
2578 	static struct timeval now;
2579 	base = event_init(&secs, &now);
2580 #else
2581 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2582 	/* libev */
2583 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2584 #  else
2585 	/* libevent */
2586 #    ifdef HAVE_EVENT_BASE_NEW
2587 	base = event_base_new();
2588 #    else
2589 	base = event_init();
2590 #    endif
2591 #  endif
2592 #endif
2593 	return base;
2594 }
2595 
2596 static void
2597 add_udp_handler(
2598 	struct nsd *nsd,
2599 	struct nsd_socket *sock,
2600 	struct udp_handler_data *data)
2601 {
2602 	struct event *handler = &data->event;
2603 
2604 	data->nsd = nsd;
2605 	data->socket = sock;
2606 
2607 	memset(handler, 0, sizeof(*handler));
2608 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2609 	if(event_base_set(nsd->event_base, handler) != 0)
2610 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2611 	if(event_add(handler, NULL) != 0)
2612 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2613 }
2614 
2615 void
2616 add_tcp_handler(
2617 	struct nsd *nsd,
2618 	struct nsd_socket *sock,
2619 	struct tcp_accept_handler_data *data)
2620 {
2621 	struct event *handler = &data->event;
2622 
2623 	data->nsd = nsd;
2624 	data->socket = sock;
2625 
2626 #ifdef HAVE_SSL
2627 	if (nsd->tls_ctx &&
2628 	    nsd->options->tls_port &&
2629 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2630 	{
2631 		data->tls_accept = 1;
2632 		if(verbosity >= 2) {
2633 			char buf[48];
2634 			addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2635 			VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2636 		}
2637 	} else {
2638 		data->tls_accept = 0;
2639 	}
2640 #endif
2641 
2642 	memset(handler, 0, sizeof(*handler));
2643 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2644 	if(event_base_set(nsd->event_base, handler) != 0)
2645 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2646 	if(event_add(handler, NULL) != 0)
2647 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2648 	data->event_added = 1;
2649 }
2650 
2651 /*
2652  * Serve DNS requests.
2653  */
2654 void
2655 server_child(struct nsd *nsd)
2656 {
2657 	size_t i, from, numifs;
2658 	region_type *server_region = region_create(xalloc, free);
2659 	struct event_base* event_base = nsd_child_event_base();
2660 	sig_atomic_t mode;
2661 
2662 	if(!event_base) {
2663 		log_msg(LOG_ERR, "nsd server could not create event base");
2664 		exit(1);
2665 	}
2666 	nsd->event_base = event_base;
2667 	nsd->server_region = server_region;
2668 
2669 #ifdef RATELIMIT
2670 	rrl_init(nsd->this_child->child_num);
2671 #endif
2672 
2673 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2674 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2675 
2676 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2677 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2678 	}
2679 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2680 		server_close_all_sockets(nsd->udp, nsd->ifs);
2681 	}
2682 
2683 	if (nsd->this_child->parent_fd != -1) {
2684 		struct event *handler;
2685 		struct ipc_handler_conn_data* user_data =
2686 			(struct ipc_handler_conn_data*)region_alloc(
2687 			server_region, sizeof(struct ipc_handler_conn_data));
2688 		user_data->nsd = nsd;
2689 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2690 
2691 		handler = (struct event*) region_alloc(
2692 			server_region, sizeof(*handler));
2693 		memset(handler, 0, sizeof(*handler));
2694 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2695 			EV_READ, child_handle_parent_command, user_data);
2696 		if(event_base_set(event_base, handler) != 0)
2697 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2698 		if(event_add(handler, NULL) != 0)
2699 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2700 	}
2701 
2702 	if(nsd->reuseport) {
2703 		numifs = nsd->ifs / nsd->reuseport;
2704 		from = numifs * nsd->this_child->child_num;
2705 		if(from+numifs > nsd->ifs) { /* should not happen */
2706 			from = 0;
2707 			numifs = nsd->ifs;
2708 		}
2709 	} else {
2710 		from = 0;
2711 		numifs = nsd->ifs;
2712 	}
2713 
2714 	if (nsd->server_kind & NSD_SERVER_UDP) {
2715 		memset(msgs, 0, sizeof(msgs));
2716 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2717 			queries[i] = query_create(server_region,
2718 				compressed_dname_offsets,
2719 				compression_table_size, compressed_dnames);
2720 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2721 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2722 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2723 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2724 			msgs[i].msg_hdr.msg_iovlen  = 1;
2725 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2726 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2727 		}
2728 
2729 		for (i = from; i < from+numifs; ++i) {
2730 			struct udp_handler_data *data =	region_alloc_zero(
2731 				nsd->server_region, sizeof(*data));
2732 			add_udp_handler(nsd, &nsd->udp[i], data);
2733 		}
2734 	}
2735 
2736 	/*
2737 	 * Keep track of all the TCP accept handlers so we can enable
2738 	 * and disable them based on the current number of active TCP
2739 	 * connections.
2740 	 */
2741 	if (nsd->server_kind & NSD_SERVER_TCP) {
2742 		tcp_accept_handler_count = numifs;
2743 		tcp_accept_handlers = region_alloc_array(server_region,
2744 			numifs, sizeof(*tcp_accept_handlers));
2745 
2746 		for (i = from; i < numifs; i++) {
2747 			struct tcp_accept_handler_data *data =
2748 				&tcp_accept_handlers[i-from];
2749 			memset(data, 0, sizeof(*data));
2750 			add_tcp_handler(nsd, &nsd->tcp[i], data);
2751 		}
2752 	} else {
2753 		tcp_accept_handler_count = 0;
2754 	}
2755 
2756 	/* The main loop... */
2757 	while ((mode = nsd->mode) != NSD_QUIT) {
2758 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2759 
2760 		/* Do we need to do the statistics... */
2761 		if (mode == NSD_STATS) {
2762 #ifdef BIND8_STATS
2763 			int p = nsd->st.period;
2764 			nsd->st.period = 1; /* force stats printout */
2765 			/* Dump the statistics */
2766 			bind8_stats(nsd);
2767 			nsd->st.period = p;
2768 #else /* !BIND8_STATS */
2769 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2770 #endif /* BIND8_STATS */
2771 
2772 			nsd->mode = NSD_RUN;
2773 		}
2774 		else if (mode == NSD_REAP_CHILDREN) {
2775 			/* got signal, notify parent. parent reaps terminated children. */
2776 			if (nsd->this_child->parent_fd != -1) {
2777 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2778 				if (write(nsd->this_child->parent_fd,
2779 				    &parent_notify,
2780 				    sizeof(parent_notify)) == -1)
2781 				{
2782 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2783 						(int) nsd->this_child->pid, strerror(errno));
2784 				}
2785 			} else /* no parent, so reap 'em */
2786 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2787 			nsd->mode = NSD_RUN;
2788 		}
2789 		else if(mode == NSD_RUN) {
2790 			/* Wait for a query... */
2791 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2792 				if (errno != EINTR) {
2793 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2794 					break;
2795 				}
2796 			}
2797 		} else if(mode == NSD_QUIT) {
2798 			/* ignore here, quit */
2799 		} else {
2800 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2801 				(int)mode);
2802 			nsd->mode = NSD_RUN;
2803 		}
2804 	}
2805 
2806 	service_remaining_tcp(nsd);
2807 #ifdef	BIND8_STATS
2808 	bind8_stats(nsd);
2809 #endif /* BIND8_STATS */
2810 
2811 #ifdef MEMCLEAN /* OS collects memory pages */
2812 #ifdef RATELIMIT
2813 	rrl_deinit(nsd->this_child->child_num);
2814 #endif
2815 	event_base_free(event_base);
2816 	region_destroy(server_region);
2817 #endif
2818 	server_shutdown(nsd);
2819 }
2820 
2821 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
2822 {
2823 	int* timed_out = (int*)arg;
2824         assert(event & EV_TIMEOUT);
2825 	/* wake up the service tcp thread, note event is no longer
2826 	 * registered */
2827 	*timed_out = 1;
2828 }
2829 
2830 void
2831 service_remaining_tcp(struct nsd* nsd)
2832 {
2833 	struct tcp_handler_data* p;
2834 	struct event_base* event_base;
2835 	/* check if it is needed */
2836 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
2837 		return;
2838 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
2839 
2840 	/* setup event base */
2841 	event_base = nsd_child_event_base();
2842 	if(!event_base) {
2843 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
2844 		return;
2845 	}
2846 	/* register tcp connections */
2847 	for(p = tcp_active_list; p != NULL; p = p->next) {
2848 		struct timeval timeout;
2849 		int fd = p->event.ev_fd;
2850 #ifdef USE_MINI_EVENT
2851 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
2852 #else
2853 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
2854 #endif
2855 		void (*fn)(int, short, void*);
2856 #ifdef HAVE_SSL
2857 		if(p->tls) {
2858 			if((event&EV_READ))
2859 				fn = handle_tls_reading;
2860 			else	fn = handle_tls_writing;
2861 		} else {
2862 #endif
2863 			if((event&EV_READ))
2864 				fn = handle_tcp_reading;
2865 			else	fn = handle_tcp_writing;
2866 #ifdef HAVE_SSL
2867 		}
2868 #endif
2869 
2870 		/* set timeout to 1/10 second */
2871 		if(p->tcp_timeout > 100)
2872 			p->tcp_timeout = 100;
2873 		timeout.tv_sec = p->tcp_timeout / 1000;
2874 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
2875 		event_del(&p->event);
2876 		memset(&p->event, 0, sizeof(p->event));
2877 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
2878 			fn, p);
2879 		if(event_base_set(event_base, &p->event) != 0)
2880 			log_msg(LOG_ERR, "event base set failed");
2881 		if(event_add(&p->event, &timeout) != 0)
2882 			log_msg(LOG_ERR, "event add failed");
2883 	}
2884 
2885 	/* handle it */
2886 	while(nsd->current_tcp_count > 0) {
2887 		mode_t m = server_signal_mode(nsd);
2888 		struct event timeout;
2889 		struct timeval tv;
2890 		int timed_out = 0;
2891 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
2892 			m == NSD_REAP_CHILDREN) {
2893 			/* quit */
2894 			break;
2895 		}
2896 		/* timer */
2897 		/* have to do something every second */
2898 		tv.tv_sec = 1;
2899 		tv.tv_usec = 0;
2900 		memset(&timeout, 0, sizeof(timeout));
2901 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
2902 			&timed_out);
2903 		if(event_base_set(event_base, &timeout) != 0)
2904 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
2905 		if(event_add(&timeout, &tv) != 0)
2906 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
2907 
2908 		/* service loop */
2909 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2910 			if (errno != EINTR) {
2911 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2912 				break;
2913 			}
2914 		}
2915 		if(!timed_out) {
2916 			event_del(&timeout);
2917 		} else {
2918 			/* timed out, quit */
2919 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
2920 			break;
2921 		}
2922 	}
2923 #ifdef MEMCLEAN
2924 	event_base_free(event_base);
2925 #endif
2926 	/* continue to quit after return */
2927 }
2928 
2929 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
2930  * are always used, even if nonblocking operations are broken, in which case
2931  * NUM_RECV_PER_SELECT is defined to 1 (one).
2932  */
2933 #if defined(HAVE_RECVMMSG)
2934 #define nsd_recvmmsg recvmmsg
2935 #else /* !HAVE_RECVMMSG */
2936 
2937 static int
2938 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
2939              int flags, struct timespec *timeout)
2940 {
2941 	int orig_errno;
2942 	unsigned int vpos = 0;
2943 	ssize_t rcvd;
2944 
2945 	/* timeout is ignored, ensure caller does not expect it to work */
2946 	assert(timeout == NULL);
2947 
2948 	orig_errno = errno;
2949 	errno = 0;
2950 	while(vpos < vlen) {
2951 		rcvd = recvfrom(sockfd,
2952 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
2953 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
2954 		                flags,
2955 		                msgvec[vpos].msg_hdr.msg_name,
2956 		               &msgvec[vpos].msg_hdr.msg_namelen);
2957 		if(rcvd < 0) {
2958 			break;
2959 		} else {
2960 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
2961 			msgvec[vpos].msg_len = (unsigned int)rcvd;
2962 			vpos++;
2963 		}
2964 	}
2965 
2966 	if(vpos) {
2967 		/* error will be picked up next time */
2968 		return (int)vpos;
2969 	} else if(errno == 0) {
2970 		errno = orig_errno;
2971 		return 0;
2972 	} else if(errno == EAGAIN) {
2973 		return 0;
2974 	}
2975 
2976 	return -1;
2977 }
2978 #endif /* HAVE_RECVMMSG */
2979 
2980 #ifdef HAVE_SENDMMSG
2981 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
2982 #else /* !HAVE_SENDMMSG */
2983 
2984 static int
2985 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
2986 {
2987 	int orig_errno;
2988 	unsigned int vpos = 0;
2989 	ssize_t snd;
2990 
2991 	orig_errno = errno;
2992 	errno = 0;
2993 	while(vpos < vlen) {
2994 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
2995 		snd = sendto(sockfd,
2996 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
2997 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
2998 		             flags,
2999 		             msgvec[vpos].msg_hdr.msg_name,
3000 		             msgvec[vpos].msg_hdr.msg_namelen);
3001 		if(snd < 0) {
3002 			break;
3003 		} else {
3004 			msgvec[vpos].msg_len = (unsigned int)snd;
3005 			vpos++;
3006 		}
3007 	}
3008 
3009 	if(vpos) {
3010 		return (int)vpos;
3011 	} else if(errno == 0) {
3012 		errno = orig_errno;
3013 		return 0;
3014 	}
3015 
3016 	return -1;
3017 }
3018 #endif /* HAVE_SENDMMSG */
3019 
3020 static void
3021 handle_udp(int fd, short event, void* arg)
3022 {
3023 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3024 	int received, sent, recvcount, i;
3025 	struct query *q;
3026 
3027 	if (!(event & EV_READ)) {
3028 		return;
3029 	}
3030 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3031 	/* this printf strangely gave a performance increase on Linux */
3032 	/* printf("recvcount %d \n", recvcount); */
3033 	if (recvcount == -1) {
3034 		if (errno != EAGAIN && errno != EINTR) {
3035 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3036 			STATUP(data->nsd, rxerr);
3037 			/* No zone statup */
3038 		}
3039 		/* Simply no data available */
3040 		return;
3041 	}
3042 	for (i = 0; i < recvcount; i++) {
3043 	loopstart:
3044 		received = msgs[i].msg_len;
3045 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3046 		q = queries[i];
3047 		if (received == -1) {
3048 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3049 #if defined(HAVE_RECVMMSG)
3050 				msgs[i].msg_hdr.msg_flags
3051 #else
3052 				errno
3053 #endif
3054 				));
3055 			STATUP(data->nsd, rxerr);
3056 			/* No zone statup */
3057 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3058 			iovecs[i].iov_len = buffer_remaining(q->packet);
3059 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3060 			goto swap_drop;
3061 		}
3062 
3063 		/* Account... */
3064 #ifdef BIND8_STATS
3065 		if (data->socket->addr.ai_family == AF_INET) {
3066 			STATUP(data->nsd, qudp);
3067 		} else if (data->socket->addr.ai_family == AF_INET6) {
3068 			STATUP(data->nsd, qudp6);
3069 		}
3070 #endif
3071 
3072 		buffer_skip(q->packet, received);
3073 		buffer_flip(q->packet);
3074 #ifdef USE_DNSTAP
3075 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
3076 			q->tcp, q->packet);
3077 #endif /* USE_DNSTAP */
3078 
3079 		/* Process and answer the query... */
3080 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
3081 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3082 				STATUP(data->nsd, nona);
3083 				ZTATUP(data->nsd, q->zone, nona);
3084 			}
3085 
3086 #ifdef USE_ZONE_STATS
3087 			if (data->socket->addr.ai_family == AF_INET) {
3088 				ZTATUP(data->nsd, q->zone, qudp);
3089 			} else if (data->socket->addr.ai_family == AF_INET6) {
3090 				ZTATUP(data->nsd, q->zone, qudp6);
3091 			}
3092 #endif
3093 
3094 			/* Add EDNS0 and TSIG info if necessary.  */
3095 			query_add_optional(q, data->nsd);
3096 
3097 			buffer_flip(q->packet);
3098 			iovecs[i].iov_len = buffer_remaining(q->packet);
3099 #ifdef BIND8_STATS
3100 			/* Account the rcode & TC... */
3101 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3102 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3103 			if (TC(q->packet)) {
3104 				STATUP(data->nsd, truncated);
3105 				ZTATUP(data->nsd, q->zone, truncated);
3106 			}
3107 #endif /* BIND8_STATS */
3108 #ifdef USE_DNSTAP
3109 			dt_collector_submit_auth_response(data->nsd,
3110 				&q->addr, q->addrlen, q->tcp, q->packet,
3111 				q->zone);
3112 #endif /* USE_DNSTAP */
3113 		} else {
3114 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3115 			iovecs[i].iov_len = buffer_remaining(q->packet);
3116 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3117 		swap_drop:
3118 			STATUP(data->nsd, dropped);
3119 			ZTATUP(data->nsd, q->zone, dropped);
3120 			if(i != recvcount-1) {
3121 				/* swap with last and decrease recvcount */
3122 				struct mmsghdr mtmp = msgs[i];
3123 				struct iovec iotmp = iovecs[i];
3124 				recvcount--;
3125 				msgs[i] = msgs[recvcount];
3126 				iovecs[i] = iovecs[recvcount];
3127 				queries[i] = queries[recvcount];
3128 				msgs[recvcount] = mtmp;
3129 				iovecs[recvcount] = iotmp;
3130 				queries[recvcount] = q;
3131 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3132 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3133 				goto loopstart;
3134 			} else { recvcount --; }
3135 		}
3136 	}
3137 
3138 	/* send until all are sent */
3139 	i = 0;
3140 	while(i<recvcount) {
3141 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3142 		if(sent == -1) {
3143 			/* don't log transient network full errors, unless
3144 			 * on higher verbosity */
3145 			if(!(errno == ENOBUFS && verbosity < 1) &&
3146 #ifdef EWOULDBLOCK
3147 			   !(errno == EWOULDBLOCK && verbosity < 1) &&
3148 #endif
3149 			   !(errno == EAGAIN && verbosity < 1)) {
3150 				const char* es = strerror(errno);
3151 				char a[48];
3152 				addr2str(&queries[i]->addr, a, sizeof(a));
3153 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3154 			}
3155 #ifdef BIND8_STATS
3156 			data->nsd->st.txerr += recvcount-i;
3157 #endif /* BIND8_STATS */
3158 			break;
3159 		}
3160 		i += sent;
3161 	}
3162 	for(i=0; i<recvcount; i++) {
3163 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3164 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3165 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3166 	}
3167 }
3168 
3169 #ifdef HAVE_SSL
3170 /*
3171  * Setup an event for the tcp handler.
3172  */
3173 static void
3174 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3175        int fd, short event)
3176 {
3177 	struct timeval timeout;
3178 	struct event_base* ev_base;
3179 
3180 	timeout.tv_sec = data->nsd->tcp_timeout;
3181 	timeout.tv_usec = 0L;
3182 
3183 	ev_base = data->event.ev_base;
3184 	event_del(&data->event);
3185 	memset(&data->event, 0, sizeof(data->event));
3186 	event_set(&data->event, fd, event, fn, data);
3187 	if(event_base_set(ev_base, &data->event) != 0)
3188 		log_msg(LOG_ERR, "event base set failed");
3189 	if(event_add(&data->event, &timeout) != 0)
3190 		log_msg(LOG_ERR, "event add failed");
3191 }
3192 #endif /* HAVE_SSL */
3193 
3194 static void
3195 cleanup_tcp_handler(struct tcp_handler_data* data)
3196 {
3197 	event_del(&data->event);
3198 #ifdef HAVE_SSL
3199 	if(data->tls) {
3200 		SSL_shutdown(data->tls);
3201 		SSL_free(data->tls);
3202 		data->tls = NULL;
3203 	}
3204 #endif
3205 	close(data->event.ev_fd);
3206 	if(data->prev)
3207 		data->prev->next = data->next;
3208 	else	tcp_active_list = data->next;
3209 	if(data->next)
3210 		data->next->prev = data->prev;
3211 
3212 	/*
3213 	 * Enable the TCP accept handlers when the current number of
3214 	 * TCP connections is about to drop below the maximum number
3215 	 * of TCP connections.
3216 	 */
3217 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3218 		configure_handler_event_types(EV_READ|EV_PERSIST);
3219 		if(slowaccept) {
3220 			event_del(&slowaccept_event);
3221 			slowaccept = 0;
3222 		}
3223 	}
3224 	--data->nsd->current_tcp_count;
3225 	assert(data->nsd->current_tcp_count >= 0);
3226 
3227 	region_destroy(data->region);
3228 }
3229 
3230 static void
3231 handle_tcp_reading(int fd, short event, void* arg)
3232 {
3233 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3234 	ssize_t received;
3235 	struct event_base* ev_base;
3236 	struct timeval timeout;
3237 
3238 	if ((event & EV_TIMEOUT)) {
3239 		/* Connection timed out.  */
3240 		cleanup_tcp_handler(data);
3241 		return;
3242 	}
3243 
3244 	if (data->nsd->tcp_query_count > 0 &&
3245 		data->query_count >= data->nsd->tcp_query_count) {
3246 		/* No more queries allowed on this tcp connection. */
3247 		cleanup_tcp_handler(data);
3248 		return;
3249 	}
3250 
3251 	assert((event & EV_READ));
3252 
3253 	if (data->bytes_transmitted == 0) {
3254 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3255 	}
3256 
3257 	/*
3258 	 * Check if we received the leading packet length bytes yet.
3259 	 */
3260 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3261 		received = read(fd,
3262 				(char *) &data->query->tcplen
3263 				+ data->bytes_transmitted,
3264 				sizeof(uint16_t) - data->bytes_transmitted);
3265 		if (received == -1) {
3266 			if (errno == EAGAIN || errno == EINTR) {
3267 				/*
3268 				 * Read would block, wait until more
3269 				 * data is available.
3270 				 */
3271 				return;
3272 			} else {
3273 				char buf[48];
3274 				addr2str(&data->query->addr, buf, sizeof(buf));
3275 #ifdef ECONNRESET
3276 				if (verbosity >= 2 || errno != ECONNRESET)
3277 #endif /* ECONNRESET */
3278 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3279 				cleanup_tcp_handler(data);
3280 				return;
3281 			}
3282 		} else if (received == 0) {
3283 			/* EOF */
3284 			cleanup_tcp_handler(data);
3285 			return;
3286 		}
3287 
3288 		data->bytes_transmitted += received;
3289 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3290 			/*
3291 			 * Not done with the tcplen yet, wait for more
3292 			 * data to become available.
3293 			 */
3294 			return;
3295 		}
3296 
3297 		assert(data->bytes_transmitted == sizeof(uint16_t));
3298 
3299 		data->query->tcplen = ntohs(data->query->tcplen);
3300 
3301 		/*
3302 		 * Minimum query size is:
3303 		 *
3304 		 *     Size of the header (12)
3305 		 *   + Root domain name   (1)
3306 		 *   + Query class        (2)
3307 		 *   + Query type         (2)
3308 		 */
3309 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3310 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3311 			cleanup_tcp_handler(data);
3312 			return;
3313 		}
3314 
3315 		if (data->query->tcplen > data->query->maxlen) {
3316 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3317 			cleanup_tcp_handler(data);
3318 			return;
3319 		}
3320 
3321 		buffer_set_limit(data->query->packet, data->query->tcplen);
3322 	}
3323 
3324 	assert(buffer_remaining(data->query->packet) > 0);
3325 
3326 	/* Read the (remaining) query data.  */
3327 	received = read(fd,
3328 			buffer_current(data->query->packet),
3329 			buffer_remaining(data->query->packet));
3330 	if (received == -1) {
3331 		if (errno == EAGAIN || errno == EINTR) {
3332 			/*
3333 			 * Read would block, wait until more data is
3334 			 * available.
3335 			 */
3336 			return;
3337 		} else {
3338 			char buf[48];
3339 			addr2str(&data->query->addr, buf, sizeof(buf));
3340 #ifdef ECONNRESET
3341 			if (verbosity >= 2 || errno != ECONNRESET)
3342 #endif /* ECONNRESET */
3343 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3344 			cleanup_tcp_handler(data);
3345 			return;
3346 		}
3347 	} else if (received == 0) {
3348 		/* EOF */
3349 		cleanup_tcp_handler(data);
3350 		return;
3351 	}
3352 
3353 	data->bytes_transmitted += received;
3354 	buffer_skip(data->query->packet, received);
3355 	if (buffer_remaining(data->query->packet) > 0) {
3356 		/*
3357 		 * Message not yet complete, wait for more data to
3358 		 * become available.
3359 		 */
3360 		return;
3361 	}
3362 
3363 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3364 
3365 	/* Account... */
3366 #ifdef BIND8_STATS
3367 #ifndef INET6
3368 	STATUP(data->nsd, ctcp);
3369 #else
3370 	if (data->query->addr.ss_family == AF_INET) {
3371 		STATUP(data->nsd, ctcp);
3372 	} else if (data->query->addr.ss_family == AF_INET6) {
3373 		STATUP(data->nsd, ctcp6);
3374 	}
3375 #endif
3376 #endif /* BIND8_STATS */
3377 
3378 	/* We have a complete query, process it.  */
3379 
3380 	/* tcp-query-count: handle query counter ++ */
3381 	data->query_count++;
3382 
3383 	buffer_flip(data->query->packet);
3384 #ifdef USE_DNSTAP
3385 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3386 		data->query->addrlen, data->query->tcp, data->query->packet);
3387 #endif /* USE_DNSTAP */
3388 	data->query_state = server_process_query(data->nsd, data->query);
3389 	if (data->query_state == QUERY_DISCARDED) {
3390 		/* Drop the packet and the entire connection... */
3391 		STATUP(data->nsd, dropped);
3392 		ZTATUP(data->nsd, data->query->zone, dropped);
3393 		cleanup_tcp_handler(data);
3394 		return;
3395 	}
3396 
3397 #ifdef BIND8_STATS
3398 	if (RCODE(data->query->packet) == RCODE_OK
3399 	    && !AA(data->query->packet))
3400 	{
3401 		STATUP(data->nsd, nona);
3402 		ZTATUP(data->nsd, data->query->zone, nona);
3403 	}
3404 #endif /* BIND8_STATS */
3405 
3406 #ifdef USE_ZONE_STATS
3407 #ifndef INET6
3408 	ZTATUP(data->nsd, data->query->zone, ctcp);
3409 #else
3410 	if (data->query->addr.ss_family == AF_INET) {
3411 		ZTATUP(data->nsd, data->query->zone, ctcp);
3412 	} else if (data->query->addr.ss_family == AF_INET6) {
3413 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3414 	}
3415 #endif
3416 #endif /* USE_ZONE_STATS */
3417 
3418 	query_add_optional(data->query, data->nsd);
3419 
3420 	/* Switch to the tcp write handler.  */
3421 	buffer_flip(data->query->packet);
3422 	data->query->tcplen = buffer_remaining(data->query->packet);
3423 #ifdef BIND8_STATS
3424 	/* Account the rcode & TC... */
3425 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3426 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3427 	if (TC(data->query->packet)) {
3428 		STATUP(data->nsd, truncated);
3429 		ZTATUP(data->nsd, data->query->zone, truncated);
3430 	}
3431 #endif /* BIND8_STATS */
3432 #ifdef USE_DNSTAP
3433 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3434 		data->query->addrlen, data->query->tcp, data->query->packet,
3435 		data->query->zone);
3436 #endif /* USE_DNSTAP */
3437 	data->bytes_transmitted = 0;
3438 
3439 	timeout.tv_sec = data->tcp_timeout / 1000;
3440 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3441 
3442 	ev_base = data->event.ev_base;
3443 	event_del(&data->event);
3444 	memset(&data->event, 0, sizeof(data->event));
3445 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3446 		handle_tcp_reading, data);
3447 	if(event_base_set(ev_base, &data->event) != 0)
3448 		log_msg(LOG_ERR, "event base set tcpr failed");
3449 	if(event_add(&data->event, &timeout) != 0)
3450 		log_msg(LOG_ERR, "event add tcpr failed");
3451 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3452 	handle_tcp_writing(fd, EV_WRITE, data);
3453 }
3454 
3455 static void
3456 handle_tcp_writing(int fd, short event, void* arg)
3457 {
3458 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3459 	ssize_t sent;
3460 	struct query *q = data->query;
3461 	struct timeval timeout;
3462 	struct event_base* ev_base;
3463 
3464 	if ((event & EV_TIMEOUT)) {
3465 		/* Connection timed out.  */
3466 		cleanup_tcp_handler(data);
3467 		return;
3468 	}
3469 
3470 	assert((event & EV_WRITE));
3471 
3472 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3473 		/* Writing the response packet length.  */
3474 		uint16_t n_tcplen = htons(q->tcplen);
3475 #ifdef HAVE_WRITEV
3476 		struct iovec iov[2];
3477 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3478 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3479 		iov[1].iov_base = buffer_begin(q->packet);
3480 		iov[1].iov_len = buffer_limit(q->packet);
3481 		sent = writev(fd, iov, 2);
3482 #else /* HAVE_WRITEV */
3483 		sent = write(fd,
3484 			     (const char *) &n_tcplen + data->bytes_transmitted,
3485 			     sizeof(n_tcplen) - data->bytes_transmitted);
3486 #endif /* HAVE_WRITEV */
3487 		if (sent == -1) {
3488 			if (errno == EAGAIN || errno == EINTR) {
3489 				/*
3490 				 * Write would block, wait until
3491 				 * socket becomes writable again.
3492 				 */
3493 				return;
3494 			} else {
3495 #ifdef ECONNRESET
3496 				if(verbosity >= 2 || errno != ECONNRESET)
3497 #endif /* ECONNRESET */
3498 #ifdef EPIPE
3499 				  if(verbosity >= 2 || errno != EPIPE)
3500 #endif /* EPIPE 'broken pipe' */
3501 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3502 				cleanup_tcp_handler(data);
3503 				return;
3504 			}
3505 		}
3506 
3507 		data->bytes_transmitted += sent;
3508 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3509 			/*
3510 			 * Writing not complete, wait until socket
3511 			 * becomes writable again.
3512 			 */
3513 			return;
3514 		}
3515 
3516 #ifdef HAVE_WRITEV
3517 		sent -= sizeof(n_tcplen);
3518 		/* handle potential 'packet done' code */
3519 		goto packet_could_be_done;
3520 #endif
3521  	}
3522 
3523 	sent = write(fd,
3524 		     buffer_current(q->packet),
3525 		     buffer_remaining(q->packet));
3526 	if (sent == -1) {
3527 		if (errno == EAGAIN || errno == EINTR) {
3528 			/*
3529 			 * Write would block, wait until
3530 			 * socket becomes writable again.
3531 			 */
3532 			return;
3533 		} else {
3534 #ifdef ECONNRESET
3535 			if(verbosity >= 2 || errno != ECONNRESET)
3536 #endif /* ECONNRESET */
3537 #ifdef EPIPE
3538 				  if(verbosity >= 2 || errno != EPIPE)
3539 #endif /* EPIPE 'broken pipe' */
3540 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3541 			cleanup_tcp_handler(data);
3542 			return;
3543 		}
3544 	}
3545 
3546 	data->bytes_transmitted += sent;
3547 #ifdef HAVE_WRITEV
3548   packet_could_be_done:
3549 #endif
3550 	buffer_skip(q->packet, sent);
3551 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3552 		/*
3553 		 * Still more data to write when socket becomes
3554 		 * writable again.
3555 		 */
3556 		return;
3557 	}
3558 
3559 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3560 
3561 	if (data->query_state == QUERY_IN_AXFR) {
3562 		/* Continue processing AXFR and writing back results.  */
3563 		buffer_clear(q->packet);
3564 		data->query_state = query_axfr(data->nsd, q);
3565 		if (data->query_state != QUERY_PROCESSED) {
3566 			query_add_optional(data->query, data->nsd);
3567 
3568 			/* Reset data. */
3569 			buffer_flip(q->packet);
3570 			q->tcplen = buffer_remaining(q->packet);
3571 			data->bytes_transmitted = 0;
3572 			/* Reset timeout.  */
3573 			timeout.tv_sec = data->tcp_timeout / 1000;
3574 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3575 			ev_base = data->event.ev_base;
3576 			event_del(&data->event);
3577 			memset(&data->event, 0, sizeof(data->event));
3578 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
3579 				handle_tcp_writing, data);
3580 			if(event_base_set(ev_base, &data->event) != 0)
3581 				log_msg(LOG_ERR, "event base set tcpw failed");
3582 			if(event_add(&data->event, &timeout) != 0)
3583 				log_msg(LOG_ERR, "event add tcpw failed");
3584 
3585 			/*
3586 			 * Write data if/when the socket is writable
3587 			 * again.
3588 			 */
3589 			return;
3590 		}
3591 	}
3592 
3593 	/*
3594 	 * Done sending, wait for the next request to arrive on the
3595 	 * TCP socket by installing the TCP read handler.
3596 	 */
3597 	if (data->nsd->tcp_query_count > 0 &&
3598 		data->query_count >= data->nsd->tcp_query_count) {
3599 
3600 		(void) shutdown(fd, SHUT_WR);
3601 	}
3602 
3603 	data->bytes_transmitted = 0;
3604 
3605 	timeout.tv_sec = data->tcp_timeout / 1000;
3606 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3607 	ev_base = data->event.ev_base;
3608 	event_del(&data->event);
3609 	memset(&data->event, 0, sizeof(data->event));
3610 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3611 		handle_tcp_reading, data);
3612 	if(event_base_set(ev_base, &data->event) != 0)
3613 		log_msg(LOG_ERR, "event base set tcpw failed");
3614 	if(event_add(&data->event, &timeout) != 0)
3615 		log_msg(LOG_ERR, "event add tcpw failed");
3616 }
3617 
3618 #ifdef HAVE_SSL
3619 /** create SSL object and associate fd */
3620 static SSL*
3621 incoming_ssl_fd(SSL_CTX* ctx, int fd)
3622 {
3623 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
3624 	if(!ssl) {
3625 		log_crypto_err("could not SSL_new");
3626 		return NULL;
3627 	}
3628 	SSL_set_accept_state(ssl);
3629 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
3630 	if(!SSL_set_fd(ssl, fd)) {
3631 		log_crypto_err("could not SSL_set_fd");
3632 		SSL_free(ssl);
3633 		return NULL;
3634 	}
3635 	return ssl;
3636 }
3637 
3638 /** TLS handshake to upgrade TCP connection */
3639 static int
3640 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
3641 {
3642 	int r;
3643 	if(data->shake_state == tls_hs_read_event) {
3644 		/* read condition satisfied back to writing */
3645 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3646 		data->shake_state = tls_hs_none;
3647 		return 1;
3648 	}
3649 	if(data->shake_state == tls_hs_write_event) {
3650 		/* write condition satisfied back to reading */
3651 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3652 		data->shake_state = tls_hs_none;
3653 		return 1;
3654 	}
3655 
3656 	/* (continue to) setup the TLS connection */
3657 	ERR_clear_error();
3658 	r = SSL_do_handshake(data->tls);
3659 
3660 	if(r != 1) {
3661 		int want = SSL_get_error(data->tls, r);
3662 		if(want == SSL_ERROR_WANT_READ) {
3663 			if(data->shake_state == tls_hs_read) {
3664 				/* try again later */
3665 				return 1;
3666 			}
3667 			data->shake_state = tls_hs_read;
3668 			/* switch back to reading mode */
3669 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3670 			return 1;
3671 		} else if(want == SSL_ERROR_WANT_WRITE) {
3672 			if(data->shake_state == tls_hs_write) {
3673 				/* try again later */
3674 				return 1;
3675 			}
3676 			data->shake_state = tls_hs_write;
3677 			/* switch back to writing mode */
3678 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3679 			return 1;
3680 		} else {
3681 			if(r == 0)
3682 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
3683 			else {
3684 				unsigned long err = ERR_get_error();
3685 				if(!squelch_err_ssl_handshake(err)) {
3686 					char a[64], s[256];
3687 					addr2str(&data->query->addr, a, sizeof(a));
3688 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
3689 					log_crypto_from_err(s, err);
3690 				}
3691 			}
3692 			cleanup_tcp_handler(data);
3693 			return 0;
3694 		}
3695 	}
3696 
3697 	/* Use to log successful upgrade for testing - could be removed*/
3698 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
3699 	/* set back to the event we need to have when reading (or writing) */
3700 	if(data->shake_state == tls_hs_read && writing) {
3701 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3702 	} else if(data->shake_state == tls_hs_write && !writing) {
3703 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3704 	}
3705 	data->shake_state = tls_hs_none;
3706 	return 1;
3707 }
3708 
3709 /** handle TLS reading of incoming query */
3710 static void
3711 handle_tls_reading(int fd, short event, void* arg)
3712 {
3713 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3714 	ssize_t received;
3715 
3716 	if ((event & EV_TIMEOUT)) {
3717 		/* Connection timed out.  */
3718 		cleanup_tcp_handler(data);
3719 		return;
3720 	}
3721 
3722 	if (data->nsd->tcp_query_count > 0 &&
3723 	    data->query_count >= data->nsd->tcp_query_count) {
3724 		/* No more queries allowed on this tcp connection. */
3725 		cleanup_tcp_handler(data);
3726 		return;
3727 	}
3728 
3729 	assert((event & EV_READ));
3730 
3731 	if (data->bytes_transmitted == 0) {
3732 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3733 	}
3734 
3735 	if(data->shake_state != tls_hs_none) {
3736 		if(!tls_handshake(data, fd, 0))
3737 			return;
3738 		if(data->shake_state != tls_hs_none)
3739 			return;
3740 	}
3741 
3742 	/*
3743 	 * Check if we received the leading packet length bytes yet.
3744 	 */
3745 	if(data->bytes_transmitted < sizeof(uint16_t)) {
3746 		ERR_clear_error();
3747 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
3748 		    + data->bytes_transmitted,
3749 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
3750 			int want = SSL_get_error(data->tls, received);
3751 			if(want == SSL_ERROR_ZERO_RETURN) {
3752 				cleanup_tcp_handler(data);
3753 				return; /* shutdown, closed */
3754 			} else if(want == SSL_ERROR_WANT_READ) {
3755 				/* wants to be called again */
3756 				return;
3757 			}
3758 			else if(want == SSL_ERROR_WANT_WRITE) {
3759 				/* switch to writing */
3760 				data->shake_state = tls_hs_write_event;
3761 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3762 				return;
3763 			}
3764 			cleanup_tcp_handler(data);
3765 			log_crypto_err("could not SSL_read");
3766 			return;
3767 		}
3768 
3769 		data->bytes_transmitted += received;
3770 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3771 			/*
3772 			 * Not done with the tcplen yet, wait for more
3773 			 * data to become available.
3774 			 */
3775 			return;
3776 		}
3777 
3778 		assert(data->bytes_transmitted == sizeof(uint16_t));
3779 
3780 		data->query->tcplen = ntohs(data->query->tcplen);
3781 
3782 		/*
3783 		 * Minimum query size is:
3784 		 *
3785 		 *     Size of the header (12)
3786 		 *   + Root domain name   (1)
3787 		 *   + Query class        (2)
3788 		 *   + Query type         (2)
3789 		 */
3790 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3791 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3792 			cleanup_tcp_handler(data);
3793 			return;
3794 		}
3795 
3796 		if (data->query->tcplen > data->query->maxlen) {
3797 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3798 			cleanup_tcp_handler(data);
3799 			return;
3800 		}
3801 
3802 		buffer_set_limit(data->query->packet, data->query->tcplen);
3803 	}
3804 
3805 	assert(buffer_remaining(data->query->packet) > 0);
3806 
3807 	/* Read the (remaining) query data.  */
3808 	ERR_clear_error();
3809 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
3810 			    (int)buffer_remaining(data->query->packet));
3811 	if(received <= 0) {
3812 		int want = SSL_get_error(data->tls, received);
3813 		if(want == SSL_ERROR_ZERO_RETURN) {
3814 			cleanup_tcp_handler(data);
3815 			return; /* shutdown, closed */
3816 		} else if(want == SSL_ERROR_WANT_READ) {
3817 			/* wants to be called again */
3818 			return;
3819 		}
3820 		else if(want == SSL_ERROR_WANT_WRITE) {
3821 			/* switch back writing */
3822 			data->shake_state = tls_hs_write_event;
3823 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3824 			return;
3825 		}
3826 		cleanup_tcp_handler(data);
3827 		log_crypto_err("could not SSL_read");
3828 		return;
3829 	}
3830 
3831 	data->bytes_transmitted += received;
3832 	buffer_skip(data->query->packet, received);
3833 	if (buffer_remaining(data->query->packet) > 0) {
3834 		/*
3835 		 * Message not yet complete, wait for more data to
3836 		 * become available.
3837 		 */
3838 		return;
3839 	}
3840 
3841 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3842 
3843 	/* Account... */
3844 #ifndef INET6
3845 	STATUP(data->nsd, ctls);
3846 #else
3847 	if (data->query->addr.ss_family == AF_INET) {
3848 		STATUP(data->nsd, ctls);
3849 	} else if (data->query->addr.ss_family == AF_INET6) {
3850 		STATUP(data->nsd, ctls6);
3851 	}
3852 #endif
3853 
3854 	/* We have a complete query, process it.  */
3855 
3856 	/* tcp-query-count: handle query counter ++ */
3857 	data->query_count++;
3858 
3859 	buffer_flip(data->query->packet);
3860 #ifdef USE_DNSTAP
3861 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3862 		data->query->addrlen, data->query->tcp, data->query->packet);
3863 #endif /* USE_DNSTAP */
3864 	data->query_state = server_process_query(data->nsd, data->query);
3865 	if (data->query_state == QUERY_DISCARDED) {
3866 		/* Drop the packet and the entire connection... */
3867 		STATUP(data->nsd, dropped);
3868 		ZTATUP(data->nsd, data->query->zone, dropped);
3869 		cleanup_tcp_handler(data);
3870 		return;
3871 	}
3872 
3873 #ifdef BIND8_STATS
3874 	if (RCODE(data->query->packet) == RCODE_OK
3875 	    && !AA(data->query->packet))
3876 	{
3877 		STATUP(data->nsd, nona);
3878 		ZTATUP(data->nsd, data->query->zone, nona);
3879 	}
3880 #endif /* BIND8_STATS */
3881 
3882 #ifdef USE_ZONE_STATS
3883 #ifndef INET6
3884 	ZTATUP(data->nsd, data->query->zone, ctls);
3885 #else
3886 	if (data->query->addr.ss_family == AF_INET) {
3887 		ZTATUP(data->nsd, data->query->zone, ctls);
3888 	} else if (data->query->addr.ss_family == AF_INET6) {
3889 		ZTATUP(data->nsd, data->query->zone, ctls6);
3890 	}
3891 #endif
3892 #endif /* USE_ZONE_STATS */
3893 
3894 	query_add_optional(data->query, data->nsd);
3895 
3896 	/* Switch to the tcp write handler.  */
3897 	buffer_flip(data->query->packet);
3898 	data->query->tcplen = buffer_remaining(data->query->packet);
3899 #ifdef BIND8_STATS
3900 	/* Account the rcode & TC... */
3901 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3902 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3903 	if (TC(data->query->packet)) {
3904 		STATUP(data->nsd, truncated);
3905 		ZTATUP(data->nsd, data->query->zone, truncated);
3906 	}
3907 #endif /* BIND8_STATS */
3908 #ifdef USE_DNSTAP
3909 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3910 		data->query->addrlen, data->query->tcp, data->query->packet,
3911 		data->query->zone);
3912 #endif /* USE_DNSTAP */
3913 	data->bytes_transmitted = 0;
3914 
3915 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3916 
3917 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3918 	handle_tls_writing(fd, EV_WRITE, data);
3919 }
3920 
3921 /** handle TLS writing of outgoing response */
3922 static void
3923 handle_tls_writing(int fd, short event, void* arg)
3924 {
3925 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3926 	ssize_t sent;
3927 	struct query *q = data->query;
3928 	/* static variable that holds reassembly buffer used to put the
3929 	 * TCP length in front of the packet, like writev. */
3930 	static buffer_type* global_tls_temp_buffer = NULL;
3931 	buffer_type* write_buffer;
3932 
3933 	if ((event & EV_TIMEOUT)) {
3934 		/* Connection timed out.  */
3935 		cleanup_tcp_handler(data);
3936 		return;
3937 	}
3938 
3939 	assert((event & EV_WRITE));
3940 
3941 	if(data->shake_state != tls_hs_none) {
3942 		if(!tls_handshake(data, fd, 1))
3943 			return;
3944 		if(data->shake_state != tls_hs_none)
3945 			return;
3946 	}
3947 
3948 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
3949 
3950 	/* If we are writing the start of a message, we must include the length
3951 	 * this is done with a copy into write_buffer. */
3952 	write_buffer = NULL;
3953 	if (data->bytes_transmitted == 0) {
3954 		if(!global_tls_temp_buffer) {
3955 			/* gets deallocated when nsd shuts down from
3956 			 * nsd.region */
3957 			global_tls_temp_buffer = buffer_create(nsd.region,
3958 				QIOBUFSZ + sizeof(q->tcplen));
3959 			if (!global_tls_temp_buffer) {
3960 				return;
3961 			}
3962 		}
3963 		write_buffer = global_tls_temp_buffer;
3964 		buffer_clear(write_buffer);
3965 		buffer_write_u16(write_buffer, q->tcplen);
3966 		buffer_write(write_buffer, buffer_current(q->packet),
3967 			(int)buffer_remaining(q->packet));
3968 		buffer_flip(write_buffer);
3969 	} else {
3970 		write_buffer = q->packet;
3971 	}
3972 
3973 	/* Write the response */
3974 	ERR_clear_error();
3975 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
3976 	if(sent <= 0) {
3977 		int want = SSL_get_error(data->tls, sent);
3978 		if(want == SSL_ERROR_ZERO_RETURN) {
3979 			cleanup_tcp_handler(data);
3980 			/* closed */
3981 		} else if(want == SSL_ERROR_WANT_READ) {
3982 			/* switch back to reading */
3983 			data->shake_state = tls_hs_read_event;
3984 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
3985 		} else if(want != SSL_ERROR_WANT_WRITE) {
3986 			cleanup_tcp_handler(data);
3987 			log_crypto_err("could not SSL_write");
3988 		}
3989 		return;
3990 	}
3991 
3992 	buffer_skip(write_buffer, sent);
3993 	if(buffer_remaining(write_buffer) != 0) {
3994 		/* If not all sent, sync up the real buffer if it wasn't used.*/
3995 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
3996 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
3997 		}
3998 	}
3999 
4000 	data->bytes_transmitted += sent;
4001 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4002 		/*
4003 		 * Still more data to write when socket becomes
4004 		 * writable again.
4005 		 */
4006 		return;
4007 	}
4008 
4009 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4010 
4011 	if (data->query_state == QUERY_IN_AXFR) {
4012 		/* Continue processing AXFR and writing back results.  */
4013 		buffer_clear(q->packet);
4014 		data->query_state = query_axfr(data->nsd, q);
4015 		if (data->query_state != QUERY_PROCESSED) {
4016 			query_add_optional(data->query, data->nsd);
4017 
4018 			/* Reset data. */
4019 			buffer_flip(q->packet);
4020 			q->tcplen = buffer_remaining(q->packet);
4021 			data->bytes_transmitted = 0;
4022 			/* Reset to writing mode.  */
4023 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4024 
4025 			/*
4026 			 * Write data if/when the socket is writable
4027 			 * again.
4028 			 */
4029 			return;
4030 		}
4031 	}
4032 
4033 	/*
4034 	 * Done sending, wait for the next request to arrive on the
4035 	 * TCP socket by installing the TCP read handler.
4036 	 */
4037 	if (data->nsd->tcp_query_count > 0 &&
4038 		data->query_count >= data->nsd->tcp_query_count) {
4039 
4040 		(void) shutdown(fd, SHUT_WR);
4041 	}
4042 
4043 	data->bytes_transmitted = 0;
4044 
4045 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4046 }
4047 #endif
4048 
4049 static void
4050 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4051 	void* ATTR_UNUSED(arg))
4052 {
4053 	if(slowaccept) {
4054 		configure_handler_event_types(EV_PERSIST | EV_READ);
4055 		slowaccept = 0;
4056 	}
4057 }
4058 
4059 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4060 {
4061 #ifndef HAVE_ACCEPT4
4062 	int s = accept(fd, addr, addrlen);
4063 	if (s != -1) {
4064 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4065 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4066 			close(s);
4067 			s = -1;
4068 			errno=EINTR; /* stop error printout as error in accept4
4069 				by setting this errno, it omits printout, in
4070 				later code that calls nsd_accept4 */
4071 		}
4072 	}
4073 	return s;
4074 #else
4075 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4076 #endif /* HAVE_ACCEPT4 */
4077 }
4078 
4079 /*
4080  * Handle an incoming TCP connection.  The connection is accepted and
4081  * a new TCP reader event handler is added.  The TCP handler
4082  * is responsible for cleanup when the connection is closed.
4083  */
4084 static void
4085 handle_tcp_accept(int fd, short event, void* arg)
4086 {
4087 	struct tcp_accept_handler_data *data
4088 		= (struct tcp_accept_handler_data *) arg;
4089 	int s;
4090 	int reject = 0;
4091 	struct tcp_handler_data *tcp_data;
4092 	region_type *tcp_region;
4093 #ifdef INET6
4094 	struct sockaddr_storage addr;
4095 #else
4096 	struct sockaddr_in addr;
4097 #endif
4098 	socklen_t addrlen;
4099 	struct timeval timeout;
4100 
4101 	if (!(event & EV_READ)) {
4102 		return;
4103 	}
4104 
4105 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4106 		reject = data->nsd->options->tcp_reject_overflow;
4107 		if (!reject) {
4108 			return;
4109 		}
4110 	}
4111 
4112 	/* Accept it... */
4113 	addrlen = sizeof(addr);
4114 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4115 	if (s == -1) {
4116 		/**
4117 		 * EMFILE and ENFILE is a signal that the limit of open
4118 		 * file descriptors has been reached. Pause accept().
4119 		 * EINTR is a signal interrupt. The others are various OS ways
4120 		 * of saying that the client has closed the connection.
4121 		 */
4122 		if (errno == EMFILE || errno == ENFILE) {
4123 			if (!slowaccept) {
4124 				/* disable accept events */
4125 				struct timeval tv;
4126 				configure_handler_event_types(0);
4127 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4128 				tv.tv_usec = 0L;
4129 				memset(&slowaccept_event, 0,
4130 					sizeof(slowaccept_event));
4131 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4132 					handle_slowaccept_timeout, NULL);
4133 				(void)event_base_set(data->event.ev_base,
4134 					&slowaccept_event);
4135 				(void)event_add(&slowaccept_event, &tv);
4136 				slowaccept = 1;
4137 				/* We don't want to spam the logs here */
4138 			}
4139 		} else if (errno != EINTR
4140 			&& errno != EWOULDBLOCK
4141 #ifdef ECONNABORTED
4142 			&& errno != ECONNABORTED
4143 #endif /* ECONNABORTED */
4144 #ifdef EPROTO
4145 			&& errno != EPROTO
4146 #endif /* EPROTO */
4147 			) {
4148 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4149 		}
4150 		return;
4151 	}
4152 
4153 	if (reject) {
4154 		shutdown(s, SHUT_RDWR);
4155 		close(s);
4156 		return;
4157 	}
4158 
4159 	/*
4160 	 * This region is deallocated when the TCP connection is
4161 	 * closed by the TCP handler.
4162 	 */
4163 	tcp_region = region_create(xalloc, free);
4164 	tcp_data = (struct tcp_handler_data *) region_alloc(
4165 		tcp_region, sizeof(struct tcp_handler_data));
4166 	tcp_data->region = tcp_region;
4167 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4168 		compression_table_size, compressed_dnames);
4169 	tcp_data->nsd = data->nsd;
4170 	tcp_data->query_count = 0;
4171 #ifdef HAVE_SSL
4172 	tcp_data->shake_state = tls_hs_none;
4173 	tcp_data->tls = NULL;
4174 #endif
4175 	tcp_data->prev = NULL;
4176 	tcp_data->next = NULL;
4177 
4178 	tcp_data->query_state = QUERY_PROCESSED;
4179 	tcp_data->bytes_transmitted = 0;
4180 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4181 	tcp_data->query->addrlen = addrlen;
4182 
4183 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4184 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4185 		/* very busy, give smaller timeout */
4186 		tcp_data->tcp_timeout = 200;
4187 	}
4188 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4189 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4190 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4191 
4192 #ifdef HAVE_SSL
4193 	if (data->tls_accept) {
4194 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4195 		if(!tcp_data->tls) {
4196 			close(s);
4197 			return;
4198 		}
4199 		tcp_data->shake_state = tls_hs_read;
4200 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4201 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4202 			  handle_tls_reading, tcp_data);
4203 	} else {
4204 #endif
4205 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4206 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4207 			  handle_tcp_reading, tcp_data);
4208 #ifdef HAVE_SSL
4209 	}
4210 #endif
4211 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4212 		log_msg(LOG_ERR, "cannot set tcp event base");
4213 		close(s);
4214 		region_destroy(tcp_region);
4215 		return;
4216 	}
4217 	if(event_add(&tcp_data->event, &timeout) != 0) {
4218 		log_msg(LOG_ERR, "cannot add tcp to event base");
4219 		close(s);
4220 		region_destroy(tcp_region);
4221 		return;
4222 	}
4223 	if(tcp_active_list) {
4224 		tcp_active_list->prev = tcp_data;
4225 		tcp_data->next = tcp_active_list;
4226 	}
4227 	tcp_active_list = tcp_data;
4228 
4229 	/*
4230 	 * Keep track of the total number of TCP handlers installed so
4231 	 * we can stop accepting connections when the maximum number
4232 	 * of simultaneous TCP connections is reached.
4233 	 *
4234 	 * If tcp-reject-overflow is enabled, however, then we do not
4235 	 * change the handler event type; we keep it as-is and accept
4236 	 * overflow TCP connections only so that we can forcibly kill
4237 	 * them off.
4238 	 */
4239 	++data->nsd->current_tcp_count;
4240 	if (!data->nsd->options->tcp_reject_overflow &&
4241 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4242 	{
4243 		configure_handler_event_types(0);
4244 	}
4245 }
4246 
4247 static void
4248 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4249 {
4250 	size_t i;
4251 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4252 	for (i = 0; i < nsd->child_count; ++i) {
4253 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4254 			if (write(nsd->children[i].child_fd,
4255 				&command,
4256 				sizeof(command)) == -1)
4257 			{
4258 				if(errno != EAGAIN && errno != EINTR)
4259 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4260 					(int) command,
4261 					(int) nsd->children[i].pid,
4262 					strerror(errno));
4263 			} else if (timeout > 0) {
4264 				(void)block_read(NULL,
4265 					nsd->children[i].child_fd,
4266 					&command, sizeof(command), timeout);
4267 			}
4268 			fsync(nsd->children[i].child_fd);
4269 			close(nsd->children[i].child_fd);
4270 			nsd->children[i].child_fd = -1;
4271 		}
4272 	}
4273 }
4274 
4275 static void
4276 send_children_quit(struct nsd* nsd)
4277 {
4278 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4279 	send_children_command(nsd, NSD_QUIT, 0);
4280 }
4281 
4282 static void
4283 send_children_quit_and_wait(struct nsd* nsd)
4284 {
4285 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4286 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4287 }
4288 
4289 #ifdef BIND8_STATS
4290 static void
4291 set_children_stats(struct nsd* nsd)
4292 {
4293 	size_t i;
4294 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4295 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4296 	for (i = 0; i < nsd->child_count; ++i) {
4297 		nsd->children[i].need_to_send_STATS = 1;
4298 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4299 	}
4300 }
4301 #endif /* BIND8_STATS */
4302 
4303 static void
4304 configure_handler_event_types(short event_types)
4305 {
4306 	size_t i;
4307 
4308 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4309 		struct event* handler = &tcp_accept_handlers[i].event;
4310 		if(event_types) {
4311 			/* reassign */
4312 			int fd = handler->ev_fd;
4313 			struct event_base* base = handler->ev_base;
4314 			if(tcp_accept_handlers[i].event_added)
4315 				event_del(handler);
4316 			memset(handler, 0, sizeof(*handler));
4317 			event_set(handler, fd, event_types,
4318 				handle_tcp_accept, &tcp_accept_handlers[i]);
4319 			if(event_base_set(base, handler) != 0)
4320 				log_msg(LOG_ERR, "conhand: cannot event_base");
4321 			if(event_add(handler, NULL) != 0)
4322 				log_msg(LOG_ERR, "conhand: cannot event_add");
4323 			tcp_accept_handlers[i].event_added = 1;
4324 		} else {
4325 			/* remove */
4326 			if(tcp_accept_handlers[i].event_added) {
4327 				event_del(handler);
4328 				tcp_accept_handlers[i].event_added = 0;
4329 			}
4330 		}
4331 	}
4332 }
4333