xref: /openbsd-src/usr.sbin/nsd/server.c (revision 505ee9ea3b177e2387d907a91ca7da069f3f14d8)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #ifdef USE_DNSTAP
85 #include "dnstap/dnstap_collector.h"
86 #endif
87 
88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
89 
90 #ifdef USE_TCP_FASTOPEN
91   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
92   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
93 #endif
94 
95 /*
96  * Data for the UDP handlers.
97  */
98 struct udp_handler_data
99 {
100 	struct nsd        *nsd;
101 	struct nsd_socket *socket;
102 	struct event       event;
103 };
104 
105 struct tcp_accept_handler_data {
106 	struct nsd        *nsd;
107 	struct nsd_socket *socket;
108 	int                event_added;
109 	struct event       event;
110 #ifdef HAVE_SSL
111 	/* handler accepts TLS connections on the dedicated port */
112 	int                tls_accept;
113 #endif
114 };
115 
116 /*
117  * These globals are used to enable the TCP accept handlers
118  * when the number of TCP connection drops below the maximum
119  * number of TCP connections.
120  */
121 static size_t tcp_accept_handler_count;
122 static struct tcp_accept_handler_data *tcp_accept_handlers;
123 
124 static struct event slowaccept_event;
125 static int slowaccept;
126 
127 #ifdef HAVE_SSL
128 static unsigned char *ocspdata = NULL;
129 static long ocspdata_len = 0;
130 #endif
131 
132 #ifdef NONBLOCKING_IS_BROKEN
133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
134    read multiple times from a socket when reported ready by select. */
135 # define NUM_RECV_PER_SELECT (1)
136 #else /* !NONBLOCKING_IS_BROKEN */
137 # define NUM_RECV_PER_SELECT (100)
138 #endif /* NONBLOCKING_IS_BROKEN */
139 
140 #ifndef HAVE_MMSGHDR
141 struct mmsghdr {
142 	struct msghdr msg_hdr;
143 	unsigned int  msg_len;
144 };
145 #endif
146 
147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
148 static struct iovec iovecs[NUM_RECV_PER_SELECT];
149 static struct query *queries[NUM_RECV_PER_SELECT];
150 
151 /*
152  * Data for the TCP connection handlers.
153  *
154  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
155  * blocking the entire server on a slow TCP connection, but does make
156  * reading from and writing to the socket more complicated.
157  *
158  * Basically, whenever a read/write would block (indicated by the
159  * EAGAIN errno variable) we remember the position we were reading
160  * from/writing to and return from the TCP reading/writing event
161  * handler.  When the socket becomes readable/writable again we
162  * continue from the same position.
163  */
164 struct tcp_handler_data
165 {
166 	/*
167 	 * The region used to allocate all TCP connection related
168 	 * data, including this structure.  This region is destroyed
169 	 * when the connection is closed.
170 	 */
171 	region_type*		region;
172 
173 	/*
174 	 * The global nsd structure.
175 	 */
176 	struct nsd*			nsd;
177 
178 	/*
179 	 * The current query data for this TCP connection.
180 	 */
181 	query_type*			query;
182 
183 	/*
184 	 * The query_state is used to remember if we are performing an
185 	 * AXFR, if we're done processing, or if we should discard the
186 	 * query and connection.
187 	 */
188 	query_state_type	query_state;
189 
190 	/*
191 	 * The event for the file descriptor and tcp timeout
192 	 */
193 	struct event event;
194 
195 	/*
196 	 * The bytes_transmitted field is used to remember the number
197 	 * of bytes transmitted when receiving or sending a DNS
198 	 * packet.  The count includes the two additional bytes used
199 	 * to specify the packet length on a TCP connection.
200 	 */
201 	size_t				bytes_transmitted;
202 
203 	/*
204 	 * The number of queries handled by this specific TCP connection.
205 	 */
206 	int					query_count;
207 
208 	/*
209 	 * The timeout in msec for this tcp connection
210 	 */
211 	int	tcp_timeout;
212 #ifdef HAVE_SSL
213 	/*
214 	 * TLS object.
215 	 */
216 	SSL* tls;
217 
218 	/*
219 	 * TLS handshake state.
220 	 */
221 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
222 		tls_hs_read_event, tls_hs_write_event } shake_state;
223 #endif
224 	/* list of connections, for service of remaining tcp channels */
225 	struct tcp_handler_data *prev, *next;
226 };
227 /* global that is the list of active tcp channels */
228 static struct tcp_handler_data *tcp_active_list = NULL;
229 
230 /*
231  * Handle incoming queries on the UDP server sockets.
232  */
233 static void handle_udp(int fd, short event, void* arg);
234 
235 /*
236  * Handle incoming connections on the TCP sockets.  These handlers
237  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
238  * connection) but are disabled when the number of current TCP
239  * connections is equal to the maximum number of TCP connections.
240  * Disabling is done by changing the handler to wait for the
241  * NETIO_EVENT_NONE type.  This is done using the function
242  * configure_tcp_accept_handlers.
243  */
244 static void handle_tcp_accept(int fd, short event, void* arg);
245 
246 /*
247  * Handle incoming queries on a TCP connection.  The TCP connections
248  * are configured to be non-blocking and the handler may be called
249  * multiple times before a complete query is received.
250  */
251 static void handle_tcp_reading(int fd, short event, void* arg);
252 
253 /*
254  * Handle outgoing responses on a TCP connection.  The TCP connections
255  * are configured to be non-blocking and the handler may be called
256  * multiple times before a complete response is sent.
257  */
258 static void handle_tcp_writing(int fd, short event, void* arg);
259 
260 #ifdef HAVE_SSL
261 /* Create SSL object and associate fd */
262 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
263 /*
264  * Handle TLS handshake. May be called multiple times if incomplete.
265  */
266 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
267 
268 /*
269  * Handle incoming queries on a TLS over TCP connection.  The TLS
270  * connections are configured to be non-blocking and the handler may
271  * be called multiple times before a complete query is received.
272  */
273 static void handle_tls_reading(int fd, short event, void* arg);
274 
275 /*
276  * Handle outgoing responses on a TLS over TCP connection.  The TLS
277  * connections are configured to be non-blocking and the handler may
278  * be called multiple times before a complete response is sent.
279  */
280 static void handle_tls_writing(int fd, short event, void* arg);
281 #endif
282 
283 /*
284  * Send all children the quit nonblocking, then close pipe.
285  */
286 static void send_children_quit(struct nsd* nsd);
287 /* same, for shutdown time, waits for child to exit to avoid restart issues */
288 static void send_children_quit_and_wait(struct nsd* nsd);
289 
290 /* set childrens flags to send NSD_STATS to them */
291 #ifdef BIND8_STATS
292 static void set_children_stats(struct nsd* nsd);
293 #endif /* BIND8_STATS */
294 
295 /*
296  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
297  */
298 static void configure_handler_event_types(short event_types);
299 
300 static uint16_t *compressed_dname_offsets = 0;
301 static uint32_t compression_table_capacity = 0;
302 static uint32_t compression_table_size = 0;
303 static domain_type* compressed_dnames[MAXRRSPP];
304 
305 #ifdef USE_TCP_FASTOPEN
306 /* Checks to see if the kernel value must be manually changed in order for
307    TCP Fast Open to support server mode */
308 static void report_tcp_fastopen_config() {
309 
310 	int tcp_fastopen_fp;
311 	uint8_t tcp_fastopen_value;
312 
313 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
314 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
315 	}
316 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
317 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
318 		close(tcp_fastopen_fp);
319 	}
320 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
321 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
322 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
323 		log_msg(LOG_WARNING, "To enable TFO use the command:");
324 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
325 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
326 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
327 		close(tcp_fastopen_fp);
328 	}
329 	close(tcp_fastopen_fp);
330 }
331 #endif
332 
333 /*
334  * Remove the specified pid from the list of child pids.  Returns -1 if
335  * the pid is not in the list, child_num otherwise.  The field is set to 0.
336  */
337 static int
338 delete_child_pid(struct nsd *nsd, pid_t pid)
339 {
340 	size_t i;
341 	for (i = 0; i < nsd->child_count; ++i) {
342 		if (nsd->children[i].pid == pid) {
343 			nsd->children[i].pid = 0;
344 			if(!nsd->children[i].need_to_exit) {
345 				if(nsd->children[i].child_fd != -1)
346 					close(nsd->children[i].child_fd);
347 				nsd->children[i].child_fd = -1;
348 				if(nsd->children[i].handler)
349 					nsd->children[i].handler->fd = -1;
350 			}
351 			return i;
352 		}
353 	}
354 	return -1;
355 }
356 
357 /*
358  * Restart child servers if necessary.
359  */
360 static int
361 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
362 	int* xfrd_sock_p)
363 {
364 	struct main_ipc_handler_data *ipc_data;
365 	size_t i;
366 	int sv[2];
367 
368 	/* Fork the child processes... */
369 	for (i = 0; i < nsd->child_count; ++i) {
370 		if (nsd->children[i].pid <= 0) {
371 			if (nsd->children[i].child_fd != -1)
372 				close(nsd->children[i].child_fd);
373 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
374 				log_msg(LOG_ERR, "socketpair: %s",
375 					strerror(errno));
376 				return -1;
377 			}
378 			nsd->children[i].child_fd = sv[0];
379 			nsd->children[i].parent_fd = sv[1];
380 			nsd->children[i].pid = fork();
381 			switch (nsd->children[i].pid) {
382 			default: /* SERVER MAIN */
383 				close(nsd->children[i].parent_fd);
384 				nsd->children[i].parent_fd = -1;
385 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
386 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
387 				}
388 				if(!nsd->children[i].handler)
389 				{
390 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
391 						region, sizeof(struct main_ipc_handler_data));
392 					ipc_data->nsd = nsd;
393 					ipc_data->child = &nsd->children[i];
394 					ipc_data->child_num = i;
395 					ipc_data->xfrd_sock = xfrd_sock_p;
396 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
397 					ipc_data->forward_mode = 0;
398 					ipc_data->got_bytes = 0;
399 					ipc_data->total_bytes = 0;
400 					ipc_data->acl_num = 0;
401 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
402 						region, sizeof(struct netio_handler));
403 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
404 					nsd->children[i].handler->timeout = NULL;
405 					nsd->children[i].handler->user_data = ipc_data;
406 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
407 					nsd->children[i].handler->event_handler = parent_handle_child_command;
408 					netio_add_handler(netio, nsd->children[i].handler);
409 				}
410 				/* clear any ongoing ipc */
411 				ipc_data = (struct main_ipc_handler_data*)
412 					nsd->children[i].handler->user_data;
413 				ipc_data->forward_mode = 0;
414 				/* restart - update fd */
415 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
416 				break;
417 			case 0: /* CHILD */
418 				/* the child need not be able to access the
419 				 * nsd.db file */
420 				namedb_close_udb(nsd->db);
421 #ifdef MEMCLEAN /* OS collects memory pages */
422 				region_destroy(region);
423 #endif
424 
425 				if (pledge("stdio rpath inet", NULL) == -1) {
426 					log_msg(LOG_ERR, "pledge");
427 					exit(1);
428 				}
429 
430 				nsd->pid = 0;
431 				nsd->child_count = 0;
432 				nsd->server_kind = nsd->children[i].kind;
433 				nsd->this_child = &nsd->children[i];
434 				nsd->this_child->child_num = i;
435 				/* remove signal flags inherited from parent
436 				   the parent will handle them. */
437 				nsd->signal_hint_reload_hup = 0;
438 				nsd->signal_hint_reload = 0;
439 				nsd->signal_hint_child = 0;
440 				nsd->signal_hint_quit = 0;
441 				nsd->signal_hint_shutdown = 0;
442 				nsd->signal_hint_stats = 0;
443 				nsd->signal_hint_statsusr = 0;
444 				close(*xfrd_sock_p);
445 				close(nsd->this_child->child_fd);
446 				nsd->this_child->child_fd = -1;
447 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
448 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
449 				}
450 				server_child(nsd);
451 				/* NOTREACH */
452 				exit(0);
453 			case -1:
454 				log_msg(LOG_ERR, "fork failed: %s",
455 					strerror(errno));
456 				return -1;
457 			}
458 		}
459 	}
460 	return 0;
461 }
462 
463 #ifdef BIND8_STATS
464 static void set_bind8_alarm(struct nsd* nsd)
465 {
466 	/* resync so that the next alarm is on the next whole minute */
467 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
468 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
469 }
470 #endif
471 
472 /* set zone stat ids for zones initially read in */
473 static void
474 zonestatid_tree_set(struct nsd* nsd)
475 {
476 	struct radnode* n;
477 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
478 		zone_type* zone = (zone_type*)n->elem;
479 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
480 	}
481 }
482 
483 #ifdef USE_ZONE_STATS
484 void
485 server_zonestat_alloc(struct nsd* nsd)
486 {
487 	size_t num = (nsd->options->zonestatnames->count==0?1:
488 			nsd->options->zonestatnames->count);
489 	size_t sz = sizeof(struct nsdst)*num;
490 	char tmpfile[256];
491 	uint8_t z = 0;
492 
493 	/* file names */
494 	nsd->zonestatfname[0] = 0;
495 	nsd->zonestatfname[1] = 0;
496 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
497 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
498 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
499 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
500 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
501 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
502 
503 	/* file descriptors */
504 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
505 	if(nsd->zonestatfd[0] == -1) {
506 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
507 			strerror(errno));
508 		exit(1);
509 	}
510 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
511 	if(nsd->zonestatfd[0] == -1) {
512 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
513 			strerror(errno));
514 		close(nsd->zonestatfd[0]);
515 		unlink(nsd->zonestatfname[0]);
516 		exit(1);
517 	}
518 
519 #ifdef HAVE_MMAP
520 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
521 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
522 			strerror(errno));
523 		exit(1);
524 	}
525 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
526 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
527 			nsd->zonestatfname[0], strerror(errno));
528 		exit(1);
529 	}
530 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
531 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
532 			strerror(errno));
533 		exit(1);
534 	}
535 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
536 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
537 			nsd->zonestatfname[1], strerror(errno));
538 		exit(1);
539 	}
540 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
541 		MAP_SHARED, nsd->zonestatfd[0], 0);
542 	if(nsd->zonestat[0] == MAP_FAILED) {
543 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
544 		unlink(nsd->zonestatfname[0]);
545 		unlink(nsd->zonestatfname[1]);
546 		exit(1);
547 	}
548 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
549 		MAP_SHARED, nsd->zonestatfd[1], 0);
550 	if(nsd->zonestat[1] == MAP_FAILED) {
551 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
552 		unlink(nsd->zonestatfname[0]);
553 		unlink(nsd->zonestatfname[1]);
554 		exit(1);
555 	}
556 	memset(nsd->zonestat[0], 0, sz);
557 	memset(nsd->zonestat[1], 0, sz);
558 	nsd->zonestatsize[0] = num;
559 	nsd->zonestatsize[1] = num;
560 	nsd->zonestatdesired = num;
561 	nsd->zonestatsizenow = num;
562 	nsd->zonestatnow = nsd->zonestat[0];
563 #endif /* HAVE_MMAP */
564 }
565 
566 void
567 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
568 {
569 #ifdef HAVE_MMAP
570 #ifdef MREMAP_MAYMOVE
571 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
572 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
573 		MREMAP_MAYMOVE);
574 	if(nsd->zonestat[idx] == MAP_FAILED) {
575 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
576 		exit(1);
577 	}
578 #else /* !HAVE MREMAP */
579 	if(msync(nsd->zonestat[idx],
580 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
581 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
582 	if(munmap(nsd->zonestat[idx],
583 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
584 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
585 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
586 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
587 	if(nsd->zonestat[idx] == MAP_FAILED) {
588 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
589 		exit(1);
590 	}
591 #endif /* MREMAP */
592 #endif /* HAVE_MMAP */
593 }
594 
595 /* realloc the zonestat array for the one that is not currently in use,
596  * to match the desired new size of the array (if applicable) */
597 void
598 server_zonestat_realloc(struct nsd* nsd)
599 {
600 #ifdef HAVE_MMAP
601 	uint8_t z = 0;
602 	size_t sz;
603 	int idx = 0; /* index of the zonestat array that is not in use */
604 	if(nsd->zonestatnow == nsd->zonestat[0])
605 		idx = 1;
606 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
607 		return;
608 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
609 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
610 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
611 			strerror(errno));
612 		exit(1);
613 	}
614 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
615 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
616 			nsd->zonestatfname[idx], strerror(errno));
617 		exit(1);
618 	}
619 	zonestat_remap(nsd, idx, sz);
620 	/* zero the newly allocated region */
621 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
622 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
623 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
624 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
625 	}
626 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
627 #endif /* HAVE_MMAP */
628 }
629 
630 /* switchover to use the other array for the new children, that
631  * briefly coexist with the old children.  And we want to avoid them
632  * both writing to the same statistics arrays. */
633 void
634 server_zonestat_switch(struct nsd* nsd)
635 {
636 	if(nsd->zonestatnow == nsd->zonestat[0]) {
637 		nsd->zonestatnow = nsd->zonestat[1];
638 		nsd->zonestatsizenow = nsd->zonestatsize[1];
639 	} else {
640 		nsd->zonestatnow = nsd->zonestat[0];
641 		nsd->zonestatsizenow = nsd->zonestatsize[0];
642 	}
643 }
644 #endif /* USE_ZONE_STATS */
645 
646 static void
647 cleanup_dname_compression_tables(void *ptr)
648 {
649 	free(ptr);
650 	compressed_dname_offsets = NULL;
651 	compression_table_capacity = 0;
652 }
653 
654 static void
655 initialize_dname_compression_tables(struct nsd *nsd)
656 {
657 	size_t needed = domain_table_count(nsd->db->domains) + 1;
658 	needed += EXTRA_DOMAIN_NUMBERS;
659 	if(compression_table_capacity < needed) {
660 		if(compressed_dname_offsets) {
661 			region_remove_cleanup(nsd->db->region,
662 				cleanup_dname_compression_tables,
663 				compressed_dname_offsets);
664 			free(compressed_dname_offsets);
665 		}
666 		compressed_dname_offsets = (uint16_t *) xmallocarray(
667 			needed, sizeof(uint16_t));
668 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
669 			compressed_dname_offsets);
670 		compression_table_capacity = needed;
671 		compression_table_size=domain_table_count(nsd->db->domains)+1;
672 	}
673 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
674 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
675 }
676 
677 static int
678 set_cloexec(struct nsd_socket *sock)
679 {
680 	assert(sock != NULL);
681 
682 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
683 		const char *socktype =
684 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
685 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
686 			socktype, strerror(errno));
687 		return -1;
688 	}
689 
690 	return 1;
691 }
692 
693 static int
694 set_reuseport(struct nsd_socket *sock)
695 {
696 #ifdef SO_REUSEPORT
697 	int on = 1;
698 #ifdef SO_REUSEPORT_LB
699 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
700 	 * SO_REUSEPORT on Linux. This is what the users want with the config
701 	 * option in nsd.conf; if we actually need local address and port reuse
702 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
703 	 * _LB they want.
704 	 */
705 	int opt = SO_REUSEPORT_LB;
706 	static const char optname[] = "SO_REUSEPORT_LB";
707 #else /* !SO_REUSEPORT_LB */
708 	int opt = SO_REUSEPORT;
709 	static const char optname[] = "SO_REUSEPORT";
710 #endif /* SO_REUSEPORT_LB */
711 
712 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
713 		return 1;
714 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
715 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
716 			optname, strerror(errno));
717 	}
718 	return -1;
719 #else
720 	(void)sock;
721 #endif /* SO_REUSEPORT */
722 
723 	return 0;
724 }
725 
726 static int
727 set_reuseaddr(struct nsd_socket *sock)
728 {
729 #ifdef SO_REUSEADDR
730 	int on = 1;
731 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
732 		return 1;
733 	}
734 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
735 		strerror(errno));
736 	return -1;
737 #endif /* SO_REUSEADDR */
738 	return 0;
739 }
740 
741 static int
742 set_rcvbuf(struct nsd_socket *sock, int rcv)
743 {
744 #ifdef SO_RCVBUF
745 #ifdef SO_RCVBUFFORCE
746 	if(0 == setsockopt(
747 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
748 	{
749 		return 1;
750 	}
751 	if(errno == EPERM || errno == ENOBUFS) {
752 		return 0;
753 	}
754 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
755 		strerror(errno));
756 	return -1;
757 #else /* !SO_RCVBUFFORCE */
758 	if (0 == setsockopt(
759 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
760 	{
761 		return 1;
762 	}
763 	if(errno == ENOSYS || errno == ENOBUFS) {
764 		return 0;
765 	}
766 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
767 		strerror(errno));
768 	return -1;
769 #endif /* SO_RCVBUFFORCE */
770 #endif /* SO_RCVBUF */
771 
772 	return 0;
773 }
774 
775 static int
776 set_sndbuf(struct nsd_socket *sock, int snd)
777 {
778 #ifdef SO_SNDBUF
779 #ifdef SO_SNDBUFFORCE
780 	if(0 == setsockopt(
781 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
782 	{
783 		return 1;
784 	}
785 	if(errno == EPERM || errno == ENOBUFS) {
786 		return 0;
787 	}
788 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
789 		strerror(errno));
790 	return -1;
791 #else /* !SO_SNDBUFFORCE */
792 	if(0 == setsockopt(
793 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
794 	{
795 		return 1;
796 	}
797 	if(errno == ENOSYS || errno == ENOBUFS) {
798 		return 0;
799 	}
800 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
801 		strerror(errno));
802 	return -1;
803 #endif /* SO_SNDBUFFORCE */
804 #endif /* SO_SNDBUF */
805 
806 	return 0;
807 }
808 
809 static int
810 set_nonblock(struct nsd_socket *sock)
811 {
812 	const char *socktype =
813 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
814 
815 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
816 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
817 			socktype, strerror(errno));
818 		return -1;
819 	}
820 
821 	return 1;
822 }
823 
824 static int
825 set_ipv6_v6only(struct nsd_socket *sock)
826 {
827 #ifdef INET6
828 #ifdef IPV6_V6ONLY
829 	int on = 1;
830 	const char *socktype =
831 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
832 
833 	if(0 == setsockopt(
834 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
835 	{
836 		return 1;
837 	}
838 
839 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
840 		socktype, strerror(errno));
841 	return -1;
842 #endif /* IPV6_V6ONLY */
843 #endif /* INET6 */
844 
845 	return 0;
846 }
847 
848 static int
849 set_ipv6_use_min_mtu(struct nsd_socket *sock)
850 {
851 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))
852 #if defined(IPV6_USE_MIN_MTU)
853 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
854 	 * network. Therefore we do not send UDP datagrams larger than the
855 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
856 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
857 	 */
858 	int opt = IPV6_USE_MIN_MTU;
859 	int optval = 1;
860 	static const char optname[] = "IPV6_USE_MIN_MTU";
861 #elif defined(IPV6_MTU)
862 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
863 	 * to the MIN MTU to get the same.
864 	 */
865 	int opt = IPV6_MTU;
866 	int optval = IPV6_MIN_MTU;
867 	static const char optname[] = "IPV6_MTU";
868 #endif
869 	if(0 == setsockopt(
870 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
871 	{
872 		return 1;
873 	}
874 
875 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
876 		optname, strerror(errno));
877 	return -1;
878 #else
879 	(void)sock;
880 #endif /* INET6 */
881 
882 	return 0;
883 }
884 
885 static int
886 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
887 {
888 	int ret = 0;
889 
890 #if defined(IP_MTU_DISCOVER)
891 	int opt = IP_MTU_DISCOVER;
892 	int optval;
893 # if defined(IP_PMTUDISC_OMIT)
894 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
895 	 * information and send packets with DF=0. Fragmentation is allowed if
896 	 * and only if the packet size exceeds the outgoing interface MTU or
897 	 * the packet encounters smaller MTU link in network. This mitigates
898 	 * DNS fragmentation attacks by preventing forged PMTU information.
899 	 * FreeBSD already has same semantics without setting the option.
900 	 */
901 	optval = IP_PMTUDISC_OMIT;
902 	if(0 == setsockopt(
903 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
904 	{
905 		return 1;
906 	}
907 
908 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
909 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
910 # endif /* IP_PMTUDISC_OMIT */
911 # if defined(IP_PMTUDISC_DONT)
912 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
913 	optval = IP_PMTUDISC_DONT;
914 	if(0 == setsockopt(
915 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
916 	{
917 		return 1;
918 	}
919 
920 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
921 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
922 # endif
923 	ret = -1;
924 #elif defined(IP_DONTFRAG)
925 	int off = 0;
926 	if (0 == setsockopt(
927 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
928 	{
929 		return 1;
930 	}
931 
932 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
933 		strerror(errno));
934 	ret = -1;
935 #else
936 	(void)sock;
937 #endif
938 
939 	return ret;
940 }
941 
942 static int
943 set_ip_freebind(struct nsd_socket *sock)
944 {
945 #ifdef IP_FREEBIND
946 	int on = 1;
947 	const char *socktype =
948 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
949 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
950 	{
951 		return 1;
952 	}
953 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
954 		socktype, strerror(errno));
955 	return -1;
956 #else
957 	(void)sock;
958 #endif /* IP_FREEBIND */
959 
960 	return 0;
961 }
962 
963 static int
964 set_ip_transparent(struct nsd_socket *sock)
965 {
966 	/*
967 	The scandalous preprocessor blob here calls for some explanation :)
968 	POSIX does not specify an option to bind non-local IPs, so
969 	platforms developed several implementation-specific options,
970 	all set in the same way, but with different names.
971 	For additional complexity, some platform manage this setting
972 	differently for different address families (IPv4 vs IPv6).
973 	This scandalous preprocessor blob below abstracts such variability
974 	in the way which leaves the C code as lean and clear as possible.
975 	*/
976 
977 #if defined(IP_TRANSPARENT)
978 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
979 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
980 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
981 // as of 2020-01, Linux does not support this on IPv6 programmatically
982 #elif defined(SO_BINDANY)
983 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
984 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
985 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
986 #elif defined(IP_BINDANY)
987 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
988 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
989 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
990 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
991 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
992 #endif
993 
994 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
995 	(void)sock;
996 #else
997 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
998 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
999 #	endif
1000 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1001 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1002 #	endif
1003 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1004 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1005 #	endif
1006 
1007 	int on = 1;
1008 	const char *socktype =
1009 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1010 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1011 
1012 	if(0 == setsockopt(
1013 		sock->s,
1014 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1015 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1016 		&on, sizeof(on)))
1017 	{
1018 		return 1;
1019 	}
1020 
1021 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1022 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1023 	return -1;
1024 #endif
1025 
1026 	return 0;
1027 }
1028 
1029 static int
1030 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1031 {
1032 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1033 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1034 		return 1;
1035 	}
1036 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1037 		strerror(errno));
1038 	return -1;
1039 #else
1040 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1041 #endif
1042 	return 0;
1043 }
1044 
1045 #ifdef USE_TCP_FASTOPEN
1046 static int
1047 set_tcp_fastopen(struct nsd_socket *sock)
1048 {
1049 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1050 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1051 	 */
1052 	int qlen;
1053 
1054 #ifdef __APPLE__
1055 	/* macOS X implementation only supports qlen of 1 via this call. The
1056 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1057 	 * kernel parameter.
1058 	 */
1059 	qlen = 1;
1060 #else
1061 	/* 5 is recommended on Linux. */
1062 	qlen = 5;
1063 #endif
1064 	if (0 == setsockopt(
1065 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1066 	{
1067 		return 1;
1068 	}
1069 
1070 	if (errno == EPERM) {
1071 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1072 				 "; this could likely be because sysctl "
1073 				 "net.inet.tcp.fastopen.enabled, "
1074 				 "net.inet.tcp.fastopen.server_enable, or "
1075 				 "net.ipv4.tcp_fastopen is disabled",
1076 			strerror(errno));
1077 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1078 	 * disabled, except when verbosity enabled for debugging
1079 	 */
1080 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1081 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1082 			strerror(errno));
1083 	}
1084 
1085 	return (errno == ENOPROTOOPT ? 0 : -1);
1086 }
1087 #endif /* USE_TCP_FASTOPEN */
1088 
1089 static int
1090 set_bindtodevice(struct nsd_socket *sock)
1091 {
1092 #if defined(SO_BINDTODEVICE)
1093 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1094 		sock->device, strlen(sock->device)) == -1)
1095 	{
1096 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1097 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1098 		return -1;
1099 	}
1100 
1101 	return 1;
1102 #else
1103 	(void)sock;
1104 	return 0;
1105 #endif
1106 }
1107 
1108 static int
1109 set_setfib(struct nsd_socket *sock)
1110 {
1111 #if defined(SO_SETFIB)
1112 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1113 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1114 	{
1115 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1116 		                 "SO_SETFIB", sock->fib, strerror(errno));
1117 		return -1;
1118 	}
1119 
1120 	return 1;
1121 #else
1122 	(void)sock;
1123 	return 0;
1124 #endif
1125 }
1126 
1127 static int
1128 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1129 {
1130 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1131 
1132 	if(-1 == (sock->s = socket(
1133 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1134 	{
1135 #ifdef INET6
1136 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1137 		   (sock->addr.ai_family == AF_INET6) &&
1138 		   (errno == EAFNOSUPPORT))
1139 		{
1140 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1141 				"not supported");
1142 			return 0;
1143 		}
1144 #endif
1145 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1146 		return -1;
1147 	}
1148 
1149 	set_cloexec(sock);
1150 
1151 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1152 		*reuseport_works = (set_reuseport(sock) == 1);
1153 
1154 	if(nsd->options->receive_buffer_size > 0)
1155 		rcv = nsd->options->receive_buffer_size;
1156 	if(set_rcvbuf(sock, rcv) == -1)
1157 		return -1;
1158 
1159 	if(nsd->options->send_buffer_size > 0)
1160 		snd = nsd->options->send_buffer_size;
1161 	if(set_sndbuf(sock, snd) == -1)
1162 		return -1;
1163 #ifdef INET6
1164 	if(sock->addr.ai_family == AF_INET6) {
1165 		if(set_ipv6_v6only(sock) == -1 ||
1166 		   set_ipv6_use_min_mtu(sock) == -1)
1167 			return -1;
1168 	} else
1169 #endif /* INET6 */
1170 	if(sock->addr.ai_family == AF_INET) {
1171 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1172 			return -1;
1173 	}
1174 
1175 	/* Set socket to non-blocking. Otherwise, on operating systems
1176 	 * with thundering herd problems, the UDP recv could block
1177 	 * after select returns readable.
1178 	 */
1179 	set_nonblock(sock);
1180 
1181 	if(nsd->options->ip_freebind)
1182 		(void)set_ip_freebind(sock);
1183 	if(nsd->options->ip_transparent)
1184 		(void)set_ip_transparent(sock);
1185 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1186 		return -1;
1187 	if(sock->fib != -1 && set_setfib(sock) == -1)
1188 		return -1;
1189 
1190 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1191 		char buf[256];
1192 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1193 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1194 			buf, strerror(errno));
1195 		return -1;
1196 	}
1197 
1198 	return 1;
1199 }
1200 
1201 static int
1202 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1203 {
1204 #ifdef USE_TCP_FASTOPEN
1205 	report_tcp_fastopen_config();
1206 #endif
1207 
1208 	(void)reuseport_works;
1209 
1210 	if(-1 == (sock->s = socket(
1211 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1212 	{
1213 #ifdef INET6
1214 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1215 		   (sock->addr.ai_family == AF_INET6) &&
1216 		   (errno == EAFNOSUPPORT))
1217 		{
1218 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1219 			                     "not supported");
1220 			return 0;
1221 		}
1222 #endif /* INET6 */
1223 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1224 		return -1;
1225 	}
1226 
1227 	set_cloexec(sock);
1228 
1229 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1230 		*reuseport_works = (set_reuseport(sock) == 1);
1231 
1232 	(void)set_reuseaddr(sock);
1233 
1234 #ifdef INET6
1235 	if(sock->addr.ai_family == AF_INET6) {
1236 		if (set_ipv6_v6only(sock) == -1 ||
1237 		    set_ipv6_use_min_mtu(sock) == -1)
1238 			return -1;
1239 	}
1240 #endif
1241 
1242 	if(nsd->tcp_mss > 0)
1243 		set_tcp_maxseg(sock, nsd->tcp_mss);
1244 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1245 	   it may block in accept, even if select() says readable. */
1246 	(void)set_nonblock(sock);
1247 	if(nsd->options->ip_freebind)
1248 		(void)set_ip_freebind(sock);
1249 	if(nsd->options->ip_transparent)
1250 		(void)set_ip_transparent(sock);
1251 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1252 		return -1;
1253 	if(sock->fib != -1 && set_setfib(sock) == -1)
1254 		return -1;
1255 
1256 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1257 		char buf[256];
1258 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1259 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1260 			buf, strerror(errno));
1261 		return -1;
1262 	}
1263 
1264 #ifdef USE_TCP_FASTOPEN
1265 	(void)set_tcp_fastopen(sock);
1266 #endif
1267 
1268 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1269 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1270 		return -1;
1271 	}
1272 
1273 	return 1;
1274 }
1275 
1276 /*
1277  * Initialize the server, reuseport, create and bind the sockets.
1278  */
1279 int
1280 server_init(struct nsd *nsd)
1281 {
1282 	size_t i;
1283 	int reuseport = 1; /* Determine if REUSEPORT works. */
1284 
1285 	/* open server interface ports */
1286 	for(i = 0; i < nsd->ifs; i++) {
1287 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1288 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1289 		{
1290 			return -1;
1291 		}
1292 	}
1293 
1294 	if(nsd->reuseport && reuseport) {
1295 		size_t ifs = nsd->ifs * nsd->reuseport;
1296 
1297 		/* increase the size of the interface arrays, there are going
1298 		 * to be separate interface file descriptors for every server
1299 		 * instance */
1300 		region_remove_cleanup(nsd->region, free, nsd->udp);
1301 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1302 
1303 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1304 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1305 		region_add_cleanup(nsd->region, free, nsd->udp);
1306 		region_add_cleanup(nsd->region, free, nsd->tcp);
1307 
1308 		for(i = nsd->ifs; i < ifs; i++) {
1309 			nsd->udp[i].addr = nsd->udp[i%nsd->ifs].addr;
1310 			nsd->udp[i].servers = nsd->udp[i%nsd->ifs].servers;
1311 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1312 				return -1;
1313 			}
1314 			/* Turn off REUSEPORT for TCP by copying the socket
1315 			 * file descriptor.
1316 			 * This means we should not close TCP used by
1317 			 * other servers in reuseport enabled mode, in
1318 			 * server_child().
1319 			 */
1320 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1321 		}
1322 
1323 		nsd->ifs = ifs;
1324 	} else {
1325 		nsd->reuseport = 0;
1326 	}
1327 
1328 	return 0;
1329 }
1330 
1331 /*
1332  * Prepare the server for take off.
1333  *
1334  */
1335 int
1336 server_prepare(struct nsd *nsd)
1337 {
1338 #ifdef RATELIMIT
1339 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1340 #ifdef HAVE_GETRANDOM
1341 	uint32_t v;
1342 	if(getrandom(&v, sizeof(v), 0) == -1) {
1343 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1344 		exit(1);
1345 	}
1346 	hash_set_raninit(v);
1347 #elif defined(HAVE_ARC4RANDOM)
1348 	hash_set_raninit(arc4random());
1349 #else
1350 	uint32_t v = getpid() ^ time(NULL);
1351 	srandom((unsigned long)v);
1352 #  ifdef HAVE_SSL
1353 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1354 		hash_set_raninit(v);
1355 	else
1356 #  endif
1357 		hash_set_raninit(random());
1358 #endif
1359 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1360 		nsd->options->rrl_ratelimit,
1361 		nsd->options->rrl_whitelist_ratelimit,
1362 		nsd->options->rrl_slip,
1363 		nsd->options->rrl_ipv4_prefix_length,
1364 		nsd->options->rrl_ipv6_prefix_length);
1365 #endif /* RATELIMIT */
1366 
1367 	/* Open the database... */
1368 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1369 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1370 			nsd->dbfile, strerror(errno));
1371 		unlink(nsd->task[0]->fname);
1372 		unlink(nsd->task[1]->fname);
1373 #ifdef USE_ZONE_STATS
1374 		unlink(nsd->zonestatfname[0]);
1375 		unlink(nsd->zonestatfname[1]);
1376 #endif
1377 		xfrd_del_tempdir(nsd);
1378 		return -1;
1379 	}
1380 	/* check if zone files have been modified */
1381 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1382 	 * for all zones */
1383 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1384 		nsd->options->database[0] == 0))
1385 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1386 	zonestatid_tree_set(nsd);
1387 
1388 	compression_table_capacity = 0;
1389 	initialize_dname_compression_tables(nsd);
1390 
1391 #ifdef	BIND8_STATS
1392 	/* Initialize times... */
1393 	time(&nsd->st.boot);
1394 	set_bind8_alarm(nsd);
1395 #endif /* BIND8_STATS */
1396 
1397 	return 0;
1398 }
1399 
1400 /*
1401  * Fork the required number of servers.
1402  */
1403 static int
1404 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1405 	int* xfrd_sock_p)
1406 {
1407 	size_t i;
1408 
1409 	/* Start all child servers initially.  */
1410 	for (i = 0; i < nsd->child_count; ++i) {
1411 		nsd->children[i].pid = 0;
1412 	}
1413 
1414 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1415 }
1416 
1417 static void
1418 server_close_socket(struct nsd_socket *sock)
1419 {
1420 	if(sock->s != -1) {
1421 		close(sock->s);
1422 		sock->s = -1;
1423 	}
1424 }
1425 
1426 void
1427 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1428 {
1429 	size_t i;
1430 
1431 	/* Close all the sockets... */
1432 	for (i = 0; i < n; ++i) {
1433 		server_close_socket(&sockets[i]);
1434 	}
1435 }
1436 
1437 /*
1438  * Close the sockets, shutdown the server and exit.
1439  * Does not return.
1440  */
1441 void
1442 server_shutdown(struct nsd *nsd)
1443 {
1444 	size_t i;
1445 
1446 	server_close_all_sockets(nsd->udp, nsd->ifs);
1447 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1448 	/* CHILD: close command channel to parent */
1449 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1450 	{
1451 		close(nsd->this_child->parent_fd);
1452 		nsd->this_child->parent_fd = -1;
1453 	}
1454 	/* SERVER: close command channels to children */
1455 	if(!nsd->this_child)
1456 	{
1457 		for(i=0; i < nsd->child_count; ++i)
1458 			if(nsd->children[i].child_fd != -1)
1459 			{
1460 				close(nsd->children[i].child_fd);
1461 				nsd->children[i].child_fd = -1;
1462 			}
1463 	}
1464 
1465 	tsig_finalize();
1466 #ifdef HAVE_SSL
1467 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1468 	if (nsd->tls_ctx)
1469 		SSL_CTX_free(nsd->tls_ctx);
1470 #endif
1471 
1472 #ifdef MEMCLEAN /* OS collects memory pages */
1473 #ifdef RATELIMIT
1474 	rrl_mmap_deinit_keep_mmap();
1475 #endif
1476 #ifdef USE_DNSTAP
1477 	dt_collector_destroy(nsd->dt_collector, nsd);
1478 #endif
1479 	udb_base_free_keep_mmap(nsd->task[0]);
1480 	udb_base_free_keep_mmap(nsd->task[1]);
1481 	namedb_close_udb(nsd->db); /* keeps mmap */
1482 	namedb_close(nsd->db);
1483 	nsd_options_destroy(nsd->options);
1484 	region_destroy(nsd->region);
1485 #endif
1486 	log_finalize();
1487 	exit(0);
1488 }
1489 
1490 void
1491 server_prepare_xfrd(struct nsd* nsd)
1492 {
1493 	char tmpfile[256];
1494 	/* create task mmaps */
1495 	nsd->mytask = 0;
1496 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1497 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1498 	nsd->task[0] = task_file_create(tmpfile);
1499 	if(!nsd->task[0]) {
1500 #ifdef USE_ZONE_STATS
1501 		unlink(nsd->zonestatfname[0]);
1502 		unlink(nsd->zonestatfname[1]);
1503 #endif
1504 		xfrd_del_tempdir(nsd);
1505 		exit(1);
1506 	}
1507 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1508 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1509 	nsd->task[1] = task_file_create(tmpfile);
1510 	if(!nsd->task[1]) {
1511 		unlink(nsd->task[0]->fname);
1512 #ifdef USE_ZONE_STATS
1513 		unlink(nsd->zonestatfname[0]);
1514 		unlink(nsd->zonestatfname[1]);
1515 #endif
1516 		xfrd_del_tempdir(nsd);
1517 		exit(1);
1518 	}
1519 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1520 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1521 	/* create xfrd listener structure */
1522 	nsd->xfrd_listener = region_alloc(nsd->region,
1523 		sizeof(netio_handler_type));
1524 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1525 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1526 	nsd->xfrd_listener->fd = -1;
1527 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1528 		nsd;
1529 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1530 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1531 }
1532 
1533 
1534 void
1535 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1536 {
1537 	pid_t pid;
1538 	int sockets[2] = {0,0};
1539 	struct ipc_handler_conn_data *data;
1540 
1541 	if(nsd->xfrd_listener->fd != -1)
1542 		close(nsd->xfrd_listener->fd);
1543 	if(del_db) {
1544 		/* recreate taskdb that xfrd was using, it may be corrupt */
1545 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1546 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1547 		nsd->task[1-nsd->mytask]->fname = NULL;
1548 		/* free alloc already, so udb does not shrink itself */
1549 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1550 		nsd->task[1-nsd->mytask]->alloc = NULL;
1551 		udb_base_free(nsd->task[1-nsd->mytask]);
1552 		/* create new file, overwrite the old one */
1553 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1554 		free(tmpfile);
1555 	}
1556 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1557 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1558 		return;
1559 	}
1560 	pid = fork();
1561 	switch (pid) {
1562 	case -1:
1563 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1564 		break;
1565 	default:
1566 		/* PARENT: close first socket, use second one */
1567 		close(sockets[0]);
1568 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1569 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1570 		}
1571 		if(del_db) xfrd_free_namedb(nsd);
1572 		/* use other task than I am using, since if xfrd died and is
1573 		 * restarted, the reload is using nsd->mytask */
1574 		nsd->mytask = 1 - nsd->mytask;
1575 
1576 #ifdef HAVE_SETPROCTITLE
1577 		setproctitle("xfrd");
1578 #endif
1579 #ifdef HAVE_CPUSET_T
1580 		if(nsd->use_cpu_affinity) {
1581 			set_cpu_affinity(nsd->xfrd_cpuset);
1582 		}
1583 #endif
1584 
1585 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1586 		/* ENOTREACH */
1587 		break;
1588 	case 0:
1589 		/* CHILD: close second socket, use first one */
1590 		close(sockets[1]);
1591 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1592 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1593 		}
1594 		nsd->xfrd_listener->fd = sockets[0];
1595 		break;
1596 	}
1597 	/* server-parent only */
1598 	nsd->xfrd_listener->timeout = NULL;
1599 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1600 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1601 	/* clear ongoing ipc reads */
1602 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1603 	data->conn->is_reading = 0;
1604 }
1605 
1606 /** add all soainfo to taskdb */
1607 static void
1608 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1609 {
1610 	struct radnode* n;
1611 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1612 	/* add all SOA INFO to mytask */
1613 	udb_ptr_init(&task_last, taskudb);
1614 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1615 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1616 	}
1617 	udb_ptr_unlink(&task_last, taskudb);
1618 }
1619 
1620 void
1621 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1622 {
1623 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1624 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1625 	 *   then they exchange and process.
1626 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1627 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1628 	 *   expire notifications can be sent back via a normal reload later
1629 	 *   (xfrd will wait for current running reload to finish if any).
1630 	 */
1631 	sig_atomic_t cmd = 0;
1632 	pid_t mypid;
1633 	int xfrd_sock = nsd->xfrd_listener->fd;
1634 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1635 	udb_ptr t;
1636 	if(!shortsoa) {
1637 		if(nsd->signal_hint_shutdown) {
1638 		shutdown:
1639 			log_msg(LOG_WARNING, "signal received, shutting down...");
1640 			server_close_all_sockets(nsd->udp, nsd->ifs);
1641 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1642 #ifdef HAVE_SSL
1643 			daemon_remote_close(nsd->rc);
1644 #endif
1645 			/* Unlink it if possible... */
1646 			unlinkpid(nsd->pidfile);
1647 			unlink(nsd->task[0]->fname);
1648 			unlink(nsd->task[1]->fname);
1649 #ifdef USE_ZONE_STATS
1650 			unlink(nsd->zonestatfname[0]);
1651 			unlink(nsd->zonestatfname[1]);
1652 #endif
1653 			/* write the nsd.db to disk, wait for it to complete */
1654 			udb_base_sync(nsd->db->udb, 1);
1655 			udb_base_close(nsd->db->udb);
1656 			server_shutdown(nsd);
1657 			exit(0);
1658 		}
1659 	}
1660 	if(shortsoa) {
1661 		/* put SOA in xfrd task because mytask may be in use */
1662 		taskudb = nsd->task[1-nsd->mytask];
1663 	}
1664 
1665 	add_all_soa_to_task(nsd, taskudb);
1666 	if(!shortsoa) {
1667 		/* wait for xfrd to signal task is ready, RELOAD signal */
1668 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1669 			cmd != NSD_RELOAD) {
1670 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1671 			exit(1);
1672 		}
1673 		if(nsd->signal_hint_shutdown) {
1674 			goto shutdown;
1675 		}
1676 	}
1677 	/* give xfrd our task, signal it with RELOAD_DONE */
1678 	task_process_sync(taskudb);
1679 	cmd = NSD_RELOAD_DONE;
1680 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1681 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1682 			(int)nsd->pid, strerror(errno));
1683 	}
1684 	mypid = getpid();
1685 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1686 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1687 			strerror(errno));
1688 	}
1689 
1690 	if(!shortsoa) {
1691 		/* process the xfrd task works (expiry data) */
1692 		nsd->mytask = 1 - nsd->mytask;
1693 		taskudb = nsd->task[nsd->mytask];
1694 		task_remap(taskudb);
1695 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1696 		while(!udb_ptr_is_null(&t)) {
1697 			task_process_expire(nsd->db, TASKLIST(&t));
1698 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1699 		}
1700 		udb_ptr_unlink(&t, taskudb);
1701 		task_clear(taskudb);
1702 
1703 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1704 		cmd = NSD_RELOAD_DONE;
1705 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1706 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1707 				(int)nsd->pid, strerror(errno));
1708 		}
1709 	}
1710 }
1711 
1712 #ifdef HAVE_SSL
1713 static void
1714 log_crypto_from_err(const char* str, unsigned long err)
1715 {
1716 	/* error:[error code]:[library name]:[function name]:[reason string] */
1717 	char buf[128];
1718 	unsigned long e;
1719 	ERR_error_string_n(err, buf, sizeof(buf));
1720 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1721 	while( (e=ERR_get_error()) ) {
1722 		ERR_error_string_n(e, buf, sizeof(buf));
1723 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1724 	}
1725 }
1726 
1727 void
1728 log_crypto_err(const char* str)
1729 {
1730 	log_crypto_from_err(str, ERR_get_error());
1731 }
1732 
1733 /** true if the ssl handshake error has to be squelched from the logs */
1734 static int
1735 squelch_err_ssl_handshake(unsigned long err)
1736 {
1737 	if(verbosity >= 3)
1738 		return 0; /* only squelch on low verbosity */
1739 	/* this is very specific, we could filter on ERR_GET_REASON()
1740 	 * (the third element in ERR_PACK) */
1741 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1742 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1743 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1744 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1745 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1746 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1747 #endif
1748 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1749 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1750 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1751 #  ifdef SSL_R_VERSION_TOO_LOW
1752 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1753 #  endif
1754 #endif
1755 		)
1756 		return 1;
1757 	return 0;
1758 }
1759 
1760 void
1761 perform_openssl_init(void)
1762 {
1763 	/* init SSL library */
1764 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1765 	ERR_load_crypto_strings();
1766 #endif
1767 	ERR_load_SSL_strings();
1768 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1769 	OpenSSL_add_all_algorithms();
1770 #else
1771 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1772 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1773 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1774 #endif
1775 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1776 	(void)SSL_library_init();
1777 #else
1778 	OPENSSL_init_ssl(0, NULL);
1779 #endif
1780 
1781 	if(!RAND_status()) {
1782 		/* try to seed it */
1783 		unsigned char buf[256];
1784 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1785 		size_t i;
1786 		v = seed;
1787 		for(i=0; i<256/sizeof(v); i++) {
1788 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1789 			v = v*seed + (unsigned int)i;
1790 		}
1791 		RAND_seed(buf, 256);
1792 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1793 	}
1794 }
1795 
1796 static int
1797 get_ocsp(char *filename, unsigned char **ocsp)
1798 {
1799 	BIO *bio;
1800 	OCSP_RESPONSE *response;
1801 	int len = -1;
1802 	unsigned char *p, *buf;
1803 	assert(filename);
1804 
1805 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1806 		log_crypto_err("get_ocsp: BIO_new_file failed");
1807 		return -1;
1808 	}
1809 
1810 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1811 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1812 		BIO_free(bio);
1813 		return -1;
1814 	}
1815 
1816 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1817 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1818 		OCSP_RESPONSE_free(response);
1819 		BIO_free(bio);
1820 		return -1;
1821 	}
1822 
1823 	if ((buf = malloc((size_t) len)) == NULL) {
1824 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1825 		OCSP_RESPONSE_free(response);
1826 		BIO_free(bio);
1827 		return -1;
1828 	}
1829 
1830 	p = buf;
1831 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1832 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1833 		free(buf);
1834 		OCSP_RESPONSE_free(response);
1835 		BIO_free(bio);
1836 		return -1;
1837 	}
1838 
1839 	OCSP_RESPONSE_free(response);
1840 	BIO_free(bio);
1841 
1842 	*ocsp = buf;
1843 	return len;
1844 }
1845 
1846 /* further setup ssl ctx after the keys are loaded */
1847 static void
1848 listen_sslctx_setup_2(void* ctxt)
1849 {
1850 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1851 	(void)ctx;
1852 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1853 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1854 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1855 	}
1856 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1857 	if(1) {
1858 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1859 		if (!ecdh) {
1860 			log_crypto_err("could not find p256, not enabling ECDHE");
1861 		} else {
1862 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1863 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1864 			}
1865 			EC_KEY_free (ecdh);
1866 		}
1867 	}
1868 #endif
1869 }
1870 
1871 static int
1872 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1873 {
1874 	if(ocspdata) {
1875 		unsigned char *p;
1876 		if ((p=malloc(ocspdata_len)) == NULL) {
1877 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1878 			return SSL_TLSEXT_ERR_NOACK;
1879 		}
1880 		memcpy(p, ocspdata, ocspdata_len);
1881 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1882 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1883 			free(p);
1884 			return SSL_TLSEXT_ERR_NOACK;
1885 		}
1886 		return SSL_TLSEXT_ERR_OK;
1887 	} else {
1888 		return SSL_TLSEXT_ERR_NOACK;
1889 	}
1890 }
1891 
1892 SSL_CTX*
1893 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1894 {
1895 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1896 	if(!ctx) {
1897 		log_crypto_err("could not SSL_CTX_new");
1898 		return NULL;
1899 	}
1900 	/* no SSLv2, SSLv3 because has defects */
1901 #if SSL_OP_NO_SSLv2 != 0
1902 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1903 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1904 		SSL_CTX_free(ctx);
1905 		return NULL;
1906 	}
1907 #endif
1908 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1909 		!= SSL_OP_NO_SSLv3){
1910 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1911 		SSL_CTX_free(ctx);
1912 		return 0;
1913 	}
1914 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1915 	/* if we have tls 1.1 disable 1.0 */
1916 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1917 		!= SSL_OP_NO_TLSv1){
1918 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1919 		SSL_CTX_free(ctx);
1920 		return 0;
1921 	}
1922 #endif
1923 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1924 	/* if we have tls 1.2 disable 1.1 */
1925 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1926 		!= SSL_OP_NO_TLSv1_1){
1927 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1928 		SSL_CTX_free(ctx);
1929 		return 0;
1930 	}
1931 #endif
1932 #if defined(SSL_OP_NO_RENEGOTIATION)
1933 	/* disable client renegotiation */
1934 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1935 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1936 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1937 		SSL_CTX_free(ctx);
1938 		return 0;
1939 	}
1940 #endif
1941 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
1942 	/* if we have sha256, set the cipher list to have no known vulns */
1943 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
1944 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
1945 #endif
1946 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
1947 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
1948 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
1949 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
1950 		SSL_CTX_free(ctx);
1951 		return 0;
1952 	}
1953 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
1954 	SSL_CTX_set_security_level(ctx, 0);
1955 #endif
1956 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
1957 		log_msg(LOG_ERR, "error for cert file: %s", pem);
1958 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
1959 		SSL_CTX_free(ctx);
1960 		return NULL;
1961 	}
1962 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
1963 		log_msg(LOG_ERR, "error for private key file: %s", key);
1964 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
1965 		SSL_CTX_free(ctx);
1966 		return NULL;
1967 	}
1968 	if(!SSL_CTX_check_private_key(ctx)) {
1969 		log_msg(LOG_ERR, "error for key file: %s", key);
1970 		log_crypto_err("Error in SSL_CTX check_private_key");
1971 		SSL_CTX_free(ctx);
1972 		return NULL;
1973 	}
1974 	listen_sslctx_setup_2(ctx);
1975 	if(verifypem && verifypem[0]) {
1976 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
1977 			log_crypto_err("Error in SSL_CTX verify locations");
1978 			SSL_CTX_free(ctx);
1979 			return NULL;
1980 		}
1981 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
1982 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
1983 	}
1984 	return ctx;
1985 }
1986 
1987 SSL_CTX*
1988 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
1989 {
1990 	char *key, *pem;
1991 	SSL_CTX *ctx;
1992 
1993 	key = nsd->options->tls_service_key;
1994 	pem = nsd->options->tls_service_pem;
1995 	if(!key || key[0] == 0) {
1996 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
1997 		return NULL;
1998 	}
1999 	if(!pem || pem[0] == 0) {
2000 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2001 		return NULL;
2002 	}
2003 
2004 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2005 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2006 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2007 	if(!ctx) {
2008 		log_msg(LOG_ERR, "could not setup server TLS context");
2009 		return NULL;
2010 	}
2011 	if(ocspfile && ocspfile[0]) {
2012 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2013 			log_crypto_err("Error reading OCSPfile");
2014 			SSL_CTX_free(ctx);
2015 			return NULL;
2016 		} else {
2017 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2018 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2019 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2020 				SSL_CTX_free(ctx);
2021 				return NULL;
2022 			}
2023 		}
2024 	}
2025 	return ctx;
2026 }
2027 
2028 /* check if tcp_handler_accept_data created for TLS dedicated port */
2029 int
2030 using_tls_port(struct sockaddr* addr, const char* tls_port)
2031 {
2032 	in_port_t port = 0;
2033 
2034 	if (addr->sa_family == AF_INET)
2035 		port = ((struct sockaddr_in*)addr)->sin_port;
2036 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2037 	else
2038 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2039 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2040 	if (atoi(tls_port) == ntohs(port))
2041 		return 1;
2042 
2043 	return 0;
2044 }
2045 #endif
2046 
2047 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2048 ssize_t
2049 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2050 {
2051 	uint8_t* buf = (uint8_t*) p;
2052 	ssize_t total = 0;
2053 	struct pollfd fd;
2054 	memset(&fd, 0, sizeof(fd));
2055 	fd.fd = s;
2056 	fd.events = POLLIN;
2057 
2058 	while( total < sz) {
2059 		ssize_t ret;
2060 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2061 		if(ret == -1) {
2062 			if(errno == EAGAIN)
2063 				/* blocking read */
2064 				continue;
2065 			if(errno == EINTR) {
2066 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2067 					return -1;
2068 				/* other signals can be handled later */
2069 				continue;
2070 			}
2071 			/* some error */
2072 			return -1;
2073 		}
2074 		if(ret == 0) {
2075 			/* operation timed out */
2076 			return -2;
2077 		}
2078 		ret = read(s, buf+total, sz-total);
2079 		if(ret == -1) {
2080 			if(errno == EAGAIN)
2081 				/* blocking read */
2082 				continue;
2083 			if(errno == EINTR) {
2084 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2085 					return -1;
2086 				/* other signals can be handled later */
2087 				continue;
2088 			}
2089 			/* some error */
2090 			return -1;
2091 		}
2092 		if(ret == 0) {
2093 			/* closed connection! */
2094 			return 0;
2095 		}
2096 		total += ret;
2097 	}
2098 	return total;
2099 }
2100 
2101 static void
2102 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2103 {
2104 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2105 	udb_ptr t, next;
2106 	udb_base* u = nsd->task[nsd->mytask];
2107 	udb_ptr_init(&next, u);
2108 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2109 	udb_base_set_userdata(u, 0);
2110 	while(!udb_ptr_is_null(&t)) {
2111 		/* store next in list so this one can be deleted or reused */
2112 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2113 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2114 
2115 		/* process task t */
2116 		/* append results for task t and update last_task */
2117 		task_process_in_reload(nsd, u, last_task, &t);
2118 
2119 		/* go to next */
2120 		udb_ptr_set_ptr(&t, u, &next);
2121 
2122 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2123 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2124 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2125 			if(cmd == NSD_QUIT) {
2126 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2127 				/* sync to disk (if needed) */
2128 				udb_base_sync(nsd->db->udb, 0);
2129 				/* unlink files of remainder of tasks */
2130 				while(!udb_ptr_is_null(&t)) {
2131 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2132 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2133 					}
2134 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2135 				}
2136 				udb_ptr_unlink(&t, u);
2137 				udb_ptr_unlink(&next, u);
2138 				exit(0);
2139 			}
2140 		}
2141 
2142 	}
2143 	udb_ptr_unlink(&t, u);
2144 	udb_ptr_unlink(&next, u);
2145 }
2146 
2147 #ifdef BIND8_STATS
2148 static void
2149 parent_send_stats(struct nsd* nsd, int cmdfd)
2150 {
2151 	size_t i;
2152 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2153 		log_msg(LOG_ERR, "could not write stats to reload");
2154 		return;
2155 	}
2156 	for(i=0; i<nsd->child_count; i++)
2157 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2158 			sizeof(stc_type))) {
2159 			log_msg(LOG_ERR, "could not write stats to reload");
2160 			return;
2161 		}
2162 }
2163 
2164 static void
2165 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2166 {
2167 	struct nsdst s;
2168 	stc_type* p;
2169 	size_t i;
2170 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2171 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2172 		log_msg(LOG_ERR, "could not read stats from oldpar");
2173 		return;
2174 	}
2175 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2176 	s.db_mem = region_get_mem(nsd->db->region);
2177 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2178 		nsd->child_count);
2179 	if(!p) return;
2180 	for(i=0; i<nsd->child_count; i++) {
2181 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2182 			sizeof(stc_type))
2183 			return;
2184 	}
2185 }
2186 #endif /* BIND8_STATS */
2187 
2188 /*
2189  * Reload the database, stop parent, re-fork children and continue.
2190  * as server_main.
2191  */
2192 static void
2193 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2194 	int cmdsocket)
2195 {
2196 	pid_t mypid;
2197 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2198 	int ret;
2199 	udb_ptr last_task;
2200 	struct sigaction old_sigchld, ign_sigchld;
2201 	/* ignore SIGCHLD from the previous server_main that used this pid */
2202 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2203 	ign_sigchld.sa_handler = SIG_IGN;
2204 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2205 
2206 #ifdef HAVE_SETPROCTITLE
2207 	setproctitle("main");
2208 #endif
2209 #ifdef HAVE_CPUSET_T
2210 	if(nsd->use_cpu_affinity) {
2211 		set_cpu_affinity(nsd->cpuset);
2212 	}
2213 #endif
2214 
2215 	/* see what tasks we got from xfrd */
2216 	task_remap(nsd->task[nsd->mytask]);
2217 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2218 	udb_compact_inhibited(nsd->db->udb, 1);
2219 	reload_process_tasks(nsd, &last_task, cmdsocket);
2220 	udb_compact_inhibited(nsd->db->udb, 0);
2221 	udb_compact(nsd->db->udb);
2222 
2223 #ifndef NDEBUG
2224 	if(nsd_debug_level >= 1)
2225 		region_log_stats(nsd->db->region);
2226 #endif /* NDEBUG */
2227 	/* sync to disk (if needed) */
2228 	udb_base_sync(nsd->db->udb, 0);
2229 
2230 	initialize_dname_compression_tables(nsd);
2231 
2232 #ifdef BIND8_STATS
2233 	/* Restart dumping stats if required.  */
2234 	time(&nsd->st.boot);
2235 	set_bind8_alarm(nsd);
2236 #endif
2237 #ifdef USE_ZONE_STATS
2238 	server_zonestat_realloc(nsd); /* realloc for new children */
2239 	server_zonestat_switch(nsd);
2240 #endif
2241 
2242 	/* listen for the signals of failed children again */
2243 	sigaction(SIGCHLD, &old_sigchld, NULL);
2244 	/* Start new child processes */
2245 	if (server_start_children(nsd, server_region, netio, &nsd->
2246 		xfrd_listener->fd) != 0) {
2247 		send_children_quit(nsd);
2248 		exit(1);
2249 	}
2250 
2251 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2252 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2253 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2254 		if(cmd == NSD_QUIT) {
2255 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2256 			send_children_quit(nsd);
2257 			exit(0);
2258 		}
2259 	}
2260 
2261 	/* Send quit command to parent: blocking, wait for receipt. */
2262 	do {
2263 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2264 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2265 		{
2266 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2267 				strerror(errno));
2268 		}
2269 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2270 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2271 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2272 			RELOAD_SYNC_TIMEOUT);
2273 		if(ret == -2) {
2274 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2275 		}
2276 	} while (ret == -2);
2277 	if(ret == -1) {
2278 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2279 			strerror(errno));
2280 	}
2281 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2282 	if(cmd == NSD_QUIT) {
2283 		/* small race condition possible here, parent got quit cmd. */
2284 		send_children_quit(nsd);
2285 		exit(1);
2286 	}
2287 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2288 #ifdef BIND8_STATS
2289 	reload_do_stats(cmdsocket, nsd, &last_task);
2290 #endif
2291 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2292 	task_process_sync(nsd->task[nsd->mytask]);
2293 #ifdef USE_ZONE_STATS
2294 	server_zonestat_realloc(nsd); /* realloc for next children */
2295 #endif
2296 
2297 	/* send soainfo to the xfrd process, signal it that reload is done,
2298 	 * it picks up the taskudb */
2299 	cmd = NSD_RELOAD_DONE;
2300 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2301 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2302 			strerror(errno));
2303 	}
2304 	mypid = getpid();
2305 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2306 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2307 			strerror(errno));
2308 	}
2309 
2310 	/* try to reopen file */
2311 	if (nsd->file_rotation_ok)
2312 		log_reopen(nsd->log_filename, 1);
2313 	/* exit reload, continue as new server_main */
2314 }
2315 
2316 /*
2317  * Get the mode depending on the signal hints that have been received.
2318  * Multiple signal hints can be received and will be handled in turn.
2319  */
2320 static sig_atomic_t
2321 server_signal_mode(struct nsd *nsd)
2322 {
2323 	if(nsd->signal_hint_quit) {
2324 		nsd->signal_hint_quit = 0;
2325 		return NSD_QUIT;
2326 	}
2327 	else if(nsd->signal_hint_shutdown) {
2328 		nsd->signal_hint_shutdown = 0;
2329 		return NSD_SHUTDOWN;
2330 	}
2331 	else if(nsd->signal_hint_child) {
2332 		nsd->signal_hint_child = 0;
2333 		return NSD_REAP_CHILDREN;
2334 	}
2335 	else if(nsd->signal_hint_reload) {
2336 		nsd->signal_hint_reload = 0;
2337 		return NSD_RELOAD;
2338 	}
2339 	else if(nsd->signal_hint_reload_hup) {
2340 		nsd->signal_hint_reload_hup = 0;
2341 		return NSD_RELOAD_REQ;
2342 	}
2343 	else if(nsd->signal_hint_stats) {
2344 		nsd->signal_hint_stats = 0;
2345 #ifdef BIND8_STATS
2346 		set_bind8_alarm(nsd);
2347 #endif
2348 		return NSD_STATS;
2349 	}
2350 	else if(nsd->signal_hint_statsusr) {
2351 		nsd->signal_hint_statsusr = 0;
2352 		return NSD_STATS;
2353 	}
2354 	return NSD_RUN;
2355 }
2356 
2357 /*
2358  * The main server simply waits for signals and child processes to
2359  * terminate.  Child processes are restarted as necessary.
2360  */
2361 void
2362 server_main(struct nsd *nsd)
2363 {
2364 	region_type *server_region = region_create(xalloc, free);
2365 	netio_type *netio = netio_create(server_region);
2366 	netio_handler_type reload_listener;
2367 	int reload_sockets[2] = {-1, -1};
2368 	struct timespec timeout_spec;
2369 	int status;
2370 	pid_t child_pid;
2371 	pid_t reload_pid = -1;
2372 	sig_atomic_t mode;
2373 
2374 	/* Ensure we are the main process */
2375 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2376 
2377 	/* Add listener for the XFRD process */
2378 	netio_add_handler(netio, nsd->xfrd_listener);
2379 
2380 	/* Start the child processes that handle incoming queries */
2381 	if (server_start_children(nsd, server_region, netio,
2382 		&nsd->xfrd_listener->fd) != 0) {
2383 		send_children_quit(nsd);
2384 		exit(1);
2385 	}
2386 	reload_listener.fd = -1;
2387 
2388 	/* This_child MUST be 0, because this is the parent process */
2389 	assert(nsd->this_child == 0);
2390 
2391 	/* Run the server until we get a shutdown signal */
2392 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2393 		/* Did we receive a signal that changes our mode? */
2394 		if(mode == NSD_RUN) {
2395 			nsd->mode = mode = server_signal_mode(nsd);
2396 		}
2397 
2398 		switch (mode) {
2399 		case NSD_RUN:
2400 			/* see if any child processes terminated */
2401 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2402 				int is_child = delete_child_pid(nsd, child_pid);
2403 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2404 					if(nsd->children[is_child].child_fd == -1)
2405 						nsd->children[is_child].has_exited = 1;
2406 					parent_check_all_children_exited(nsd);
2407 				} else if(is_child != -1) {
2408 					log_msg(LOG_WARNING,
2409 					       "server %d died unexpectedly with status %d, restarting",
2410 					       (int) child_pid, status);
2411 					restart_child_servers(nsd, server_region, netio,
2412 						&nsd->xfrd_listener->fd);
2413 				} else if (child_pid == reload_pid) {
2414 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2415 					pid_t mypid;
2416 					log_msg(LOG_WARNING,
2417 					       "Reload process %d failed with status %d, continuing with old database",
2418 					       (int) child_pid, status);
2419 					reload_pid = -1;
2420 					if(reload_listener.fd != -1) close(reload_listener.fd);
2421 					reload_listener.fd = -1;
2422 					reload_listener.event_types = NETIO_EVENT_NONE;
2423 					task_process_sync(nsd->task[nsd->mytask]);
2424 					/* inform xfrd reload attempt ended */
2425 					if(!write_socket(nsd->xfrd_listener->fd,
2426 						&cmd, sizeof(cmd))) {
2427 						log_msg(LOG_ERR, "problems "
2428 						  "sending SOAEND to xfrd: %s",
2429 						  strerror(errno));
2430 					}
2431 					mypid = getpid();
2432 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2433 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2434 							strerror(errno));
2435 					}
2436 				} else if(status != 0) {
2437 					/* check for status, because we get
2438 					 * the old-servermain because reload
2439 					 * is the process-parent of old-main,
2440 					 * and we get older server-processes
2441 					 * that are exiting after a reload */
2442 					log_msg(LOG_WARNING,
2443 					       "process %d terminated with status %d",
2444 					       (int) child_pid, status);
2445 				}
2446 			}
2447 			if (child_pid == -1) {
2448 				if (errno == EINTR) {
2449 					continue;
2450 				}
2451 				if (errno != ECHILD)
2452 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2453 			}
2454 			if (nsd->mode != NSD_RUN)
2455 				break;
2456 
2457 			/* timeout to collect processes. In case no sigchild happens. */
2458 			timeout_spec.tv_sec = 60;
2459 			timeout_spec.tv_nsec = 0;
2460 
2461 			/* listen on ports, timeout for collecting terminated children */
2462 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2463 				if (errno != EINTR) {
2464 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2465 				}
2466 			}
2467 			if(nsd->restart_children) {
2468 				restart_child_servers(nsd, server_region, netio,
2469 					&nsd->xfrd_listener->fd);
2470 				nsd->restart_children = 0;
2471 			}
2472 			if(nsd->reload_failed) {
2473 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2474 				pid_t mypid;
2475 				nsd->reload_failed = 0;
2476 				log_msg(LOG_WARNING,
2477 				       "Reload process %d failed, continuing with old database",
2478 				       (int) reload_pid);
2479 				reload_pid = -1;
2480 				if(reload_listener.fd != -1) close(reload_listener.fd);
2481 				reload_listener.fd = -1;
2482 				reload_listener.event_types = NETIO_EVENT_NONE;
2483 				task_process_sync(nsd->task[nsd->mytask]);
2484 				/* inform xfrd reload attempt ended */
2485 				if(!write_socket(nsd->xfrd_listener->fd,
2486 					&cmd, sizeof(cmd))) {
2487 					log_msg(LOG_ERR, "problems "
2488 					  "sending SOAEND to xfrd: %s",
2489 					  strerror(errno));
2490 				}
2491 				mypid = getpid();
2492 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2493 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2494 						strerror(errno));
2495 				}
2496 			}
2497 
2498 			break;
2499 		case NSD_RELOAD_REQ: {
2500 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2501 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2502 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2503 				"main: ipc send reload_req to xfrd"));
2504 			if(!write_socket(nsd->xfrd_listener->fd,
2505 				&cmd, sizeof(cmd))) {
2506 				log_msg(LOG_ERR, "server_main: could not send "
2507 				"reload_req to xfrd: %s", strerror(errno));
2508 			}
2509 			nsd->mode = NSD_RUN;
2510 			} break;
2511 		case NSD_RELOAD:
2512 			/* Continue to run nsd after reload */
2513 			nsd->mode = NSD_RUN;
2514 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2515 			if (reload_pid != -1) {
2516 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2517 				       (int) reload_pid);
2518 				break;
2519 			}
2520 
2521 			/* switch the mytask to keep track of who owns task*/
2522 			nsd->mytask = 1 - nsd->mytask;
2523 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2524 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2525 				reload_pid = -1;
2526 				break;
2527 			}
2528 
2529 			/* Do actual reload */
2530 			reload_pid = fork();
2531 			switch (reload_pid) {
2532 			case -1:
2533 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2534 				break;
2535 			default:
2536 				/* PARENT */
2537 				close(reload_sockets[0]);
2538 				server_reload(nsd, server_region, netio,
2539 					reload_sockets[1]);
2540 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2541 				close(reload_sockets[1]);
2542 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2543 				/* drop stale xfrd ipc data */
2544 				((struct ipc_handler_conn_data*)nsd->
2545 					xfrd_listener->user_data)
2546 					->conn->is_reading = 0;
2547 				reload_pid = -1;
2548 				reload_listener.fd = -1;
2549 				reload_listener.event_types = NETIO_EVENT_NONE;
2550 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2551 				break;
2552 			case 0:
2553 				/* CHILD */
2554 				/* server_main keep running until NSD_QUIT_SYNC
2555 				 * received from reload. */
2556 				close(reload_sockets[1]);
2557 				reload_listener.fd = reload_sockets[0];
2558 				reload_listener.timeout = NULL;
2559 				reload_listener.user_data = nsd;
2560 				reload_listener.event_types = NETIO_EVENT_READ;
2561 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2562 				netio_add_handler(netio, &reload_listener);
2563 				reload_pid = getppid();
2564 				break;
2565 			}
2566 			break;
2567 		case NSD_QUIT_SYNC:
2568 			/* synchronisation of xfrd, parent and reload */
2569 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2570 				sig_atomic_t cmd = NSD_RELOAD;
2571 				/* stop xfrd ipc writes in progress */
2572 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2573 					"main: ipc send indication reload"));
2574 				if(!write_socket(nsd->xfrd_listener->fd,
2575 					&cmd, sizeof(cmd))) {
2576 					log_msg(LOG_ERR, "server_main: could not send reload "
2577 					"indication to xfrd: %s", strerror(errno));
2578 				}
2579 				/* wait for ACK from xfrd */
2580 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2581 				nsd->quit_sync_done = 1;
2582 			}
2583 			nsd->mode = NSD_RUN;
2584 			break;
2585 		case NSD_QUIT:
2586 			/* silent shutdown during reload */
2587 			if(reload_listener.fd != -1) {
2588 				/* acknowledge the quit, to sync reload that we will really quit now */
2589 				sig_atomic_t cmd = NSD_RELOAD;
2590 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2591 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2592 					log_msg(LOG_ERR, "server_main: "
2593 						"could not ack quit: %s", strerror(errno));
2594 				}
2595 #ifdef BIND8_STATS
2596 				parent_send_stats(nsd, reload_listener.fd);
2597 #endif /* BIND8_STATS */
2598 				close(reload_listener.fd);
2599 			}
2600 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2601 			/* only quit children after xfrd has acked */
2602 			send_children_quit(nsd);
2603 
2604 #ifdef MEMCLEAN /* OS collects memory pages */
2605 			region_destroy(server_region);
2606 #endif
2607 			server_shutdown(nsd);
2608 
2609 			/* ENOTREACH */
2610 			break;
2611 		case NSD_SHUTDOWN:
2612 			break;
2613 		case NSD_REAP_CHILDREN:
2614 			/* continue; wait for child in run loop */
2615 			nsd->mode = NSD_RUN;
2616 			break;
2617 		case NSD_STATS:
2618 #ifdef BIND8_STATS
2619 			set_children_stats(nsd);
2620 #endif
2621 			nsd->mode = NSD_RUN;
2622 			break;
2623 		default:
2624 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2625 			nsd->mode = NSD_RUN;
2626 			break;
2627 		}
2628 	}
2629 	log_msg(LOG_WARNING, "signal received, shutting down...");
2630 
2631 	/* close opened ports to avoid race with restart of nsd */
2632 	server_close_all_sockets(nsd->udp, nsd->ifs);
2633 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2634 #ifdef HAVE_SSL
2635 	daemon_remote_close(nsd->rc);
2636 #endif
2637 	send_children_quit_and_wait(nsd);
2638 
2639 	/* Unlink it if possible... */
2640 	unlinkpid(nsd->pidfile);
2641 	unlink(nsd->task[0]->fname);
2642 	unlink(nsd->task[1]->fname);
2643 #ifdef USE_ZONE_STATS
2644 	unlink(nsd->zonestatfname[0]);
2645 	unlink(nsd->zonestatfname[1]);
2646 #endif
2647 #ifdef USE_DNSTAP
2648 	dt_collector_close(nsd->dt_collector, nsd);
2649 #endif
2650 
2651 	if(reload_listener.fd != -1) {
2652 		sig_atomic_t cmd = NSD_QUIT;
2653 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2654 			"main: ipc send quit to reload-process"));
2655 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2656 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2657 				strerror(errno));
2658 		}
2659 		fsync(reload_listener.fd);
2660 		close(reload_listener.fd);
2661 		/* wait for reload to finish processing */
2662 		while(1) {
2663 			if(waitpid(reload_pid, NULL, 0) == -1) {
2664 				if(errno == EINTR) continue;
2665 				if(errno == ECHILD) break;
2666 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2667 					(int)reload_pid, strerror(errno));
2668 			}
2669 			break;
2670 		}
2671 	}
2672 	if(nsd->xfrd_listener->fd != -1) {
2673 		/* complete quit, stop xfrd */
2674 		sig_atomic_t cmd = NSD_QUIT;
2675 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2676 			"main: ipc send quit to xfrd"));
2677 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2678 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2679 				strerror(errno));
2680 		}
2681 		fsync(nsd->xfrd_listener->fd);
2682 		close(nsd->xfrd_listener->fd);
2683 		(void)kill(nsd->pid, SIGTERM);
2684 	}
2685 
2686 #ifdef MEMCLEAN /* OS collects memory pages */
2687 	region_destroy(server_region);
2688 #endif
2689 	/* write the nsd.db to disk, wait for it to complete */
2690 	udb_base_sync(nsd->db->udb, 1);
2691 	udb_base_close(nsd->db->udb);
2692 	server_shutdown(nsd);
2693 }
2694 
2695 static query_state_type
2696 server_process_query(struct nsd *nsd, struct query *query)
2697 {
2698 	return query_process(query, nsd);
2699 }
2700 
2701 static query_state_type
2702 server_process_query_udp(struct nsd *nsd, struct query *query)
2703 {
2704 #ifdef RATELIMIT
2705 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2706 		if(rrl_process_query(query))
2707 			return rrl_slip(query);
2708 		else	return QUERY_PROCESSED;
2709 	}
2710 	return QUERY_DISCARDED;
2711 #else
2712 	return query_process(query, nsd);
2713 #endif
2714 }
2715 
2716 struct event_base*
2717 nsd_child_event_base(void)
2718 {
2719 	struct event_base* base;
2720 #ifdef USE_MINI_EVENT
2721 	static time_t secs;
2722 	static struct timeval now;
2723 	base = event_init(&secs, &now);
2724 #else
2725 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2726 	/* libev */
2727 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2728 #  else
2729 	/* libevent */
2730 #    ifdef HAVE_EVENT_BASE_NEW
2731 	base = event_base_new();
2732 #    else
2733 	base = event_init();
2734 #    endif
2735 #  endif
2736 #endif
2737 	return base;
2738 }
2739 
2740 static void
2741 add_udp_handler(
2742 	struct nsd *nsd,
2743 	struct nsd_socket *sock,
2744 	struct udp_handler_data *data)
2745 {
2746 	struct event *handler = &data->event;
2747 
2748 	data->nsd = nsd;
2749 	data->socket = sock;
2750 
2751 	memset(handler, 0, sizeof(*handler));
2752 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2753 	if(event_base_set(nsd->event_base, handler) != 0)
2754 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2755 	if(event_add(handler, NULL) != 0)
2756 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2757 }
2758 
2759 void
2760 add_tcp_handler(
2761 	struct nsd *nsd,
2762 	struct nsd_socket *sock,
2763 	struct tcp_accept_handler_data *data)
2764 {
2765 	struct event *handler = &data->event;
2766 
2767 	data->nsd = nsd;
2768 	data->socket = sock;
2769 
2770 #ifdef HAVE_SSL
2771 	if (nsd->tls_ctx &&
2772 	    nsd->options->tls_port &&
2773 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2774 	{
2775 		data->tls_accept = 1;
2776 		if(verbosity >= 2) {
2777 			char buf[48];
2778 			addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2779 			VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2780 		}
2781 	} else {
2782 		data->tls_accept = 0;
2783 	}
2784 #endif
2785 
2786 	memset(handler, 0, sizeof(*handler));
2787 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2788 	if(event_base_set(nsd->event_base, handler) != 0)
2789 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2790 	if(event_add(handler, NULL) != 0)
2791 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2792 	data->event_added = 1;
2793 }
2794 
2795 /*
2796  * Serve DNS requests.
2797  */
2798 void
2799 server_child(struct nsd *nsd)
2800 {
2801 	size_t i, from, numifs;
2802 	region_type *server_region = region_create(xalloc, free);
2803 	struct event_base* event_base = nsd_child_event_base();
2804 	sig_atomic_t mode;
2805 
2806 	if(!event_base) {
2807 		log_msg(LOG_ERR, "nsd server could not create event base");
2808 		exit(1);
2809 	}
2810 	nsd->event_base = event_base;
2811 	nsd->server_region = server_region;
2812 
2813 #ifdef RATELIMIT
2814 	rrl_init(nsd->this_child->child_num);
2815 #endif
2816 
2817 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2818 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2819 
2820 #ifdef HAVE_SETPROCTITLE
2821 	setproctitle("server %d", nsd->this_child->child_num + 1);
2822 #endif
2823 #ifdef HAVE_CPUSET_T
2824 	if(nsd->use_cpu_affinity) {
2825 		set_cpu_affinity(nsd->this_child->cpuset);
2826 	}
2827 #endif
2828 
2829 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2830 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2831 	}
2832 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2833 		server_close_all_sockets(nsd->udp, nsd->ifs);
2834 	}
2835 
2836 	if (nsd->this_child->parent_fd != -1) {
2837 		struct event *handler;
2838 		struct ipc_handler_conn_data* user_data =
2839 			(struct ipc_handler_conn_data*)region_alloc(
2840 			server_region, sizeof(struct ipc_handler_conn_data));
2841 		user_data->nsd = nsd;
2842 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2843 
2844 		handler = (struct event*) region_alloc(
2845 			server_region, sizeof(*handler));
2846 		memset(handler, 0, sizeof(*handler));
2847 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2848 			EV_READ, child_handle_parent_command, user_data);
2849 		if(event_base_set(event_base, handler) != 0)
2850 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2851 		if(event_add(handler, NULL) != 0)
2852 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2853 	}
2854 
2855 	if(nsd->reuseport) {
2856 		numifs = nsd->ifs / nsd->reuseport;
2857 		from = numifs * nsd->this_child->child_num;
2858 		if(from+numifs > nsd->ifs) { /* should not happen */
2859 			from = 0;
2860 			numifs = nsd->ifs;
2861 		}
2862 	} else {
2863 		from = 0;
2864 		numifs = nsd->ifs;
2865 	}
2866 
2867 	if (nsd->server_kind & NSD_SERVER_UDP) {
2868 		int child = nsd->this_child->child_num;
2869 		memset(msgs, 0, sizeof(msgs));
2870 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2871 			queries[i] = query_create(server_region,
2872 				compressed_dname_offsets,
2873 				compression_table_size, compressed_dnames);
2874 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2875 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2876 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
2877 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2878 			msgs[i].msg_hdr.msg_iovlen  = 1;
2879 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2880 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2881 		}
2882 
2883 		for (i = 0; i < nsd->ifs; i++) {
2884 			int listen;
2885 			struct udp_handler_data *data;
2886 
2887 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
2888 
2889 			if(i >= from && i < (from + numifs) && listen) {
2890 				data = region_alloc_zero(
2891 					nsd->server_region, sizeof(*data));
2892 				add_udp_handler(nsd, &nsd->udp[i], data);
2893 			} else {
2894 				/* close sockets intended for other servers */
2895 				server_close_socket(&nsd->udp[i]);
2896 			}
2897 		}
2898 	}
2899 
2900 	/*
2901 	 * Keep track of all the TCP accept handlers so we can enable
2902 	 * and disable them based on the current number of active TCP
2903 	 * connections.
2904 	 */
2905 	if (nsd->server_kind & NSD_SERVER_TCP) {
2906 		int child = nsd->this_child->child_num;
2907 		tcp_accept_handler_count = numifs;
2908 		tcp_accept_handlers = region_alloc_array(server_region,
2909 			numifs, sizeof(*tcp_accept_handlers));
2910 
2911 		for (i = 0; i < nsd->ifs; i++) {
2912 			int listen;
2913 			struct tcp_accept_handler_data *data;
2914 
2915 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
2916 
2917 			if(i >= from && i < (from + numifs) && listen) {
2918 				data = &tcp_accept_handlers[i-from];
2919 				memset(data, 0, sizeof(*data));
2920 				add_tcp_handler(nsd, &nsd->tcp[i], data);
2921 			} else {
2922 				/* close sockets intended for other servers */
2923 				/*
2924 				 * uncomment this once tcp servers are no
2925 				 * longer copied in the tcp fd copy line
2926 				 * in server_init().
2927 				server_close_socket(&nsd->tcp[i]);
2928 				*/
2929 				/* close sockets not meant for this server*/
2930 				if(!listen)
2931 					server_close_socket(&nsd->tcp[i]);
2932 			}
2933 		}
2934 	} else {
2935 		tcp_accept_handler_count = 0;
2936 	}
2937 
2938 	/* The main loop... */
2939 	while ((mode = nsd->mode) != NSD_QUIT) {
2940 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2941 
2942 		/* Do we need to do the statistics... */
2943 		if (mode == NSD_STATS) {
2944 #ifdef BIND8_STATS
2945 			int p = nsd->st.period;
2946 			nsd->st.period = 1; /* force stats printout */
2947 			/* Dump the statistics */
2948 			bind8_stats(nsd);
2949 			nsd->st.period = p;
2950 #else /* !BIND8_STATS */
2951 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2952 #endif /* BIND8_STATS */
2953 
2954 			nsd->mode = NSD_RUN;
2955 		}
2956 		else if (mode == NSD_REAP_CHILDREN) {
2957 			/* got signal, notify parent. parent reaps terminated children. */
2958 			if (nsd->this_child->parent_fd != -1) {
2959 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2960 				if (write(nsd->this_child->parent_fd,
2961 				    &parent_notify,
2962 				    sizeof(parent_notify)) == -1)
2963 				{
2964 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2965 						(int) nsd->this_child->pid, strerror(errno));
2966 				}
2967 			} else /* no parent, so reap 'em */
2968 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2969 			nsd->mode = NSD_RUN;
2970 		}
2971 		else if(mode == NSD_RUN) {
2972 			/* Wait for a query... */
2973 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2974 				if (errno != EINTR) {
2975 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2976 					break;
2977 				}
2978 			}
2979 		} else if(mode == NSD_QUIT) {
2980 			/* ignore here, quit */
2981 		} else {
2982 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2983 				(int)mode);
2984 			nsd->mode = NSD_RUN;
2985 		}
2986 	}
2987 
2988 	service_remaining_tcp(nsd);
2989 #ifdef	BIND8_STATS
2990 	bind8_stats(nsd);
2991 #endif /* BIND8_STATS */
2992 
2993 #ifdef MEMCLEAN /* OS collects memory pages */
2994 #ifdef RATELIMIT
2995 	rrl_deinit(nsd->this_child->child_num);
2996 #endif
2997 	event_base_free(event_base);
2998 	region_destroy(server_region);
2999 #endif
3000 	server_shutdown(nsd);
3001 }
3002 
3003 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3004 {
3005 	int* timed_out = (int*)arg;
3006         assert(event & EV_TIMEOUT);
3007 	/* wake up the service tcp thread, note event is no longer
3008 	 * registered */
3009 	*timed_out = 1;
3010 }
3011 
3012 void
3013 service_remaining_tcp(struct nsd* nsd)
3014 {
3015 	struct tcp_handler_data* p;
3016 	struct event_base* event_base;
3017 	/* check if it is needed */
3018 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3019 		return;
3020 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3021 
3022 	/* setup event base */
3023 	event_base = nsd_child_event_base();
3024 	if(!event_base) {
3025 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3026 		return;
3027 	}
3028 	/* register tcp connections */
3029 	for(p = tcp_active_list; p != NULL; p = p->next) {
3030 		struct timeval timeout;
3031 		int fd = p->event.ev_fd;
3032 #ifdef USE_MINI_EVENT
3033 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3034 #else
3035 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3036 #endif
3037 		void (*fn)(int, short, void*);
3038 #ifdef HAVE_SSL
3039 		if(p->tls) {
3040 			if((event&EV_READ))
3041 				fn = handle_tls_reading;
3042 			else	fn = handle_tls_writing;
3043 		} else {
3044 #endif
3045 			if((event&EV_READ))
3046 				fn = handle_tcp_reading;
3047 			else	fn = handle_tcp_writing;
3048 #ifdef HAVE_SSL
3049 		}
3050 #endif
3051 
3052 		/* set timeout to 1/10 second */
3053 		if(p->tcp_timeout > 100)
3054 			p->tcp_timeout = 100;
3055 		timeout.tv_sec = p->tcp_timeout / 1000;
3056 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3057 		event_del(&p->event);
3058 		memset(&p->event, 0, sizeof(p->event));
3059 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3060 			fn, p);
3061 		if(event_base_set(event_base, &p->event) != 0)
3062 			log_msg(LOG_ERR, "event base set failed");
3063 		if(event_add(&p->event, &timeout) != 0)
3064 			log_msg(LOG_ERR, "event add failed");
3065 	}
3066 
3067 	/* handle it */
3068 	while(nsd->current_tcp_count > 0) {
3069 		mode_t m = server_signal_mode(nsd);
3070 		struct event timeout;
3071 		struct timeval tv;
3072 		int timed_out = 0;
3073 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3074 			m == NSD_REAP_CHILDREN) {
3075 			/* quit */
3076 			break;
3077 		}
3078 		/* timer */
3079 		/* have to do something every second */
3080 		tv.tv_sec = 1;
3081 		tv.tv_usec = 0;
3082 		memset(&timeout, 0, sizeof(timeout));
3083 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3084 			&timed_out);
3085 		if(event_base_set(event_base, &timeout) != 0)
3086 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3087 		if(event_add(&timeout, &tv) != 0)
3088 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3089 
3090 		/* service loop */
3091 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3092 			if (errno != EINTR) {
3093 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3094 				break;
3095 			}
3096 		}
3097 		if(!timed_out) {
3098 			event_del(&timeout);
3099 		} else {
3100 			/* timed out, quit */
3101 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3102 			break;
3103 		}
3104 	}
3105 #ifdef MEMCLEAN
3106 	event_base_free(event_base);
3107 #endif
3108 	/* continue to quit after return */
3109 }
3110 
3111 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3112  * are always used, even if nonblocking operations are broken, in which case
3113  * NUM_RECV_PER_SELECT is defined to 1 (one).
3114  */
3115 #if defined(HAVE_RECVMMSG)
3116 #define nsd_recvmmsg recvmmsg
3117 #else /* !HAVE_RECVMMSG */
3118 
3119 static int
3120 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3121              int flags, struct timespec *timeout)
3122 {
3123 	int orig_errno;
3124 	unsigned int vpos = 0;
3125 	ssize_t rcvd;
3126 
3127 	/* timeout is ignored, ensure caller does not expect it to work */
3128 	assert(timeout == NULL);
3129 
3130 	orig_errno = errno;
3131 	errno = 0;
3132 	while(vpos < vlen) {
3133 		rcvd = recvfrom(sockfd,
3134 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3135 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3136 		                flags,
3137 		                msgvec[vpos].msg_hdr.msg_name,
3138 		               &msgvec[vpos].msg_hdr.msg_namelen);
3139 		if(rcvd < 0) {
3140 			break;
3141 		} else {
3142 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3143 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3144 			vpos++;
3145 		}
3146 	}
3147 
3148 	if(vpos) {
3149 		/* error will be picked up next time */
3150 		return (int)vpos;
3151 	} else if(errno == 0) {
3152 		errno = orig_errno;
3153 		return 0;
3154 	} else if(errno == EAGAIN) {
3155 		return 0;
3156 	}
3157 
3158 	return -1;
3159 }
3160 #endif /* HAVE_RECVMMSG */
3161 
3162 #ifdef HAVE_SENDMMSG
3163 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3164 #else /* !HAVE_SENDMMSG */
3165 
3166 static int
3167 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3168 {
3169 	int orig_errno;
3170 	unsigned int vpos = 0;
3171 	ssize_t snd;
3172 
3173 	orig_errno = errno;
3174 	errno = 0;
3175 	while(vpos < vlen) {
3176 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3177 		snd = sendto(sockfd,
3178 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3179 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3180 		             flags,
3181 		             msgvec[vpos].msg_hdr.msg_name,
3182 		             msgvec[vpos].msg_hdr.msg_namelen);
3183 		if(snd < 0) {
3184 			break;
3185 		} else {
3186 			msgvec[vpos].msg_len = (unsigned int)snd;
3187 			vpos++;
3188 		}
3189 	}
3190 
3191 	if(vpos) {
3192 		return (int)vpos;
3193 	} else if(errno == 0) {
3194 		errno = orig_errno;
3195 		return 0;
3196 	}
3197 
3198 	return -1;
3199 }
3200 #endif /* HAVE_SENDMMSG */
3201 
3202 static void
3203 handle_udp(int fd, short event, void* arg)
3204 {
3205 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3206 	int received, sent, recvcount, i;
3207 	struct query *q;
3208 
3209 	if (!(event & EV_READ)) {
3210 		return;
3211 	}
3212 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3213 	/* this printf strangely gave a performance increase on Linux */
3214 	/* printf("recvcount %d \n", recvcount); */
3215 	if (recvcount == -1) {
3216 		if (errno != EAGAIN && errno != EINTR) {
3217 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3218 			STATUP(data->nsd, rxerr);
3219 			/* No zone statup */
3220 		}
3221 		/* Simply no data available */
3222 		return;
3223 	}
3224 	for (i = 0; i < recvcount; i++) {
3225 	loopstart:
3226 		received = msgs[i].msg_len;
3227 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3228 		q = queries[i];
3229 		if (received == -1) {
3230 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3231 #if defined(HAVE_RECVMMSG)
3232 				msgs[i].msg_hdr.msg_flags
3233 #else
3234 				errno
3235 #endif
3236 				));
3237 			STATUP(data->nsd, rxerr);
3238 			/* No zone statup */
3239 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3240 			iovecs[i].iov_len = buffer_remaining(q->packet);
3241 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3242 			goto swap_drop;
3243 		}
3244 
3245 		/* Account... */
3246 #ifdef BIND8_STATS
3247 		if (data->socket->addr.ai_family == AF_INET) {
3248 			STATUP(data->nsd, qudp);
3249 		} else if (data->socket->addr.ai_family == AF_INET6) {
3250 			STATUP(data->nsd, qudp6);
3251 		}
3252 #endif
3253 
3254 		buffer_skip(q->packet, received);
3255 		buffer_flip(q->packet);
3256 #ifdef USE_DNSTAP
3257 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
3258 			q->tcp, q->packet);
3259 #endif /* USE_DNSTAP */
3260 
3261 		/* Process and answer the query... */
3262 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
3263 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3264 				STATUP(data->nsd, nona);
3265 				ZTATUP(data->nsd, q->zone, nona);
3266 			}
3267 
3268 #ifdef USE_ZONE_STATS
3269 			if (data->socket->addr.ai_family == AF_INET) {
3270 				ZTATUP(data->nsd, q->zone, qudp);
3271 			} else if (data->socket->addr.ai_family == AF_INET6) {
3272 				ZTATUP(data->nsd, q->zone, qudp6);
3273 			}
3274 #endif
3275 
3276 			/* Add EDNS0 and TSIG info if necessary.  */
3277 			query_add_optional(q, data->nsd);
3278 
3279 			buffer_flip(q->packet);
3280 			iovecs[i].iov_len = buffer_remaining(q->packet);
3281 #ifdef BIND8_STATS
3282 			/* Account the rcode & TC... */
3283 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3284 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3285 			if (TC(q->packet)) {
3286 				STATUP(data->nsd, truncated);
3287 				ZTATUP(data->nsd, q->zone, truncated);
3288 			}
3289 #endif /* BIND8_STATS */
3290 #ifdef USE_DNSTAP
3291 			dt_collector_submit_auth_response(data->nsd,
3292 				&q->addr, q->addrlen, q->tcp, q->packet,
3293 				q->zone);
3294 #endif /* USE_DNSTAP */
3295 		} else {
3296 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3297 			iovecs[i].iov_len = buffer_remaining(q->packet);
3298 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3299 		swap_drop:
3300 			STATUP(data->nsd, dropped);
3301 			ZTATUP(data->nsd, q->zone, dropped);
3302 			if(i != recvcount-1) {
3303 				/* swap with last and decrease recvcount */
3304 				struct mmsghdr mtmp = msgs[i];
3305 				struct iovec iotmp = iovecs[i];
3306 				recvcount--;
3307 				msgs[i] = msgs[recvcount];
3308 				iovecs[i] = iovecs[recvcount];
3309 				queries[i] = queries[recvcount];
3310 				msgs[recvcount] = mtmp;
3311 				iovecs[recvcount] = iotmp;
3312 				queries[recvcount] = q;
3313 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3314 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3315 				goto loopstart;
3316 			} else { recvcount --; }
3317 		}
3318 	}
3319 
3320 	/* send until all are sent */
3321 	i = 0;
3322 	while(i<recvcount) {
3323 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3324 		if(sent == -1) {
3325 			/* don't log transient network full errors, unless
3326 			 * on higher verbosity */
3327 			if(!(errno == ENOBUFS && verbosity < 1) &&
3328 #ifdef EWOULDBLOCK
3329 			   !(errno == EWOULDBLOCK && verbosity < 1) &&
3330 #endif
3331 			   !(errno == EAGAIN && verbosity < 1)) {
3332 				const char* es = strerror(errno);
3333 				char a[48];
3334 				addr2str(&queries[i]->addr, a, sizeof(a));
3335 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3336 			}
3337 #ifdef BIND8_STATS
3338 			data->nsd->st.txerr += recvcount-i;
3339 #endif /* BIND8_STATS */
3340 			break;
3341 		}
3342 		i += sent;
3343 	}
3344 	for(i=0; i<recvcount; i++) {
3345 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3346 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3347 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3348 	}
3349 }
3350 
3351 #ifdef HAVE_SSL
3352 /*
3353  * Setup an event for the tcp handler.
3354  */
3355 static void
3356 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3357        int fd, short event)
3358 {
3359 	struct timeval timeout;
3360 	struct event_base* ev_base;
3361 
3362 	timeout.tv_sec = data->nsd->tcp_timeout;
3363 	timeout.tv_usec = 0L;
3364 
3365 	ev_base = data->event.ev_base;
3366 	event_del(&data->event);
3367 	memset(&data->event, 0, sizeof(data->event));
3368 	event_set(&data->event, fd, event, fn, data);
3369 	if(event_base_set(ev_base, &data->event) != 0)
3370 		log_msg(LOG_ERR, "event base set failed");
3371 	if(event_add(&data->event, &timeout) != 0)
3372 		log_msg(LOG_ERR, "event add failed");
3373 }
3374 #endif /* HAVE_SSL */
3375 
3376 static void
3377 cleanup_tcp_handler(struct tcp_handler_data* data)
3378 {
3379 	event_del(&data->event);
3380 #ifdef HAVE_SSL
3381 	if(data->tls) {
3382 		SSL_shutdown(data->tls);
3383 		SSL_free(data->tls);
3384 		data->tls = NULL;
3385 	}
3386 #endif
3387 	close(data->event.ev_fd);
3388 	if(data->prev)
3389 		data->prev->next = data->next;
3390 	else	tcp_active_list = data->next;
3391 	if(data->next)
3392 		data->next->prev = data->prev;
3393 
3394 	/*
3395 	 * Enable the TCP accept handlers when the current number of
3396 	 * TCP connections is about to drop below the maximum number
3397 	 * of TCP connections.
3398 	 */
3399 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3400 		configure_handler_event_types(EV_READ|EV_PERSIST);
3401 		if(slowaccept) {
3402 			event_del(&slowaccept_event);
3403 			slowaccept = 0;
3404 		}
3405 	}
3406 	--data->nsd->current_tcp_count;
3407 	assert(data->nsd->current_tcp_count >= 0);
3408 
3409 	region_destroy(data->region);
3410 }
3411 
3412 static void
3413 handle_tcp_reading(int fd, short event, void* arg)
3414 {
3415 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3416 	ssize_t received;
3417 	struct event_base* ev_base;
3418 	struct timeval timeout;
3419 
3420 	if ((event & EV_TIMEOUT)) {
3421 		/* Connection timed out.  */
3422 		cleanup_tcp_handler(data);
3423 		return;
3424 	}
3425 
3426 	if (data->nsd->tcp_query_count > 0 &&
3427 		data->query_count >= data->nsd->tcp_query_count) {
3428 		/* No more queries allowed on this tcp connection. */
3429 		cleanup_tcp_handler(data);
3430 		return;
3431 	}
3432 
3433 	assert((event & EV_READ));
3434 
3435 	if (data->bytes_transmitted == 0) {
3436 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3437 	}
3438 
3439 	/*
3440 	 * Check if we received the leading packet length bytes yet.
3441 	 */
3442 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3443 		received = read(fd,
3444 				(char *) &data->query->tcplen
3445 				+ data->bytes_transmitted,
3446 				sizeof(uint16_t) - data->bytes_transmitted);
3447 		if (received == -1) {
3448 			if (errno == EAGAIN || errno == EINTR) {
3449 				/*
3450 				 * Read would block, wait until more
3451 				 * data is available.
3452 				 */
3453 				return;
3454 			} else {
3455 				char buf[48];
3456 				addr2str(&data->query->addr, buf, sizeof(buf));
3457 #ifdef ECONNRESET
3458 				if (verbosity >= 2 || errno != ECONNRESET)
3459 #endif /* ECONNRESET */
3460 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3461 				cleanup_tcp_handler(data);
3462 				return;
3463 			}
3464 		} else if (received == 0) {
3465 			/* EOF */
3466 			cleanup_tcp_handler(data);
3467 			return;
3468 		}
3469 
3470 		data->bytes_transmitted += received;
3471 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3472 			/*
3473 			 * Not done with the tcplen yet, wait for more
3474 			 * data to become available.
3475 			 */
3476 			return;
3477 		}
3478 
3479 		assert(data->bytes_transmitted == sizeof(uint16_t));
3480 
3481 		data->query->tcplen = ntohs(data->query->tcplen);
3482 
3483 		/*
3484 		 * Minimum query size is:
3485 		 *
3486 		 *     Size of the header (12)
3487 		 *   + Root domain name   (1)
3488 		 *   + Query class        (2)
3489 		 *   + Query type         (2)
3490 		 */
3491 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3492 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3493 			cleanup_tcp_handler(data);
3494 			return;
3495 		}
3496 
3497 		if (data->query->tcplen > data->query->maxlen) {
3498 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3499 			cleanup_tcp_handler(data);
3500 			return;
3501 		}
3502 
3503 		buffer_set_limit(data->query->packet, data->query->tcplen);
3504 	}
3505 
3506 	assert(buffer_remaining(data->query->packet) > 0);
3507 
3508 	/* Read the (remaining) query data.  */
3509 	received = read(fd,
3510 			buffer_current(data->query->packet),
3511 			buffer_remaining(data->query->packet));
3512 	if (received == -1) {
3513 		if (errno == EAGAIN || errno == EINTR) {
3514 			/*
3515 			 * Read would block, wait until more data is
3516 			 * available.
3517 			 */
3518 			return;
3519 		} else {
3520 			char buf[48];
3521 			addr2str(&data->query->addr, buf, sizeof(buf));
3522 #ifdef ECONNRESET
3523 			if (verbosity >= 2 || errno != ECONNRESET)
3524 #endif /* ECONNRESET */
3525 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3526 			cleanup_tcp_handler(data);
3527 			return;
3528 		}
3529 	} else if (received == 0) {
3530 		/* EOF */
3531 		cleanup_tcp_handler(data);
3532 		return;
3533 	}
3534 
3535 	data->bytes_transmitted += received;
3536 	buffer_skip(data->query->packet, received);
3537 	if (buffer_remaining(data->query->packet) > 0) {
3538 		/*
3539 		 * Message not yet complete, wait for more data to
3540 		 * become available.
3541 		 */
3542 		return;
3543 	}
3544 
3545 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3546 
3547 	/* Account... */
3548 #ifdef BIND8_STATS
3549 #ifndef INET6
3550 	STATUP(data->nsd, ctcp);
3551 #else
3552 	if (data->query->addr.ss_family == AF_INET) {
3553 		STATUP(data->nsd, ctcp);
3554 	} else if (data->query->addr.ss_family == AF_INET6) {
3555 		STATUP(data->nsd, ctcp6);
3556 	}
3557 #endif
3558 #endif /* BIND8_STATS */
3559 
3560 	/* We have a complete query, process it.  */
3561 
3562 	/* tcp-query-count: handle query counter ++ */
3563 	data->query_count++;
3564 
3565 	buffer_flip(data->query->packet);
3566 #ifdef USE_DNSTAP
3567 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3568 		data->query->addrlen, data->query->tcp, data->query->packet);
3569 #endif /* USE_DNSTAP */
3570 	data->query_state = server_process_query(data->nsd, data->query);
3571 	if (data->query_state == QUERY_DISCARDED) {
3572 		/* Drop the packet and the entire connection... */
3573 		STATUP(data->nsd, dropped);
3574 		ZTATUP(data->nsd, data->query->zone, dropped);
3575 		cleanup_tcp_handler(data);
3576 		return;
3577 	}
3578 
3579 #ifdef BIND8_STATS
3580 	if (RCODE(data->query->packet) == RCODE_OK
3581 	    && !AA(data->query->packet))
3582 	{
3583 		STATUP(data->nsd, nona);
3584 		ZTATUP(data->nsd, data->query->zone, nona);
3585 	}
3586 #endif /* BIND8_STATS */
3587 
3588 #ifdef USE_ZONE_STATS
3589 #ifndef INET6
3590 	ZTATUP(data->nsd, data->query->zone, ctcp);
3591 #else
3592 	if (data->query->addr.ss_family == AF_INET) {
3593 		ZTATUP(data->nsd, data->query->zone, ctcp);
3594 	} else if (data->query->addr.ss_family == AF_INET6) {
3595 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3596 	}
3597 #endif
3598 #endif /* USE_ZONE_STATS */
3599 
3600 	query_add_optional(data->query, data->nsd);
3601 
3602 	/* Switch to the tcp write handler.  */
3603 	buffer_flip(data->query->packet);
3604 	data->query->tcplen = buffer_remaining(data->query->packet);
3605 #ifdef BIND8_STATS
3606 	/* Account the rcode & TC... */
3607 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3608 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3609 	if (TC(data->query->packet)) {
3610 		STATUP(data->nsd, truncated);
3611 		ZTATUP(data->nsd, data->query->zone, truncated);
3612 	}
3613 #endif /* BIND8_STATS */
3614 #ifdef USE_DNSTAP
3615 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3616 		data->query->addrlen, data->query->tcp, data->query->packet,
3617 		data->query->zone);
3618 #endif /* USE_DNSTAP */
3619 	data->bytes_transmitted = 0;
3620 
3621 	timeout.tv_sec = data->tcp_timeout / 1000;
3622 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3623 
3624 	ev_base = data->event.ev_base;
3625 	event_del(&data->event);
3626 	memset(&data->event, 0, sizeof(data->event));
3627 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3628 		handle_tcp_reading, data);
3629 	if(event_base_set(ev_base, &data->event) != 0)
3630 		log_msg(LOG_ERR, "event base set tcpr failed");
3631 	if(event_add(&data->event, &timeout) != 0)
3632 		log_msg(LOG_ERR, "event add tcpr failed");
3633 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3634 	handle_tcp_writing(fd, EV_WRITE, data);
3635 }
3636 
3637 static void
3638 handle_tcp_writing(int fd, short event, void* arg)
3639 {
3640 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3641 	ssize_t sent;
3642 	struct query *q = data->query;
3643 	struct timeval timeout;
3644 	struct event_base* ev_base;
3645 
3646 	if ((event & EV_TIMEOUT)) {
3647 		/* Connection timed out.  */
3648 		cleanup_tcp_handler(data);
3649 		return;
3650 	}
3651 
3652 	assert((event & EV_WRITE));
3653 
3654 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3655 		/* Writing the response packet length.  */
3656 		uint16_t n_tcplen = htons(q->tcplen);
3657 #ifdef HAVE_WRITEV
3658 		struct iovec iov[2];
3659 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3660 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3661 		iov[1].iov_base = buffer_begin(q->packet);
3662 		iov[1].iov_len = buffer_limit(q->packet);
3663 		sent = writev(fd, iov, 2);
3664 #else /* HAVE_WRITEV */
3665 		sent = write(fd,
3666 			     (const char *) &n_tcplen + data->bytes_transmitted,
3667 			     sizeof(n_tcplen) - data->bytes_transmitted);
3668 #endif /* HAVE_WRITEV */
3669 		if (sent == -1) {
3670 			if (errno == EAGAIN || errno == EINTR) {
3671 				/*
3672 				 * Write would block, wait until
3673 				 * socket becomes writable again.
3674 				 */
3675 				return;
3676 			} else {
3677 #ifdef ECONNRESET
3678 				if(verbosity >= 2 || errno != ECONNRESET)
3679 #endif /* ECONNRESET */
3680 #ifdef EPIPE
3681 				  if(verbosity >= 2 || errno != EPIPE)
3682 #endif /* EPIPE 'broken pipe' */
3683 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3684 				cleanup_tcp_handler(data);
3685 				return;
3686 			}
3687 		}
3688 
3689 		data->bytes_transmitted += sent;
3690 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3691 			/*
3692 			 * Writing not complete, wait until socket
3693 			 * becomes writable again.
3694 			 */
3695 			return;
3696 		}
3697 
3698 #ifdef HAVE_WRITEV
3699 		sent -= sizeof(n_tcplen);
3700 		/* handle potential 'packet done' code */
3701 		goto packet_could_be_done;
3702 #endif
3703  	}
3704 
3705 	sent = write(fd,
3706 		     buffer_current(q->packet),
3707 		     buffer_remaining(q->packet));
3708 	if (sent == -1) {
3709 		if (errno == EAGAIN || errno == EINTR) {
3710 			/*
3711 			 * Write would block, wait until
3712 			 * socket becomes writable again.
3713 			 */
3714 			return;
3715 		} else {
3716 #ifdef ECONNRESET
3717 			if(verbosity >= 2 || errno != ECONNRESET)
3718 #endif /* ECONNRESET */
3719 #ifdef EPIPE
3720 				  if(verbosity >= 2 || errno != EPIPE)
3721 #endif /* EPIPE 'broken pipe' */
3722 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3723 			cleanup_tcp_handler(data);
3724 			return;
3725 		}
3726 	}
3727 
3728 	data->bytes_transmitted += sent;
3729 #ifdef HAVE_WRITEV
3730   packet_could_be_done:
3731 #endif
3732 	buffer_skip(q->packet, sent);
3733 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3734 		/*
3735 		 * Still more data to write when socket becomes
3736 		 * writable again.
3737 		 */
3738 		return;
3739 	}
3740 
3741 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3742 
3743 	if (data->query_state == QUERY_IN_AXFR) {
3744 		/* Continue processing AXFR and writing back results.  */
3745 		buffer_clear(q->packet);
3746 		data->query_state = query_axfr(data->nsd, q);
3747 		if (data->query_state != QUERY_PROCESSED) {
3748 			query_add_optional(data->query, data->nsd);
3749 
3750 			/* Reset data. */
3751 			buffer_flip(q->packet);
3752 			q->tcplen = buffer_remaining(q->packet);
3753 			data->bytes_transmitted = 0;
3754 			/* Reset timeout.  */
3755 			timeout.tv_sec = data->tcp_timeout / 1000;
3756 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3757 			ev_base = data->event.ev_base;
3758 			event_del(&data->event);
3759 			memset(&data->event, 0, sizeof(data->event));
3760 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
3761 				handle_tcp_writing, data);
3762 			if(event_base_set(ev_base, &data->event) != 0)
3763 				log_msg(LOG_ERR, "event base set tcpw failed");
3764 			if(event_add(&data->event, &timeout) != 0)
3765 				log_msg(LOG_ERR, "event add tcpw failed");
3766 
3767 			/*
3768 			 * Write data if/when the socket is writable
3769 			 * again.
3770 			 */
3771 			return;
3772 		}
3773 	}
3774 
3775 	/*
3776 	 * Done sending, wait for the next request to arrive on the
3777 	 * TCP socket by installing the TCP read handler.
3778 	 */
3779 	if (data->nsd->tcp_query_count > 0 &&
3780 		data->query_count >= data->nsd->tcp_query_count) {
3781 
3782 		(void) shutdown(fd, SHUT_WR);
3783 	}
3784 
3785 	data->bytes_transmitted = 0;
3786 
3787 	timeout.tv_sec = data->tcp_timeout / 1000;
3788 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3789 	ev_base = data->event.ev_base;
3790 	event_del(&data->event);
3791 	memset(&data->event, 0, sizeof(data->event));
3792 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3793 		handle_tcp_reading, data);
3794 	if(event_base_set(ev_base, &data->event) != 0)
3795 		log_msg(LOG_ERR, "event base set tcpw failed");
3796 	if(event_add(&data->event, &timeout) != 0)
3797 		log_msg(LOG_ERR, "event add tcpw failed");
3798 }
3799 
3800 #ifdef HAVE_SSL
3801 /** create SSL object and associate fd */
3802 static SSL*
3803 incoming_ssl_fd(SSL_CTX* ctx, int fd)
3804 {
3805 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
3806 	if(!ssl) {
3807 		log_crypto_err("could not SSL_new");
3808 		return NULL;
3809 	}
3810 	SSL_set_accept_state(ssl);
3811 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
3812 	if(!SSL_set_fd(ssl, fd)) {
3813 		log_crypto_err("could not SSL_set_fd");
3814 		SSL_free(ssl);
3815 		return NULL;
3816 	}
3817 	return ssl;
3818 }
3819 
3820 /** TLS handshake to upgrade TCP connection */
3821 static int
3822 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
3823 {
3824 	int r;
3825 	if(data->shake_state == tls_hs_read_event) {
3826 		/* read condition satisfied back to writing */
3827 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3828 		data->shake_state = tls_hs_none;
3829 		return 1;
3830 	}
3831 	if(data->shake_state == tls_hs_write_event) {
3832 		/* write condition satisfied back to reading */
3833 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3834 		data->shake_state = tls_hs_none;
3835 		return 1;
3836 	}
3837 
3838 	/* (continue to) setup the TLS connection */
3839 	ERR_clear_error();
3840 	r = SSL_do_handshake(data->tls);
3841 
3842 	if(r != 1) {
3843 		int want = SSL_get_error(data->tls, r);
3844 		if(want == SSL_ERROR_WANT_READ) {
3845 			if(data->shake_state == tls_hs_read) {
3846 				/* try again later */
3847 				return 1;
3848 			}
3849 			data->shake_state = tls_hs_read;
3850 			/* switch back to reading mode */
3851 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3852 			return 1;
3853 		} else if(want == SSL_ERROR_WANT_WRITE) {
3854 			if(data->shake_state == tls_hs_write) {
3855 				/* try again later */
3856 				return 1;
3857 			}
3858 			data->shake_state = tls_hs_write;
3859 			/* switch back to writing mode */
3860 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3861 			return 1;
3862 		} else {
3863 			if(r == 0)
3864 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
3865 			else {
3866 				unsigned long err = ERR_get_error();
3867 				if(!squelch_err_ssl_handshake(err)) {
3868 					char a[64], s[256];
3869 					addr2str(&data->query->addr, a, sizeof(a));
3870 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
3871 					log_crypto_from_err(s, err);
3872 				}
3873 			}
3874 			cleanup_tcp_handler(data);
3875 			return 0;
3876 		}
3877 	}
3878 
3879 	/* Use to log successful upgrade for testing - could be removed*/
3880 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
3881 	/* set back to the event we need to have when reading (or writing) */
3882 	if(data->shake_state == tls_hs_read && writing) {
3883 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3884 	} else if(data->shake_state == tls_hs_write && !writing) {
3885 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3886 	}
3887 	data->shake_state = tls_hs_none;
3888 	return 1;
3889 }
3890 
3891 /** handle TLS reading of incoming query */
3892 static void
3893 handle_tls_reading(int fd, short event, void* arg)
3894 {
3895 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3896 	ssize_t received;
3897 
3898 	if ((event & EV_TIMEOUT)) {
3899 		/* Connection timed out.  */
3900 		cleanup_tcp_handler(data);
3901 		return;
3902 	}
3903 
3904 	if (data->nsd->tcp_query_count > 0 &&
3905 	    data->query_count >= data->nsd->tcp_query_count) {
3906 		/* No more queries allowed on this tcp connection. */
3907 		cleanup_tcp_handler(data);
3908 		return;
3909 	}
3910 
3911 	assert((event & EV_READ));
3912 
3913 	if (data->bytes_transmitted == 0) {
3914 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3915 	}
3916 
3917 	if(data->shake_state != tls_hs_none) {
3918 		if(!tls_handshake(data, fd, 0))
3919 			return;
3920 		if(data->shake_state != tls_hs_none)
3921 			return;
3922 	}
3923 
3924 	/*
3925 	 * Check if we received the leading packet length bytes yet.
3926 	 */
3927 	if(data->bytes_transmitted < sizeof(uint16_t)) {
3928 		ERR_clear_error();
3929 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
3930 		    + data->bytes_transmitted,
3931 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
3932 			int want = SSL_get_error(data->tls, received);
3933 			if(want == SSL_ERROR_ZERO_RETURN) {
3934 				cleanup_tcp_handler(data);
3935 				return; /* shutdown, closed */
3936 			} else if(want == SSL_ERROR_WANT_READ) {
3937 				/* wants to be called again */
3938 				return;
3939 			}
3940 			else if(want == SSL_ERROR_WANT_WRITE) {
3941 				/* switch to writing */
3942 				data->shake_state = tls_hs_write_event;
3943 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3944 				return;
3945 			}
3946 			cleanup_tcp_handler(data);
3947 			log_crypto_err("could not SSL_read");
3948 			return;
3949 		}
3950 
3951 		data->bytes_transmitted += received;
3952 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3953 			/*
3954 			 * Not done with the tcplen yet, wait for more
3955 			 * data to become available.
3956 			 */
3957 			return;
3958 		}
3959 
3960 		assert(data->bytes_transmitted == sizeof(uint16_t));
3961 
3962 		data->query->tcplen = ntohs(data->query->tcplen);
3963 
3964 		/*
3965 		 * Minimum query size is:
3966 		 *
3967 		 *     Size of the header (12)
3968 		 *   + Root domain name   (1)
3969 		 *   + Query class        (2)
3970 		 *   + Query type         (2)
3971 		 */
3972 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3973 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3974 			cleanup_tcp_handler(data);
3975 			return;
3976 		}
3977 
3978 		if (data->query->tcplen > data->query->maxlen) {
3979 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3980 			cleanup_tcp_handler(data);
3981 			return;
3982 		}
3983 
3984 		buffer_set_limit(data->query->packet, data->query->tcplen);
3985 	}
3986 
3987 	assert(buffer_remaining(data->query->packet) > 0);
3988 
3989 	/* Read the (remaining) query data.  */
3990 	ERR_clear_error();
3991 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
3992 			    (int)buffer_remaining(data->query->packet));
3993 	if(received <= 0) {
3994 		int want = SSL_get_error(data->tls, received);
3995 		if(want == SSL_ERROR_ZERO_RETURN) {
3996 			cleanup_tcp_handler(data);
3997 			return; /* shutdown, closed */
3998 		} else if(want == SSL_ERROR_WANT_READ) {
3999 			/* wants to be called again */
4000 			return;
4001 		}
4002 		else if(want == SSL_ERROR_WANT_WRITE) {
4003 			/* switch back writing */
4004 			data->shake_state = tls_hs_write_event;
4005 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4006 			return;
4007 		}
4008 		cleanup_tcp_handler(data);
4009 		log_crypto_err("could not SSL_read");
4010 		return;
4011 	}
4012 
4013 	data->bytes_transmitted += received;
4014 	buffer_skip(data->query->packet, received);
4015 	if (buffer_remaining(data->query->packet) > 0) {
4016 		/*
4017 		 * Message not yet complete, wait for more data to
4018 		 * become available.
4019 		 */
4020 		return;
4021 	}
4022 
4023 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4024 
4025 	/* Account... */
4026 #ifndef INET6
4027 	STATUP(data->nsd, ctls);
4028 #else
4029 	if (data->query->addr.ss_family == AF_INET) {
4030 		STATUP(data->nsd, ctls);
4031 	} else if (data->query->addr.ss_family == AF_INET6) {
4032 		STATUP(data->nsd, ctls6);
4033 	}
4034 #endif
4035 
4036 	/* We have a complete query, process it.  */
4037 
4038 	/* tcp-query-count: handle query counter ++ */
4039 	data->query_count++;
4040 
4041 	buffer_flip(data->query->packet);
4042 #ifdef USE_DNSTAP
4043 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
4044 		data->query->addrlen, data->query->tcp, data->query->packet);
4045 #endif /* USE_DNSTAP */
4046 	data->query_state = server_process_query(data->nsd, data->query);
4047 	if (data->query_state == QUERY_DISCARDED) {
4048 		/* Drop the packet and the entire connection... */
4049 		STATUP(data->nsd, dropped);
4050 		ZTATUP(data->nsd, data->query->zone, dropped);
4051 		cleanup_tcp_handler(data);
4052 		return;
4053 	}
4054 
4055 #ifdef BIND8_STATS
4056 	if (RCODE(data->query->packet) == RCODE_OK
4057 	    && !AA(data->query->packet))
4058 	{
4059 		STATUP(data->nsd, nona);
4060 		ZTATUP(data->nsd, data->query->zone, nona);
4061 	}
4062 #endif /* BIND8_STATS */
4063 
4064 #ifdef USE_ZONE_STATS
4065 #ifndef INET6
4066 	ZTATUP(data->nsd, data->query->zone, ctls);
4067 #else
4068 	if (data->query->addr.ss_family == AF_INET) {
4069 		ZTATUP(data->nsd, data->query->zone, ctls);
4070 	} else if (data->query->addr.ss_family == AF_INET6) {
4071 		ZTATUP(data->nsd, data->query->zone, ctls6);
4072 	}
4073 #endif
4074 #endif /* USE_ZONE_STATS */
4075 
4076 	query_add_optional(data->query, data->nsd);
4077 
4078 	/* Switch to the tcp write handler.  */
4079 	buffer_flip(data->query->packet);
4080 	data->query->tcplen = buffer_remaining(data->query->packet);
4081 #ifdef BIND8_STATS
4082 	/* Account the rcode & TC... */
4083 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4084 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4085 	if (TC(data->query->packet)) {
4086 		STATUP(data->nsd, truncated);
4087 		ZTATUP(data->nsd, data->query->zone, truncated);
4088 	}
4089 #endif /* BIND8_STATS */
4090 #ifdef USE_DNSTAP
4091 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
4092 		data->query->addrlen, data->query->tcp, data->query->packet,
4093 		data->query->zone);
4094 #endif /* USE_DNSTAP */
4095 	data->bytes_transmitted = 0;
4096 
4097 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4098 
4099 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4100 	handle_tls_writing(fd, EV_WRITE, data);
4101 }
4102 
4103 /** handle TLS writing of outgoing response */
4104 static void
4105 handle_tls_writing(int fd, short event, void* arg)
4106 {
4107 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4108 	ssize_t sent;
4109 	struct query *q = data->query;
4110 	/* static variable that holds reassembly buffer used to put the
4111 	 * TCP length in front of the packet, like writev. */
4112 	static buffer_type* global_tls_temp_buffer = NULL;
4113 	buffer_type* write_buffer;
4114 
4115 	if ((event & EV_TIMEOUT)) {
4116 		/* Connection timed out.  */
4117 		cleanup_tcp_handler(data);
4118 		return;
4119 	}
4120 
4121 	assert((event & EV_WRITE));
4122 
4123 	if(data->shake_state != tls_hs_none) {
4124 		if(!tls_handshake(data, fd, 1))
4125 			return;
4126 		if(data->shake_state != tls_hs_none)
4127 			return;
4128 	}
4129 
4130 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4131 
4132 	/* If we are writing the start of a message, we must include the length
4133 	 * this is done with a copy into write_buffer. */
4134 	write_buffer = NULL;
4135 	if (data->bytes_transmitted == 0) {
4136 		if(!global_tls_temp_buffer) {
4137 			/* gets deallocated when nsd shuts down from
4138 			 * nsd.region */
4139 			global_tls_temp_buffer = buffer_create(nsd.region,
4140 				QIOBUFSZ + sizeof(q->tcplen));
4141 			if (!global_tls_temp_buffer) {
4142 				return;
4143 			}
4144 		}
4145 		write_buffer = global_tls_temp_buffer;
4146 		buffer_clear(write_buffer);
4147 		buffer_write_u16(write_buffer, q->tcplen);
4148 		buffer_write(write_buffer, buffer_current(q->packet),
4149 			(int)buffer_remaining(q->packet));
4150 		buffer_flip(write_buffer);
4151 	} else {
4152 		write_buffer = q->packet;
4153 	}
4154 
4155 	/* Write the response */
4156 	ERR_clear_error();
4157 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4158 	if(sent <= 0) {
4159 		int want = SSL_get_error(data->tls, sent);
4160 		if(want == SSL_ERROR_ZERO_RETURN) {
4161 			cleanup_tcp_handler(data);
4162 			/* closed */
4163 		} else if(want == SSL_ERROR_WANT_READ) {
4164 			/* switch back to reading */
4165 			data->shake_state = tls_hs_read_event;
4166 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4167 		} else if(want != SSL_ERROR_WANT_WRITE) {
4168 			cleanup_tcp_handler(data);
4169 			log_crypto_err("could not SSL_write");
4170 		}
4171 		return;
4172 	}
4173 
4174 	buffer_skip(write_buffer, sent);
4175 	if(buffer_remaining(write_buffer) != 0) {
4176 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4177 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4178 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4179 		}
4180 	}
4181 
4182 	data->bytes_transmitted += sent;
4183 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4184 		/*
4185 		 * Still more data to write when socket becomes
4186 		 * writable again.
4187 		 */
4188 		return;
4189 	}
4190 
4191 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4192 
4193 	if (data->query_state == QUERY_IN_AXFR) {
4194 		/* Continue processing AXFR and writing back results.  */
4195 		buffer_clear(q->packet);
4196 		data->query_state = query_axfr(data->nsd, q);
4197 		if (data->query_state != QUERY_PROCESSED) {
4198 			query_add_optional(data->query, data->nsd);
4199 
4200 			/* Reset data. */
4201 			buffer_flip(q->packet);
4202 			q->tcplen = buffer_remaining(q->packet);
4203 			data->bytes_transmitted = 0;
4204 			/* Reset to writing mode.  */
4205 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4206 
4207 			/*
4208 			 * Write data if/when the socket is writable
4209 			 * again.
4210 			 */
4211 			return;
4212 		}
4213 	}
4214 
4215 	/*
4216 	 * Done sending, wait for the next request to arrive on the
4217 	 * TCP socket by installing the TCP read handler.
4218 	 */
4219 	if (data->nsd->tcp_query_count > 0 &&
4220 		data->query_count >= data->nsd->tcp_query_count) {
4221 
4222 		(void) shutdown(fd, SHUT_WR);
4223 	}
4224 
4225 	data->bytes_transmitted = 0;
4226 
4227 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4228 }
4229 #endif
4230 
4231 static void
4232 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4233 	void* ATTR_UNUSED(arg))
4234 {
4235 	if(slowaccept) {
4236 		configure_handler_event_types(EV_PERSIST | EV_READ);
4237 		slowaccept = 0;
4238 	}
4239 }
4240 
4241 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4242 {
4243 #ifndef HAVE_ACCEPT4
4244 	int s = accept(fd, addr, addrlen);
4245 	if (s != -1) {
4246 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4247 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4248 			close(s);
4249 			s = -1;
4250 			errno=EINTR; /* stop error printout as error in accept4
4251 				by setting this errno, it omits printout, in
4252 				later code that calls nsd_accept4 */
4253 		}
4254 	}
4255 	return s;
4256 #else
4257 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4258 #endif /* HAVE_ACCEPT4 */
4259 }
4260 
4261 /*
4262  * Handle an incoming TCP connection.  The connection is accepted and
4263  * a new TCP reader event handler is added.  The TCP handler
4264  * is responsible for cleanup when the connection is closed.
4265  */
4266 static void
4267 handle_tcp_accept(int fd, short event, void* arg)
4268 {
4269 	struct tcp_accept_handler_data *data
4270 		= (struct tcp_accept_handler_data *) arg;
4271 	int s;
4272 	int reject = 0;
4273 	struct tcp_handler_data *tcp_data;
4274 	region_type *tcp_region;
4275 #ifdef INET6
4276 	struct sockaddr_storage addr;
4277 #else
4278 	struct sockaddr_in addr;
4279 #endif
4280 	socklen_t addrlen;
4281 	struct timeval timeout;
4282 
4283 	if (!(event & EV_READ)) {
4284 		return;
4285 	}
4286 
4287 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4288 		reject = data->nsd->options->tcp_reject_overflow;
4289 		if (!reject) {
4290 			return;
4291 		}
4292 	}
4293 
4294 	/* Accept it... */
4295 	addrlen = sizeof(addr);
4296 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4297 	if (s == -1) {
4298 		/**
4299 		 * EMFILE and ENFILE is a signal that the limit of open
4300 		 * file descriptors has been reached. Pause accept().
4301 		 * EINTR is a signal interrupt. The others are various OS ways
4302 		 * of saying that the client has closed the connection.
4303 		 */
4304 		if (errno == EMFILE || errno == ENFILE) {
4305 			if (!slowaccept) {
4306 				/* disable accept events */
4307 				struct timeval tv;
4308 				configure_handler_event_types(0);
4309 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4310 				tv.tv_usec = 0L;
4311 				memset(&slowaccept_event, 0,
4312 					sizeof(slowaccept_event));
4313 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4314 					handle_slowaccept_timeout, NULL);
4315 				(void)event_base_set(data->event.ev_base,
4316 					&slowaccept_event);
4317 				(void)event_add(&slowaccept_event, &tv);
4318 				slowaccept = 1;
4319 				/* We don't want to spam the logs here */
4320 			}
4321 		} else if (errno != EINTR
4322 			&& errno != EWOULDBLOCK
4323 #ifdef ECONNABORTED
4324 			&& errno != ECONNABORTED
4325 #endif /* ECONNABORTED */
4326 #ifdef EPROTO
4327 			&& errno != EPROTO
4328 #endif /* EPROTO */
4329 			) {
4330 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4331 		}
4332 		return;
4333 	}
4334 
4335 	if (reject) {
4336 		shutdown(s, SHUT_RDWR);
4337 		close(s);
4338 		return;
4339 	}
4340 
4341 	/*
4342 	 * This region is deallocated when the TCP connection is
4343 	 * closed by the TCP handler.
4344 	 */
4345 	tcp_region = region_create(xalloc, free);
4346 	tcp_data = (struct tcp_handler_data *) region_alloc(
4347 		tcp_region, sizeof(struct tcp_handler_data));
4348 	tcp_data->region = tcp_region;
4349 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4350 		compression_table_size, compressed_dnames);
4351 	tcp_data->nsd = data->nsd;
4352 	tcp_data->query_count = 0;
4353 #ifdef HAVE_SSL
4354 	tcp_data->shake_state = tls_hs_none;
4355 	tcp_data->tls = NULL;
4356 #endif
4357 	tcp_data->prev = NULL;
4358 	tcp_data->next = NULL;
4359 
4360 	tcp_data->query_state = QUERY_PROCESSED;
4361 	tcp_data->bytes_transmitted = 0;
4362 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4363 	tcp_data->query->addrlen = addrlen;
4364 
4365 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4366 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4367 		/* very busy, give smaller timeout */
4368 		tcp_data->tcp_timeout = 200;
4369 	}
4370 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4371 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4372 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4373 
4374 #ifdef HAVE_SSL
4375 	if (data->tls_accept) {
4376 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4377 		if(!tcp_data->tls) {
4378 			close(s);
4379 			return;
4380 		}
4381 		tcp_data->shake_state = tls_hs_read;
4382 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4383 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4384 			  handle_tls_reading, tcp_data);
4385 	} else {
4386 #endif
4387 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4388 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4389 			  handle_tcp_reading, tcp_data);
4390 #ifdef HAVE_SSL
4391 	}
4392 #endif
4393 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4394 		log_msg(LOG_ERR, "cannot set tcp event base");
4395 		close(s);
4396 		region_destroy(tcp_region);
4397 		return;
4398 	}
4399 	if(event_add(&tcp_data->event, &timeout) != 0) {
4400 		log_msg(LOG_ERR, "cannot add tcp to event base");
4401 		close(s);
4402 		region_destroy(tcp_region);
4403 		return;
4404 	}
4405 	if(tcp_active_list) {
4406 		tcp_active_list->prev = tcp_data;
4407 		tcp_data->next = tcp_active_list;
4408 	}
4409 	tcp_active_list = tcp_data;
4410 
4411 	/*
4412 	 * Keep track of the total number of TCP handlers installed so
4413 	 * we can stop accepting connections when the maximum number
4414 	 * of simultaneous TCP connections is reached.
4415 	 *
4416 	 * If tcp-reject-overflow is enabled, however, then we do not
4417 	 * change the handler event type; we keep it as-is and accept
4418 	 * overflow TCP connections only so that we can forcibly kill
4419 	 * them off.
4420 	 */
4421 	++data->nsd->current_tcp_count;
4422 	if (!data->nsd->options->tcp_reject_overflow &&
4423 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4424 	{
4425 		configure_handler_event_types(0);
4426 	}
4427 }
4428 
4429 static void
4430 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4431 {
4432 	size_t i;
4433 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4434 	for (i = 0; i < nsd->child_count; ++i) {
4435 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4436 			if (write(nsd->children[i].child_fd,
4437 				&command,
4438 				sizeof(command)) == -1)
4439 			{
4440 				if(errno != EAGAIN && errno != EINTR)
4441 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4442 					(int) command,
4443 					(int) nsd->children[i].pid,
4444 					strerror(errno));
4445 			} else if (timeout > 0) {
4446 				(void)block_read(NULL,
4447 					nsd->children[i].child_fd,
4448 					&command, sizeof(command), timeout);
4449 			}
4450 			fsync(nsd->children[i].child_fd);
4451 			close(nsd->children[i].child_fd);
4452 			nsd->children[i].child_fd = -1;
4453 		}
4454 	}
4455 }
4456 
4457 static void
4458 send_children_quit(struct nsd* nsd)
4459 {
4460 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4461 	send_children_command(nsd, NSD_QUIT, 0);
4462 }
4463 
4464 static void
4465 send_children_quit_and_wait(struct nsd* nsd)
4466 {
4467 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4468 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4469 }
4470 
4471 #ifdef BIND8_STATS
4472 static void
4473 set_children_stats(struct nsd* nsd)
4474 {
4475 	size_t i;
4476 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4477 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4478 	for (i = 0; i < nsd->child_count; ++i) {
4479 		nsd->children[i].need_to_send_STATS = 1;
4480 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4481 	}
4482 }
4483 #endif /* BIND8_STATS */
4484 
4485 static void
4486 configure_handler_event_types(short event_types)
4487 {
4488 	size_t i;
4489 
4490 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4491 		struct event* handler = &tcp_accept_handlers[i].event;
4492 		if(event_types) {
4493 			/* reassign */
4494 			int fd = handler->ev_fd;
4495 			struct event_base* base = handler->ev_base;
4496 			if(tcp_accept_handlers[i].event_added)
4497 				event_del(handler);
4498 			memset(handler, 0, sizeof(*handler));
4499 			event_set(handler, fd, event_types,
4500 				handle_tcp_accept, &tcp_accept_handlers[i]);
4501 			if(event_base_set(base, handler) != 0)
4502 				log_msg(LOG_ERR, "conhand: cannot event_base");
4503 			if(event_add(handler, NULL) != 0)
4504 				log_msg(LOG_ERR, "conhand: cannot event_add");
4505 			tcp_accept_handlers[i].event_added = 1;
4506 		} else {
4507 			/* remove */
4508 			if(tcp_accept_handlers[i].event_added) {
4509 				event_del(handler);
4510 				tcp_accept_handlers[i].event_added = 0;
4511 			}
4512 		}
4513 	}
4514 }
4515